# Steam Australian Users Data 

Files expected (relative to this notebook):
- `data/australian_user_reviews.json.gz`
- `data/australian_users_items.json.gz`


In [1]:
import gzip
import ast
from pathlib import Path
import pandas as pd

# Path to the data folder (adjust if your folder structure is different)
DATA_DIR = Path('data')

def load_python_dicts_gz(path: Path, max_rows=None, verbose=True) -> pd.DataFrame:
    """Load a .json.gz file where each line is a Python dict literal.

    Many McAuley Steam datasets are stored as one Python dict per line, using
    single quotes instead of strict JSON. This helper parses them safely using
    ast.literal_eval, then flattens into a pandas DataFrame.

    Parameters
    ----------
    path : Path
        Path to the .json.gz file.
    max_rows : int or None
        If not None, stop after reading this many lines (useful for quick
        exploration). Set to None to load the full file.
    verbose : bool
        If True, print progress every 100k lines.
    """
    rows = []
    with gzip.open(path, 'rt', encoding='utf-8') as f:
        for i, line in enumerate(f, start=1):
            line = line.strip()
            if not line:
                continue
            rows.append(ast.literal_eval(line))
            if max_rows is not None and len(rows) >= max_rows:
                break
            if verbose and i % 100_000 == 0:
                print(f"Read {i} lines from {path.name}...")

    df = pd.json_normalize(rows)
    return df


## australian_user_reviews.json.gz

Load a sample of the reviews file and inspect its columns.

In [2]:
reviews_path = DATA_DIR / 'australian_user_reviews.json.gz'
reviews = load_python_dicts_gz(reviews_path, max_rows=100_000)
print('reviews shape:', reviews.shape)
print('reviews columns:')
print(list(reviews.columns))
reviews.head()

reviews shape: (25799, 3)
reviews columns:
['user_id', 'user_url', 'reviews']


Unnamed: 0,user_id,user_url,reviews
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'funny': '', 'posted': 'Posted November 5, 2..."
1,js41637,http://steamcommunity.com/id/js41637,"[{'funny': '', 'posted': 'Posted June 24, 2014..."
2,evcentric,http://steamcommunity.com/id/evcentric,"[{'funny': '', 'posted': 'Posted February 3.',..."
3,doctr,http://steamcommunity.com/id/doctr,"[{'funny': '', 'posted': 'Posted October 14, 2..."
4,maplemage,http://steamcommunity.com/id/maplemage,"[{'funny': '3 people found this review funny',..."


## australian_users_items.json.gz

Load a sample of the items file and inspect its columns.

In [3]:
items_path = DATA_DIR / 'australian_users_items.json.gz'
items = load_python_dicts_gz(items_path, max_rows=100)
print('items shape:', items.shape)
print('items columns:')
print(list(items.columns))
items.head()

items shape: (100, 5)
items columns:
['user_id', 'items_count', 'steam_id', 'user_url', 'items']


Unnamed: 0,user_id,items_count,steam_id,user_url,items
0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
1,js41637,888,76561198035864385,http://steamcommunity.com/id/js41637,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
2,evcentric,137,76561198007712555,http://steamcommunity.com/id/evcentric,"[{'item_id': '1200', 'item_name': 'Red Orchest..."
3,Riot-Punch,328,76561197963445855,http://steamcommunity.com/id/Riot-Punch,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
4,doctr,541,76561198002099482,http://steamcommunity.com/id/doctr,"[{'item_id': '300', 'item_name': 'Day of Defea..."


### Notes
- For quick exploration, the helper loads at most 100,000 rows.
- To load the **full** dataset for either file, change `max_rows=100_000` to `max_rows=None`.
- Be aware that loading the full files can take a while and use more memory.