In [5]:
# Utility: Profile JSON schema for a list of Steam API responses
from collections import defaultdict

def profile_json_schema(json_list, max_depth=2, prefix=""):
    field_types = defaultdict(set)
    field_counts = defaultdict(int)
    for obj in json_list:
        if not isinstance(obj, dict):
            continue
        for k, v in obj.items():
            field_types[prefix + k].add(type(v).__name__)
            field_counts[prefix + k] += 1
            if isinstance(v, dict) and max_depth > 1:
                sub_types, sub_counts = profile_json_schema([v], max_depth-1, prefix=prefix + k + ".")
                for subk, types in sub_types.items():
                    field_types[subk].update(types)
                for subk, count in sub_counts.items():
                    field_counts[subk] += count
            elif isinstance(v, list) and max_depth > 1 and v and isinstance(v[0], dict):
                sub_types, sub_counts = profile_json_schema(v, max_depth-1, prefix=prefix + k + "[]")
                for subk, types in sub_types.items():
                    field_types[subk].update(types)
                for subk, count in sub_counts.items():
                    field_counts[subk] += count
    return field_types, field_counts

In [None]:
# Utility: Profile JSON schema for a list of Steam API responses
from collections import defaultdict

def profile_json_schema(json_list, max_depth=2, prefix=""):
    field_types = defaultdict(set)
    field_counts = defaultdict(int)
    for obj in json_list:
        if not isinstance(obj, dict):
            continue
        for k, v in obj.items():
            field_types[prefix + k].add(type(v).__name__)
            field_counts[prefix + k] += 1
            if isinstance(v, dict) and max_depth > 1:
                sub_types, sub_counts = profile_json_schema([v], max_depth-1, prefix=prefix + k + ".")
                for subk, types in sub_types.items():
                    field_types[subk].update(types)
                for subk, count in sub_counts.items():
                    field_counts[subk] += count
            elif isinstance(v, list) and max_depth > 1 and v and isinstance(v[0], dict):
                sub_types, sub_counts = profile_json_schema(v, max_depth-1, prefix=prefix + k + "[]")
                for subk, types in sub_types.items():
                    field_types[subk].update(types)
                for subk, count in sub_counts.items():
                    field_counts[subk] += count
    return field_types, field_counts

# Example usage:
# field_types, field_counts = profile_json_schema(list_of_jsons)
# for field, types in field_types.items():
#     print(f"{field}: types={types}, present_in={field_counts[field]} objects")


# ActualGameSearch: Acquire, Explore, ETL, and Search

This notebook demonstrates the acquisition of product metadata and reviews, exploratory data analysis, ETL pipeline design, and basic search functionality for Actual Game Search V2.

**Outline:**
1. Import Required Libraries
2. Acquire Product Metadata and Reviews
3. Explore and Interact with the Data
4. Design ETL Pipeline Functions
5. Implement Search Functionality


In [3]:
# Ensure local pipeline package is importable (works in Codespaces and local)
import sys, os
nb_dir = os.getcwd()
pipeline_dir = os.path.abspath(os.path.join(nb_dir, ".."))  # .../pipeline
src_dir = os.path.join(pipeline_dir, "src")
if src_dir not in sys.path:
    sys.path.insert(0, src_dir)
print("Added to sys.path:", src_dir)

Added to sys.path: d:\Projects\ActualGameSearch_V2\pipeline\src


In [1]:
# 1. Import Required Libraries
import pandas as pd
import numpy as np
import requests
import json
import os
from typing import List, Dict, Any


In [7]:
# Sample and profile recent Steam reviews for one sampled app
# Pick the first valid appid from the sample details
if not sample_details_df.empty:
    app_id = int(sample_details_df.iloc[0]["steam_appid"])
    print("Profiling reviews for app:", app_id)
    reviews_df = steam.get_reviews(app_id, count=100)
    print("Fetched", len(reviews_df), "reviews")
    review_jsons = reviews_df.to_dict(orient="records")
    r_types, r_counts = profile_json_schema(review_jsons, max_depth=2)
    print("Field presence and types in sampled reviews:")
    for field, types in sorted(r_types.items()):
        print(f"{field}: types={types}, present_in={r_counts[field]} objects")
else:
    print("No app details available to sample reviews.")

Profiling reviews for app: 2688110
Fetched 11 reviews
Field presence and types in sampled reviews:
app_id: types={'int'}, present_in=11 objects
author_steamid: types={'str'}, present_in=11 objects
language: types={'str'}, present_in=11 objects
recommendationid: types={'str'}, present_in=11 objects
review: types={'str'}, present_in=11 objects
timestamp_created: types={'int'}, present_in=11 objects
voted_up: types={'bool'}, present_in=11 objects
votes_funny: types={'int'}, present_in=11 objects
votes_up: types={'int'}, present_in=11 objects


## Persist Sampled Data for Reproducible ETL

Save the sampled app details and reviews to `pipeline/notebooks/data/` as both CSV and Feather for easy reuse.

In [None]:
# Save sampled app details and reviews to disk
import os
data_dir = os.path.abspath(os.path.join(os.getcwd(), "../data"))
os.makedirs(data_dir, exist_ok=True)

apps_path_csv = os.path.join(data_dir, "sampled_apps.csv")
apps_path_feather = os.path.join(data_dir, "sampled_apps.feather")
reviews_path_csv = os.path.join(data_dir, "sampled_reviews.csv")
reviews_path_feather = os.path.join(data_dir, "sampled_reviews.feather")

sample_details_df.to_csv(apps_path_csv, index=False)
sample_details_df.reset_index(drop=True).to_feather(apps_path_feather)
reviews_df.to_csv(reviews_path_csv, index=False)
reviews_df.reset_index(drop=True).to_feather(reviews_path_feather)

print(f"Saved {len(sample_details_df)} apps to {apps_path_csv} and {apps_path_feather}")
print(f"Saved {len(reviews_df)} reviews to {reviews_path_csv} and {reviews_path_feather}")

## Minimal ETL Transform Demo

Flatten key nested fields and show a normalized table preview.

In [None]:
# Example: Flatten genres and categories from app details
def extract_list_field(df, field, id_col="steam_appid"):
    rows = []
    for _, row in df.iterrows():
        appid = row[id_col]
        items = row.get(field, [])
        if isinstance(items, list):
            for item in items:
                if isinstance(item, dict) and "id" in item and "description" in item:
                    rows.append({id_col: appid, f"{field}_id": item["id"], f"{field}_desc": item["description"]})
    return pd.DataFrame(rows)

genres_df = extract_list_field(sample_details_df, "genres")
categories_df = extract_list_field(sample_details_df, "categories")

print("Genres table preview:")
display(genres_df.head())
print("Categories table preview:")
display(categories_df.head())

In [9]:
# Debug: Print current working directory to verify notebook path context
import os
print(os.getcwd())

d:\Projects\ActualGameSearch_V2\pipeline\notebooks


In [None]:
# Add pipeline/src to sys.path for local package imports
import sys
sys.path.append(r'd:\Projects\ActualGameSearch_V2\pipeline\src')

## 2. Acquire Product Metadata and Reviews

In this section, we will fetch or load product metadata and reviews from a data source or API, and load them into pandas DataFrames.

In [11]:
# Acquire a random sample of Steam app details and profile their JSON schema
from ags_pipeline.extract.steam_client import SteamClient

steam = SteamClient()

# Sample 20 apps for schema exploration (adjust n for more/less)
sample_details_df = steam.sample_apps_with_details(n=20)

# Convert each row to dict for schema profiling
sample_jsons = sample_details_df.to_dict(orient="records")

field_types, field_counts = profile_json_schema(sample_jsons, max_depth=2)

print("Field presence and types in sampled Steam app details:")
for field, types in sorted(field_types.items()):
    print(f"{field}: types={types}, present_in={field_counts[field]} objects")

ModuleNotFoundError: No module named 'ags_pipeline'

In [2]:
# Try to load sample metadata and reviews from local files (if available)
metadata_path = '../../AI-Agent-Workspace/Docs/Background/SteamSeeker-2023/Metadata_Census_May31_2023.feather'
reviews_path = '../../AI-Agent-Workspace/Docs/Background/SteamSeeker-2023/Review_Census_May31_2023.feather'

if os.path.exists(metadata_path) and os.path.exists(reviews_path):
    metadata_df = pd.read_feather(metadata_path)
    reviews_df = pd.read_feather(reviews_path)
    print(f"Loaded {len(metadata_df)} products and {len(reviews_df)} reviews from local files.")
else:
    # Placeholder: Fetch from API or download sample data
    print("Sample data files not found. Please provide local files or implement API acquisition.")
    metadata_df = pd.DataFrame()
    reviews_df = pd.DataFrame()


Sample data files not found. Please provide local files or implement API acquisition.


## 3. Explore and Interact with the Data

Let's perform exploratory data analysis: display sample records, check for missing values, and summarize key statistics.

In [3]:
# Display sample records
print("Sample product metadata:")
display(metadata_df.head())

print("Sample reviews:")
display(reviews_df.head())

# Check for missing values
print("\nMissing values in metadata:")
print(metadata_df.isnull().sum())

print("\nMissing values in reviews:")
print(reviews_df.isnull().sum())

# Summarize key statistics
print("\nMetadata summary:")
display(metadata_df.describe(include='all'))

print("\nReviews summary:")
display(reviews_df.describe(include='all'))


Sample product metadata:


Sample reviews:



Missing values in metadata:
Series([], dtype: float64)

Missing values in reviews:
Series([], dtype: float64)

Metadata summary:


ValueError: Cannot describe a DataFrame without columns

## 4. Design ETL Pipeline Functions

Develop Python functions to extract, transform, and load (ETL) product metadata and reviews, including data cleaning and normalization.

In [6]:
# Acquire a random sample of Steam app details and profile their JSON schema
from ags_pipeline.extract.steam_client import SteamClient

steam = SteamClient(delay=0.2)

# Sample 5 apps for schema exploration (smaller for quicker run)
sample_details_df = steam.sample_apps_with_details(n=5)

# Convert each row to dict for schema profiling
sample_jsons = sample_details_df.to_dict(orient="records")

field_types, field_counts = profile_json_schema(sample_jsons, max_depth=2)

print("Field presence and types in sampled Steam app details:")
for field, types in sorted(field_types.items()):
    print(f"{field}: types={types}, present_in={field_counts[field]} objects")

Field presence and types in sampled Steam app details:
about_the_game: types={'str'}, present_in=4 objects
achievements: types={'dict', 'float'}, present_in=4 objects
achievements.highlighted: types={'list'}, present_in=3 objects
achievements.total: types={'int'}, present_in=3 objects
background: types={'str'}, present_in=4 objects
background_raw: types={'str'}, present_in=4 objects
capsule_image: types={'str'}, present_in=4 objects
capsule_imagev5: types={'str'}, present_in=4 objects
categories: types={'list'}, present_in=4 objects
categories[]description: types={'str'}, present_in=21 objects
categories[]id: types={'int'}, present_in=21 objects
content_descriptors: types={'dict'}, present_in=4 objects
content_descriptors.ids: types={'list'}, present_in=4 objects
content_descriptors.notes: types={'NoneType', 'str'}, present_in=4 objects
demos: types={'float', 'list'}, present_in=4 objects
demos[]appid: types={'int'}, present_in=1 objects
demos[]description: types={'str'}, present_in=1 

## 5. Implement Search Functionality

Create functions to search the product metadata and reviews based on user queries, such as keyword or category search.

In [5]:
# Simple keyword search in product metadata
def search_products(metadata: pd.DataFrame, query: str, top_n: int = 10) -> pd.DataFrame:
    mask = metadata['name'].str.contains(query, case=False, na=False) | \
           metadata['short_description'].str.contains(query, case=False, na=False)
    return metadata[mask].head(top_n)

# Simple keyword search in reviews
def search_reviews(reviews: pd.DataFrame, query: str, top_n: int = 10) -> pd.DataFrame:
    mask = reviews['processed_review'].str.contains(query, case=False, na=False)
    return reviews[mask].head(top_n)

# Example usage:
print("\nSample product search for 'farming':")
display(search_products(metadata_clean, 'farming'))

print("\nSample review search for 'cozy':")
display(search_reviews(reviews_clean, 'cozy'))



Sample product search for 'farming':


KeyError: 'name'