## Data collection

### Task: Import modules and working path setup

In [1]:
# %pip install flask flask-cors flask-jwt-extended polars numpy scipy scikit-learn python-dotenv gunicorn pillow

In [2]:
import sys, os, json, gzip, csv, urllib.request, shutil
from pathlib import Path
import pandas as pd
import polars as pl
import pyarrow
import numpy as np

module_path = str((Path("..") / "utilities").resolve())
if module_path not in sys.path:
    sys.path.append(module_path)

from logger import Logger
from configurations import Configurations


### Task: Get configuration variables and initializations

In [3]:
# Initialize logger
LOG_FILE = Configurations.LOG_PATH
logger = Logger(process_name="data_collection", log_file=LOG_FILE)

# Define the folder to store the raw data
RAW_DIR = Path(Configurations.DATA_RAW_PATH)
RAW_DIR.mkdir(parents=True, exist_ok=True)

# Define the folder to store the processed datain_path
PROCESSED_DIR = Path(Configurations.DATA_PROCESSED_PATH)
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

# Amazon Reviews dataset, collected in 2023 by McAuley Lab
CATEGORIES = Configurations.CATEGORIES
CORES = Configurations.CORES
SPLITS = Configurations.SPLITS
BASE_URL = Configurations.BASE_URL
meta_base_url = "https://mcauleylab.ucsd.edu/public_datasets/data/amazon_2023/raw/meta_categories/meta_{category}.jsonl.gz"

# Build meta URL map from categories
meta_urls = {cat: meta_base_url.format(category=cat) for cat in CATEGORIES}

### Task: Define functions

#### Build candidate subsets

In [4]:
def build_url(core: str, category: str, split: str) -> str:
    return f"{BASE_URL}/{core}/last_out_w_his/{category}.{split}.csv.gz"

#### Local path for parquet

In [5]:
def local_path_for_parquet(core: str, category: str, split: str, sample: str = None, raw_dir=RAW_DIR) -> Path:
    safe_cat = category.replace("/", "-")

    if raw_dir == RAW_DIR:
        return RAW_DIR / f"{safe_cat}.{core}.{split}.csv.gz"
    elif raw_dir == PROCESSED_DIR:
        if sample is None:
            return PROCESSED_DIR / f"{safe_cat}.{core}.{split}.parquet"
        else:
            return PROCESSED_DIR / f"{safe_cat}.{core}.{split}.{sample}.parquet"
    else:
        raise ValueError(f"Invalid directory: {raw_dir}")

#### Download files

In [6]:
def download_file(url: str, out_path: Path, max_retries: int = 3) -> None:
    if out_path.exists() and out_path.stat().st_size > 0:
        logger.log_info(f"Exists, skip: {out_path.name}")
        return
    attempt = 0
    while attempt < max_retries:
        try:
            attempt += 1
            logger.log_info(f"Downloading (attempt {attempt}/{max_retries}): {url}")
            tmp = str(out_path) + ".part"
            urllib.request.urlretrieve(url, tmp)
            os.replace(tmp, out_path)
            logger.log_info(f"Saved: {out_path.name}")
            return
        except Exception as e:
            logger.log_warning(f"Failed attempt {attempt} for {url}: {e}")
    raise RuntimeError(f"Exceeded retries: {url}")

#### Dataset helpers

In [7]:
import numpy as np, polars as pl

def sample_users_maintain_5core(df: pl.DataFrame, n_users: int, seed: int = 42) -> pl.DataFrame:
    """Sample users while maintaining distribution"""
    np.random.seed(seed)
    all_users = df['user_id'].unique().to_list()
    if n_users >= len(all_users):
        return df
    
    sampled = np.random.choice(all_users, size=n_users, replace=False).tolist()
    df_sampled = df.filter(pl.col('user_id').is_in(sampled))
    return df_sampled


def enforce_5core(df: pl.DataFrame):
    """Enforce 5-core: each user/item has ≥5 ratings"""
    for iteration in range(10):
        n_before = len(df)
        
        # Remove sparse items
        item_counts = df.group_by('parent_asin').agg(pl.len().alias('n'))
        valid_items = item_counts.filter(pl.col('n') >= 5)['parent_asin'].to_list()
        df = df.filter(pl.col('parent_asin').is_in(valid_items))
        
        # Remove sparse users
        user_counts = df.group_by('user_id').agg(pl.len().alias('n'))
        valid_users = user_counts.filter(pl.col('n') >= 5)['user_id'].to_list()
        df = df.filter(pl.col('user_id').is_in(valid_users))
        
        if n_before == len(df):
            break
    
    return df


import polars as pl
import pandas as pd
from pathlib import Path


def save_dataset_to_parquet(csv_gz_path: Path, out_parquet_path: Path):   
    # Check if exists
    if out_parquet_path.exists():
        logger.log_info(f"Skip: {out_parquet_path.name}")
        return
    
    # Read CSV.GZ
    logger.log_info(f"Reading: {csv_gz_path.name}")
    df = pl.from_pandas(
        pd.read_csv(csv_gz_path, compression='gzip')[Configurations.COLUMNS]
    )
    
    logger.log_info(f"  Shape: {df.shape}")
    logger.log_info(f"  Users: {df['user_id'].n_unique():,}")
    logger.log_info(f"  Items: {df['parent_asin'].n_unique():,}")
    
    # Save to Parquet
    df.to_pandas().to_parquet(out_parquet_path, engine='pyarrow', index=False)
    logger.log_info(f"Saved: {out_parquet_path.name}\n")

#### Custom dataset helpers

In [8]:
import polars as pl
from pathlib import Path
import json


# ============================================================================
# FUNCTION 1: Inspect Data
# ============================================================================

def inspect_data(df: pl.DataFrame, category: str):
    """Inspect and calculate statistics"""
    
    logger.log_info(f"\n{'='*70}")
    logger.log_info(f"INSPECTING DATA: {category}")
    logger.log_info(f"{'='*70}")
    
    n_users = df['user_id'].n_unique()
    n_items = df['parent_asin'].n_unique()
    n_ratings = len(df)
    
    logger.log_info(f"Total ratings: {n_ratings:,}")
    logger.log_info(f"Total users:   {n_users:,}")
    logger.log_info(f"Total items:   {n_items:,}")
    
    # Per-user statistics
    user_stats = df.group_by('user_id').agg([
        pl.len().alias('n_ratings'),
        pl.n_unique('parent_asin').alias('n_items')
    ])
    
    avg_ratings_per_user = user_stats['n_ratings'].mean()
    avg_items_per_user = user_stats['n_items'].mean()
    
    logger.log_info(f"\nPer-User Statistics:")
    logger.log_info(f"  Avg ratings per user: {avg_ratings_per_user:.2f}")
    logger.log_info(f"  Avg items per user:   {avg_items_per_user:.2f}")
    
    logger.log_info(f"\nDistribution:")
    for pct in [50, 75, 90, 95]:
        r_val = user_stats['n_ratings'].quantile(pct / 100)
        i_val = user_stats['n_items'].quantile(pct / 100)
        logger.log_info(f"  {pct}th percentile: {r_val:.0f} ratings, {i_val:.0f} items")
    
    logger.log_info(f"{'='*70}\n")
    
    return {
        'avg_ratings_per_user': avg_ratings_per_user,
        'avg_items_per_user': avg_items_per_user
    }

In [9]:
import polars as pl
import pandas as pd
from pathlib import Path
import json


def create_custom_filtered_datasets(category: str, rating_mult: float = 2.0, item_mult: float = 2.0):
    """
    Create custom filtered datasets
    Only saves .filter.parquet files (no JSON)
    """
    
    safe_cat = category.replace('/', '-')
    
    logger.log_info(f"\nProcessing: {category}")
    logger.log_info(f"Multipliers: {rating_mult}x ratings, {item_mult}x items")
    
    # Paths
    train_input = PROCESSED_DIR / f"{safe_cat}.5core.train.parquet"
    valid_input = PROCESSED_DIR / f"{safe_cat}.5core.valid.parquet"
    test_input = PROCESSED_DIR / f"{safe_cat}.5core.test.parquet"
    
    train_output = PROCESSED_DIR / f"{safe_cat}.5core.train.filter.parquet"
    valid_output = PROCESSED_DIR / f"{safe_cat}.5core.valid.filter.parquet"
    test_output = PROCESSED_DIR / f"{safe_cat}.5core.test.filter.parquet"
    
    # Check if all exist
    if train_output.exists() and valid_output.exists() and test_output.exists():
        logger.log_info(f"Skip: All filter files exist")
        return
    
    # ========================================
    # TRAIN
    # ========================================
    
    if not train_output.exists():
        logger.log_info(f"\nTRAIN:")
        
        # Read
        logger.log_info(f"  Reading: {train_input.name}")
        df = pl.read_parquet(train_input)
        logger.log_info(f"  Shape: {df.shape}")
        
        # Stats
        user_stats = df.group_by('user_id').agg([
            pl.len().alias('n_ratings'),
            pl.n_unique('parent_asin').alias('n_items')
        ])
        
        avg_r = user_stats['n_ratings'].mean()
        avg_i = user_stats['n_items'].mean()
        
        logger.log_info(f"  Avg ratings: {avg_r:.2f}")
        logger.log_info(f"  Avg items: {avg_i:.2f}")
        
        # Thresholds
        min_r = avg_r * rating_mult
        min_i = avg_i * item_mult
        
        logger.log_info(f"  Min ratings: {min_r:.2f}")
        logger.log_info(f"  Min items: {min_i:.2f}")
        
        # Filter
        active = (user_stats
            .filter(pl.col('n_ratings') > min_r)
            .filter(pl.col('n_items') > min_i)['user_id']
            .to_list()
        )
        
        logger.log_info(f"  Active users: {len(active):,} of {user_stats.height:,} ({len(active)/user_stats.height*100:.1f}%)")
        
        df_filtered = df.filter(pl.col('user_id').is_in(active))
        
        logger.log_info(f"  Filtered: {len(df_filtered):,} ratings, {df_filtered['user_id'].n_unique():,} users, {df_filtered['parent_asin'].n_unique():,} items")
        
        # Save
        df_filtered.to_pandas().to_parquet(train_output, engine='pyarrow', index=False)
        logger.log_info(f"  Saved: {train_output.name}")
        logger.log_info(f"  Shape: {df_filtered.shape}")
    
    else:
        logger.log_info(f"\nSkip: {train_output.name} exists")
    
    # ========================================
    # VALID
    # ========================================
    
    if not valid_output.exists():
        logger.log_info(f"\nVALID:")
        
        # Read train users directly from train.filter.parquet
        logger.log_info(f"  Loading train users from: {train_output.name}")
        df_train_filtered = pl.read_parquet(train_output)
        train_users = df_train_filtered['user_id'].unique().to_list()
        train_items = df_train_filtered['parent_asin'].unique().to_list()
        
        logger.log_info(f"  Train users: {len(train_users):,}")
        logger.log_info(f"  Train items: {len(train_items):,}") 
        
        # Read valid
        logger.log_info(f"  Reading: {valid_input.name}")
        df = pl.read_parquet(valid_input)
        logger.log_info(f"  Shape: {df.shape}")
        
        # Filter
        df_filtered = df.filter(
            pl.col('user_id').is_in(train_users) &
            pl.col('parent_asin').is_in(train_items)
        )
        logger.log_info(f"  Filtered: {len(df_filtered):,} ratings, {df_filtered['user_id'].n_unique():,} users")
        
        # Save
        df_filtered.to_pandas().to_parquet(valid_output, engine='pyarrow', index=False)
        logger.log_info(f"  Saved: {valid_output.name}")
        logger.log_info(f"  Shape: {df_filtered.shape}")
    
    else:
        logger.log_info(f"\nSkip: {valid_output.name} exists")
    
    # ========================================
    # TEST
    # ========================================
    
    if not test_output.exists():
        logger.log_info(f"\nTEST:")
        
        # Read train users directly from train.filter.parquet
        logger.log_info(f"  Loading train users from: {train_output.name}")
        df_train_filtered = pl.read_parquet(train_output)
        train_users = df_train_filtered['user_id'].unique().to_list()
        
        logger.log_info(f"  Train users: {len(train_users):,}")
        
        # Read test
        logger.log_info(f"  Reading: {test_input.name}")
        df = pl.read_parquet(test_input)
        logger.log_info(f"  Shape: {df.shape}")
        
        # Filter
        df_test_filtered = df.filter(
            pl.col('user_id').is_in(train_users) &
            pl.col('parent_asin').is_in(train_items)
        )
        logger.log_info(f"  Filtered: {len(df_filtered):,} ratings, {df_filtered['user_id'].n_unique():,} users")
        
        # Save
        df_filtered.to_pandas().to_parquet(test_output, engine='pyarrow', index=False)
        logger.log_info(f"  Saved: {test_output.name}")
        logger.log_info(f"  Shape: {df_filtered.shape}")
    
    else:
        logger.log_info(f"\nSkip: {test_output.name} exists")
    
    logger.log_info(f"\nCompleted: {category}")

In [10]:
def filter_active_users(df: pl.DataFrame, stats: dict, rating_multiplier: float = 2.0, item_multiplier: float = 2.0):
    """
    Filter users: n_ratings > 2×avg AND n_items > 2×avg
    """
    
    logger.log_info(f"{'='*70}")
    logger.log_info(f"FILTERING ACTIVE USERS")
    logger.log_info(f"{'='*70}")
    
    min_ratings = stats['avg_ratings_per_user'] * rating_multiplier
    min_items = stats['avg_items_per_user'] * item_multiplier
    
    logger.log_info(f"Thresholds:")
    logger.log_info(f"  Min ratings: {min_ratings:.2f} (> {rating_multiplier}× avg)")
    logger.log_info(f"  Min items:   {min_items:.2f} (> {item_multiplier}× avg)")
    
    # Calculate per-user stats
    user_stats = df.group_by('user_id').agg([
        pl.len().alias('n_ratings'),
        pl.n_unique('parent_asin').alias('n_items')
    ])
    
    original_users = user_stats.height
    
    # Filter by ratings
    users_by_ratings = user_stats.filter(pl.col('n_ratings') > min_ratings)
    logger.log_info(f"\nStep 1: Filter by ratings > {min_ratings:.2f}")
    logger.log_info(f"  {original_users:,} → {users_by_ratings.height:,} users ({users_by_ratings.height/original_users*100:.1f}%)")
    
    # Filter by items
    active_users = users_by_ratings.filter(pl.col('n_items') > min_items)
    active_user_ids = active_users['user_id'].to_list()
    
    logger.log_info(f"\nStep 2: Filter by items > {min_items:.2f}")
    logger.log_info(f"  {users_by_ratings.height:,} → {len(active_user_ids):,} users ({len(active_user_ids)/users_by_ratings.height*100:.1f}%)")
    
    # Apply filter to dataset
    df_filtered = df.filter(pl.col('user_id').is_in(active_user_ids))
    
    logger.log_info(f"\n{'='*70}")
    logger.log_info(f"RESULT:")
    logger.log_info(f"  Users:   {df_filtered['user_id'].n_unique():,}")
    logger.log_info(f"  Items:   {df_filtered['parent_asin'].n_unique():,}")
    logger.log_info(f"  Ratings: {len(df_filtered):,}")
    logger.log_info(f"  Shape:   {df_filtered.shape}")
    logger.log_info(f"{'='*70}\n")
    
    return df_filtered, active_user_ids

In [11]:
def process_category_custom(category: str, rating_mult: float = 2.0, item_mult: float = 2.0):   
    safe_cat = category.replace('/', '-')
    
    logger.log_info(f"\n{'='*70}")
    logger.log_info(f"PROCESSING CATEGORY: {category}")
    logger.log_info(f"{'='*70}")
    
    # File paths
    train_input = PROCESSED_DIR / f"{safe_cat}.5core.train.parquet"
    valid_input = PROCESSED_DIR / f"{safe_cat}.5core.valid.parquet"
    test_input = PROCESSED_DIR / f"{safe_cat}.5core.test.parquet"
    
    train_output = PROCESSED_DIR / f"{safe_cat}.5core.train.filter.parquet"
    valid_output = PROCESSED_DIR / f"{safe_cat}.5core.valid.filter.parquet"
    test_output = PROCESSED_DIR / f"{safe_cat}.5core.test.filter.parquet"
    users_output = PROCESSED_DIR / f"{safe_cat}.5core.train.filter.parquet"
    
    # Check if all outputs exist
    if all([train_output.exists(), valid_output.exists(), test_output.exists()]):
        logger.log_info(f"Skip: All custom files exist for {category}\n")
        return
    
    # ========================================
    # STEP 1: Process TRAIN
    # ========================================
    
    if not train_output.exists():
        logger.log_info(f"\n{'='*70}")
        logger.log_info(f"STEP 1: CREATE TRAIN.CUSTOM")
        logger.log_info(f"{'='*70}")
        
        # Read original train.parquet
        logger.log_info(f"Reading: {train_input.name}")
        df_train = pl.read_parquet(train_input)
        logger.log_info(f"  Loaded: {df_train.shape}")
        
        # Inspect
        stats = inspect_data(df_train, category)
        
        # Filter
        df_train_filtered, train_users = filter_active_users(
            df_train, 
            stats, 
            rating_mult, 
            item_mult
        )
        
        # Save
        df_train_filtered.to_pandas().to_parquet(train_output, index=False)
        logger.log_info(f"  Saved: {train_output.name}")
        logger.log_info(f"  Shape: {df_train_filtered.shape}\n")
        
        # Save users list
        with open(users_output, 'w') as f:
            json.dump({'train_users': train_users}, f)
        logger.log_info(f"  Saved: {users_output.name} ({len(train_users):,} users)\n")
    
    else:
        logger.log_info(f"\nSkip: {train_output.name} exists")
        # Load users for valid/test filtering
        with open(users_output, 'r') as f:
            train_users = json.load(f)['train_users']
    
    # ========================================
    # STEP 2: Process VALID
    # ========================================
    
    if not valid_output.exists():
        logger.log_info(f"\n{'='*70}")
        logger.log_info(f"STEP 2: CREATE VALID.CUSTOM")
        logger.log_info(f"{'='*70}")
        
        # Load users if not already loaded
        if not train_output.exists():
            with open(users_output, 'r') as f:
                train_users = json.load(f)['train_users']
        
        # Read original valid.parquet
        logger.log_info(f"Reading: {valid_input.name}")
        df_valid = pl.read_parquet(valid_input)
        logger.log_info(f"  Loaded: {df_valid.shape}")
        logger.log_info(f"  Users: {df_valid['user_id'].n_unique():,}")
        
        # Filter by train users
        logger.log_info(f"\nFiltering by train users ({len(train_users):,})...")
        df_valid_filtered = df_valid.filter(pl.col('user_id').is_in(train_users))
        
        logger.log_info(f"  Filtered: {df_valid_filtered['user_id'].n_unique():,} users")
        logger.log_info(f"  Ratings:  {len(df_valid_filtered):,}")
        
        # Save
        # df_valid_filtered.write_parquet(valid_output) #df_save.to_pandas().to_parquet(out_path, index=False)
        df_valid_filtered.to_pandas().to_parquet(valid_output, index=False)
        logger.log_info(f"  Saved: {valid_output.name}")
        logger.log_info(f"  Shape: {df_valid_filtered.shape}\n")
    
    else:
        logger.log_info(f"\nSkip: {valid_output.name} exists")
    
    # ========================================
    # STEP 3: Process TEST
    # ========================================
    
    if not test_output.exists():
        logger.log_info(f"\n{'='*70}")
        logger.log_info(f"STEP 3: CREATE TEST.CUSTOM")
        logger.log_info(f"{'='*70}")
        
        # Load users if not already loaded
        if not train_output.exists() and not valid_output.exists():
            with open(users_output, 'r') as f:
                train_users = json.load(f)['train_users']
        
        # Read original test.parquet
        logger.log_info(f"Reading: {test_input.name}")
        df_test = pl.read_parquet(test_input)
        logger.log_info(f"  Loaded: {df_test.shape}")
        logger.log_info(f"  Users: {df_test['user_id'].n_unique():,}")
        
        # Filter by train users
        logger.log_info(f"\nFiltering by train users ({len(train_users):,})...")
        df_test_filtered = df_test.filter(pl.col('user_id').is_in(train_users))
        
        logger.log_info(f"  Filtered: {df_test_filtered['user_id'].n_unique():,} users")
        logger.log_info(f"  Ratings:  {len(df_test_filtered):,}")
        
        # Save
        # df_test_filtered.write_parquet(test_output)
        df_test_filtered.to_pandas().to_parquet(test_output, index=False)
        logger.log_info(f"  Saved: {test_output.name}")
        logger.log_info(f"  Shape: {df_test_filtered.shape}\n")
    
    else:
        logger.log_info(f"\nSkip: {test_output.name} exists")
    
    logger.log_info(f"{'='*70}")
    logger.log_info(f"  COMPLETED: {category}")
    logger.log_info(f"{'='*70}\n")

In [12]:
def run_custom_filtering(rating_mult=2.0, item_mult=2.0):
    """
    Run custom filtering for all categories
    """
    
    logger.log_info("="*70)
    logger.log_info("CREATE CUSTOM FILTERED DATASETS")
    logger.log_info("="*70 + "\n")
    
    for category in Configurations.CATEGORIES:
        try:
            create_custom_filtered_datasets(category, rating_mult, item_mult)
        except Exception as e:
            logger.log_exception(f"Error processing {category}: {e}")
    
    logger.log_info("\n" + "="*70)
    logger.log_info("COMPLETED")
    logger.log_info("="*70)

#### Filter dataset helpers

In [13]:
def create_n_sample(input_path: Path, n: int, item_mult: float = 1.0, n_name_out: str = None):
    # Parse input filename
    stem = input_path.stem  # e.g., "Electronics.5core.train.filter"
    parts = stem.split('.')
    
    # Extract components
    category = parts[0]
    core = parts[1] if len(parts) > 1 else '5core'
    split = parts[2] if len(parts) > 2 else 'train'
    
    # Create output path (replace last suffix with n)
    output_path = input_path.parent / f"{category}.{core}.{split}.{n_name_out}.parquet"
    
    # Check exists
    if output_path.exists():
        logger.log_info(f"Skip: {output_path.name}")
        return
    
    logger.log_info(f"\nSampling {n} users")
    logger.log_info(f"  Input:  {input_path.name}")
    logger.log_info(f"  Output: {output_path.name}")
    
    # Read input
    df = pl.read_parquet(input_path)
    total = df['user_id'].n_unique()
    
    logger.log_info(f"  Total users: {total:,}")
    
    # Handle train vs valid/test
    if split == 'train':
        # TRAIN: Sample top N active users
        if n >= total:
            df_sampled = df
            users_n = df['user_id'].unique().to_list()
        else:
            activity = df.group_by('user_id').agg(pl.len().alias('n')).sort('n', descending=True)
            users_n = activity.head(n)['user_id'].to_list()
            df_sampled = df.filter(pl.col('user_id').is_in(users_n))
        
        # Add filter rating/item counts
        if item_mult > 1.0:
            logger.log_info(f"  Before item filter: {df_sampled.shape} shape, {len(df_sampled):,} ratings, {df_sampled['user_id'].n_unique():,}  users, {df_sampled['parent_asin'].n_unique():,} items")

            item_counts = df_sampled.group_by('parent_asin').agg(pl.len().alias('n'))
            avg_item = item_counts['n'].mean()
            min_item = avg_item * item_mult
            
            logger.log_info(f"  Avg ratings/item: {avg_item:.2f}")
            logger.log_info(f"  Min threshold: {min_item:.2f} ({item_mult}x avg)")
            
            popular = item_counts.filter(pl.col('n') > min_item)['parent_asin'].to_list()
            df_sampled = df_sampled.filter(pl.col('parent_asin').is_in(popular))
            
            logger.log_info(f"  After item filter: {df_sampled.shape} shape, {len(df_sampled):,} ratings, {df_sampled['user_id'].n_unique():,}  users, {df_sampled['parent_asin'].n_unique():,} items")
            
        logger.log_info(f"  Sampled: {df_sampled.shape} shape, {len(df_sampled):,} ratings, {len(users_n):,} users")       
    else:
        # VALID/TEST: Filter by train users
        train_path = input_path.parent / f"{category}.{core}.train.{n_name_out}.parquet"
        
        if not train_path.exists():
            logger.log_warning(f"  Train sample not found: {train_path.name}")
            logger.log_warning(f"  Create train sample first!")
            return
        
        # Get train users and items
        df_train = pl.read_parquet(train_path)
        users_n = df_train['user_id'].unique().to_list()
        items_n = df_train['parent_asin'].unique().to_list()
        
        # Filter
        df_sampled = df.filter(
            pl.col('user_id').is_in(users_n) &
            pl.col('parent_asin').is_in(items_n)
        )
        
        logger.log_info(f"  Filtered: {len(df_sampled):,} ratings, {df_sampled['user_id'].n_unique():,} users")
    
    # Save
    df_sampled.to_pandas().to_parquet(output_path, engine='pyarrow', index=False)
    logger.log_info(f"  Saved: {output_path.name}\n")

#### Meta data helpers

In [14]:
def _meta_path(category: str):
    """Get path to raw metadata file"""
    return RAW_DIR / f"{category}.meta.jsonl.gz"

def download_meta(category: str, url: str = None):
    """Download raw metadata file"""
    url = url or meta_urls.get(category)
    if not url:
        logger.log_warning(f"[META] No URL for {category}")
        return
    
    dst = _meta_path(category)
    if dst.exists() and dst.stat().st_size > 0:
        logger.log_info(f"[META] Skip: {dst.name}")
        return
    
    logger.log_info(f"[META] Downloading: {category}")
    try:
        urllib.request.urlretrieve(url, str(dst))
        logger.log_info(f"[META] {dst.name}")
    except Exception as e:
        logger.log_exception(f"[META] Failed: {e}")

def _process_images(img_list):
    """Keep first 3 images, hi_res and thumb only"""
    if not img_list or not isinstance(img_list, list):
        return []
    return [{"hi_res": img.get("hi_res"), "thumb": img.get("thumb")} 
            for img in img_list[:3] if isinstance(img, dict)]


def _process_description(desc, max_len=2000):
    """Merge list to string and truncate"""
    if isinstance(desc, list):
        desc = " ".join(str(d) for d in desc if d)
    elif not desc:
        return ""
    else:
        desc = str(desc)
    return desc[:max_len] + ("..." if len(desc) > max_len else "")


def _process_list_field(field, max_items):
    """Flatten and limit list fields"""
    if not isinstance(field, list):
        return []
    if field and isinstance(field[0], list):
        field = field[0]
    return field[:max_items]


def _extract_item_metadata(obj):
    """Extract essential metadata from raw JSON"""
    return {
        "parent_asin": obj.get("parent_asin"),
        "title": obj.get("title", ""),
        "price": obj.get("price"),
        "average_rating": obj.get("average_rating"),
        "rating_number": obj.get("rating_number"),
        "features": _process_list_field(obj.get("features"), 10),
        "description": _process_description(obj.get("description")),
        "categories": _process_list_field(obj.get("categories"), 5),
        "images": _process_images(obj.get("images")),
        "store": obj.get("store", "")
    }


def save_meta_for_training_ui(category: str):
    """
    Save FULL metadata.
    Hybrid recommendation need full metadata for cold-start.
    """
    safe_cat = category.replace('/', '-')
    out_path = PROCESSED_DIR / f"{safe_cat}.meta.parquet"
    
    if out_path.exists():
        logger.log_info(f"[META] Skip as exist for metadata: {out_path.name}")
        return out_path
    
    # Load raw metadata
    fp = _meta_path(category)
    if not fp.exists():
        logger.log_warning(f"[META] Not found: {fp}")
        return None
    
    logger.log_info(f"[META] Reading: {fp.name}")
    
    rows = []
    with gzip.open(fp, "rt", encoding="utf-8") as f:
        for line in f:
            try:
                obj = json.loads(line)
                if obj.get("parent_asin"):
                    rows.append(_extract_item_metadata(obj))
            except:
                continue
    
    if not rows:
        logger.log_warning(f"[META] No data for {category}")
        return None
    
    # Save
    df = pd.DataFrame(rows)
    for col in ['price', 'average_rating', 'rating_number']:
        df[col] = pd.to_numeric(df[col], errors='coerce')
    for col in ['title', 'description', 'store']:
        df[col] = df[col].fillna('')
    
    df.to_parquet(out_path, index=False)
    logger.log_info(f"[META] {out_path.name}: {len(df):,} items")
    
    return out_path

### Task: Main execution logic

In [15]:
def run():
    logger.log_info("="*70)
    logger.log_info("DATASET COLLECTION")
    logger.log_info("="*70 + "\n")

    for core in CORES:
        for cat in CATEGORIES:
            logger.log_info(f"\n{'='*70}")
            logger.log_info(f"{cat}")
            logger.log_info(f"{'='*70}")
            
            # Download, create processed parquet files for raw data
            # IMPORTANT: Process TRAIN first!
            for split in Configurations.SPLITS:                
                logger.log_info(f"\nProcessing {split.upper()}...")
                
                url = build_url(core, cat, split)
                in_path = local_path_for_parquet(core, cat, split, raw_dir=RAW_DIR)
                out_path = local_path_for_parquet(core, cat, split, raw_dir=PROCESSED_DIR)
                try:
                    download_file(url, in_path)
                    save_dataset_to_parquet(in_path, out_path)
                except Exception as e:
                    logger.log_exception(f"{split} error: {e}")

    logger.log_info("\n" + "="*70)
    logger.log_info("METADATA")
    logger.log_info("="*70)
    
    for cat in CATEGORIES:
        download_meta(cat)
        save_meta_for_training_ui(cat)

    logger.log_info("\n" + "="*70)
    logger.log_info("CREATING SAMPLES FROM PARQUET DATA")
    logger.log_info("="*70)
    for core in CORES:
        for cat in CATEGORIES:
            for split in Configurations.SPLITS:
                for sample in Configurations.SAMPLE_SIZES:
                    if sample != "full":
                        n = Configurations.SAMPLE_SIZES[sample]
                        logger.log_info(f"\ncat={cat} - split={split} - sample={sample} - n={n} sampling...")
                        in_path = local_path_for_parquet(core, cat, split,raw_dir=PROCESSED_DIR)
                        out_path = local_path_for_parquet(core, cat, split, sample, raw_dir=PROCESSED_DIR)
                        logger.log_info(f"Input: {in_path.name} \n → Output: {out_path.name}")
                        create_n_sample(in_path, n, Configurations.ITEM_MULTI, sample)

    logger.log_info("\n COMPLETED")

### Task: Unit test

#### Run unit test

In [16]:
run()

2025-10-10 21:39:20,649 - INFO - DATASET COLLECTION

2025-10-10 21:39:20,651 - INFO - 
2025-10-10 21:39:20,651 - INFO - Electronics
2025-10-10 21:39:20,652 - INFO - 
Processing TRAIN...
2025-10-10 21:39:20,653 - INFO - Exists, skip: Electronics.5core.train.csv.gz
2025-10-10 21:39:20,653 - INFO - Skip: Electronics.5core.train.parquet
2025-10-10 21:39:20,653 - INFO - 
Processing VALID...
2025-10-10 21:39:20,653 - INFO - Exists, skip: Electronics.5core.valid.csv.gz
2025-10-10 21:39:20,655 - INFO - Skip: Electronics.5core.valid.parquet
2025-10-10 21:39:20,655 - INFO - 
Processing TEST...
2025-10-10 21:39:20,655 - INFO - Exists, skip: Electronics.5core.test.csv.gz
2025-10-10 21:39:20,656 - INFO - Skip: Electronics.5core.test.parquet
2025-10-10 21:39:20,656 - INFO - 
2025-10-10 21:39:20,656 - INFO - Beauty_and_Personal_Care
2025-10-10 21:39:20,657 - INFO - 
Processing TRAIN...
2025-10-10 21:39:20,657 - INFO - Exists, skip: Beauty_and_Personal_Care.5core.train.csv.gz
2025-10-10 21:39:20,658 -

#### Diagnostics

In [17]:
def diagnose_dataset(category: str, suffix: str = 'small'):
    safe_cat = category.replace('/', '-')
    
    logger.log_info("="*70)
    logger.log_info(f"DIAGNOSTIC: {category} (suffix={suffix})")
    logger.log_info("="*70)
    
    # Load files
    train = PROCESSED_DIR / f"{safe_cat}.5core.train.{suffix}.parquet"
    valid = PROCESSED_DIR / f"{safe_cat}.5core.valid.{suffix}.parquet"
    test = PROCESSED_DIR / f"{safe_cat}.5core.test.{suffix}.parquet"
    
    if not train.exists():
        logger.log_error(f"File not found: {train.name}")
        return
    
    df_train = pl.read_parquet(train)
    df_valid = pl.read_parquet(valid) if valid.exists() else None
    df_test = pl.read_parquet(test) if test.exists() else None
    
    # Stats
    def stats(df, name):
        u, i, r = df['user_id'].n_unique(), df['parent_asin'].n_unique(), len(df)
        s = 1 - (r / (u * i))
        logger.log_info(f"{name}: {r:,} ratings, {u:,} users, {i:,} items, sparsity {s:.2%}")
        return u, i, r
    
    train_u, train_i, train_r = stats(df_train, "TRAIN")
    
    if df_valid is not None:
        valid_u, valid_i, valid_r = stats(df_valid, "VALID")
        
        # Check overlap
        train_users = set(df_train['user_id'].unique())
        valid_users = set(df_valid['user_id'].unique())
        train_items = set(df_train['parent_asin'].unique())
        valid_items = set(df_valid['parent_asin'].unique())
        
        user_overlap = len(train_users & valid_users)
        item_overlap = len(train_items & valid_items)
        
        logger.log_info(f"\nOVERLAP:")
        logger.log_info(f"  Users: {user_overlap:,} / {valid_u:,} ({user_overlap/valid_u*100:.1f}%)")
        logger.log_info(f"  Items: {item_overlap:,} / {valid_i:,} ({item_overlap/valid_i*100:.1f}%)")
        
        if user_overlap < valid_u:
            logger.log_warning(f"  {valid_u - user_overlap:,} valid users NOT in train!")
        if item_overlap < valid_i:
            logger.log_warning(f"  {valid_i - item_overlap:,} valid items NOT in train!")
    
    if df_test is not None:
        test_u, test_i, test_r = stats(df_test, "TEST")
    
    logger.log_info("="*70 + "\n")

In [18]:
for cat in Configurations.CATEGORIES:
    diagnose_dataset(cat, "big")

2025-10-10 21:41:20,843 - INFO - DIAGNOSTIC: Electronics (suffix=big)
2025-10-10 21:41:20,911 - INFO - TRAIN: 85,890 ratings, 35,494 users, 82 items, sparsity 97.05%
2025-10-10 21:41:20,913 - INFO - VALID: 858 ratings, 858 users, 81 items, sparsity 98.77%
2025-10-10 21:41:20,926 - INFO - 
OVERLAP:
2025-10-10 21:41:20,926 - INFO -   Users: 858 / 858 (100.0%)
2025-10-10 21:41:20,926 - INFO -   Items: 81 / 81 (100.0%)
2025-10-10 21:41:20,927 - INFO - TEST: 675 ratings, 675 users, 76 items, sparsity 98.68%

2025-10-10 21:41:20,928 - INFO - DIAGNOSTIC: Beauty_and_Personal_Care (suffix=big)
2025-10-10 21:41:20,931 - INFO - TRAIN: 7,718 ratings, 6,613 users, 8 items, sparsity 85.41%
2025-10-10 21:41:20,931 - INFO - VALID: 66 ratings, 66 users, 8 items, sparsity 87.50%
2025-10-10 21:41:20,933 - INFO - 
OVERLAP:
2025-10-10 21:41:20,933 - INFO -   Users: 66 / 66 (100.0%)
2025-10-10 21:41:20,933 - INFO -   Items: 8 / 8 (100.0%)
2025-10-10 21:41:20,933 - INFO - TEST: 65 ratings, 65 users, 8 items,