# Phase 1: Data Collection

Fetch and cache games from Chess.com API for the target player.

**Outputs:**
- `games.parquet` - Game metadata
- `positions.parquet` - Position-level data
- `game_aggregates.parquet` - Aggregated game features
- `sessions.json` - Session detection results
- `summary.json` - Analysis summary
- `opening_book.json` - Opening repertoire

In [None]:
# Parameters (injected by Papermill)
username = "default_user"
days_back = 0
time_classes = ["rapid", "blitz"]
validate_opponents = True
max_games_per_time_class = 0

In [None]:
# Setup
import sys
sys.path.insert(0, '..')
from common import (
    setup_notebook, validate_parameters, print_section,
    get_user_data_dir, get_phase_dir, save_phase_output,
    generate_player_baseline, load_dataset_parquet,
    PROJECT_ROOT, DATA_DIR
)
import json
import pandas as pd
from pathlib import Path

setup_notebook()
validate_parameters(username)

In [None]:
# Generate baseline for the player
# This handles fetching, caching, opponent validation, and analysis
print_section(f"DATA COLLECTION: {username}")
print(f"Time classes: {time_classes}")
print(f"Days back: {days_back if days_back else 'All history'}")
print(f"Validate opponents: {validate_opponents}")
print()

# Convert 0 to None for optional parameters
actual_days_back = days_back if days_back else None
actual_max_games = max_games_per_time_class if max_games_per_time_class else None

result = generate_player_baseline(
    username=username,
    output_dir=DATA_DIR / "other-users",
    days_back=actual_days_back,
    time_classes=time_classes,
    validate_opponents=validate_opponents,
    max_games=actual_max_games,
)

if result.get("status") != "success":
    raise RuntimeError(f"Data collection failed: {result}")

print(f"\nData collection complete!")
print(f"Total games: {result['total_games']}")

In [None]:
# Load the generated datasets
user_data_dir = get_user_data_dir(username)
print(f"Data directory: {user_data_dir}")

# Load game dataset
games_df = pd.DataFrame(load_dataset_parquet(user_data_dir / "games.parquet"))
print(f"Games loaded: {len(games_df)}")

# Load positions dataset
positions_df = pd.DataFrame(load_dataset_parquet(user_data_dir / "positions.parquet"))
print(f"Positions loaded: {len(positions_df)}")

# Load game aggregates
aggregates_df = pd.DataFrame(load_dataset_parquet(user_data_dir / "game_aggregates.parquet"))
print(f"Game aggregates loaded: {len(aggregates_df)}")

In [None]:
# Load sessions and summary
with open(user_data_dir / "sessions.json") as f:
    sessions_data = json.load(f)
print(f"Sessions detected: {len(sessions_data.get('sessions', []))}")

with open(user_data_dir / "summary.json") as f:
    summary_data = json.load(f)

# Display summary
print(f"\nSummary:")
print(f"  Username: {summary_data['username']}")
print(f"  Total games: {summary_data['total_games']}")
print(f"  Fair opponent games: {summary_data.get('games_vs_fair_opponents', 'N/A')}")
print(f"  Banned opponent games: {summary_data.get('games_vs_banned_opponents', 'N/A')}")
print(f"  Unique opponents: {summary_data['unique_opponents']}")
print(f"  Positions extracted: {summary_data['positions_extracted']}")

In [None]:
# Copy outputs to phase directory for downstream phases
phase_dir = get_phase_dir(username, "phase1")

# Save a phase manifest
manifest = {
    "username": username,
    "source_dir": str(user_data_dir),
    "total_games": len(games_df),
    "total_positions": len(positions_df),
    "time_classes": time_classes,
    "days_back": actual_days_back,
    "validate_opponents": validate_opponents,
    "games_by_time_class": summary_data.get("games_by_time_class", {}),
}
save_phase_output(username, "phase1", "manifest.json", manifest)

print(f"\nPhase 1 complete!")
print(f"Output directory: {phase_dir}")

In [None]:
# Display games overview
print(f"\nGames Overview:")
print(f"Time class distribution:")
print(games_df['time_class'].value_counts())

print(f"\nResult distribution:")
print(games_df['player_result'].value_counts())

if 'opponent_is_banned' in games_df.columns:
    print(f"\nOpponent status:")
    print(games_df['opponent_is_banned'].value_counts())