# Phase 2: Quick Analysis

Fast statistical analysis of game patterns without expensive engine evaluation.

**Inputs:**
- Phase 1 outputs (games.parquet, sessions.json)

**Outputs:**
- `elo_analysis.json` - Rating pattern analysis
- `result_patterns.json` - Win/loss/draw patterns
- `session_analysis.json` - Session pattern analysis
- `quick_stats.json` - Combined quick statistics

In [None]:
# Parameters (injected by Papermill)
username = "default_user"  # Chess.com username

In [None]:
# Setup
import sys
sys.path.insert(0, '..')
from common import (
    setup_notebook, validate_parameters, print_section, print_subsection,
    get_user_data_dir, get_phase_dir, save_phase_output, load_phase_output,
    load_baseline, load_dataset_parquet,
    analyze_elo_patterns, analyze_result_patterns, analyze_session_patterns,
    track_rating_over_time, analyze_rating_improvement,
    Glicko2Rating,
    PROJECT_ROOT
)
import json
import pandas as pd
import numpy as np
from dataclasses import asdict

setup_notebook()
validate_parameters(username)

In [None]:
# Load data from Phase 1
print_section(f"QUICK ANALYSIS: {username}")

user_data_dir = get_user_data_dir(username)

# Load datasets
games_df = pd.DataFrame(load_dataset_parquet(user_data_dir / "games.parquet"))
print(f"Games loaded: {len(games_df)}")

# Merge with aggregates if available
try:
    aggregates_df = pd.DataFrame(load_dataset_parquet(user_data_dir / "game_aggregates.parquet"))
    games_df = games_df.merge(aggregates_df, on='game_id', how='left', suffixes=('', '_agg'))
    print(f"Merged with aggregates")
except FileNotFoundError:
    print("No aggregates file found")

# Load sessions
with open(user_data_dir / "sessions.json") as f:
    sessions_data = json.load(f)

# Load baselines
trusted_baseline = load_baseline("trusted")
cheater_baseline = load_baseline("cheater")
print(f"Baselines loaded: trusted={bool(trusted_baseline)}, cheater={bool(cheater_baseline)}")

In [None]:
# Session Analysis
print_subsection("SESSION ANALYSIS")

session_patterns = sessions_data.get('patterns', {})
sessions = sessions_data.get('sessions', [])

print(f"Total sessions: {len(sessions)}")
if session_patterns:
    print(f"Average session length: {session_patterns.get('avg_session_length', 0):.1f} games")
    print(f"Max session length: {session_patterns.get('max_session_length', 0)} games")
    print(f"Average session duration: {session_patterns.get('avg_session_duration_minutes', 0):.0f} minutes")

session_analysis = {
    "total_sessions": len(sessions),
    "patterns": session_patterns
}

In [None]:
# Rating Analysis (Chess.com Elo) - Separate by Time Class
print_subsection("RATING ANALYSIS")

# Get time classes in data
time_classes_in_data = games_df['time_class'].unique().tolist()
print(f"Time classes: {time_classes_in_data}")

# Analyze overall
elo_analysis = analyze_elo_patterns(games_df.to_dict('records'))

print(f"\nOverall Rating Stats:")
print(f"  Rating range: {elo_analysis.elo_min} - {elo_analysis.elo_max}")
print(f"  Rating change: {elo_analysis.elo_change:+d}")
print(f"  Win rate: {elo_analysis.win_rate:.1%}")
print(f"  Manipulation score: {elo_analysis.rating_manipulation_score:.3f}")

# Analyze by time class
elo_by_time_class = {}
for tc in time_classes_in_data:
    tc_games = games_df[games_df['time_class'] == tc].to_dict('records')
    if tc_games:
        tc_analysis = analyze_elo_patterns(tc_games)
        elo_by_time_class[tc] = asdict(tc_analysis)
        print(f"\n{tc.capitalize()} Rating Stats:")
        print(f"  Games: {tc_analysis.total_games}")
        print(f"  Rating: {tc_analysis.elo_min} - {tc_analysis.elo_max} (current: {tc_analysis.elo_end})")
        print(f"  Win rate: {tc_analysis.win_rate:.1%}")

# Compare to baselines
if trusted_baseline:
    trusted_manip = trusted_baseline.get('elo_baseline', {}).get('manipulation_score_mean', 0)
    trusted_max = trusted_baseline.get('elo_baseline', {}).get('manipulation_score_max', 0)
    print(f"\nBaseline comparison:")
    print(f"  Trusted avg: {trusted_manip:.3f}, max: {trusted_max:.3f}")
    if elo_analysis.rating_manipulation_score > trusted_max:
        print(f"  WARNING: Manipulation score exceeds trusted baseline max!")

In [None]:
# Glicko-2 Rating Analysis
print_subsection("GLICKO-2 ANALYSIS")

# Load raw games cache for Glicko-2 (needs nested white/black dicts)
raw_games_cache_path = user_data_dir / "games_cache.json"
if raw_games_cache_path.exists():
    with open(raw_games_cache_path) as f:
        raw_cache = json.load(f)
    raw_games = raw_cache.get('games', [])
    
    # Track rating over time using raw game data
    rating_timeline = track_rating_over_time(raw_games, username)
    improvement_analysis = analyze_rating_improvement(rating_timeline)

    print(f"Rating periods: {len(rating_timeline)}")
    if not rating_timeline.empty:
        print(f"Final Glicko-2: {rating_timeline.iloc[-1]['rating']:.0f} (RD: {rating_timeline.iloc[-1]['rd']:.0f})")
        if improvement_analysis:
            print(f"Improvement trend: {improvement_analysis.trend}")
            print(f"Glicko-2 slope: {improvement_analysis.glicko2_slope:.1f} per 100 games")
            print(f"Elo slope: {improvement_analysis.elo_slope:.1f} per 100 games")
        else:
            print("Insufficient data for improvement analysis")
else:
    print("Raw games cache not found, skipping Glicko-2 analysis")
    rating_timeline = pd.DataFrame()
    improvement_analysis = None

In [None]:
# Result Patterns
print_subsection("RESULT PATTERNS")

result_analysis = analyze_result_patterns(games_df.to_dict('records'))

# Calculate totals from breakdown
total_wins = result_analysis.wins_by_checkmate + result_analysis.wins_by_resignation + result_analysis.wins_by_timeout
total_losses = result_analysis.losses_by_checkmate + result_analysis.losses_by_resignation + result_analysis.losses_by_timeout
total_draws = result_analysis.draws_total

print(f"Win/Draw/Loss: {total_wins}/{total_draws}/{total_losses}")
print(f"Checkmate rate: {result_analysis.checkmate_rate:.1%}")
print(f"Resignation rate: {result_analysis.resignation_rate:.1%}")
print(f"Timeout rate: {result_analysis.timeout_rate:.1%}")

# Compare to baselines
if trusted_baseline:
    trusted_timeout = trusted_baseline.get('timeout_baseline', {}).get('timeout_win_rate_mean', 0)
    print(f"\nTimeout comparison:")
    print(f"  Player timeout rate: {result_analysis.timeout_rate:.1%}")
    print(f"  Trusted baseline: {trusted_timeout:.1%}")

In [None]:
# Opening Book Summary
print_subsection("OPENING BOOK")

with open(user_data_dir / "opening_book.json") as f:
    opening_book = json.load(f)

# Handle both old format (ECO dict) and new format (positions dict)
if 'positions' in opening_book:
    # New format with position counts
    positions = opening_book.get('positions', {})
    common_positions = opening_book.get('common_positions', {})
    print(f"Unique positions: {opening_book.get('num_unique_positions', len(positions))}")
    print(f"Common positions (3+ games): {opening_book.get('num_common_positions', len(common_positions))}")
    print(f"Average opening depth: {opening_book.get('opening_depth_avg', 0):.1f} plies")
    
    if common_positions:
        print(f"\nMost common positions:")
        # Sort by count descending
        sorted_positions = sorted(common_positions.items(), key=lambda x: x[1], reverse=True)[:5]
        for fen, count in sorted_positions:
            # Truncate FEN for display
            short_fen = fen[:40] + "..." if len(fen) > 40 else fen
            print(f"  {short_fen}: {count} games")
else:
    # Old format with ECO codes
    print(f"Unique openings (ECO): {len(opening_book)}")
    
    # Top openings
    sorted_openings = sorted(opening_book.items(), key=lambda x: x[1].get('count', 0), reverse=True)[:10]
    print(f"\nTop 10 openings:")
    for eco, stats in sorted_openings:
        count = stats.get('count', 0)
        wins = stats.get('wins', 0)
        win_rate = wins / count if count > 0 else 0
        print(f"  {eco}: {count} games, {win_rate:.0%} win rate")

In [None]:
# Save quick stats
opening_count = opening_book.get('num_unique_positions', len(opening_book)) if isinstance(opening_book, dict) else len(opening_book)

quick_stats = {
    "username": username,
    "total_games": len(games_df),
    "time_classes": time_classes_in_data,
    "elo_analysis": asdict(elo_analysis),
    "elo_by_time_class": elo_by_time_class,
    "result_patterns": asdict(result_analysis),
    "session_analysis": session_analysis,
    "glicko2": {
        "periods": len(rating_timeline),
        "improvement": asdict(improvement_analysis) if improvement_analysis else None,
    },
    "opening_count": opening_count,
}

save_phase_output(username, "phase2", "quick_stats.json", quick_stats)
save_phase_output(username, "phase2", "elo_analysis.json", asdict(elo_analysis))
save_phase_output(username, "phase2", "elo_by_time_class.json", elo_by_time_class)
save_phase_output(username, "phase2", "result_patterns.json", asdict(result_analysis))
save_phase_output(username, "phase2", "session_analysis.json", session_analysis)

print(f"\nPhase 2 complete!")

In [None]:
# Visualization: Rating over time
import matplotlib.pyplot as plt

if not rating_timeline.empty and len(rating_timeline) > 1:
    fig, ax = plt.subplots(figsize=(12, 5))
    
    # Use rating column (Glicko-2)
    ax.plot(range(len(rating_timeline)), rating_timeline['rating'], 'b-', linewidth=2, label='Glicko-2')
    ax.fill_between(
        range(len(rating_timeline)),
        rating_timeline['rating'] - rating_timeline['rd'],
        rating_timeline['rating'] + rating_timeline['rd'],
        alpha=0.2
    )
    
    # Also plot Chess.com Elo if available
    if 'elo_end' in rating_timeline.columns:
        ax.plot(range(len(rating_timeline)), rating_timeline['elo_end'], 'g--', linewidth=1, label='Chess.com Elo')
    
    ax.set_xlabel('Rating Period')
    ax.set_ylabel('Rating')
    ax.set_title(f'Rating Progression: {username}')
    ax.legend()
    plt.tight_layout()
    plt.show()