# Phase 3: Game Prioritization

Score each game for potential suspiciousness to prioritize deep analysis.

**Inputs:**
- Phase 1: games.parquet, game_aggregates.parquet
- Phase 2: quick_stats.json

**Outputs:**
- `priority_scores.parquet` - Games with suspicion scores
- `high_priority_games.json` - List of games for deep analysis

In [None]:
# Parameters (injected by Papermill)
username = "default_user"  # Chess.com username
min_suspicion_score = 3  # Minimum score for deep analysis
max_games_to_analyze = 50  # Maximum games for deep analysis

In [None]:
# Setup
import sys
sys.path.insert(0, '..')
from common import (
    setup_notebook, validate_parameters, print_section, print_subsection,
    get_user_data_dir, save_phase_output, load_phase_output,
    load_dataset_parquet,
)
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

setup_notebook()
validate_parameters(username)

In [None]:
# Load data
print_section(f"GAME PRIORITIZATION: {username}")

user_data_dir = get_user_data_dir(username)

# Load games with aggregates
games_df = pd.DataFrame(load_dataset_parquet(user_data_dir / "games.parquet"))
print(f"Games loaded: {len(games_df)}")

try:
    aggregates_df = pd.DataFrame(load_dataset_parquet(user_data_dir / "game_aggregates.parquet"))
    games_df = games_df.merge(aggregates_df, on='game_id', how='left', suffixes=('', '_agg'))
    print(f"Merged with aggregates: {len(aggregates_df)} rows")
except FileNotFoundError:
    print("No aggregates found, using games only")

# Load quick stats
quick_stats = load_phase_output(username, "phase2", "quick_stats.json")
avg_rating = quick_stats['elo_analysis'].get('current_rating', 1500)

In [None]:
# Define suspicion scoring function
def calculate_game_suspicion(row):
    """
    Calculate a suspicion score for a game.
    
    Higher scores indicate games that should be analyzed more closely.
    """
    score = 0
    
    # Win against higher-rated opponent (+2)
    rating_diff = row.get('player_rating', 1500) - row.get('opponent_rating', 1500)
    if row.get('player_result') == 'win' and rating_diff < -100:
        score += 2
    
    # Large material swings in won game (+1)
    if row.get('player_result') == 'win':
        if row.get('material_trajectory') == 'volatile':
            score += 1
        if row.get('max_material_behind', 0) > 300:
            score += 1
    
    # High fragility positions (+1)
    if row.get('fragility_category') == 'high':
        score += 1
    
    # Suspicious time patterns (+1)
    avg_move_time = row.get('avg_move_time', 10)
    if avg_move_time < 2:
        score += 1  # Very fast moves
    
    # Won by resignation in complex position (+1)
    if row.get('termination') == 'resignation' and row.get('player_result') == 'win':
        if row.get('game_length', 40) > 40:
            score += 1
    
    # Game vs banned opponent (-1, less interesting for cheat detection)
    if row.get('opponent_is_banned', False):
        score -= 1
    
    return max(0, score)

# Calculate suspicion scores
games_df['suspicion_score'] = games_df.apply(calculate_game_suspicion, axis=1)

print(f"\nSuspicion score distribution:")
print(games_df['suspicion_score'].value_counts().sort_index())

In [None]:
# Sort and prioritize games
priority_cols = [
    'game_id', 'game_date', 'opponent_username',
    'player_rating', 'opponent_rating', 'player_result',
    'time_class', 'termination', 'game_length',
    'suspicion_score'
]

# Select available columns
available_cols = [c for c in priority_cols if c in games_df.columns]
prioritized_df = games_df[available_cols].copy()
prioritized_df = prioritized_df.sort_values('suspicion_score', ascending=False)

print(f"\nTop 20 games by suspicion score:")
display_cols = ['game_date', 'opponent_username', 'player_result', 'suspicion_score']
display_cols = [c for c in display_cols if c in prioritized_df.columns]
print(prioritized_df[display_cols].head(20).to_string())

In [None]:
# Select high-priority games for deep analysis
MIN_GAMES_FOR_ANALYSIS = 10  # Always try to analyze at least this many games

high_priority = prioritized_df[
    prioritized_df['suspicion_score'] >= min_suspicion_score
].head(max_games_to_analyze)

print(f"\nGames selected for deep analysis:")
print(f"  Suspicion threshold: >= {min_suspicion_score}")
print(f"  Max games: {max_games_to_analyze}")
print(f"  High-suspicion games found: {len(high_priority)}")

# Ensure we have enough games for meaningful analysis
total_games = len(prioritized_df)
target_count = min(MIN_GAMES_FOR_ANALYSIS, total_games)  # Can't analyze more than we have

if len(high_priority) < target_count:
    print(f"\n  Only {len(high_priority)} high-suspicion games found.")
    print(f"  Falling back to top {target_count} games by suspicion score...")
    
    # Take top games by score (includes high-suspicion + next highest scores)
    high_priority = prioritized_df.head(target_count)
    
    # Add note about fallback selection
    fallback_count = target_count - len(prioritized_df[prioritized_df['suspicion_score'] >= min_suspicion_score])
    if fallback_count > 0:
        print(f"  Added {fallback_count} additional games below threshold")

print(f"\n  Final selection: {len(high_priority)} games")
if len(high_priority) > 0:
    print(f"  Score range: {high_priority['suspicion_score'].min()} - {high_priority['suspicion_score'].max()}")

In [None]:
# Visualization: Suspicion score distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Bar chart of score distribution
score_counts = games_df['suspicion_score'].value_counts().sort_index()
axes[0].bar(score_counts.index, score_counts.values, color='steelblue')
axes[0].axvline(x=min_suspicion_score, color='red', linestyle='--', label=f'Threshold ({min_suspicion_score})')
axes[0].set_xlabel('Suspicion Score')
axes[0].set_ylabel('Number of Games')
axes[0].set_title('Suspicion Score Distribution')
axes[0].legend()

# Score vs rating difference
if 'player_rating' in games_df.columns and 'opponent_rating' in games_df.columns:
    rating_diff = games_df['opponent_rating'] - games_df['player_rating']
    colors = ['green' if r == 'win' else 'red' if r == 'loss' else 'gray' 
              for r in games_df.get('player_result', ['draw']*len(games_df))]
    axes[1].scatter(rating_diff, games_df['suspicion_score'], c=colors, alpha=0.5)
    axes[1].set_xlabel('Rating Difference (Opponent - Player)')
    axes[1].set_ylabel('Suspicion Score')
    axes[1].set_title('Suspicion vs Rating Difference')
    axes[1].axhline(y=min_suspicion_score, color='red', linestyle='--')

plt.tight_layout()
plt.show()

In [None]:
# Save outputs
save_phase_output(username, "phase3", "priority_scores.parquet", prioritized_df)

high_priority_list = high_priority['game_id'].tolist()
high_priority_data = {
    "username": username,
    "min_suspicion_score": min_suspicion_score,
    "max_games": max_games_to_analyze,
    "selected_count": len(high_priority_list),
    "game_ids": high_priority_list,
}
save_phase_output(username, "phase3", "high_priority_games.json", high_priority_data)

print(f"\nPhase 3 complete!")
print(f"High priority games saved: {len(high_priority_list)}")