# Phase 4b: Ken Regan Analysis (IPR & Z-Score)

Apply Ken Regan's cheat detection methodology using IPR and Z-scores.

**Inputs:**
- Phase 4a: engine_analysis.parquet, engine_positions.json

**Outputs:**
- `regan_analysis.parquet` - Per-game Regan metrics
- `suspicious_positions.json` - Flagged positions

In [None]:
# Parameters (injected by Papermill)
username = "default_user"  # Chess.com username
z_score_threshold = 2.0  # Z-score threshold for flagging

In [None]:
# Setup
import sys
sys.path.insert(0, '..')
from common import (
    setup_notebook, validate_parameters, print_section, print_subsection,
    get_user_data_dir, save_phase_output, load_phase_output,
    load_dataset_parquet, load_baseline,
    analyze_game_regan, ReganAnalysisResult,
)
import json
import pandas as pd
import numpy as np
from dataclasses import asdict

setup_notebook()
validate_parameters(username)

In [None]:
# Load data
print_section(f"REGAN ANALYSIS: {username}")

# Load engine analysis results (optional - phase 4a may not have run)
try:
    engine_df = load_phase_output(username, "phase4a", "engine_analysis.parquet")
    print(f"Engine analysis loaded: {len(engine_df)} games")
except FileNotFoundError:
    engine_df = pd.DataFrame()
    print("No engine analysis found (phase 4a may not have completed)")

# Load detailed positions
try:
    engine_positions = load_phase_output(username, "phase4a", "engine_positions.json")
    print(f"Position data loaded: {len(engine_positions.get('games', []))} games")
except FileNotFoundError:
    engine_positions = {"games": []}
    print("No position data found")

# Load baselines for comparison
trusted_baseline = load_baseline("trusted")
cheater_baseline = load_baseline("cheater")

if engine_df.empty and not engine_positions.get('games'):
    print("\nWARNING: No engine analysis data available. Regan analysis will be skipped.")
    print("Run phase 4a (engine analysis) first to enable Regan analysis.")

In [None]:
# Ken Regan Analysis
print_subsection("CALCULATING IPR AND Z-SCORES")

regan_results = []

# Load games dataframe to get player ratings
user_data_dir = get_user_data_dir(username)
try:
    games_df = pd.DataFrame(load_dataset_parquet(user_data_dir / "games.parquet"))
    game_ratings = dict(zip(games_df['game_id'].astype(str), games_df['player_rating']))
except Exception:
    game_ratings = {}

for game_data in engine_positions.get('games', []):
    game_id = game_data['game_id']
    positions = game_data.get('positions', [])
    
    if not positions:
        continue
    
    # Get player's rating for this game (or default to 1500)
    official_elo = game_ratings.get(str(game_id), 1500)
    if pd.isna(official_elo):
        official_elo = 1500
    official_elo = int(official_elo)
    
    # Use analyze_game_regan for proper IPR and z-score calculation
    # The function expects positions with: best_move, move, eval_before, eval_after
    regan_result = analyze_game_regan(
        positions=positions,
        official_elo=official_elo,
        exclude_book_moves=0,  # Already filtered in engine analysis
    )
    
    regan_results.append({
        'game_id': game_id,
        'official_elo': regan_result.official_elo,
        'ipr': regan_result.ipr,
        'elo_difference': regan_result.elo_difference,
        'z_score': regan_result.z_score,
        'move_match_rate': regan_result.move_match_rate,
        'avg_partial_credit': regan_result.avg_partial_credit,
        'is_flagged': regan_result.is_suspicious,
        'suspicion_level': regan_result.suspicion_level,
        'total_moves': regan_result.num_positions,
        'acpl': game_data.get('acpl', 0),
    })

print(f"Analyzed {len(regan_results)} games")

In [None]:
# Display Regan analysis results
print_subsection("REGAN ANALYSIS RESULTS")

if regan_results:
    regan_df = pd.DataFrame(regan_results)
    
    print(f"\nSummary:")
    print(f"  Games analyzed: {len(regan_df)}")
    print(f"  Average official Elo: {regan_df['official_elo'].mean():.0f}")
    print(f"  Average IPR: {regan_df['ipr'].mean():.0f}")
    print(f"  Average Elo difference (IPR - Elo): {regan_df['elo_difference'].mean():.0f}")
    print(f"  Average Z-score: {regan_df['z_score'].mean():.2f}")
    print(f"  Average move match rate: {regan_df['move_match_rate'].mean():.1%}")
    print(f"  Games flagged: {regan_df['is_flagged'].sum()}")
    
    # Show flagged games
    flagged = regan_df[regan_df['is_flagged']]
    if not flagged.empty:
        print(f"\nFlagged games:")
        print(flagged[['game_id', 'official_elo', 'ipr', 'z_score', 'suspicion_level']].to_string())
    
    # Show games by suspicion level
    print(f"\nSuspicion level breakdown:")
    print(regan_df['suspicion_level'].value_counts().to_string())
    
    # Compare to baseline
    if trusted_baseline:
        print(f"\nBaseline comparison:")
        print(f"  Player avg Z-score: {regan_df['z_score'].mean():.2f}")
        print(f"  Player avg IPR - Elo: {regan_df['elo_difference'].mean():.0f}")
else:
    regan_df = pd.DataFrame()
    print("No Regan analysis results.")

In [None]:
# Correlate with rating improvement
print_subsection("RATING CORRELATION")

# Load rating timeline if available
try:
    quick_stats = load_phase_output(username, "phase2", "quick_stats.json")
    improvement = quick_stats.get('glicko2', {}).get('improvement')
    
    if improvement:
        trend = improvement.get('trend', 'N/A')
        print(f"Rating improvement trend: {trend}")
        # Use glicko2_slope or elo_slope instead of rating_per_period
        glicko_slope = improvement.get('glicko2_slope', 0)
        elo_slope = improvement.get('elo_slope', 0)
        print(f"Glicko-2 slope: {glicko_slope:.1f} per 100 games")
        print(f"Elo slope: {elo_slope:.1f} per 100 games")
        
        # High Z-scores during rapid improvement could be suspicious
        if trend == 'improving' and not regan_df.empty:
            avg_z = regan_df['z_score'].mean()
            if avg_z > 1.5:
                print(f"\nWARNING: High Z-scores ({avg_z:.2f}) during improvement period")
    else:
        print("Insufficient data for improvement analysis")
except FileNotFoundError:
    print("No rating data available for correlation.")

In [None]:
# Save outputs
if not regan_df.empty:
    save_phase_output(username, "phase4b", "regan_analysis.parquet", regan_df)
    
    # Save flagged positions detail
    flagged_detail = {
        "username": username,
        "z_score_threshold": z_score_threshold,
        "flagged_games": regan_df[regan_df['is_flagged']].to_dict('records'),
        "summary": {
            "total_games": len(regan_df),
            "flagged_count": int(regan_df['is_flagged'].sum()),
            "avg_z_score": float(regan_df['z_score'].mean()),
            "max_z_score": float(regan_df['z_score'].max()),
        }
    }
    save_phase_output(username, "phase4b", "suspicious_positions.json", flagged_detail)

print(f"\nPhase 4b complete!")

In [None]:
# Visualization
import matplotlib.pyplot as plt

if not regan_df.empty:
    fig, axes = plt.subplots(1, 3, figsize=(15, 4))
    
    # Z-score distribution
    axes[0].hist(regan_df['z_score'], bins=20, color='steelblue', edgecolor='white')
    axes[0].axvline(2.0, color='red', linestyle='--', label='Threshold (2.0)')
    axes[0].axvline(regan_df['z_score'].mean(), color='green', linestyle='--', label=f"Mean ({regan_df['z_score'].mean():.2f})")
    axes[0].set_xlabel('Z-Score')
    axes[0].set_ylabel('Games')
    axes[0].set_title('Z-Score Distribution')
    axes[0].legend()
    
    # IPR vs Official Elo
    axes[1].scatter(regan_df['official_elo'], regan_df['ipr'], 
                   c=['red' if f else 'blue' for f in regan_df['is_flagged']], alpha=0.6)
    min_elo = min(regan_df['official_elo'].min(), regan_df['ipr'].min()) - 50
    max_elo = max(regan_df['official_elo'].max(), regan_df['ipr'].max()) + 50
    axes[1].plot([min_elo, max_elo], [min_elo, max_elo], 'k--', alpha=0.3, label='IPR = Elo')
    axes[1].set_xlabel('Official Elo')
    axes[1].set_ylabel('IPR (Intrinsic Performance Rating)')
    axes[1].set_title('IPR vs Official Elo')
    axes[1].legend()
    
    # Move match rate distribution
    axes[2].hist(regan_df['move_match_rate'], bins=20, color='purple', edgecolor='white')
    axes[2].axvline(regan_df['move_match_rate'].mean(), color='red', linestyle='--', 
                   label=f"Mean ({regan_df['move_match_rate'].mean():.1%})")
    axes[2].set_xlabel('Move Match Rate')
    axes[2].set_ylabel('Games')
    axes[2].set_title('Best Move Match Rate')
    axes[2].legend()
    
    plt.tight_layout()
    plt.show()