# Phase 4c: Tablebase Analysis

Analyze endgame accuracy using Lichess tablebase API.

**Inputs:**
- Phase 1: games.parquet, raw games
- Phase 3: high_priority_games.json

**Outputs:**
- `tablebase_accuracy.parquet` - Endgame accuracy per game
- `tablebase_consistency.json` - Overall tablebase consistency report

In [None]:
# Parameters (injected by Papermill)
username = "default_user"  # Chess.com username
max_pieces = 7  # Maximum pieces for tablebase lookup

In [None]:
# Setup
import sys
sys.path.insert(0, '..')
from common import (
    setup_notebook, validate_parameters, print_section, print_subsection,
    get_user_data_dir, save_phase_output, load_phase_output,
    load_dataset_parquet, load_cached_games_v2,
    analyze_endgame_accuracy,
    is_tablebase_position,
)
import chess
import chess.pgn
import io
import json
import pandas as pd
import numpy as np
from tqdm import tqdm
import time

setup_notebook()
validate_parameters(username)

In [None]:
# Load data
print_section(f"TABLEBASE ANALYSIS: {username}")

user_data_dir = get_user_data_dir(username)

# Load games
games_df = pd.DataFrame(load_dataset_parquet(user_data_dir / "games.parquet"))
print(f"Games loaded: {len(games_df)}")

# Load raw games for PGN access
all_games_raw, _ = load_cached_games_v2(user_data_dir)
print(f"Raw games loaded: {len(all_games_raw)}")

# Load priority games (analyze all, not just high priority)
# Tablebase analysis is fast enough to run on all games
try:
    priority_data = load_phase_output(username, "phase3", "high_priority_games.json")
    priority_game_ids = set(priority_data['game_ids'])
    print(f"High priority games: {len(priority_game_ids)}")
except FileNotFoundError:
    priority_game_ids = set()
    print("No priority data, analyzing all games with endgames")

In [None]:
# Helper to get PGN from raw game
def get_game_pgn(game_id: str, all_games: list) -> chess.pgn.Game:
    """Extract PGN for a specific game."""
    for game in all_games:
        url = game.get('url', '')
        if game_id in url or url.endswith(f'/{game_id}'):
            pgn_str = game.get('pgn', '')
            if pgn_str:
                return chess.pgn.read_game(io.StringIO(pgn_str))
    return None

# Find games with tablebase-eligible endgames
games_with_endgames = []

for _, row in games_df.iterrows():
    game_id = row.get('game_id', '')
    pgn = get_game_pgn(game_id, all_games_raw)
    
    if pgn is None:
        continue
    
    # Check if game reaches tablebase position
    board = pgn.board()
    for move in pgn.mainline_moves():
        board.push(move)
    
    # Count pieces at end
    piece_count = len(board.piece_map())
    if piece_count <= max_pieces:
        games_with_endgames.append((game_id, pgn, row))

print(f"Games with tablebase-eligible endgames: {len(games_with_endgames)}")

In [None]:
# Run tablebase analysis
print_subsection("ANALYZING ENDGAMES")

tablebase_results = []

for game_id, pgn, row in tqdm(games_with_endgames, desc="Analyzing endgames"):
    try:
        # Determine player color
        white = pgn.headers.get('White', '').lower()
        player_is_white = username.lower() == white
        player_color = chess.WHITE if player_is_white else chess.BLACK
        
        # Extract boards and moves from the game
        board = pgn.board()
        boards = []
        moves = []
        
        in_tablebase = False
        for move in pgn.mainline_moves():
            # Check if we've entered tablebase territory
            if not in_tablebase and len(board.piece_map()) <= max_pieces:
                in_tablebase = True
            
            if in_tablebase:
                boards.append(board.copy())
                moves.append(move)
            
            board.push(move)
        
        if not boards:
            continue
        
        # Analyze endgame accuracy using the correct API
        result = analyze_endgame_accuracy(
            boards=boards,
            moves=moves,
            player_color=player_color,
        )
        
        if result:
            tablebase_results.append({
                'game_id': game_id,
                'is_priority': game_id in priority_game_ids,
                'player_result': row.get('player_result', ''),
                'tablebase_moves': result.get('player_moves', 0),
                'correct_moves': result.get('optimal_moves', 0),
                'accuracy': result.get('accuracy', 0),
                'mistakes': len(result.get('mistakes', [])),
            })
        
        # Rate limit to be nice to Lichess API
        time.sleep(0.1)
        
    except Exception as e:
        print(f"Error analyzing {game_id}: {e}")
        continue

print(f"\nAnalyzed {len(tablebase_results)} endgames")

In [None]:
# Display results
print_subsection("TABLEBASE RESULTS")

if tablebase_results:
    tb_df = pd.DataFrame(tablebase_results)
    
    print(f"Games with tablebase positions: {len(tb_df)}")
    
    if 'tablebase_moves' in tb_df.columns and 'correct_moves' in tb_df.columns:
        total_tb_moves = tb_df['tablebase_moves'].sum()
        total_correct = tb_df['correct_moves'].sum()
        accuracy = total_correct / total_tb_moves if total_tb_moves > 0 else 0
        
        print(f"\nOverall tablebase accuracy:")
        print(f"  Total tablebase moves: {total_tb_moves}")
        print(f"  Correct moves: {total_correct}")
        print(f"  Accuracy: {accuracy:.1%}")
        
        # Perfect games
        perfect = tb_df[tb_df['correct_moves'] == tb_df['tablebase_moves']]
        print(f"\nPerfect endgames: {len(perfect)} / {len(tb_df)} ({len(perfect)/len(tb_df):.1%})")
    
    # Show games with mistakes
    if 'mistakes' in tb_df.columns:
        with_mistakes = tb_df[tb_df['mistakes'] > 0]
        if not with_mistakes.empty:
            print(f"\nGames with tablebase mistakes: {len(with_mistakes)}")
else:
    tb_df = pd.DataFrame()
    print("No tablebase positions found in analyzed games.")

In [None]:
# Generate consistency report
print_subsection("TABLEBASE CONSISTENCY")

if tablebase_results:
    # Calculate consistency metrics
    consistency = {
        "username": username,
        "games_analyzed": len(tb_df),
        "total_tablebase_moves": int(tb_df.get('tablebase_moves', pd.Series([0])).sum()),
        "total_correct": int(tb_df.get('correct_moves', pd.Series([0])).sum()),
        "overall_accuracy": float(tb_df.get('correct_moves', pd.Series([0])).sum() / 
                                   max(1, tb_df.get('tablebase_moves', pd.Series([0])).sum())),
        "perfect_games": int((tb_df.get('correct_moves', pd.Series()) == 
                              tb_df.get('tablebase_moves', pd.Series())).sum()) if 'correct_moves' in tb_df else 0,
    }
    
    print(f"Consistency Report:")
    for key, value in consistency.items():
        if isinstance(value, float):
            print(f"  {key}: {value:.2%}")
        else:
            print(f"  {key}: {value}")
else:
    consistency = {"username": username, "games_analyzed": 0}

In [None]:
# Save outputs
if not tb_df.empty:
    save_phase_output(username, "phase4c", "tablebase_accuracy.parquet", tb_df)

save_phase_output(username, "phase4c", "tablebase_consistency.json", consistency)

print(f"\nPhase 4c complete!")

In [None]:
# Visualization
import matplotlib.pyplot as plt

if not tb_df.empty and 'correct_moves' in tb_df.columns and 'tablebase_moves' in tb_df.columns:
    fig, axes = plt.subplots(1, 2, figsize=(12, 4))
    
    # Accuracy per game
    tb_df['accuracy'] = tb_df['correct_moves'] / tb_df['tablebase_moves'].replace(0, 1)
    axes[0].hist(tb_df['accuracy'], bins=10, color='steelblue', edgecolor='white')
    axes[0].axvline(tb_df['accuracy'].mean(), color='red', linestyle='--', label=f"Mean: {tb_df['accuracy'].mean():.1%}")
    axes[0].set_xlabel('Tablebase Accuracy')
    axes[0].set_ylabel('Games')
    axes[0].set_title('Endgame Accuracy Distribution')
    axes[0].legend()
    
    # Accuracy by game result
    if 'player_result' in tb_df.columns:
        result_accuracy = tb_df.groupby('player_result')['accuracy'].mean()
        result_accuracy.plot(kind='bar', ax=axes[1], color=['green', 'gray', 'red'])
        axes[1].set_xlabel('Game Result')
        axes[1].set_ylabel('Average Tablebase Accuracy')
        axes[1].set_title('Accuracy by Game Result')
        axes[1].set_xticklabels(axes[1].get_xticklabels(), rotation=0)
    
    plt.tight_layout()
    plt.show()