In [None]:
%pip install tqdm

In [None]:
import os
import sys
import json
import glob
from typing import Dict, List, Tuple, Any
from collections import defaultdict

import numpy as np
import pandas as pd
from tqdm import tqdm

In [None]:
# ============================================================================
# Configuration
# ============================================================================

PREDICTIONS_DIR = "/Volumes/main_dev/dld_ml_anticheat_test/anticheat_test_volume/pgc_wwcd/pgc_results/pgc2025_predictions/"
DATA_PATH = "/Volumes/main_dev/dld_ml_anticheat_test/anticheat_test_volume/pgc_wwcd/pgc_features/prod/"

NUM_PHASES = 10
MODELS = ["WeightedCox", "RuleBased_HP", "RuleBased_WhiteZone", "RuleBased_BlueZone"]

print(f"Predictions directory: {PREDICTIONS_DIR}")
print(f"Ground truth data path: {DATA_PATH}")
print(f"Models to evaluate: {MODELS}")

In [None]:
# ============================================================================
# Data Loading Functions
# ============================================================================

def load_prediction_json(json_path: str) -> Dict[str, Any]:
    """Load a single prediction JSON file."""
    with open(json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data


def load_ground_truth_csv(csv_path: str) -> pd.DataFrame:
    """Load ground truth CSV with squad_win column."""
    df = pd.read_csv(csv_path)
    return df


def match_predictions_with_truth(match_id: str, predictions: Dict, ground_truth: pd.DataFrame) -> List[Dict]:
    """
    Match predictions with ground truth data.
    
    Args:
        match_id: Match ID string
        predictions: Prediction data from JSON (format: {time_point_str: {...}})
        ground_truth: Ground truth DataFrame
    
    Returns:
        List of matched records with predictions and ground truth
    """
    matched_data = []
    
    for time_point_str, pred_data in predictions.items():
        time_point = float(time_point_str)
        
        # Find matching rows in ground truth
        gt_rows = ground_truth[
            (ground_truth['time_point'].round(6) == round(time_point, 6))
        ]
        
        if len(gt_rows) == 0:
            continue
        
        # Get winner squad number
        winner_rows = gt_rows[gt_rows['squad_win'] == 1]
        if len(winner_rows) == 0:
            continue
        
        winner_squad = int(winner_rows.iloc[0]['squad_number'])
        phase = pred_data.get('phase', None)
        probabilities_raw = pred_data.get('probabilities', {})
        is_alive_raw = pred_data.get('is_alive', {})
        
        # CRITICAL FIX: Convert string keys to integers (JSON keys are always strings)
        probabilities = {int(k): float(v) for k, v in probabilities_raw.items()}
        is_alive = {int(k): bool(v) for k, v in is_alive_raw.items()}
        
        # CRITICAL FIX: Normalize probabilities to sum to 1.0 (only for alive squads)
        alive_probs = {k: v for k, v in probabilities.items() if is_alive.get(k, False)}
        total_prob = sum(alive_probs.values())
        
        if total_prob > 0:
            # Renormalize to sum to 1.0
            probabilities_normalized = {k: v / total_prob for k, v in alive_probs.items()}
        else:
            # If all probabilities are 0, use uniform distribution over alive squads
            if len(alive_probs) > 0:
                uniform_prob = 1.0 / len(alive_probs)
                probabilities_normalized = {k: uniform_prob for k in alive_probs.keys()}
            else:
                probabilities_normalized = {}
        
        # Count alive squads
        num_alive = sum(is_alive.values())
        
        # Skip if winner is not alive (data inconsistency)
        if winner_squad not in probabilities_normalized:
            continue
        
        matched_data.append({
            'match_id': match_id,
            'time_point': time_point,
            'phase': phase,
            'winner_squad': winner_squad,
            'probabilities': probabilities_normalized,
            'is_alive': is_alive,
            'num_alive': num_alive,
        })
    
    return matched_data


print("Data loading functions defined")

In [None]:
# ============================================================================
# Metric Calculation Functions
# ============================================================================

def compute_accuracy(probs_dict: Dict[int, float], winner_squad: int) -> float:
    """
    Compute accuracy: whether predicted winner matches actual winner.
    
    Args:
        probs_dict: Dictionary mapping squad_number (int) to probability (normalized)
        winner_squad: Actual winner squad number (int)
    
    Returns:
        1.0 if correct, 0.0 otherwise
    """
    if not probs_dict:
        return 0.0
    
    predicted_winner = max(probs_dict, key=probs_dict.get)
    return 1.0 if predicted_winner == winner_squad else 0.0


def compute_log_loss(probs_dict: Dict[int, float], winner_squad: int, eps: float = 1e-15) -> float:
    """
    Compute log loss (cross-entropy) for multi-class classification.
    
    Log Loss = -log(P(y_true))
    
    Args:
        probs_dict: Dictionary mapping squad_number (int) to probability (normalized)
        winner_squad: Actual winner squad number (int)
        eps: Small value to avoid log(0)
    
    Returns:
        Log loss value
    """
    if winner_squad not in probs_dict:
        # Winner not in probabilities (should not happen after filtering)
        return float('inf')
    
    prob = probs_dict[winner_squad]
    prob = np.clip(prob, eps, 1.0 - eps)  # Clip to avoid log(0) and log(1)
    return -np.log(prob)


def compute_brier_score(probs_dict: Dict[int, float], winner_squad: int) -> float:
    """
    Compute Brier score (mean squared error between predicted probabilities and actual outcome).
    
    Brier Score = sum_i (p_i - y_i)^2
    where y_i = 1 if squad_i is winner, 0 otherwise
    
    Args:
        probs_dict: Dictionary mapping squad_number (int) to probability (normalized)
        winner_squad: Actual winner squad number (int)
    
    Returns:
        Brier score value
    """
    brier = 0.0
    for squad, prob in probs_dict.items():
        true_value = 1.0 if squad == winner_squad else 0.0
        brier += (prob - true_value) ** 2
    
    return brier


def compute_ece(all_predictions: List[Dict], n_bins: int = 10) -> float:
    """
    Compute Expected Calibration Error (ECE).
    
    ECE measures how well predicted probabilities match actual accuracy.
    
    Args:
        all_predictions: List of prediction dictionaries
        n_bins: Number of bins for calibration
    
    Returns:
        ECE value
    """
    confidences = []
    correctness = []
    
    for pred in all_predictions:
        probs_dict = pred['probabilities']
        winner_squad = pred['winner_squad']
        
        if not probs_dict:
            continue
        
        # Get predicted winner and confidence
        predicted_winner = max(probs_dict, key=probs_dict.get)
        confidence = probs_dict[predicted_winner]
        is_correct = 1.0 if predicted_winner == winner_squad else 0.0
        
        confidences.append(confidence)
        correctness.append(is_correct)
    
    if len(confidences) == 0:
        return 0.0
    
    confidences = np.array(confidences)
    correctness = np.array(correctness)
    
    # Bin confidences and compute ECE
    bin_boundaries = np.linspace(0, 1, n_bins + 1)
    ece = 0.0
    total_samples = len(confidences)
    
    for i in range(n_bins):
        bin_lower = bin_boundaries[i]
        bin_upper = bin_boundaries[i + 1]
        
        # Find samples in this bin
        in_bin = (confidences > bin_lower) & (confidences <= bin_upper)
        n_in_bin = in_bin.sum()
        
        if n_in_bin > 0:
            # Compute average confidence and accuracy in bin
            avg_confidence = confidences[in_bin].mean()
            avg_accuracy = correctness[in_bin].mean()
            
            # Add weighted absolute difference to ECE
            ece += (n_in_bin / total_samples) * abs(avg_accuracy - avg_confidence)
    
    return ece


print("Metric calculation functions defined")

In [None]:
# ============================================================================
# Top-K Accuracy Functions
# ============================================================================

def compute_topk_accuracy_all(all_predictions: List[Dict], k: int) -> Tuple[float, int]:
    """
    Compute Top-K accuracy using ALL time points where exactly K teams are alive.
    
    Args:
        all_predictions: List of prediction dictionaries
        k: Number of alive teams to filter for (e.g., 4 or 8)
    
    Returns:
        Tuple of (accuracy, num_samples)
    """
    correct = 0
    total = 0
    
    for pred in all_predictions:
        num_alive = pred['num_alive']
        
        # Only consider time points where exactly K teams are alive
        if num_alive != k:
            continue
        
        probs_dict = pred['probabilities']
        winner_squad = pred['winner_squad']
        
        if not probs_dict:
            continue
        
        # Get predicted winner
        predicted_winner = max(probs_dict, key=probs_dict.get)
        
        if predicted_winner == winner_squad:
            correct += 1
        total += 1
    
    if total == 0:
        return 0.0, 0
    
    return correct / total, total


def compute_topk_accuracy_first(all_predictions: List[Dict], k: int) -> Tuple[float, int]:
    """
    Compute Top-K accuracy using FIRST time point per match where exactly K teams are alive.
    
    Args:
        all_predictions: List of prediction dictionaries
        k: Number of alive teams to filter for (e.g., 4 or 8)
    
    Returns:
        Tuple of (accuracy, num_samples)
    """
    # Group by match_id and find first occurrence
    match_first_k = {}
    
    for pred in all_predictions:
        if pred['num_alive'] != k:
            continue
        
        match_id = pred['match_id']
        time_point = pred['time_point']
        
        # Keep first occurrence (earliest time point) for each match
        if match_id not in match_first_k:
            match_first_k[match_id] = pred
        elif time_point < match_first_k[match_id]['time_point']:
            match_first_k[match_id] = pred
    
    # Compute accuracy on first occurrences only
    correct = 0
    total = 0
    
    for pred in match_first_k.values():
        probs_dict = pred['probabilities']
        winner_squad = pred['winner_squad']
        
        if not probs_dict:
            continue
        
        predicted_winner = max(probs_dict, key=probs_dict.get)
        
        if predicted_winner == winner_squad:
            correct += 1
        total += 1
    
    if total == 0:
        return 0.0, 0
    
    return correct / total, total


def compute_topk_accuracy_avg(all_predictions: List[Dict], k: int) -> Tuple[float, int]:
    """
    Compute Top-K accuracy using AVERAGE accuracy per match (each match counts once).
    
    Args:
        all_predictions: List of prediction dictionaries
        k: Number of alive teams to filter for (e.g., 4 or 8)
    
    Returns:
        Tuple of (accuracy, num_samples = num_matches)
    """
    # Group by match_id
    match_predictions = defaultdict(list)
    
    for pred in all_predictions:
        if pred['num_alive'] != k:
            continue
        
        match_id = pred['match_id']
        match_predictions[match_id].append(pred)
    
    # Compute average accuracy per match
    match_accuracies = []
    
    for match_id, preds in match_predictions.items():
        correct = 0
        total = 0
        
        for pred in preds:
            probs_dict = pred['probabilities']
            winner_squad = pred['winner_squad']
            
            if not probs_dict:
                continue
            
            predicted_winner = max(probs_dict, key=probs_dict.get)
            
            if predicted_winner == winner_squad:
                correct += 1
            total += 1
        
        if total > 0:
            match_accuracies.append(correct / total)
    
    if len(match_accuracies) == 0:
        return 0.0, 0
    
    return np.mean(match_accuracies), len(match_accuracies)


print("Top-K accuracy functions defined")

In [None]:
# ============================================================================
# Phase-wise Evaluation
# ============================================================================

def compute_phase_wise_metrics(all_predictions: List[Dict]) -> pd.DataFrame:
    """
    Compute phase-wise metrics (Accuracy, Log-loss, ECE, Brier-score).
    
    Args:
        all_predictions: List of all matched prediction dictionaries
    
    Returns:
        DataFrame with metrics as rows and phases as columns
    """
    # Group predictions by phase
    phase_groups = defaultdict(list)
    for pred in all_predictions:
        phase = pred.get('phase')
        if phase is not None and 1 <= phase <= NUM_PHASES:
            phase_groups[phase].append(pred)
    
    # Compute metrics for each phase
    results = {
        'accuracy': [],
        'log_loss': [],
        'ece': [],
        'brier_score': [],
    }
    
    phase_labels = []
    
    for phase in range(1, NUM_PHASES + 1):
        phase_labels.append(f'Phase{phase}')
        phase_preds = phase_groups.get(phase, [])
        
        if len(phase_preds) == 0:
            results['accuracy'].append(np.nan)
            results['log_loss'].append(np.nan)
            results['ece'].append(np.nan)
            results['brier_score'].append(np.nan)
            continue
        
        # Accuracy
        accuracies = [
            compute_accuracy(pred['probabilities'], pred['winner_squad'])
            for pred in phase_preds
        ]
        results['accuracy'].append(np.mean(accuracies))
        
        # Log Loss
        log_losses = [
            compute_log_loss(pred['probabilities'], pred['winner_squad'])
            for pred in phase_preds
        ]
        results['log_loss'].append(np.mean(log_losses))
        
        # ECE (computed on all phase predictions together)
        ece = compute_ece(phase_preds)
        results['ece'].append(ece)
        
        # Brier Score
        brier_scores = [
            compute_brier_score(pred['probabilities'], pred['winner_squad'])
            for pred in phase_preds
        ]
        results['brier_score'].append(np.mean(brier_scores))
    
    # Create DataFrame
    df = pd.DataFrame(results, index=phase_labels).T
    return df


print("Phase-wise evaluation function defined")


In [None]:
# ============================================================================
# Main Evaluation Loop
# ============================================================================

def evaluate_model(model_name: str) -> Dict[str, Any]:
    """
    Evaluate a single model on all prediction files.
    
    Args:
        model_name: Name of the model (e.g., 'WeightedCox')
    
    Returns:
        Dictionary with evaluation results
    """
    print(f"\nEvaluating {model_name}...")
    
    # Find all prediction JSON files for this model
    prediction_files = sorted(glob.glob(os.path.join(PREDICTIONS_DIR, f"*_{model_name}.json")))
    print(f"  Found {len(prediction_files)} prediction files")
    
    if len(prediction_files) == 0:
        print(f"  Warning: No prediction files found for {model_name}")
        return None
    
    all_matched_predictions = []
    
    # Process each prediction file
    for pred_file in tqdm(prediction_files, desc=f"  Processing {model_name}"):
        # Load prediction
        pred_data = load_prediction_json(pred_file)
        match_id = pred_data['match_id']
        predictions = pred_data['predictions']
        
        # Extract UUID from match_id (last part after the last dot)
        # match_id format: match.bro.custom.es_as-pgc25gs_01.steam.normal.as.2025.11.28.13.UUID
        # We need the UUID part to match with CSV filename: pgc_2025_UUID.csv
        uuid = match_id.split('.')[-1]
        
        # Find corresponding ground truth CSV using UUID
        csv_files = glob.glob(os.path.join(DATA_PATH, f"*{uuid}*.csv"))
        
        if len(csv_files) == 0:
            continue
        
        # Load ground truth
        ground_truth = load_ground_truth_csv(csv_files[0])
        
        # Match predictions with ground truth
        matched = match_predictions_with_truth(match_id, predictions, ground_truth)
        all_matched_predictions.extend(matched)
    
    print(f"  Total matched predictions: {len(all_matched_predictions)}")
    
    if len(all_matched_predictions) == 0:
        print(f"  Warning: No matched predictions for {model_name}")
        return None
    
    # Compute phase-wise metrics
    phase_metrics = compute_phase_wise_metrics(all_matched_predictions)
    
    # Compute Top-K accuracies (3 methods)
    # Method 1: All time points
    top4_all_acc, top4_all_n = compute_topk_accuracy_all(all_matched_predictions, k=4)
    top8_all_acc, top8_all_n = compute_topk_accuracy_all(all_matched_predictions, k=8)
    
    # Method 2: First time point per match
    top4_first_acc, top4_first_n = compute_topk_accuracy_first(all_matched_predictions, k=4)
    top8_first_acc, top8_first_n = compute_topk_accuracy_first(all_matched_predictions, k=8)
    
    # Method 3: Average per match
    top4_avg_acc, top4_avg_n = compute_topk_accuracy_avg(all_matched_predictions, k=4)
    top8_avg_acc, top8_avg_n = compute_topk_accuracy_avg(all_matched_predictions, k=8)
    
    return {
        'model_name': model_name,
        'phase_metrics': phase_metrics,
        'top4_all_accuracy': top4_all_acc,
        'top4_all_samples': top4_all_n,
        'top4_first_accuracy': top4_first_acc,
        'top4_first_samples': top4_first_n,
        'top4_avg_accuracy': top4_avg_acc,
        'top4_avg_matches': top4_avg_n,
        'top8_all_accuracy': top8_all_acc,
        'top8_all_samples': top8_all_n,
        'top8_first_accuracy': top8_first_acc,
        'top8_first_samples': top8_first_n,
        'top8_avg_accuracy': top8_avg_acc,
        'top8_avg_matches': top8_avg_n,
        'total_predictions': len(all_matched_predictions),
    }


print("Main evaluation function defined")

In [None]:
# ============================================================================
# Run Evaluation
# ============================================================================
# Run evaluation on all models
results = {}

for model in MODELS:
    result = evaluate_model(model)
    if result is not None:
        results[model] = result

print(f"\n{'='*60}")
print(f"Evaluation complete for {len(results)} models")
print(f"{'='*60}")

# Display results for each model
for model_name, result in results.items():
    print(f"\n{'='*60}")
    print(f"Model: {model_name}")
    print(f"{'='*60}")
    print(f"\nTotal predictions: {result['total_predictions']}")
    
    print(f"\n{'-'*60}")
    print("Phase-wise Metrics:")
    print(f"{'-'*60}")
    print(result['phase_metrics'].round(4))
    
    print(f"\n{'-'*60}")
    print("Top-K Accuracy (3 Methods):")
    print(f"{'-'*60}")
    print("\nTop-4 Accuracy:")
    print(f"  All time points:  {result['top4_all_accuracy']:.4f} (n={result['top4_all_samples']} samples)")
    print(f"  First per match:  {result['top4_first_accuracy']:.4f} (n={result['top4_first_samples']} matches)")
    print(f"  Avg per match:    {result['top4_avg_accuracy']:.4f} (n={result['top4_avg_matches']} matches)")
    
    print("\nTop-8 Accuracy:")
    print(f"  All time points:  {result['top8_all_accuracy']:.4f} (n={result['top8_all_samples']} samples)")
    print(f"  First per match:  {result['top8_first_accuracy']:.4f} (n={result['top8_first_samples']} matches)")
    print(f"  Avg per match:    {result['top8_avg_accuracy']:.4f} (n={result['top8_avg_matches']} matches)")
    print()

# Model Comparison
print("\n" + "="*60)
print("Phase-wise Accuracy Comparison")
print("="*60)
accuracy_comparison = pd.DataFrame({
    model_name: result['phase_metrics'].loc['accuracy']
    for model_name, result in results.items()
})
print(accuracy_comparison.round(4))

print("\n" + "="*60)
print("Phase-wise Log Loss Comparison")
print("="*60)
logloss_comparison = pd.DataFrame({
    model_name: result['phase_metrics'].loc['log_loss']
    for model_name, result in results.items()
})
print(logloss_comparison.round(4))

print("\n" + "="*60)
print("Phase-wise ECE Comparison")
print("="*60)
ece_comparison = pd.DataFrame({
    model_name: result['phase_metrics'].loc['ece']
    for model_name, result in results.items()
})
print(ece_comparison.round(4))

print("\n" + "="*60)
print("Phase-wise Brier Score Comparison")
print("="*60)
brier_comparison = pd.DataFrame({
    model_name: result['phase_metrics'].loc['brier_score']
    for model_name, result in results.items()
})
print(brier_comparison.round(4))

# Summary statistics
print("\n" + "="*60)
print("Overall Performance Summary (Averaged Across All Phases)")
print("="*60)

summary_stats = {}
for model_name, result in results.items():
    phase_metrics = result['phase_metrics']
    
    summary_stats[model_name] = {
        'Avg Accuracy': phase_metrics.loc['accuracy'].mean(),
        'Avg Log Loss': phase_metrics.loc['log_loss'].mean(),
        'Avg ECE': phase_metrics.loc['ece'].mean(),
        'Avg Brier Score': phase_metrics.loc['brier_score'].mean(),
        'Top-4 (First)': result['top4_first_accuracy'],
        'Top-4 (Avg)': result['top4_avg_accuracy'],
        'Top-8 (First)': result['top8_first_accuracy'],
        'Top-8 (Avg)': result['top8_avg_accuracy'],
    }

summary_df = pd.DataFrame(summary_stats).T
print(summary_df.round(4))

# Save results (optional)
SAVE_RESULTS = False  # Set to True to save

if SAVE_RESULTS:
    output_dir = "/Volumes/main_dev/dld_ml_anticheat_test/anticheat_test_volume/pgc_wwcd/pgc_results/pgc2025_evaluation/"
    os.makedirs(output_dir, exist_ok=True)
    
    # Save summary statistics
    summary_df.to_csv(os.path.join(output_dir, "summary_statistics_fixed.csv"))
    print(f"\nSaved summary statistics to {output_dir}/summary_statistics_fixed.csv")
    
    # Save each model's phase-wise metrics
    for model_name, result in results.items():
        filename = f"phase_metrics_{model_name}_fixed.csv"
        result['phase_metrics'].to_csv(os.path.join(output_dir, filename))
        print(f"Saved {model_name} phase metrics to {output_dir}/{filename}")
    
    print("\nAll results saved successfully!")
else:
    print("\nSet SAVE_RESULTS = True to save results to disk")

