In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load processed data
df = pd.read_csv('../data/processed/odds_processed.csv')

print("=== BIAS ANALYSIS ===\n")

# 1. HOME BIAS ANALYSIS
print("1. HOME ADVANTAGE BIAS")
print("-" * 50)

# Overall home win rate
actual_home_win_rate = df['Outcome_H'].mean()
print(f"Actual home win rate: {actual_home_win_rate:.2%}")

# Average implied probability for home wins
avg_implied_home_prob = df['B365_true_prob_H'].mean()
print(f"Average implied home win probability: {avg_implied_home_prob:.2%}")

# Bias = difference
home_bias = avg_implied_home_prob - actual_home_win_rate
print(f"Home bias: {home_bias:+.2%}")

if home_bias > 0:
    print("→ Market OVERVALUES home teams")
else:
    print("→ Market UNDERVALUES home teams")

print("\n")

# 2. FAVORITE BIAS ANALYSIS
print("2. FAVORITE BIAS")
print("-" * 50)

# Group by favorite strength
favorite_analysis = df.groupby('Match_Type').agg({
    'B365_true_prob_H': 'mean',
    'B365_true_prob_A': 'mean',
    'Outcome_H': 'mean',
    'Outcome_A': 'mean',
    'FTR': 'count'
}).round(3)

favorite_analysis.columns = ['Avg_Prob_H', 'Avg_Prob_A', 'Actual_H', 'Actual_A', 'Count']
print(favorite_analysis)

print("\n")

# 3. CALIBRATION ANALYSIS
print("3. PROBABILITY CALIBRATION")
print("-" * 50)

# Bin probabilities and check actual outcomes
def calibration_analysis(df, outcome_col, prob_col, n_bins=10):
    """
    Check if predicted probabilities match actual outcomes
    """
    df_temp = df[[prob_col, outcome_col]].dropna()
    
    # Create probability bins
    df_temp['prob_bin'] = pd.cut(df_temp[prob_col], bins=n_bins)
    
    calibration = df_temp.groupby('prob_bin', observed=True).agg({
        prob_col: ['mean', 'count'],  # Changed: now using list for multiple aggregations
        outcome_col: 'mean'
    }).reset_index()
    
    # Flatten multi-level columns
    calibration.columns = ['Bin', 'Predicted_Prob', 'Count', 'Actual_Rate']
    
    return calibration

# Analyze home wins
home_calibration = calibration_analysis(df, 'Outcome_H', 'B365_true_prob_H')
print("\nHome Win Calibration:")
print(home_calibration)

# Calculate calibration error
home_calibration['Error'] = home_calibration['Predicted_Prob'] - home_calibration['Actual_Rate']
avg_error = home_calibration['Error'].abs().mean()
print(f"\nAverage calibration error: {avg_error:.3f}")

=== BIAS ANALYSIS ===

1. HOME ADVANTAGE BIAS
--------------------------------------------------
Actual home win rate: 44.39%
Average implied home win probability: 43.98%
Home bias: -0.40%
→ Market UNDERVALUES home teams


2. FAVORITE BIAS
--------------------------------------------------
                Avg_Prob_H  Avg_Prob_A  Actual_H  Actual_A  Count
Match_Type                                                       
Close                0.363       0.354     0.353     0.350   1227
Heavy_Favorite       0.516       0.275     0.535     0.260   1992
Medium               0.390       0.339     0.379     0.329   1083


3. PROBABILITY CALIBRATION
--------------------------------------------------


ValueError: Length mismatch: Expected axis has 3 elements, new values have 4 elements