In [1]:
#imports
import pandas as pd
import pickle
from scipy.stats import poisson
import numpy as np

## Load Data

In [2]:
dict_table = pickle.load(open('./data/dict_table', 'rb'))
df_historical_data = pd.read_csv('./data/clean_copa_america_matches.csv')
df_fixture = pd.read_csv('./data/clean_copa_america_fixture.csv')
metrics_df = pd.read_excel('./data/team_metrics.xlsx').set_index('Team')
ovrl_metrics_df = pd.read_excel('./data/ovrl_team_ratings.xlsx').set_index('Team')

In [3]:
# Process historical data
df_home = df_historical_data[['HomeTeam', 'HomeGoals', 'AwayGoals']]
df_away = df_historical_data[['AwayTeam', 'HomeGoals', 'AwayGoals']]

df_home = df_home.rename(columns={'HomeTeam': 'Team', 'HomeGoals': 'GoalsScored', 'AwayGoals': 'GoalsConceded'})
df_away = df_away.rename(columns={'AwayTeam': 'Team', 'HomeGoals': 'GoalsConceded', 'AwayGoals': 'GoalsScored'})

df_team_strength = pd.concat([df_home, df_away], ignore_index=True).groupby(['Team']).mean()
df_team_strength = df_team_strength.add_suffix('_historical')
df_team_strength.loc['Canada'] = {'GoalsScored_historical': 0.7, 'GoalsConceded_historical': 1.2}

metrics_df = metrics_df.add_suffix('_recent')
df_team_strength = df_team_strength.merge(metrics_df, left_index=True, right_index=True, how='left')
df_team_strength = df_team_strength.merge(ovrl_metrics_df, left_index=True, right_index=True, how='left')
df_team_strength.dropna(inplace=True)

In [4]:
# Normalize team names
def normalize_team_name(name):
    return name.replace("(H)", "").strip()

df_team_strength.index = df_team_strength.index.map(normalize_team_name)
for group in dict_table:
    dict_table[group]['Team'] = dict_table[group]['Team'].map(normalize_team_name)
df_fixture['home'] = df_fixture['home'].map(normalize_team_name)
df_fixture['away'] = df_fixture['away'].map(normalize_team_name)

In [5]:
# Normalize the overall score
def normalize_score(score, max_score=100, target_range=10):
    return (score / max_score) * target_range

# Function to predict points
def predict_points(home, away):
    if home in df_team_strength.index and away in df_team_strength.index:
        try:
            # Extract historical and recent metrics
            goals_scored_home_historical = df_team_strength.at[home, 'GoalsScored_historical']
            goals_conceded_home_historical = df_team_strength.at[home, 'GoalsConceded_historical']
            goals_scored_away_historical = df_team_strength.at[away, 'GoalsScored_historical']
            goals_conceded_away_historical = df_team_strength.at[away, 'GoalsConceded_historical']

            goals_scored_home_recent = df_team_strength.at[home, 'Goals Scored_recent']
            goals_conceded_home_recent = df_team_strength.at[home, 'Goals Conceded_recent']
            goals_scored_away_recent = df_team_strength.at[away, 'Goals Scored_recent']
            goals_conceded_away_recent = df_team_strength.at[away, 'Goals Conceded_recent']

            ovrl_score_home = normalize_score(df_team_strength.at[home, 'OvrlScore'])
            ovrl_score_away = normalize_score(df_team_strength.at[away, 'OvrlScore'])
        except KeyError as e:
            print(f"Missing data for team: {e}")
            return (None, None)
        except ValueError as e:
            print(f"Invalid data for team: {e}")
            return (None, None)

        # Check for NaN values
        if any(pd.isnull([goals_scored_home_historical, goals_conceded_home_historical, 
                          goals_scored_away_historical, goals_conceded_away_historical,
                          goals_scored_home_recent, goals_conceded_home_recent, 
                          goals_scored_away_recent, goals_conceded_away_recent, 
                          ovrl_score_home, ovrl_score_away])):
            print(f"NaN values found for teams: {home} or {away}")
            return (None, None)

        # Define weighting factors for historical and recent metrics
        historical_weight = 0.15
        recent_weight = 0.25
        ovrl_weight = 0.6

        # Calculate lambda values incorporating historical, recent, and overall score
        lamb_home = (historical_weight * goals_scored_home_historical + 
                     recent_weight * goals_scored_home_recent + 
                     ovrl_weight * ovrl_score_home) * \
                    (historical_weight * goals_conceded_away_historical + 
                     recent_weight * goals_conceded_away_recent)

        lamb_away = (historical_weight * goals_scored_away_historical + 
                     recent_weight * goals_scored_away_recent + 
                     ovrl_weight * ovrl_score_away) * \
                    (historical_weight * goals_conceded_home_historical + 
                     recent_weight * goals_conceded_home_recent)

        # Ensure lambdas are non-negative
        lamb_home = max(0, lamb_home)
        lamb_away = max(0, lamb_away)

        # Initialize probabilities
        prob_home, prob_away, prob_draw = 0, 0, 0
        
        # Calculate probabilities of different match outcomes
        for x in range(0, 11):  # Number of goals home team
            for y in range(0, 11):  # Number of goals away team
                p = poisson.pmf(x, lamb_home) * poisson.pmf(y, lamb_away)
                if x == y:
                    prob_draw += p
                elif x > y:
                    prob_home += p
                else:
                    prob_away += p

        # Normalize probabilities to sum to 1
        total_prob = prob_home + prob_draw + prob_away
        prob_home /= total_prob
        prob_draw /= total_prob
        prob_away /= total_prob

        # Randomly determine match outcome based on probabilities
        outcome = np.random.choice(['home', 'draw', 'away'], p=[prob_home, prob_draw, prob_away])
        
        if outcome == 'home':
            return 3, 0
        elif outcome == 'away':
            return 0, 3
        else:
            return 1, 1
    else:
        print(f"Teams not found in index: {home} or {away}")
        return (None, None)


In [6]:
# Function to handle group stage matches
def handle_group_stage(df_fixture_group, dict_table, group):
    teams_in_group = dict_table[group]['Team'].values
    df_fixture_group_6 = df_fixture_group[df_fixture_group['home'].isin(teams_in_group) | df_fixture_group['away'].isin(teams_in_group)].copy()
    for index, row in df_fixture_group_6.iterrows():
        home, away = row['home'], row['away']
        points_home, points_away = predict_points(home, away)
        if points_home is not None and points_away is not None:
            dict_table[group].loc[dict_table[group]['Team'] == home, 'Pts'] += points_home
            dict_table[group].loc[dict_table[group]['Team'] == away, 'Pts'] += points_away
    dict_table[group] = dict_table[group].sort_values('Pts', ascending=False).reset_index(drop=True)
    dict_table[group] = dict_table[group][['Team', 'Pts']]
    dict_table[group] = dict_table[group].round(0)

In [7]:
# Function to get the winner of knockout matches
def get_winner(df_fixture_updated):
    for index, row in df_fixture_updated.iterrows():
        home, away = row['home'], row['away']
        if home not in df_team_strength.index or away not in df_team_strength.index:
            print(f"Teams not found in index: {home} or {away}")
            df_fixture_updated.loc[index, 'winner'] = 'Unknown'
            df_fixture_updated.loc[index, 'loser'] = 'Unknown'
            continue
        points_home, points_away = predict_points(home, away)
        if points_home > points_away:
            winner = home
            loser = away
        else:
            winner = away
            loser = home
        df_fixture_updated.loc[index, 'winner'] = winner
        df_fixture_updated.loc[index, 'loser'] = loser
    return df_fixture_updated


In [8]:
# Function to update the fixture table
def update_table(df_fixture_round_1, df_fixture_round_2):
    for index, row in df_fixture_round_1.iterrows():
        winner = df_fixture_round_1.loc[index, 'winner']
        loser = df_fixture_round_1.loc[index, 'loser']
        match = df_fixture_round_1.loc[index, 'score']
        df_fixture_round_2.replace({f'Winner {match}': winner, f'Loser {match}': loser}, inplace=True)
    df_fixture_round_2.loc[:, 'winner'] = '?'
    df_fixture_round_2.loc[:, 'loser'] = '?'
    return df_fixture_round_2

In [9]:
# Function to simulate the entire tournament
def simulate_tournament():
    # Create copies of the fixtures for the simulation
    df_fixture_group = df_fixture[:24].copy()
    df_fixture_quarter = df_fixture[24:28].copy()
    df_fixture_semi = df_fixture[28:30].copy()
    df_fixture_final = df_fixture[30:].copy()
    
    # Reset the points for all teams in each group
    for group in dict_table:
        dict_table[group]['Pts'] = 0

    # Handle group stage matches
    for group in dict_table:
        handle_group_stage(df_fixture_group, dict_table, group)

    # Determine group winners and runners-up
    for group in dict_table:
        group_winner = dict_table[group].loc[0, 'Team']
        runners_up = dict_table[group].loc[1, 'Team']
        df_fixture_quarter.replace({f'Winner {group}': group_winner, f'Runner-up {group}': runners_up}, inplace=True)

    # Initialize the 'winner' and 'loser' columns
    df_fixture_quarter.loc[:, 'winner'] = '?'
    df_fixture_quarter.loc[:, 'loser'] = '?'
    df_fixture_semi.loc[:, 'winner'] = '?'
    df_fixture_semi.loc[:, 'loser'] = '?'
    df_fixture_final.loc[:, 'winner'] = '?'
    df_fixture_final.loc[:, 'loser'] = '?'

    # Get the winners for quarter-finals, semi-finals, and finals
    get_winner(df_fixture_quarter)
    update_table(df_fixture_quarter, df_fixture_semi)
    get_winner(df_fixture_semi)
    update_table(df_fixture_semi, df_fixture_final)
    final_winner = get_winner(df_fixture_final).loc[df_fixture_final.index[0], 'winner']
    
    return final_winner

In [14]:
# Run multiple simulations
num_simulations = 10000
winners = []

for _ in range(num_simulations):
    winners.append(simulate_tournament())

# Calculate the probability of each team winning the tournament
winner_counts = pd.Series(winners).value_counts()
probabilities = winner_counts / num_simulations

print("Probabilities of each team winning the tournament:")
print(probabilities)

Probabilities of each team winning the tournament:
Uruguay          0.1370
Colombia         0.1295
Argentina        0.1292
Paraguay         0.1065
Peru             0.0852
Brazil           0.0829
United States    0.0828
Chile            0.0820
Ecuador          0.0383
Mexico           0.0334
Jamaica          0.0295
Venezuela        0.0213
Panama           0.0165
Costa Rica       0.0127
Bolivia          0.0109
Canada           0.0023
Name: count, dtype: float64
