In [48]:
#imports
import pandas as pd
import pickle
from scipy.stats import poisson
import numpy as np

In [49]:
dict_table = pickle.load(open('./data/dict_table','rb'))
df_historical_data = pd.read_csv('./data/clean_copa_america_matches.csv')
df_fixture = pd.read_csv('./data/clean_copa_america_fixture.csv')


In [50]:
# Process historical data
df_home = df_historical_data[['HomeTeam', 'HomeGoals', 'AwayGoals']]
df_away = df_historical_data[['AwayTeam', 'HomeGoals', 'AwayGoals']]

df_home = df_home.rename(columns={'HomeTeam': 'Team', 'HomeGoals': 'GoalsScored', 'AwayGoals': 'GoalsConceded'})
df_away = df_away.rename(columns={'AwayTeam': 'Team', 'HomeGoals': 'GoalsConceded', 'AwayGoals': 'GoalsScored'})

df_team_strength = pd.concat([df_home, df_away], ignore_index=True).groupby(['Team']).mean()

# Load team metrics from the Excel file
metrics_df = pd.read_excel('./data/team_metrics.xlsx')
metrics_df = metrics_df.set_index('Team')

# Ensure the columns have the correct suffixes after merging
df_team_strength = df_team_strength.add_suffix('_historical')
df_team_strength.loc['Canada'] = {'GoalsScored_historical': 0.7, 'GoalsConceded_historical': 1.2}

metrics_df = metrics_df.add_suffix('_recent')

# Merge the historical data with the recent metrics data
df_team_strength = df_team_strength.merge(metrics_df, left_index=True, right_index=True, how='left')

# Load overall metrics and merge them
ovrl_metrics_df = pd.read_excel('./data/ovrl_team_ratings.xlsx')
ovrl_metrics_df = ovrl_metrics_df.set_index('Team')
df_team_strength = df_team_strength.merge(ovrl_metrics_df, left_index=True, right_index=True, how='left')

df_team_strength.dropna(inplace=True)

df_team_strength


Unnamed: 0_level_0,GoalsScored_historical,GoalsConceded_historical,Points Per Game_recent,Win Rate_recent,Goals Scored_recent,Goals Conceded_recent,Goal Difference_recent,OvrlScore
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Argentina,1.797753,0.842697,1.5,0.5,2.0,0.5,1.5,83.0
Bolivia,0.75,1.734375,2.625,0.875,0.875,1.5,-0.625,66.0
Brazil,1.969388,0.744898,1.125,0.25,1.375,1.375,0.0,80.0
Chile,1.482353,1.141176,1.875,0.5,1.25,0.75,0.5,75.0
Colombia,1.184783,0.869565,1.25,0.375,2.125,0.625,1.5,78.0
Costa Rica,1.0,1.823529,1.5,0.5,2.125,1.5,0.625,74.0
Ecuador,1.153846,1.646154,2.0,0.625,1.375,0.875,0.5,73.0
Jamaica,0.0,1.5,1.0,0.25,1.5,1.0,0.5,70.0
Mexico,1.375,1.291667,2.0,0.625,1.875,1.375,0.5,77.0
Panama,1.333333,3.333333,1.625,0.5,2.0,0.75,1.25,70.0


In [51]:
import numpy as np
import pandas as pd

# Monte Carlo simulation for match outcomes
def montecarlo_simulate_match(home, away, num_simulations=10000):
    if home in df_team_strength.index and away in df_team_strength.index:
        # Get the overall scores
        home_ovrl_score = df_team_strength.at[home, 'OvrlScore']
        away_ovrl_score = df_team_strength.at[away, 'OvrlScore']

        # Calculate expected goals using the overall scores
        lamb_home = min(np.exp(home_ovrl_score), 10)
        lamb_away = min(np.exp(away_ovrl_score), 10)

        # Validate the lambda values
        if lamb_home <= 0 or np.isnan(lamb_home):
            lamb_home = 0.01  # Set to a small positive value
        if lamb_away <= 0 or np.isnan(lamb_away):
            lamb_away = 0.01  # Set to a small positive value

        home_wins = 0
        away_wins = 0
        draws = 0

        for _ in range(num_simulations):
            home_goals = np.random.poisson(lamb_home)
            away_goals = np.random.poisson(lamb_away)

            if home_goals > away_goals:
                home_wins += 1
            elif home_goals < away_goals:
                away_wins += 1
            else:
                draws += 1

        total_matches = home_wins + away_wins + draws
        prob_home = home_wins / total_matches
        prob_away = away_wins / total_matches
        prob_draw = draws / total_matches

        # Normalize probabilities to ensure they sum to 1
        total_prob = prob_home + prob_away + prob_draw
        prob_home /= total_prob
        prob_away /= total_prob
        prob_draw /= total_prob

        if not np.isclose(prob_home + prob_away + prob_draw, 1.0):
            print(f"ERROR: Probabilities do not sum to 1 for {home} vs {away}")
            print(f"Probabilities: home={prob_home}, away={prob_away}, draw={prob_draw}")

        return prob_home, prob_away, prob_draw
    else:
        print(f"ERROR: One or both teams not found in df_team_strength: {home}, {away}")
        return 0, 0, 0

# Example usage
# Make sure df_team_strength is defined with your data
home_team = 'Brazil'
away_team = 'Argentina'
prob_home, prob_away, prob_draw = montecarlo_simulate_match(home_team, away_team)
print(f"Probability of {home_team} winning: {prob_home}")
print(f"Probability of {away_team} winning: {prob_away}")
print(f"Probability of a draw: {prob_draw}")


Probability of Brazil winning: 0.4514
Probability of Argentina winning: 0.4618
Probability of a draw: 0.0868


In [52]:
# Example usage
home_team = 'Brazil'
away_team = 'Argentina'
prob_home, prob_away, prob_draw = montecarlo_simulate_match(home_team, away_team)
print(f"Probability of {home_team} winning: {prob_home}")
print(f"Probability of {away_team} winning: {prob_away}")
print(f"Probability of a draw: {prob_draw}")

Probability of Brazil winning: 0.45380000000000004
Probability of Argentina winning: 0.45630000000000004
Probability of a draw: 0.08990000000000001


In [53]:
# Function to determine winners of knockout matches
def get_winner(df_fixture_updated, num_simulations=1):
    for index, row in df_fixture_updated.iterrows():
        home, away = row['home'], row['away']
        prob_home, prob_away, prob_draw = montecarlo_simulate_match(home, away, num_simulations)
        
        # Determine winner based on highest probability
        if prob_home > prob_away and prob_home > prob_draw:
            winner = home
        elif prob_away > prob_home and prob_away > prob_draw:
            winner = away
        else:
            # In case of a draw, randomly select the winner
            winner = np.random.choice([home, away])
        
        df_fixture_updated.loc[index, 'winner'] = winner
        df_fixture_updated
    return df_fixture_updated

In [54]:
def update_table(df_fixture_round_1, df_fixture_round_2, losers=None):
    if losers is None:
        losers = {}
    for index, row in df_fixture_round_1.iterrows():
        winner = df_fixture_round_1.loc[index, 'winner']
        match = df_fixture_round_1.loc[index, 'score']
        home = df_fixture_round_1.loc[index, 'home']
        away = df_fixture_round_1.loc[index, 'away']
        
        # Determine the loser
        if winner == home:
            loser = away
        else:
            loser = home
        
        # Update the fixtures with winners and track losers
        df_fixture_round_2.replace({f'Winner {match}': winner, f'Loser {match}': loser}, inplace=True)
        losers[match] = loser
        
        # print(f"Match {match}: Winner = {winner}, Loser = {loser}")
        
    df_fixture_round_2['winner'] = '?'
    return df_fixture_round_2, losers


In [55]:
# df_team_strength.loc['Canada'] = {'GoalsScored_historical': 0.7, 'GoalsConceded_historical': 1.2}


In [56]:
df_team_strength

Unnamed: 0_level_0,GoalsScored_historical,GoalsConceded_historical,Points Per Game_recent,Win Rate_recent,Goals Scored_recent,Goals Conceded_recent,Goal Difference_recent,OvrlScore
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Argentina,1.797753,0.842697,1.5,0.5,2.0,0.5,1.5,83.0
Bolivia,0.75,1.734375,2.625,0.875,0.875,1.5,-0.625,66.0
Brazil,1.969388,0.744898,1.125,0.25,1.375,1.375,0.0,80.0
Chile,1.482353,1.141176,1.875,0.5,1.25,0.75,0.5,75.0
Colombia,1.184783,0.869565,1.25,0.375,2.125,0.625,1.5,78.0
Costa Rica,1.0,1.823529,1.5,0.5,2.125,1.5,0.625,74.0
Ecuador,1.153846,1.646154,2.0,0.625,1.375,0.875,0.5,73.0
Jamaica,0.0,1.5,1.0,0.25,1.5,1.0,0.5,70.0
Mexico,1.375,1.291667,2.0,0.625,1.875,1.375,0.5,77.0
Panama,1.333333,3.333333,1.625,0.5,2.0,0.75,1.25,70.0


In [57]:
def simulate_tournament(num_simulations):
    winners = []
    for sim_num in range(num_simulations):
        print(f"Starting simulation {sim_num + 1}")
        
        # Reset fixture stages
        df_fixture_group_22_copy = df_fixture[:24].copy()
        df_fixture_quarter_copy = df_fixture[24:28].copy()
        df_fixture_semi_copy = df_fixture[28:30].copy()
        df_fixture_final_copy = df_fixture[30:].copy()
        
        # Create a deep copy of dict_table to reset points for each simulation
        dict_table_copy = {group: df.copy() for group, df in dict_table.items()}
        
        # Process group stage
        for group in dict_table_copy:
            teams_in_group = dict_table_copy[group]['Team'].values
            df_fixture_group_6 = df_fixture_group_22_copy[df_fixture_group_22_copy['home'].isin(teams_in_group)]
            for index, row in df_fixture_group_6.iterrows():
                home, away = row['home'], row['away']
                result = montecarlo_simulate_match(home, away)
                if result is None:
                    print(f"Skipping match {home} vs {away} due to missing team strength")
                    continue
                
                prob_home, prob_away, prob_draw = result
                
                if not np.isclose(prob_home + prob_away + prob_draw, 1.0):
                    print(f"ERROR: Probabilities do not sum to 1 before choice for {home} vs {away}")
                    print(f"Probabilities: home={prob_home}, away={prob_away}, draw={prob_draw}")
                
                 # Determine the match outcome based on highest probability
                if prob_home > prob_away and prob_home > prob_draw:
                    dict_table_copy[group].loc[dict_table_copy[group]['Team'] == home, 'Pts'] += 3
                    print(f"Match {home} vs {away}: {home} wins with probability {prob_home:.2f}")
                elif prob_away > prob_home and prob_away > prob_draw:
                    dict_table_copy[group].loc[dict_table_copy[group]['Team'] == away, 'Pts'] += 3
                    print(f"Match {home} vs {away}: {away} wins with probability {prob_away:.2f}")
                else:
                    dict_table_copy[group].loc[dict_table_copy[group]['Team'] == home, 'Pts'] += 1
                    dict_table_copy[group].loc[dict_table_copy[group]['Team'] == away, 'Pts'] += 1
                    print(f"Match {home} vs {away}: Draw with probability {prob_draw:.2f}")

            dict_table_copy[group] = dict_table_copy[group].sort_values('Pts', ascending=False).reset_index(drop=True)
            dict_table_copy[group] = dict_table_copy[group][['Team', 'Pts']]
            dict_table_copy[group] = dict_table_copy[group].round(0)
        
        # Process knockout stages
        losers = {}
        for group in dict_table_copy:
            group_winner = dict_table_copy[group].loc[0, 'Team']
            runners_up = dict_table_copy[group].loc[1, 'Team']
            df_fixture_quarter_copy.replace({f'Winner {group}': group_winner, f'Runner-up {group}': runners_up}, inplace=True)
        
        df_fixture_quarter_copy['winner'] = '?'
        df_fixture_quarter_copy = get_winner(df_fixture_quarter_copy)
        print(f"Quarter-final results: {df_fixture_quarter_copy}")
        
        df_fixture_semi_copy, losers = update_table(df_fixture_quarter_copy, df_fixture_semi_copy, losers)
        df_fixture_semi_copy = get_winner(df_fixture_semi_copy)
        print(f"Semi-final results: {df_fixture_semi_copy}")
        
        df_fixture_final_copy, losers = update_table(df_fixture_semi_copy, df_fixture_final_copy, losers)
        df_fixture_final_copy = get_winner(df_fixture_final_copy)
        print(f"Final results: {df_fixture_final_copy}")
        
        # Reset index to ensure we can access the first row
        df_fixture_final_copy.reset_index(drop=True, inplace=True)
        
        if not df_fixture_final_copy.empty:
            winner = df_fixture_final_copy.loc[0, 'winner']
            print(f"Simulation {sim_num + 1} winner: {winner}")
            winners.append(winner)
        else:
            print("ERROR: df_fixture_final_copy is empty. Skipping this simulation.")
    
    winner_counts = pd.Series(winners).value_counts(normalize=True)
    return winner_counts


In [59]:
# Run the tournament simulation
tournament_results = simulate_tournament(num_simulations=1)
print(tournament_results)

Starting simulation 1
Match Argentina vs Canada: Canada wins with probability 0.47
Match Peru vs Chile: Peru wins with probability 0.46
Match Peru vs Canada: Canada wins with probability 0.46
Match Chile vs Argentina: Chile wins with probability 0.46
Match Argentina vs Peru: Argentina wins with probability 0.46
Match Canada vs Chile: Chile wins with probability 0.46
Match Ecuador vs Venezuela: Ecuador wins with probability 0.46
Match Mexico vs Jamaica: Mexico wins with probability 0.46
Match Ecuador vs Jamaica: Ecuador wins with probability 0.46
Match Venezuela vs Mexico: Venezuela wins with probability 0.46
Match Mexico vs Ecuador: Mexico wins with probability 0.46
Match Jamaica vs Venezuela: Venezuela wins with probability 0.46
Match Uruguay vs Panama: Uruguay wins with probability 0.46
Match Panama vs United States: United States wins with probability 0.46
Match Uruguay vs Bolivia: Draw with probability 0.09
Match Bolivia vs Panama: Bolivia wins with probability 0.46
Match Colombia 