In [1]:
#imports
import pandas as pd
import pickle
from scipy.stats import poisson

In [2]:
dict_table = pickle.load(open('./data/dict_table','rb'))
df_historical_data = pd.read_csv('./data/clean_copa_america_matches.csv')
df_fixture = pd.read_csv('./data/clean_copa_america_fixture.csv')

In [3]:
# Process historical data
df_home = df_historical_data[['HomeTeam', 'HomeGoals', 'AwayGoals']]
df_away = df_historical_data[['AwayTeam', 'HomeGoals', 'AwayGoals']]

df_home = df_home.rename(columns={'HomeTeam': 'Team', 'HomeGoals': 'GoalsScored', 'AwayGoals': 'GoalsConceded'})
df_away = df_away.rename(columns={'AwayTeam': 'Team', 'HomeGoals': 'GoalsConceded', 'AwayGoals': 'GoalsScored'})

df_team_strength = pd.concat([df_home, df_away], ignore_index=True).groupby(['Team']).mean()

# Load team metrics from the Excel file
metrics_df = pd.read_excel('./data/team_metrics.xlsx')
metrics_df = metrics_df.set_index('Team')

# Ensure the columns have the correct suffixes after merging
df_team_strength = df_team_strength.add_suffix('_historical')
df_team_strength.loc['Canada'] = {'GoalsScored_historical': 0.7, 'GoalsConceded_historical': 1.2}

metrics_df = metrics_df.add_suffix('_recent')

# Merge the historical data with the recent metrics data
df_team_strength = df_team_strength.merge(metrics_df, left_index=True, right_index=True, how='left')

# Load overall metrics and merge them
ovrl_metrics_df = pd.read_excel('./data/ovrl_team_ratings.xlsx')
ovrl_metrics_df = ovrl_metrics_df.set_index('Team')
df_team_strength = df_team_strength.merge(ovrl_metrics_df, left_index=True, right_index=True, how='left')

df_team_strength.dropna(inplace=True)

df_team_strength


Unnamed: 0_level_0,GoalsScored_historical,GoalsConceded_historical,Points Per Game_recent,Win Rate_recent,Goals Scored_recent,Goals Conceded_recent,Goal Difference_recent,OvrlScore
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Argentina,1.797753,0.842697,1.5,0.5,2.0,0.5,1.5,83.0
Bolivia,0.75,1.734375,2.625,0.875,0.875,1.5,-0.625,66.0
Brazil,1.969388,0.744898,1.125,0.25,1.375,1.375,0.0,80.0
Chile,1.482353,1.141176,1.875,0.5,1.25,0.75,0.5,75.0
Colombia,1.184783,0.869565,1.25,0.375,2.125,0.625,1.5,78.0
Costa Rica,1.0,1.823529,1.5,0.5,2.125,1.5,0.625,74.0
Ecuador,1.153846,1.646154,2.0,0.625,1.375,0.875,0.5,73.0
Jamaica,0.0,1.5,1.0,0.25,1.5,1.0,0.5,70.0
Mexico,1.375,1.291667,2.0,0.625,1.875,1.375,0.5,77.0
Panama,1.333333,3.333333,1.625,0.5,2.0,0.75,1.25,70.0


## Predicting Points

In [4]:
import numpy as np

def predict_points(home, away):
    if home in df_team_strength.index and away in df_team_strength.index:
        try:
            # Extract historical and recent metrics
            goals_scored_home_historical = df_team_strength.at[home, 'GoalsScored_historical']
            goals_conceded_home_historical = df_team_strength.at[home, 'GoalsConceded_historical']
            goals_scored_away_historical = df_team_strength.at[away, 'GoalsScored_historical']
            goals_conceded_away_historical = df_team_strength.at[away, 'GoalsConceded_historical']

            goals_scored_home_recent = df_team_strength.at[home, 'Goals Scored_recent']
            goals_conceded_home_recent = df_team_strength.at[home, 'Goals Conceded_recent']
            goals_scored_away_recent = df_team_strength.at[away, 'Goals Scored_recent']
            goals_conceded_away_recent = df_team_strength.at[away, 'Goals Conceded_recent']

            ovrl_score_home = df_team_strength.at[home, 'OvrlScore']
            ovrl_score_away = df_team_strength.at[away, 'OvrlScore']
        except KeyError as e:
            print(f"Missing data for team: {e}")
            return (None, None)
        except ValueError as e:
            print(f"Invalid data for team: {e}")
            return (None, None)

        # Check for NaN values
        if any(pd.isnull([goals_scored_home_historical, goals_conceded_home_historical, 
                          goals_scored_away_historical, goals_conceded_away_historical,
                          goals_scored_home_recent, goals_conceded_home_recent, 
                          goals_scored_away_recent, goals_conceded_away_recent, 
                          ovrl_score_home, ovrl_score_away])):
            print(f"NaN values found for teams: {home} or {away}")
            return (None, None)

        # Define weighting factors for historical and recent metrics
        historical_weight = 0.5
        recent_weight = 0.5
        ovrl_weight = 0.2  # Adjust as necessary

        # Calculate lambda values incorporating historical, recent, and overall score
        lamb_home = (historical_weight * goals_scored_home_historical + 
                     recent_weight * goals_scored_home_recent) * \
                    (historical_weight * goals_conceded_away_historical + 
                     recent_weight * goals_conceded_away_recent) * \
                    (1 + ovrl_weight * max(0, ovrl_score_home - ovrl_score_away))

        lamb_away = (historical_weight * goals_scored_away_historical + 
                     recent_weight * goals_scored_away_recent) * \
                    (historical_weight * goals_conceded_home_historical + 
                     recent_weight * goals_conceded_home_recent) * \
                    (1 + ovrl_weight * max(0, ovrl_score_away - ovrl_score_home))

        # Ensure lambdas are non-negative
        lamb_home = max(0, lamb_home)
        lamb_away = max(0, lamb_away)

        # Debug print statements
        print(f"lamb_home for {home} vs {away}: {lamb_home}")
        print(f"lamb_away for {home} vs {away}: {lamb_away}")

        # Initialize probabilities
        prob_home, prob_away, prob_draw = 0, 0, 0
        
        # Calculate probabilities of different match outcomes
        for x in range(0, 11):  # Number of goals home team
            for y in range(0, 11):  # Number of goals away team
                p = poisson.pmf(x, lamb_home) * poisson.pmf(y, lamb_away)
                if x == y:
                    prob_draw += p
                elif x > y:
                    prob_home += p
                else:
                    prob_away += p

        # Normalize probabilities to sum to 1
        total_prob = prob_home + prob_draw + prob_away
        prob_home /= total_prob
        prob_draw /= total_prob
        prob_away /= total_prob

        # Debug print statements
        print(f"prob_home: {prob_home}, prob_draw: {prob_draw}, prob_away: {prob_away}")

        # Randomly determine match outcome based on probabilities
        outcome = np.random.choice(['home', 'draw', 'away'], p=[prob_home, prob_draw, prob_away])
        
        if outcome == 'home':
            return 3, 0
        elif outcome == 'away':
            return 0, 3
        else:
            return 1, 1
    else:
        print(f"Teams not found in index: {home} or {away}")
        return (None, None)



In [5]:
print(predict_points('Colombia', 'United States',))
print(predict_points('Argentina', 'Mexico'))
print(predict_points('Brazil', 'Ecuador'))

lamb_home for Colombia vs United States: 3.0247735507246376
lamb_away for Colombia vs United States: 1.3077445652173911
prob_home: 0.719550210358217, prob_draw: 0.14566690477986505, prob_away: 0.13478288486191797
(3, 0)
lamb_home for Argentina vs Mexico: 5.570037453183522
lamb_away for Argentina vs Mexico: 1.0909410112359552
prob_home: 0.947443567730045, prob_draw: 0.033676328098033115, prob_away: 0.01888010417192192
(3, 0)
lamb_home for Brazil vs Ecuador: 5.059029631083202
lamb_away for Brazil vs Ecuador: 1.340223950156986
prob_home: 0.9054391188342964, prob_draw: 0.055379030372783716, prob_away: 0.03918185079291995
(3, 0)


In [6]:
df_fixture

Unnamed: 0,home,score,away,year
0,Argentina,Match 1,Canada,2024
1,Peru,Match 2,Chile,2024
2,Peru,Match 10,Canada,2024
3,Chile,Match 9,Argentina,2024
4,Argentina,Match 17,Peru,2024
5,Canada,Match 18,Chile,2024
6,Ecuador,Match 4,Venezuela,2024
7,Mexico,Match 3,Jamaica,2024
8,Ecuador,Match 12,Jamaica,2024
9,Venezuela,Match 11,Mexico,2024


## Predicting Copa America

## Group Stage

In [7]:
df_fixture_group_22 = df_fixture[:24].copy()
df_fixture_quarter = df_fixture[24:28].copy()
df_fixture_semi = df_fixture[28:30].copy()
df_fixture_final = df_fixture[30:].copy()

In [8]:
df_fixture_final

Unnamed: 0,home,score,away,year
30,Loser Match 29,Match 31,Loser Match 30,2024
31,Winner Match 29,Match 32,Winner Match 30,2024


In [9]:
# Ensure 'Pts' column exists and is initialized to 0 for all teams
for group in dict_table:
    if 'Pts' not in dict_table[group].columns:
        dict_table[group]['Pts'] = 0

# Function to handle group stage matches
def handle_group_stage(df_fixture_group, dict_table, group):
    teams_in_group = dict_table[group]['Team'].values
    df_fixture_group_6 = df_fixture_group[df_fixture_group['home'].isin(teams_in_group)]
    for index, row in df_fixture_group_6.iterrows():
        home, away = row['home'], row['away']
        points_home, points_away = predict_points(home, away)
        
        # Check for valid points
        if points_home is not None and points_away is not None:
            # Update points in the dictionary table
            dict_table[group].loc[dict_table[group]['Team'] == home, 'Pts'] += points_home
            dict_table[group].loc[dict_table[group]['Team'] == away, 'Pts'] += points_away

    # Sort the teams by points and reset the index
    dict_table[group] = dict_table[group].sort_values('Pts', ascending=False).reset_index(drop=True)
    dict_table[group] = dict_table[group][['Team', 'Pts']]
    dict_table[group] = dict_table[group].round(0)

# Process each group
for group in dict_table:
    handle_group_stage(df_fixture_group_22, dict_table, group)

# Display the results
for group in dict_table:
    print(f"Group {group} standings:")
    print(dict_table[group])


lamb_home for Argentina vs Canada: 10.182724719101124
lamb_away for Argentina vs Canada: 0.8643609550561798
prob_home: 0.9975930554260118, prob_draw: 0.0017998014656120608, prob_away: 0.0006071431083761046
lamb_home for Peru vs Chile: 1.0735207612456747
lamb_away for Peru vs Chile: 2.050871972318339
prob_home: 0.1919118622224298, prob_draw: 0.2090286764135266, prob_away: 0.5990594613640436
lamb_home for Peru vs Canada: 2.0293382352941176
lamb_away for Peru vs Canada: 1.4495735294117646
prob_home: 0.5105223896715249, prob_draw: 0.2146171965774204, prob_away: 0.2748604137510547
lamb_home for Chile vs Argentina: 0.9171802709847985
lamb_away for Chile vs Argentina: 4.668443489755453
prob_home: 0.025649557532602472, prob_draw: 0.04821721425573672, prob_away: 0.9261332282116608
lamb_home for Argentina vs Peru: 5.701097157964309
lamb_away for Argentina vs Peru: 0.7621777924653007
prob_home: 0.9702065362295665, prob_draw: 0.021071985293825077, prob_away: 0.008721478476608407
lamb_home for Cana

In [10]:
dict_table

{'Group A':         Team  Pts
 0  Argentina    9
 1      Chile    4
 2     Canada    2
 3       Peru    1,
 'Group B':         Team  Pts
 0     Mexico    9
 1    Ecuador    4
 2    Jamaica    2
 3  Venezuela    1,
 'Group C':                 Team  Pts
 0            Uruguay    6
 1             Panama    3
 2  United States (H)    0
 3            Bolivia    0,
 'Group D':          Team  Pts
 0    Colombia    9
 1      Brazil    6
 2  Costa Rica    3
 3    Paraguay    0}

## Quarter Finals

In [11]:
df_fixture_quarter

Unnamed: 0,home,score,away,year
24,Winner Group A,Match 25,Runner-up Group B,2024
25,Winner Group B,Match 26,Runner-up Group A,2024
26,Winner Group C,Match 27,Runner-up Group D,2024
27,Winner Group D,Match 28,Runner-up Group C,2024


In [12]:
for group in dict_table:
    print(group)
    group_winner = dict_table[group].loc[0, 'Team']
    runners_up = dict_table[group].loc[1, 'Team']

    print(group_winner, runners_up)
    
    # Replace the values in df_fixture_quarter
    df_fixture_quarter.replace({f'Winner {group}': group_winner,
                                f'Runner-up {group}': runners_up}, inplace=True)

# After replacing values, set the 'winner' column to '?'
df_fixture_quarter['winner'] = '?'

# Print the DataFrame to verify changes
print(df_fixture_quarter)

Group A
Argentina Chile
Group B
Mexico Ecuador
Group C
Uruguay Panama
Group D
Colombia Brazil
         home     score     away  year winner
24  Argentina  Match 25  Ecuador  2024      ?
25     Mexico  Match 26    Chile  2024      ?
26    Uruguay  Match 27   Brazil  2024      ?
27   Colombia  Match 28   Panama  2024      ?


In [13]:
def get_winner(df_fixture_updated):
    for index, row in df_fixture_updated.iterrows():
        home, away = row['home'], row['away']
        points_home, points_away = predict_points(home, away)
        if points_home > points_away:
            winner = home
        else:
            winner = away
        df_fixture_updated.loc[index, 'winner'] = winner
    return df_fixture_updated

In [14]:
get_winner(df_fixture_quarter)


lamb_home for Argentina vs Ecuador: 7.181039325842696
lamb_away for Argentina vs Ecuador: 0.8488683016421781
prob_home: 0.9867700381494486, prob_draw: 0.009447460956394215, prob_away: 0.0037825008941570416
lamb_home for Mexico vs Chile: 2.1512132352941173
lamb_away for Mexico vs Chile: 1.8215686274509806
prob_home: 0.46088480935662995, prob_draw: 0.2053394928121683, prob_away: 0.33377569783120176
lamb_home for Uruguay vs Brazil: 1.8040701382297109
lamb_away for Uruguay vs Brazil: 1.8112192987660178
prob_home: 0.3891392094403353, prob_draw: 0.21883226554284776, prob_away: 0.3920285250168169
lamb_home for Colombia vs Panama: 8.784714673913046
lamb_away for Colombia vs Panama: 1.2454710144927534
prob_home: 0.9894466818824434, prob_draw: 0.007121418617708825, prob_away: 0.003431899499847674


Unnamed: 0,home,score,away,year,winner
24,Argentina,Match 25,Ecuador,2024,Argentina
25,Mexico,Match 26,Chile,2024,Mexico
26,Uruguay,Match 27,Brazil,2024,Brazil
27,Colombia,Match 28,Panama,2024,Colombia


In [15]:
def update_table(df_fixture_round_1, df_fixture_round_2):
    for index, row in df_fixture_round_1.iterrows():
        winner = df_fixture_round_1.loc[index, 'winner']
        match = df_fixture_round_1.loc[index, 'score']
        df_fixture_round_2.replace({f'Winner {match}':winner}, inplace=True)
    df_fixture_round_2['winner'] = '?'
    return df_fixture_round_2

In [16]:
update_table(df_fixture_quarter, df_fixture_semi)


Unnamed: 0,home,score,away,year,winner
28,Argentina,Match 29,Mexico,2024,?
29,Brazil,Match 30,Colombia,2024,?


In [17]:
get_winner(df_fixture_semi)


lamb_home for Argentina vs Mexico: 5.570037453183522
lamb_away for Argentina vs Mexico: 1.0909410112359552
prob_home: 0.947443567730045, prob_draw: 0.033676328098033115, prob_away: 0.01888010417192192
lamb_home for Brazil vs Colombia: 1.749441964285714
lamb_away for Brazil vs Colombia: 1.7541003493788823
prob_home: 0.38770620871908934, prob_draw: 0.22267763749555977, prob_away: 0.3896161537853509


Unnamed: 0,home,score,away,year,winner
28,Argentina,Match 29,Mexico,2024,Argentina
29,Brazil,Match 30,Colombia,2024,Colombia


In [18]:
update_table(df_fixture_semi, df_fixture_final)


Unnamed: 0,home,score,away,year,winner
30,Loser Match 29,Match 31,Loser Match 30,2024,?
31,Argentina,Match 32,Colombia,2024,?


In [19]:
get_winner(df_fixture_final)


Teams not found in index: Loser Match 29 or Loser Match 30


TypeError: '>' not supported between instances of 'NoneType' and 'NoneType'