In [2]:
import logging
import warnings

import numpy as np
import pandas as pd

import helpers.hdbg as hdbg
import research_amp.soccer_prediction.models as rasoprmo
import research_amp.soccer_prediction.preproccesing as rasoprpr
import research_amp.soccer_prediction.utils as rasoprut
from scipy.special import factorial
from scipy.optimize import minimize


pd.set_option("display.max_columns", None)
warnings.filterwarnings("ignore")

In [5]:
# Define the S3 Buckets, dataset path and local directory for download.
bucket = "cryptokaizen-data-test"
dataset_path = "kaizen_ai/soccer_prediction/datasets/OSF_football/"
local_dir = "datasets/OSF_football"
# Download data from S3.
rasoprut.download_data_from_s3(
    bucket_name=bucket, dataset_path=dataset_path, local_path=local_dir
)
# Load the data from S3 into pandas dataframe objects.
dataframes = rasoprut.load_data_to_dataframe(local_path=local_dir)

In [6]:
data = dataframes["ISDBv2_df"]

In [7]:
# Convert date column to datetime
data['Date'] = pd.to_datetime(data['Date'], dayfirst=True)
# Define half-life period (e.g., 180 days)
half_life_period = 180
# Calculate the age of each match in days (relative to the most recent match)
data['Days_Ago'] = (data['Date'].max() - data['Date']).dt.days
# Calculate time weights
data['Time_Weight'] = 0.5 ** (data['Days_Ago'] / half_life_period)
# Generate unique team identifiers.
teams = pd.Series(data['HT'].tolist() + data['AT'].tolist()).unique()
team_to_id = {team: idx for idx, team in enumerate(teams)}
# Map teams to unique identifiers.
data['HT_id'] = data['HT'].map(team_to_id)
data['AT_id'] = data['AT'].map(team_to_id)
# Display the first few rows of the dataset
print(data.head())

     Sea   Lge       Date              HT             AT  HS  AS  GD WDL  \
0  00-01  GER1 2000-08-11        Dortmund  Hansa Rostock   1   0   1   W   
1  00-01  GER1 2000-08-12   Bayern Munich  Hertha Berlin   4   1   3   W   
2  00-01  GER1 2000-08-12        Freiburg  VfB Stuttgart   4   0   4   W   
3  00-01  GER1 2000-08-12    Hamburger SV    Munich 1860   2   2   0   D   
4  00-01  GER1 2000-08-12  Kaiserslautern         Bochum   0   1  -1   L   

   Days_Ago   Time_Weight  HT_id  AT_id  
0      6165  4.894661e-11      0     17  
1      6164  4.913546e-11      1     12  
2      6164  4.913546e-11      2     13  
3      6164  4.913546e-11      3     10  
4      6164  4.913546e-11      4     11  


In [8]:
def bivariate_poisson_log_likelihood(params, data):
    c, h, rho, *strengths = params
    log_likelihood = 0 
    for _, row in data.iterrows():
        i, j, goals_i, goals_j, time_weight = row['HT_id'], row['AT_id'], row['HS'], row['AS'], row['Time_Weight']
        lambda_i = np.exp(c + strengths[i] + h)
        lambda_j = np.exp(c + strengths[j] - h)
        cov = rho * np.sqrt(lambda_i * lambda_j)
        # Calculate joint probability
        joint_prob = 0
        for k in range(min(goals_i, goals_j) + 1):
            P_goals_i = (lambda_i**goals_i * np.exp(-lambda_i)) / factorial(goals_i)
            P_goals_j = (lambda_j**goals_j * np.exp(-lambda_j)) / factorial(goals_j)
            joint_prob = P_goals_i * P_goals_j
        log_likelihood += time_weight * np.log(joint_prob)
    return -log_likelihood 

In [9]:
# Number of teams
num_teams = len(teams)
# Initial parameters: [c, h, rho, *strengths]
initial_params = [0, 0, 0.1] + [1] * num_teams

In [10]:
# Select the data for the league and season.
final_data = data[(data['Lge'] == 'ENG5') & 
                     ((data['Sea'] == '07-08') | 
                      (data['Sea'] == '06-07') | 
                      (data['Sea'] == '08-09'))]
# Set optimization options.
options = {
    'maxiter': 10,  
    'disp': True      
}
# Optimize parameters using the BFGS algorithm with options.
result = minimize(bivariate_poisson_log_likelihood, initial_params, args=(final_data.iloc[:552],), method='L-BFGS-B', options=options)
optimized_params = result.x
print("Optimized Parameters:", optimized_params)

RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =         1471     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  8.45210D-04    |proj g|=  6.31690D-04


 This problem is unconstrained.



At iterate    1    f=  6.46688D-04    |proj g|=  1.21498D-04

At iterate    2    f=  6.32424D-04    |proj g|=  4.50321D-05

At iterate    3    f=  6.29604D-04    |proj g|=  1.10370D-05
Optimized Parameters:
At iterate    4    f=  6.26630D-04    |proj g|=  7.91921D-06

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
 1471      4      6      1     0     0   7.919D-06   6.266D-04
  F =   6.2662969146201955E-004

CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL            
 [-0.72511524  0.09506051  0.1        ...  1.          1.
  1.        ]


In [16]:
optimized_params[4]

1.0

In [18]:
def calculate_match_outcomes(df, params, *,
    max_goals: int = 10,
    apply_dixon_coles: bool = False,
    rho: float = -0.2,) -> pd.DataFrame:
    """
    Calculate match outcome probabilities.
    """
    c, h, rho, *strengths = params 
    # Calculate Lambda_HS and Lambda_AS for each row in the dataframe
    df["Lambda_HS"] = np.exp(c + df["HT_id"].apply(lambda x: strengths[x]) + h)
    df["Lambda_AS"] = np.exp(c + df["AT_id"].apply(lambda x: strengths[x]) - h)
    # Define probabilities.
    home_goals_probs = np.array(
        [
            np.exp(-df["Lambda_HS"]) * df["Lambda_HS"] ** i / np.math.factorial(i)
            for i in range(max_goals)
        ]
    )
    # Calculate Poisson probabilities for Away team goals.
    away_goals_probs = np.array(
        [
            np.exp(-df["Lambda_AS"]) * df["Lambda_AS"] ** i / np.math.factorial(i)
            for i in range(max_goals)
        ]
    )
    # Initialize probabilities.
    prob_home_win = np.zeros(len(df))
    prob_away_win = np.zeros(len(df))
    prob_draw = np.zeros(len(df))
    # Calculate the probabilities of home win, away win, and draw.
    for i in range(max_goals):
        for j in range(max_goals):
            prob = home_goals_probs[i] * away_goals_probs[j]
            if apply_dixon_coles:
                prob *= dixon_coles_adjustment(
                    i, j, df["Lambda_HS"], df["Lambda_AS"], rho
                )
            prob_home_win += np.where(i > j, prob, 0)
            prob_away_win += np.where(i < j, prob, 0)
            prob_draw += np.where(i == j, prob, 0)
    # Add probabilities to the DataFrame.
    df["prob_home_win"] = prob_home_win
    df["prob_away_win"] = prob_away_win
    df["prob_draw"] = prob_draw
    # Predict the outcomes based on probabilities.
    df["predicted_outcome"] = np.where(
        df["prob_home_win"] > df["prob_away_win"],
        "home_win",
        np.where(df["prob_away_win"] > df["prob_home_win"], "away_win", "draw"),
    )
    # Calculate actual outcomes for comparison.
    df["actual_outcome"] = np.where(
        df["HS"] > df["AS"],
        "home_win",
        np.where(df["HS"] < df["AS"], "away_win", "draw"),
    )
    # Round off the predicted goals to integers.
    df["Lambda_HS"] = df["Lambda_HS"].round().astype(int)
    df["Lambda_AS"] = df["Lambda_AS"].round().astype(int)
    print(df.head())
    return df

In [20]:
final_df = calculate_match_outcomes(final_data[553:], optimized_params)
# Evaluate model.
rasoprut.evaluate_model_predictions(
        final_df["actual_outcome"], final_df["predicted_outcome"]
    )

         Sea   Lge       Date            HT                AT  HS  AS  GD WDL  \
73783  07-08  ENG5 2007-08-11  Crawley Town         Stevenage   2   1   1   W   
73784  07-08  ENG5 2007-08-11     Droylsden         Salisbury   0   0   0   D   
73785  07-08  ENG5 2007-08-11     Ebbsfleet         Northwich   2   1   1   W   
73786  07-08  ENG5 2007-08-11       Farsley  Stafford Rangers   1   0   1   W   
73787  07-08  ENG5 2007-08-11        Histon            Burton   2   2   0   D   

       Days_Ago   Time_Weight  HT_id  AT_id  Lambda_HS  Lambda_AS  \
73783      3609  9.211887e-07    776    773          1          1   
73784      3609  9.211887e-07    913    917          1          1   
73785      3609  9.211887e-07    914    829          1          1   
73786      3609  9.211887e-07    915    831          1          1   
73787      3609  9.211887e-07    916    767          1          1   

       prob_home_win  prob_away_win  prob_draw predicted_outcome  \
73783       0.384695       0.3