In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('../../data/final_data/final_data.csv', parse_dates = ['Date'])
df = df.rename(columns = {
    'FBREFSave%_Home': 'FBREFSavePerc_Home',
    'FBREFSave%_Away': 'FBREFSavePerc_Away'
}).sort_values('Date')

In [3]:
rename_cols = {}
for col in df.columns:
    if col.startswith('538'):
        rename_cols[col] = col.replace('538', 'FTE')
df = df.rename(columns = rename_cols)
rename_cols

{'538Spi_Home': 'FTESpi_Home',
 '538Spi_Away': 'FTESpi_Away',
 '538Prob_Home': 'FTEProb_Home',
 '538Prob_Away': 'FTEProb_Away',
 '538Importance_Home': 'FTEImportance_Home',
 '538Importance_Away': 'FTEImportance_Away',
 '538xg_Home': 'FTExg_Home',
 '538xg_Away': 'FTExg_Away'}

In [4]:
df.columns

Index(['Team_Home', 'Team_Away', 'USxg_Home', 'USxg_Away', 'Date',
       'USProb_Home', 'USProb_Away', 'Season', 'League', 'Goals_Home',
       'Goals_Away', 'HalfGoals_Home', 'HalfGoals_Away', 'OddsB365_Home',
       'OddsB365_Draw', 'OddsB365_Away', 'OddsBW_Home', 'OddsBW_Draw',
       'OddsBW_Away', 'OddsIW_Home', 'OddsIW_Draw', 'OddsIW_Away',
       'OddsPS_Home', 'OddsPS_Draw', 'OddsPS_Away', 'OddsWH_Home',
       'OddsWH_Draw', 'OddsWH_Away', 'OddsVC_Home', 'OddsVC_Draw',
       'OddsVC_Away', 'OddsPSC_Home', 'OddsPSC_Draw', 'OddsPSC_Away',
       'FTESpi_Home', 'FTESpi_Away', 'FTEProb_Home', 'FTEProb_Away',
       'FTEImportance_Home', 'FTEImportance_Away', 'FTExg_Home', 'FTExg_Away',
       'FBREFPossession_Home', 'FBREFSavePerc_Home', 'FBREFAst_Home',
       'FBREFPK_Home', 'FBREFPKatt_Home', 'FBREFSh_Home', 'FBREFSoT_Home',
       'FBREFCrdY_Home', 'FBREFCrdR_Home', 'FBREFxg_Home', 'FBREFxa_Home',
       'FBREFSCA_Home', 'FBREFGCA_Home', 'FBREFPossession_Away',
       'FBREF

In [5]:
differentials_to_be_calculated = [
    'FBREFORating',
    'FBREFDRating',
    'FBREFORating2',
    'FBREFDRating2'
]

In [20]:
from collections import defaultdict
    
def calculate_differentials_using_pandas(df, differentials_to_be_calculated):
    
    df['GameID'] = df.index
    
    drop_cols = []
    for col in df.columns:
        if col.endswith('Draw'):
            drop_cols.append(col)
    df = df.drop(drop_cols, axis=1)
    
    for dc in differentials_to_be_calculated:
        df[f'{dc}Diff_Home'] = df[f'{dc}_Home'] - df[f'{dc}_Away']
        df[f'{dc}Diff_Away'] = df[f'{dc}_Away'] - df[f'{dc}_Home']
        
#     df['GoalsDiff_Home'] = df['Goals_Home'] - df['Goals_Away']
#     df['GoalsDiff_Away'] = df['Goals_Away'] - df['Goals_Home']

    col_names_with_home_or_away = defaultdict(lambda: 0)
    for col in df.columns:
        if col.endswith('_Home') or col.endswith('_Away'):
            col_names_with_home_or_away[''.join(col.split('_')[:-1])] += 1

    for col in col_names_with_home_or_away.keys():
        assert col_names_with_home_or_away[col] == 2, print(col)

    df_long = pd.wide_to_long(
        df, 
        col_names_with_home_or_away, 
        i = ['GameID'], 
        j = 'isHome', 
        sep = '_', 
        suffix = r'\w+'
    )
    df_long = df_long.reset_index().sort_values(['GameID'])
    

    for dc in differentials_to_be_calculated:
        df_long[f'Avg{dc}Diff'] = df_long.groupby(['League', 'Season', 'Team'])[f'{dc}Diff'].transform(lambda x: x.expanding().mean().shift(1, fill_value=0))
#     df_long['TotalGoalsDiff'] = df_long.groupby(['League', 'Season', 'Team'])['GoalsDiff'].transform(lambda x: x.expanding().sum().shift(1, fill_value=0))
    df_long['Game#'] = df_long.groupby(['League', 'Season', 'Team'])[f'{dc}Diff'].transform(lambda x: pd.Series(range(0, len(x)), index=x.index))
    
    new_df_home = df_long[df_long['isHome'] == 'Home'].drop('isHome', 1)
    new_df_away = df_long[df_long['isHome'] == 'Away'].drop('isHome', 1)

    final_df = pd.merge(new_df_home, new_df_away, how='inner', on=['GameID', 'Date', 'Season', 'League'], suffixes=('_Home', '_Away'))
    
    for dc in differentials_to_be_calculated:
        final_df[f'{dc}_Var'] = final_df[f'Avg{dc}Diff_Home'] - final_df[f'Avg{dc}Diff_Away']
    
    
    
    return final_df.sort_values(['GameID'])
    

In [21]:
# df = calculate_differentials_using_pandas(df, differentials_to_be_calculated)

In [22]:
from sklearn.metrics import brier_score_loss

import statsmodels.formula.api as smf
import statsmodels.api as sm

In [23]:
pred_df = calculate_differentials_using_pandas(df, differentials_to_be_calculated)
pred_df['Win'] = 1 * (pred_df['Goals_Home'] > pred_df['Goals_Away'])

In [24]:
pred_df.columns

Index(['GameID', 'League', 'Season', 'Date', 'Team_Home', 'USxg_Home',
       'USProb_Home', 'Goals_Home', 'HalfGoals_Home', 'OddsB365_Home',
       'OddsBW_Home', 'OddsIW_Home', 'OddsPS_Home', 'OddsWH_Home',
       'OddsVC_Home', 'OddsPSC_Home', 'FTESpi_Home', 'FTEProb_Home',
       'FTEImportance_Home', 'FTExg_Home', 'FBREFPossession_Home',
       'FBREFSavePerc_Home', 'FBREFAst_Home', 'FBREFPK_Home',
       'FBREFPKatt_Home', 'FBREFSh_Home', 'FBREFSoT_Home', 'FBREFCrdY_Home',
       'FBREFCrdR_Home', 'FBREFxg_Home', 'FBREFxa_Home', 'FBREFSCA_Home',
       'FBREFGCA_Home', 'FBREFORating_Home', 'FBREFDRating_Home',
       'FBREFORating2_Home', 'FBREFDRating2_Home', 'FBREFORatingDiff_Home',
       'FBREFDRatingDiff_Home', 'FBREFORating2Diff_Home',
       'FBREFDRating2Diff_Home', 'AvgFBREFORatingDiff_Home',
       'AvgFBREFDRatingDiff_Home', 'AvgFBREFORating2Diff_Home',
       'AvgFBREFDRating2Diff_Home', 'Game#_Home', 'Team_Away', 'USxg_Away',
       'USProb_Away', 'Goals_Away', 'Half

In [25]:
pred_df = pred_df[(pred_df['Game#_Home'] >= 5) & (pred_df['Game#_Away'] >= 5)]

In [27]:
cols_for_pred = []
for col in pred_df.columns:
    if col.endswith('Var'):
        cols_for_pred.append(col)
        
# cols_for_pred = [
#     'I((1.5 * AvgFBREFORating_Home * AvgFBREFDRating_Away) - (1.1 * AvgFBREFORating_Away * AvgFBREFDRating_Home))',
#     'I((FBREFORating2_Home * AvgFBREFDRating2_Away) - (AvgFBREFORating2_Away * AvgFBREFDRating2_Home) + 0.371)',
# ]

glm_str = "Win ~ " + '+'.join(cols_for_pred)
print(glm_str)

result_glm = smf.glm(glm_str, 
             data = pred_df[pred_df['Season'].isin(['2017-2018', '2018-2019'])],
              family = sm.families.Binomial()
).fit()
result_glm.summary()

Win ~ FBREFORating_Var+FBREFDRating_Var+FBREFORating2_Var+FBREFDRating2_Var


0,1,2,3
Dep. Variable:,Win,No. Observations:,3159.0
Model:,GLM,Df Residuals:,3154.0
Model Family:,Binomial,Df Model:,4.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-2005.5
Date:,"Wed, 12 May 2021",Deviance:,4010.9
Time:,18:46:35,Pearson chi2:,3160.0
No. Iterations:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-0.2040,0.038,-5.402,0.000,-0.278,-0.130
FBREFORating_Var,0.0106,0.323,0.033,0.974,-0.621,0.643
FBREFDRating_Var,-2.5206,0.479,-5.265,0.000,-3.459,-1.582
FBREFORating2_Var,0.5982,0.183,3.265,0.001,0.239,0.957
FBREFDRating2_Var,0.0293,0.220,0.133,0.894,-0.403,0.461


In [28]:
pred_df[pred_df['Season'].isin(['2019-2020'])]['Win']

3858    0
3862    1
3865    1
3866    1
3867    0
       ..
5372    0
5373    0
5374    1
5375    0
5376    0
Name: Win, Length: 1480, dtype: int64

In [29]:
result_glm.predict(pred_df[pred_df['Season'].isin(['2019-2020'])])

3858    0.492431
3862    0.457540
3865    0.536669
3866    0.346782
3867    0.302780
          ...   
5372    0.475778
5373    0.563686
5374    0.417290
5375    0.384783
5376    0.475084
Length: 1480, dtype: float64

In [30]:
print("Validation Brier Score: ", brier_score_loss(pred_df[pred_df['Season'].isin(['2019-2020'])]['Win'], result_glm.predict(pred_df[pred_df['Season'].isin(['2019-2020'])])))


Validation Brier Score:  0.2280651550886035


In [31]:
from sklearn.metrics import accuracy_score
print("Validation Accuracy Score: ", accuracy_score(pred_df[pred_df['Season'].isin(['2019-2020'])]['Win'], 1 * (result_glm.predict(pred_df[pred_df['Season'].isin(['2019-2020'])]) > 0.5)))


Validation Accuracy Score:  0.6135135135135135
