In [None]:
# This allows us to show the full screen width

from IPython.display import display, HTML

display(HTML(data="""
<style>
    div#notebook-container    { width: 95%; }
    div#menubar-container     { width: 65%; }
    div#maintoolbar-container { width: 99%; }
</style>
"""))

In [None]:
# Load library

import pandas as pd
import numpy as np
import seaborn as sns
import statsmodels.formula.api as smf # GLM
from sklearn.linear_model import GammaRegressor
import statsmodels.api as sm
import statsmodels.stats.api as sms
from statsmodels.compat import lzip
from mlxtend.feature_selection import SequentialFeatureSelector as sfs
from sklearn.metrics import confusion_matrix, mean_squared_error, mean_gamma_deviance
import csv
from bevel.bevel.linear_ordinal_regression import OrderedLogit
import matplotlib.pyplot as plt
from scipy.stats import shapiro

# Odds Model

In [None]:
# Load Odds data

Odds = pd.read_csv("/Users/lorenzoleoni/Desktop/Materiale Personale/Database/SerieA_Odds.csv", sep=';', decimal=".")

Odds.head()

In [None]:
# Variable Full Time Result and rename columns

Odds.insert(5,'FTR',np.where(Odds['Home_Goals']>Odds['Away_Goals'],'H',np.where(Odds['Home_Goals']==Odds['Away_Goals'],'D','A')))
Odds.rename(columns={'Win_Home_Bet':'HBet','Draw_Bet':'DBet','Win_Away_Bet':'ABet'}, inplace=True)

In [None]:
# Drop the 28 matchweek (models are based on 27 matchweeks)

Odds.drop(Odds.index[265:275], inplace=True)

In [None]:
# Probabilities from odds

Odds['Hprob_Odds'] = (1/Odds['HBet'])/(1/(Odds['HBet'])+1/(Odds['DBet'])+1/(Odds['ABet']))
Odds['Dprob_Odds'] = (1/Odds['DBet'])/(1/(Odds['HBet'])+1/(Odds['DBet'])+1/(Odds['ABet']))
Odds['Aprob_Odds'] = 1 - Odds['Hprob_Odds'] - Odds['Dprob_Odds']
Odds.head()

In [None]:
# Prediction from odds

Odds['Odds_pred'] = np.where((Odds['Hprob_Odds']>Odds['Dprob_Odds'])&(Odds['Hprob_Odds']>Odds['Aprob_Odds']),'H',np.where((Odds['Dprob_Odds']>Odds['Hprob_Odds'])&(Odds['Dprob_Odds']>Odds['Aprob_Odds']),'D','A'))
Odds.head()

In [None]:
# Confusion matrix

pd.crosstab(Odds['FTR'], Odds['Odds_pred'],dropna= True)

In [None]:
# Precision for Odds model

p_odds = (57+79)/265
p_odds

# Ordered Logit Model with TM Value Costant

In [None]:
# Load dataset with all statistics for each game

SerieA = pd.read_csv("/Users/lorenzoleoni/Desktop/Materiale Personale/Database/SerieA_21_22.csv", sep=';', decimal=",", parse_dates=['Match_Date'],encoding = 'unicode_escape')
SerieA.head()

In [None]:
# Variable Win

SerieA['Win'] = np.where(SerieA.loc[:,'Home_Score']==SerieA.loc[:,'Away_Score'],'D',np.where(SerieA.loc[:,'Home_Score']>SerieA.loc[:,'Away_Score'],'H','A'))

In [None]:
# Load TM_Value for each team

TM_Value = pd.read_csv("/Users/lorenzoleoni/Desktop/Materiale Personale/Database/TM_Value_SerieA21_22.csv", sep=';')
TM_Value.head()

In [None]:
# Variable TM_Value_Home_Team

TM_Value.rename(columns={'Team':'Home_Team'}, inplace=True)
SerieA = pd.merge(SerieA,TM_Value,on='Home_Team')
SerieA.rename(columns={'TM_Value':'TM_Value_Home_Team'}, inplace=True)

In [None]:
# Variable TM_Value_Away_Team

TM_Value.rename(columns={'Home_Team':'Away_Team'}, inplace=True)
SerieA = pd.merge(SerieA,TM_Value,on='Away_Team')
SerieA.rename(columns={'TM_Value':'TM_Value_Away_Team'}, inplace=True)

In [None]:
# Distribution ratio TM value

plt.figure(figsize=(14,7)) # Make it 14x7 inch
plt.style.use('seaborn-whitegrid') # nice and clean grid

plt.hist(SerieA['TM_Value_Home_Team']/SerieA['TM_Value_Away_Team'], bins=30, facecolor = '#2ab0ff', edgecolor='#169acf', linewidth=0.5)

plt.title('Ratio TM Value', fontsize=35) 
plt.xlabel('Distribution', fontsize=28) 
plt.ylabel('Frequency', fontsize=28)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.show()

In [None]:
# Distribution log ratio TM value

plt.figure(figsize=(14,7)) # Make it 14x7 inch
plt.style.use('seaborn-whitegrid') # nice and clean grid

plt.hist(np.log(SerieA['TM_Value_Home_Team']/SerieA['TM_Value_Away_Team']), bins=30, facecolor = '#2ab0ff', edgecolor='#169acf', linewidth=0.5)

plt.title('Log Ratio TM Value', fontsize=35) 
plt.xlabel('Distribution', fontsize=28) 
plt.ylabel('Frequency', fontsize=28)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.show()

In [None]:
# Variable log_ratio_Value

SerieA['log_ratio_Value'] = np.log(SerieA.loc[:,'TM_Value_Home_Team']/SerieA.loc[:,'TM_Value_Away_Team'])

In [None]:
# Subset Games

Games = SerieA.loc[:,['Matchweek','Match_Date','Home_Team','Away_Team','Home_Score','Away_Score','Win','log_ratio_Value','Cup_Home','Cup_Away']].copy()
Games.head()

In [None]:
# Delete even rows (duplicate games)

Games = Games.sort_index(axis=0)
Games = Games.iloc[::2]
Games = Games.sort_values(by='Matchweek', ascending=True)

In [None]:
# Variable Winvalue

Games['Winvalue'] = np.where(Games.loc[:,'Win']=='H',2,np.where(Games.loc[:,'Win']=='D',1,0))

In [None]:
# Create dfs Team Performances, Goals made and conceded and Pythagorean Expectation (it refers up to previous matches)

Teams = pd.DataFrame(np.zeros((27,20)),columns=Games['Home_Team'].unique())
Goals_for = pd.DataFrame(np.zeros((27,20),dtype=int),columns=Games['Home_Team'].unique())
Goals_against = pd.DataFrame(np.zeros((27,20),dtype=int),columns=Games['Home_Team'].unique())

In [None]:
# Dictionary with column indexs for each team

names = Games['Home_Team'].unique()

mydict = {}

for i,name in enumerate(names):
    mydict[name] = int(i)
    

In [None]:
# Calculate Team Performances (it refers to the last two games),  Goals made,  Goals conceded and Pythagorean Expectation

i = 0
k = 0

for i in range(0,len(Games)):
    
    if (Games['Matchweek'].iloc[i-1] < Games['Matchweek'].iloc[i]) & (i>0): # If it was taken the following matchweek
        k +=1
    
    if k < 26:
    
    ##############################################################################################################################
        
        # IN THIS SECTION ARE CONSIDERED POSTPONED MATCHS. THE PERFORMANCE OF THE TEAMS THAT HAVE NOT PLAYED ARE 
        # EQUAL TO THOSE OF THEIR LAST MATCHES

    ##############################################################################################################################
    
        if k == 19:
            
            Teams.loc[k,'Udinese'] = Teams.loc[k-1,'Udinese']
            Teams.loc[k,'Salernitana'] = Teams.loc[k-1,'Salernitana']
            
            Goals_for.loc[k,'Udinese'] = Goals_for.loc[k-1,'Udinese']
            Goals_for.loc[k,'Salernitana'] = Goals_for.loc[k-1,'Salernitana']
            
            Goals_against.loc[k,'Udinese'] = Goals_against.loc[k-1,'Udinese']
            Goals_against.loc[k,'Salernitana'] = Goals_against.loc[k-1,'Salernitana']
            
        
        if k == 20:
            
            Teams.loc[k,'Fiorentina'] = Teams.loc[k-1,'Fiorentina']
            Teams.loc[k,'Udinese'] = Teams.loc[k-1,'Udinese']
            
            Teams.loc[k,'Bologna'] = Teams.loc[k-1,'Bologna']
            Teams.loc[k,'Internazionale'] = Teams.loc[k-1,'Internazionale']
            
            Teams.loc[k,'Atalanta'] = Teams.loc[k-1,'Atalanta']
            Teams.loc[k,'Torino'] = Teams.loc[k-1,'Torino']
            
            Teams.loc[k,'Salernitana'] = Teams.loc[k-1,'Salernitana']
            Teams.loc[k,'Venezia'] = Teams.loc[k-1,'Venezia']
            
            #
            
            Goals_for.loc[k,'Fiorentina'] = Goals_for.loc[k-1,'Fiorentina']
            Goals_for.loc[k,'Udinese'] = Goals_for.loc[k-1,'Udinese']
        
            Goals_for.loc[k,'Bologna'] = Goals_for.loc[k-1,'Bologna']
            Goals_for.loc[k,'Internazionale'] = Goals_for.loc[k-1,'Internazionale']
            
            Goals_for.loc[k,'Atalanta'] = Goals_for.loc[k-1,'Atalanta']
            Goals_for.loc[k,'Torino'] = Goals_for.loc[k-1,'Torino']
            
            Goals_for.loc[k,'Salernitana'] = Goals_for.loc[k-1,'Salernitana']
            Goals_for.loc[k,'Venezia'] = Goals_for.loc[k-1,'Venezia']
            
            #
            
            Goals_against.loc[k,'Fiorentina'] = Goals_against.loc[k-1,'Fiorentina']
            Goals_against.loc[k,'Udinese'] = Goals_against.loc[k-1,'Udinese']
            
            Goals_against.loc[k,'Bologna'] = Goals_against.loc[k-1,'Bologna']
            Goals_against.loc[k,'Internazionale'] = Goals_against.loc[k-1,'Internazionale']
            
            Goals_against.loc[k,'Atalanta'] = Goals_against.loc[k-1,'Atalanta']
            Goals_against.loc[k,'Torino'] = Goals_against.loc[k-1,'Torino']
            
            Goals_against.loc[k,'Salernitana'] = Goals_against.loc[k-1,'Salernitana']
            Goals_against.loc[k,'Venezia'] = Goals_against.loc[k-1,'Venezia']
            
  
    ##############################################################################################################################
    ##############################################################################################################################
        
        Goals_for.iloc[k+1,mydict[Games['Home_Team'].iloc[i]]] = Goals_for.iloc[k,mydict[Games['Home_Team'].iloc[i]]] + \
        Games['Home_Score'].iloc[i]
        
        Goals_for.iloc[k+1,mydict[Games['Away_Team'].iloc[i]]] = Goals_for.iloc[k,mydict[Games['Away_Team'].iloc[i]]] + \
        Games['Away_Score'].iloc[i]
        
        Goals_against.iloc[k+1,mydict[Games['Home_Team'].iloc[i]]] = Goals_against.iloc[k,mydict[Games['Home_Team'].iloc[i]]] + \
        Games['Away_Score'].iloc[i]
        
        Goals_against.iloc[k+1,mydict[Games['Away_Team'].iloc[i]]] = Goals_against.iloc[k,mydict[Games['Away_Team'].iloc[i]]] + \
        Games['Home_Score'].iloc[i]
        
        
        # Home Win
    
        if (Games['Winvalue'].iloc[i] == 2)&(Teams.iloc[k,mydict[Games['Home_Team'].iloc[i]]] <= 1):
            Teams.iloc[k+1,mydict[Games['Home_Team'].iloc[i]]] =  Teams.iloc[k,mydict[Games['Home_Team'].iloc[i]]] + 1

            if Teams.iloc[k,mydict[Games['Away_Team'].iloc[i]]] >= - 1:
                Teams.iloc[k+1,mydict[Games['Away_Team'].iloc[i]]] =  Teams.iloc[k,mydict[Games['Away_Team'].iloc[i]]] - 1

            else:
                Teams.iloc[k+1,mydict[Games['Away_Team'].iloc[i]]] =  - 2

        elif (Games['Winvalue'].iloc[i] == 2)&(Teams.iloc[k,mydict[Games['Home_Team'].iloc[i]]] > 1):
            Teams.iloc[k+1,mydict[Games['Home_Team'].iloc[i]]] =  2

            if Teams.iloc[k,mydict[Games['Away_Team'].iloc[i]]] >= - 1:
                Teams.iloc[k+1,mydict[Games['Away_Team'].iloc[i]]] =  Teams.iloc[k,mydict[Games['Away_Team'].iloc[i]]] - 1

            else:
                Teams.iloc[k+1,mydict[Games['Away_Team'].iloc[i]]] =  - 2

        # Draw

        if (Games['Winvalue'].iloc[i] == 1)&(Teams.iloc[k,mydict[Games['Home_Team'].iloc[i]]] >= -1.5):
            Teams.iloc[k+1,mydict[Games['Home_Team'].iloc[i]]] =  Teams.iloc[k,mydict[Games['Home_Team'].iloc[i]]] - 0.5

            if Teams.iloc[k,mydict[Games['Away_Team'].iloc[i]]] >= - 1.5:
                Teams.iloc[k+1,mydict[Games['Away_Team'].iloc[i]]]  =  Teams.iloc[k,mydict[Games['Away_Team'].iloc[i]]] - 0.5

            else:
                Teams.iloc[k+1,mydict[Games['Away_Team'].iloc[i]]] =  - 2

        elif (Games['Winvalue'].iloc[i] == 1)&(Teams.iloc[k,mydict[Games['Home_Team'].iloc[i]]] < -1.5):
            Teams.iloc[k+1,mydict[Games['Home_Team'].iloc[i]]] =  - 2

            if Teams.iloc[k,mydict[Games['Away_Team'].iloc[i]]] >= - 1.5:
                Teams.iloc[k+1,mydict[Games['Away_Team'].iloc[i]]] =  Teams.iloc[k,mydict[Games['Away_Team'].iloc[i]]] - 0.5

            else:
                Teams.iloc[k+1,mydict[Games['Away_Team'].iloc[i]]] =  - 2

        # Away Win

        if (Games['Winvalue'].iloc[i] == 0)&(Teams.iloc[k,mydict[Games['Home_Team'].iloc[i]]] >= -1):
            Teams.iloc[k+1,mydict[Games['Home_Team'].iloc[i]]] =  Teams.iloc[k,mydict[Games['Home_Team'].iloc[i]]] - 1

            if Teams.iloc[k,mydict[Games['Away_Team'].iloc[i]]] <= 1:
                Teams.iloc[k+1,mydict[Games['Away_Team'].iloc[i]]] =  Teams.iloc[k,mydict[Games['Away_Team'].iloc[i]]] + 1

            else:
                Teams.iloc[k+1,mydict[Games['Away_Team'].iloc[i]]] = 2

        elif (Games['Winvalue'].iloc[i] == 0)&(Teams.iloc[k,mydict[Games['Home_Team'].iloc[i]]] < -1):
            Teams.iloc[k+1,mydict[Games['Home_Team'].iloc[i]]] =  - 2

            if Teams.iloc[k,mydict[Games['Away_Team'].iloc[i]]] <= 1:
                Teams.iloc[k+1,mydict[Games['Away_Team'].iloc[i]]] =  Teams.iloc[k,mydict[Games['Away_Team'].iloc[i]]] + 1

            else:
                Teams.iloc[k+1,mydict[Games['Away_Team'].iloc[i]]] = 2
            
    i +=1
    
Teams.head(20)

In [None]:
# Subset Games

Games = Games[['Matchweek','Match_Date','Home_Team','Away_Team','Win','log_ratio_Value','Cup_Home','Cup_Away','Winvalue']].reset_index(drop = True)

In [None]:
# Add variables Performance and Pythagorean expectation for home and away teams created before

i = 0
k = 0

Games['Home_Performance'] = [0]*len(Games)
Games['Away_Performance'] = [0]*len(Games)
Games['Home_Diff_Goals'] = [0]*len(Games)
Games['Away_Diff_Goals'] = [0]*len(Games)

for i in range(0,len(Games)):
    
    if (Games['Matchweek'].iloc[i-1] < Games['Matchweek'].iloc[i]) & (i>0):
        k +=1
    
    if k<= 26:
        
        Games.loc[i,'Home_Performance'] = Teams.iloc[k,mydict[Games['Home_Team'].iloc[i]]]
        Games.loc[i,'Away_Performance'] = Teams.iloc[k,mydict[Games['Away_Team'].iloc[i]]]
        
        Games.loc[i,'Home_Diff_Goals'] = Goals_for.iloc[k,mydict[Games['Home_Team'].iloc[i]]] - Goals_against.iloc[k,mydict[Games['Home_Team'].iloc[i]]]
        Games.loc[i,'Away_Diff_Goals'] = Goals_for.iloc[k,mydict[Games['Away_Team'].iloc[i]]] - Goals_against.iloc[k,mydict[Games['Away_Team'].iloc[i]]]
        
    i += 1
    
#Games

In [None]:
# Save pandas for statistical inference in R

Games.to_csv("/Users/lorenzoleoni/Desktop/Materiale Personale/Database/OLM.csv",index=False)

## MAIN + DIFF

In [None]:
# ORDERED LOGIT MODEL WITH TM VALUES COSTANT MAIN + DIFF

TMValue_Costant = OrderedLogit()

Regressors = Games[['log_ratio_Value','Home_Performance','Away_Performance', 'Home_Diff_Goals','Away_Diff_Goals']]

TMValue_Costant.fit(Regressors, Games['Winvalue'])

TMValue_Costant.print_summary()

In [None]:
# Predicted probabilities Main + Diff

Games['Aprob_TMValue_Costant'] = 1/(1+np.exp(-(TMValue_Costant.coef_[5]\
                                               -TMValue_Costant.coef_[0]*Games['log_ratio_Value']\
                                               -TMValue_Costant.coef_[1]*Games['Home_Performance']\
                                               -TMValue_Costant.coef_[2]*Games['Away_Performance']\
                                               -TMValue_Costant.coef_[3]*Games['Home_Diff_Goals']\
                                               -TMValue_Costant.coef_[4]*Games['Away_Diff_Goals'])))

Games['Dprob_TMValue_Costant'] = 1/(1+np.exp(-(TMValue_Costant.coef_[6]\
                                               -TMValue_Costant.coef_[0]*Games['log_ratio_Value']\
                                               -TMValue_Costant.coef_[1]*Games['Home_Performance']\
                                               -TMValue_Costant.coef_[2]*Games['Away_Performance']\
                                               -TMValue_Costant.coef_[3]*Games['Home_Diff_Goals']\
                                               -TMValue_Costant.coef_[4]*Games['Away_Diff_Goals'])))\
                                               - Games['Aprob_TMValue_Costant']

Games['Hprob_TMValue_Costant'] = 1 - Games['Aprob_TMValue_Costant'] - Games['Dprob_TMValue_Costant']

In [None]:
# Result prediction Main + Diff

Maxprob = Games[['Aprob_TMValue_Costant','Dprob_TMValue_Costant','Hprob_TMValue_Costant']].max(axis=1)

Games['TMValue_Costant_pred']=np.where(Maxprob == Games['Aprob_TMValue_Costant'],'A',\
                               np.where(Maxprob == Games['Dprob_TMValue_Costant'],'D','H'))

In [None]:
# Confusion matrix Main + Diff

pd.crosstab(Games['Win'], Games['TMValue_Costant_pred'],dropna= True)

In [None]:
# Precision for model TMValue_costant Main + Diff

p_TMValue_Costant = (62+77)/265

p_TMValue_Costant

## MAIN + CUP

In [None]:
# ORDERED LOGIT MODEL WITH TM VALUES COSTANT MAIN + CUP

TMValue_Costant = OrderedLogit()

Regressors = Games[['log_ratio_Value','Home_Performance','Away_Performance', 'Cup_Home','Cup_Away']]

TMValue_Costant.fit(Regressors, Games['Winvalue'])

TMValue_Costant.print_summary()

In [None]:
# Predicted probabilities Main + Cup

Games['Aprob_TMValue_Costant'] = 1/(1+np.exp(-(TMValue_Costant.coef_[5]\
                                               -TMValue_Costant.coef_[0]*Games['log_ratio_Value']\
                                               -TMValue_Costant.coef_[1]*Games['Home_Performance']\
                                               -TMValue_Costant.coef_[2]*Games['Away_Performance'] \
                                               -TMValue_Costant.coef_[3]*Games['Cup_Home']\
                                               -TMValue_Costant.coef_[4]*Games['Cup_Away'])))

Games['Dprob_TMValue_Costant'] = 1/(1+np.exp(-(TMValue_Costant.coef_[6]\
                                               -TMValue_Costant.coef_[0]*Games['log_ratio_Value']\
                                               -TMValue_Costant.coef_[1]*Games['Home_Performance']\
                                               -TMValue_Costant.coef_[2]*Games['Away_Performance']\
                                               -TMValue_Costant.coef_[3]*Games['Cup_Home']\
                                               -TMValue_Costant.coef_[4]*Games['Cup_Away'])))\
                                               - Games['Aprob_TMValue_Costant']

Games['Hprob_TMValue_Costant'] = 1 - Games['Aprob_TMValue_Costant'] - Games['Dprob_TMValue_Costant']

In [None]:
# Result prediction Main + Cup

Maxprob = Games[['Aprob_TMValue_Costant','Dprob_TMValue_Costant','Hprob_TMValue_Costant']].max(axis=1)

Games['TMValue_Costant_pred']=np.where(Maxprob == Games['Aprob_TMValue_Costant'],'A',\
                               np.where(Maxprob == Games['Dprob_TMValue_Costant'],'D','H'))

In [None]:
# Confusion matrix Main + Cup

pd.crosstab(Games['Win'], Games['TMValue_Costant_pred'],dropna= True)

In [None]:
# Precision for model TMValue_costant Main + Cup

p_TMValue_Costant = (65+78)/265

p_TMValue_Costant

## MAIN (Best)

In [None]:
# ORDERED LOGIT MODEL WITH TM VALUES COSTANT MAIN

TMValue_Costant = OrderedLogit()

Regressors = Games[['log_ratio_Value','Home_Performance','Away_Performance']]

TMValue_Costant.fit(Regressors, Games['Winvalue'])

TMValue_Costant.print_summary()

In [None]:
# Predicted probabilities Main

Games['Aprob_TMValue_Costant'] = 1/(1+np.exp(-(TMValue_Costant.coef_[3]\
                                               -TMValue_Costant.coef_[0]*Games['log_ratio_Value']
                                               -TMValue_Costant.coef_[1]*Games['Home_Performance']\
                                               -TMValue_Costant.coef_[2]*Games['Away_Performance'])))

Games['Dprob_TMValue_Costant'] = 1/(1+np.exp(-(TMValue_Costant.coef_[4]\
                                               -TMValue_Costant.coef_[0]*Games['log_ratio_Value']\
                                               -TMValue_Costant.coef_[1]*Games['Home_Performance']\
                                               -TMValue_Costant.coef_[2]*Games['Away_Performance'])))\
                                               -Games['Aprob_TMValue_Costant']

Games['Hprob_TMValue_Costant'] = 1 - Games['Aprob_TMValue_Costant'] - Games['Dprob_TMValue_Costant']

In [None]:
# Result prediction Main

Maxprob = Games[['Aprob_TMValue_Costant','Dprob_TMValue_Costant','Hprob_TMValue_Costant']].max(axis=1)

Games['TMValue_Costant_pred']=np.where(Maxprob == Games['Aprob_TMValue_Costant'],'A',\
                               np.where(Maxprob == Games['Dprob_TMValue_Costant'],'D','H'))

In [None]:
# Confusion matrix Main

pd.crosstab(Games['Win'], Games['TMValue_Costant_pred'],dropna= True)

In [None]:
# Precision for model TMValue_costant Main

p_TMValue_Costant = (66+78)/265

p_TMValue_Costant

# Ordered Logit Model with TM Value for each Lineups

In [None]:
# Lineups and TM Value

ln_TM = pd.read_csv("/Users/lorenzoleoni/Desktop/Materiale Personale/Database/ln_TM.csv", sep=',', parse_dates=['Matchday'])

ln_TM.sort_values(by=['Matchday','Team'], ascending=[True,True], inplace=True)

ln_TM.reset_index(drop=True, inplace=True)

ln_TM

In [None]:
# Drop matchweek 28 (the analysis is based on 27 matchweek)

i = 0

for i in range(0,len(ln_TM)):
    
    if ln_TM.loc[i,'Matchday'] > pd.to_datetime('2022-02-28'):
        
        ln_TM.drop(i,axis=0,inplace=True)

In [None]:
# Create columns for TM value of lineups for each match

Games['ln_TM_Home'] = [0]*len(Games)

Games['ln_TM_Away'] = [0]*len(Games)

In [None]:
# Real contribution of a player in the lineup value of the team in each match

ln_TM['TMValue_Contribution'] = (ln_TM['TMValue']*ln_TM['Min'])/90

In [None]:
Tot_TMValue_Lineups = ln_TM.groupby(['Matchday','Team'])['TMValue_Contribution'].sum().reset_index()

Tot_TMValue_Lineups.head()

In [None]:
# Sum of the value of each player in the lineups in each match

i = 0

for i in range(0,len(Games)):
    
    for k in range(0,len(Tot_TMValue_Lineups)):
        
        if (Tot_TMValue_Lineups.loc[k,'Matchday'] == Games.loc[i,'Match_Date'])&(Tot_TMValue_Lineups.loc[k,'Team'] == Games.loc[i,'Home_Team']):
            
            Games.loc[i,'ln_TM_Home'] = Tot_TMValue_Lineups.loc[k,'TMValue_Contribution']
            
        if (Tot_TMValue_Lineups.loc[k,'Matchday'] == Games.loc[i,'Match_Date'])&(Tot_TMValue_Lineups.loc[k,'Team'] == Games.loc[i,'Away_Team']):
            
            Games.loc[i,'ln_TM_Away'] = Tot_TMValue_Lineups.loc[k,'TMValue_Contribution']
            
Games.head()

In [None]:
# Distribution ratio ln TM value

plt.figure(figsize=(14,7)) # Make it 14x7 inch

plt.style.use('seaborn-whitegrid') # nice and clean grid

plt.hist(Games['ln_TM_Home']/Games['ln_TM_Away'], bins=30, facecolor = '#2ab0ff', edgecolor='#169acf', linewidth=0.5)

plt.title('Ratio ln TM Value', fontsize=35)

plt.xlabel('Distribution', fontsize=28) 

plt.ylabel('Frequency', fontsize=28)

plt.xticks(fontsize=20)

plt.yticks(fontsize=20)

plt.show()

In [None]:
# Distribution log ratio ln TM value

plt.figure(figsize=(14,7)) # Make it 14x7 inch

plt.style.use('seaborn-whitegrid') # nice and clean grid

plt.hist(np.log(Games['ln_TM_Home']/Games['ln_TM_Away']), bins=30, facecolor = '#2ab0ff', edgecolor='#169acf', linewidth=0.5)

plt.title('Log Ratio ln TM Value', fontsize=35) 

plt.xlabel('Distribution', fontsize=28) 

plt.ylabel('Frequency', fontsize=28)

plt.xticks(fontsize=20)

plt.yticks(fontsize=20)

plt.show()

In [None]:
# Variable log_ratio_ln_TM

Games['log_ratio_ln_TM'] = np.log(Games.loc[:,'ln_TM_Home']/Games.loc[:,'ln_TM_Away'])

In [None]:
# Save pandas for statistical inference in R

Games.to_csv("/Users/lorenzoleoni/Desktop/Materiale Personale/Database/OLM_ln.csv",index=False)

## MAIN + CUP

In [None]:
# ORDERED LOGIT MODEL WITH TM VALUE FOR EACH LINEUPS MAIN + CUP

TMValue_Lineups = OrderedLogit()

Regressors = Games[['log_ratio_ln_TM','Home_Performance','Away_Performance','Cup_Home','Cup_Away']]

TMValue_Lineups.fit(Regressors, Games['Winvalue'])

TMValue_Lineups.print_summary()

In [None]:
# Predicted probabilities Main + Cup

Games['Aprob_TMValue_Lineups'] = 1/(1+np.exp(-(TMValue_Lineups.coef_[5]\
                               -TMValue_Lineups.coef_[0]*Games['log_ratio_ln_TM']\
                               -TMValue_Lineups.coef_[1]*Games['Home_Performance']\
                               -TMValue_Lineups.coef_[2]*Games['Away_Performance']\
                               -TMValue_Lineups.coef_[3]*Games['Cup_Home']\
                               -TMValue_Lineups.coef_[4]*Games['Cup_Away'])))

Games['Dprob_TMValue_Lineups'] = 1/(1+np.exp(-(TMValue_Lineups.coef_[6]\
                               -TMValue_Lineups.coef_[0]*Games['log_ratio_ln_TM']\
                               -TMValue_Lineups.coef_[1]*Games['Home_Performance']\
                               -TMValue_Lineups.coef_[2]*Games['Away_Performance']\
                               -TMValue_Lineups.coef_[3]*Games['Cup_Home']\
                               -TMValue_Lineups.coef_[4]*Games['Cup_Away'])))\
                               -Games['Aprob_TMValue_Lineups']

Games['Hprob_TMValue_Lineups'] = 1 - Games['Aprob_TMValue_Lineups'] - Games['Dprob_TMValue_Lineups']

In [None]:
# Result prediction Main + Cup

Maxprob = Games[['Aprob_TMValue_Lineups','Dprob_TMValue_Lineups','Hprob_TMValue_Lineups']].max(axis=1)

Games['TMValue_Lineups_pred']=np.where(Maxprob == Games['Aprob_TMValue_Lineups'],'A',\
                               np.where(Maxprob == Games['Dprob_TMValue_Lineups'],'D','H'))

In [None]:
# Confusion matrix Main + Cup

pd.crosstab(Games['Win'], Games['TMValue_Lineups_pred'],dropna= True)

In [None]:
# Precision for model TMValue_lineups Main + Cup

p_TMValue_Lineups = (63+78+2)/265

p_TMValue_Lineups

## MAIN

In [None]:
# ORDERED LOGIT MODEL WITH TM VALUE FOR EACH LINEUPS MAIN

TMValue_Lineups = OrderedLogit()

Regressors = Games[['log_ratio_ln_TM','Home_Performance','Away_Performance']]

TMValue_Lineups.fit(Regressors, Games['Winvalue'])

TMValue_Lineups.print_summary()

In [None]:
# Predicted probabilities Main

Games['Aprob_TMValue_Lineups'] = 1/(1+np.exp(-(TMValue_Lineups.coef_[3]\
                               -TMValue_Lineups.coef_[0]*Games['log_ratio_ln_TM']\
                               -TMValue_Lineups.coef_[1]*Games['Home_Performance']\
                               -TMValue_Lineups.coef_[2]*Games['Away_Performance'])))

Games['Dprob_TMValue_Lineups'] = 1/(1+np.exp(-(TMValue_Lineups.coef_[4]\
                               -TMValue_Lineups.coef_[0]*Games['log_ratio_ln_TM']\
                               -TMValue_Lineups.coef_[1]*Games['Home_Performance']\
                               -TMValue_Lineups.coef_[2]*Games['Away_Performance'])))\
                               -Games['Aprob_TMValue_Lineups']

Games['Hprob_TMValue_Lineups'] = 1 - Games['Aprob_TMValue_Lineups'] - Games['Dprob_TMValue_Lineups']

In [None]:
# Result prediction Main

Maxprob = Games[['Aprob_TMValue_Lineups','Dprob_TMValue_Lineups','Hprob_TMValue_Lineups']].max(axis=1)

Games['TMValue_Lineups_pred']=np.where(Maxprob == Games['Aprob_TMValue_Lineups'],'A',\
                               np.where(Maxprob == Games['Dprob_TMValue_Lineups'],'D','H'))

In [None]:
# Confusion matrix Main

pd.crosstab(Games['Win'], Games['TMValue_Lineups_pred'],dropna= True)

In [None]:
# Precision for model TMValue_lineups Main

p_TMValue_Lineups = (64+1+77)/265

p_TMValue_Lineups

## MAIN + DIFF (Best)

In [None]:
# ORDERED LOGIT MODEL WITH TM VALUE FOR EACH LINEUPS MAIN + DIFF

TMValue_Lineups = OrderedLogit()

Regressors = Games[['log_ratio_ln_TM','Home_Performance','Away_Performance','Home_Diff_Goals','Away_Diff_Goals']]

TMValue_Lineups.fit(Regressors, Games['Winvalue'])

TMValue_Lineups.print_summary()

In [None]:
# Predicted probabilities Main + Diff

Games['Aprob_TMValue_Lineups'] = 1/(1+np.exp(-(TMValue_Lineups.coef_[5]\
                               -TMValue_Lineups.coef_[0]*Games['log_ratio_ln_TM']\
                               -TMValue_Lineups.coef_[1]*Games['Home_Performance']\
                               -TMValue_Lineups.coef_[2]*Games['Away_Performance']\
                               -TMValue_Lineups.coef_[3]*Games['Home_Diff_Goals']\
                               -TMValue_Lineups.coef_[4]*Games['Away_Diff_Goals'])))

Games['Dprob_TMValue_Lineups'] = 1/(1+np.exp(-(TMValue_Lineups.coef_[6]\
                               -TMValue_Lineups.coef_[0]*Games['log_ratio_ln_TM']\
                               -TMValue_Lineups.coef_[1]*Games['Home_Performance']\
                               -TMValue_Lineups.coef_[2]*Games['Away_Performance']\
                               -TMValue_Lineups.coef_[3]*Games['Home_Diff_Goals']\
                               -TMValue_Lineups.coef_[4]*Games['Away_Diff_Goals'])))\
                               -Games['Aprob_TMValue_Lineups']

Games['Hprob_TMValue_Lineups'] = 1 - Games['Aprob_TMValue_Lineups'] - Games['Dprob_TMValue_Lineups']

In [None]:
# Result prediction Main + Diff

Maxprob = Games[['Aprob_TMValue_Lineups','Dprob_TMValue_Lineups','Hprob_TMValue_Lineups']].max(axis=1)

Games['TMValue_Lineups_pred']=np.where(Maxprob == Games['Aprob_TMValue_Lineups'],'A',\
                               np.where(Maxprob == Games['Dprob_TMValue_Lineups'],'D','H'))

In [None]:
# Confusion matrix Main + Diff

pd.crosstab(Games['Win'], Games['TMValue_Lineups_pred'],dropna= True)

In [None]:
# Precision for model TMValue_lineups Main + Diff

p_TMValue_Lineups = (68+5+76)/265

p_TMValue_Lineups

# Ordered Logit Model with Elo

In [None]:
# Load Elo data

Elo = pd.read_csv("/Users/lorenzoleoni/Desktop/Materiale Personale/Database/Elo.csv", sep=';', decimal=",", parse_dates=['Date'])

Elo

In [None]:
# Save Elo for each home and away team

Games['Elo_Home'] = [0]*len(Games)

Games['Elo_Away'] = [0]*len(Games)

i = 0

k = 0

for i in range(0,len(Games)):
    
    while Games.loc[i,'Match_Date'] > Elo[Elo['Team']==Games.loc[i,'Home_Team']].iloc[k,2]: # 2 -> Date

        k += 1
        
    Games.loc[i,'Elo_Home'] = Elo[Elo['Team']==Games.loc[i,'Home_Team']].iloc[k-1,1] # 1 -> Elo
    
    k = 0
    
    while Games.loc[i,'Match_Date'] > Elo[Elo['Team']==Games.loc[i,'Away_Team']].iloc[k,2]:
        
        k += 1
        
    Games.loc[i,'Elo_Away'] = Elo[Elo['Team']==Games.loc[i,'Away_Team']].iloc[k-1,1]
    
    k = 0

In [None]:
# Distribution ratio Elo

plt.figure(figsize=(14,7)) # Make it 14x7 inch

plt.style.use('seaborn-whitegrid') # nice and clean grid

plt.hist(Games['Elo_Home']/Games['Elo_Away'], bins=30, facecolor = '#2ab0ff', edgecolor='#169acf', linewidth=0.5)

plt.title('Ratio Elo', fontsize=35) 

plt.xlabel('Distribution', fontsize=28) 

plt.ylabel('Frequency', fontsize=28)

plt.xticks(fontsize=20)

plt.yticks(fontsize=20)

plt.show()

In [None]:
# Variable log_ratio_Elo

Games['ratio_Elo'] = Games['Elo_Home']/Games['Elo_Away']

In [None]:
# Save pandas for statistical inference in R

Games.to_csv("/Users/lorenzoleoni/Desktop/Materiale Personale/Database/OLM_elo.csv",index=False)

## MAIN + CUP

In [None]:
# ORDERED LOGIT MODEL WITH ELO MAIN + CUP

Elo = OrderedLogit()

Regressors = Games[['log_ratio_ln_TM','Home_Performance','Away_Performance','ratio_Elo','Cup_Home','Cup_Away']]

Elo.fit(Regressors, Games['Winvalue'])

Elo.print_summary()

In [None]:
# Predicted probabilities Main + Cup

Games['Aprob_Elo'] = 1/(1+np.exp(-(Elo.coef_[6]\
                               -Elo.coef_[0]*Games['log_ratio_ln_TM']\
                               -Elo.coef_[1]*Games['Home_Performance']\
                               -Elo.coef_[2]*Games['Away_Performance']\
                               -Elo.coef_[3]*Games['ratio_Elo']\
                               -Elo.coef_[4]*Games['Cup_Home']\
                               -Elo.coef_[5]*Games['Cup_Away'])))

Games['Dprob_Elo'] = 1/(1+np.exp(-(Elo.coef_[7]\
                               -Elo.coef_[0]*Games['log_ratio_ln_TM']\
                               -Elo.coef_[1]*Games['Home_Performance']\
                               -Elo.coef_[2]*Games['Away_Performance']\
                               -Elo.coef_[3]*Games['ratio_Elo']\
                               -Elo.coef_[4]*Games['Cup_Home']\
                               -Elo.coef_[5]*Games['Cup_Away'])))\
                               -Games['Aprob_Elo']

Games['Hprob_Elo'] = 1 - Games['Aprob_Elo'] - Games['Dprob_Elo']

In [None]:
# Result prediction Main + Cup

Maxprob = Games[['Aprob_Elo','Dprob_Elo','Hprob_Elo']].max(axis=1)

Games['Elo_pred']=np.where(Maxprob == Games['Aprob_Elo'],'A',\
                               np.where(Maxprob == Games['Dprob_Elo'],'D','H'))

In [None]:
# Confusion matrix Main + Cup

pd.crosstab(Games['Win'], Games['Elo_pred'],dropna= True)

In [None]:
# Precision for model Elo Main + Cup

p_Elo = (63+3+78)/265

p_Elo

## MAIN

In [None]:
# ORDERED LOGIT MODEL WITH ELO MAIN

Elo = OrderedLogit()

Regressors = Games[['log_ratio_ln_TM','Home_Performance','Away_Performance','ratio_Elo']]

Elo.fit(Regressors, Games['Winvalue'])

Elo.print_summary()

In [None]:
# Predicted probabilities Main

Games['Aprob_Elo'] = 1/(1+np.exp(-(Elo.coef_[4]\
                               -Elo.coef_[0]*Games['log_ratio_ln_TM']\
                               -Elo.coef_[1]*Games['Home_Performance']\
                               -Elo.coef_[2]*Games['Away_Performance']\
                               -Elo.coef_[3]*Games['ratio_Elo'])))

Games['Dprob_Elo'] = 1/(1+np.exp(-(Elo.coef_[5]\
                               -Elo.coef_[0]*Games['log_ratio_ln_TM']\
                               -Elo.coef_[1]*Games['Home_Performance']\
                               -Elo.coef_[2]*Games['Away_Performance']\
                               -Elo.coef_[3]*Games['ratio_Elo'])))\
                               -Games['Aprob_Elo']

Games['Hprob_Elo'] = 1 - Games['Aprob_Elo'] - Games['Dprob_Elo']

In [None]:
# Result prediction Main

Maxprob = Games[['Aprob_Elo','Dprob_Elo','Hprob_Elo']].max(axis=1)

Games['Elo_pred']=np.where(Maxprob == Games['Aprob_Elo'],'A',\
                               np.where(Maxprob == Games['Dprob_Elo'],'D','H'))

In [None]:
# Confusion matrix Main

pd.crosstab(Games['Win'], Games['Elo_pred'],dropna= True)

In [None]:
# Precision for model Elo Main

p_Elo = (64+2+77)/265

p_Elo

## MAIN + DIFF (Best)

In [None]:
# ORDERED LOGIT MODEL WITH ELO MAIN + DIFF

Elo = OrderedLogit()

Regressors = Games[['log_ratio_ln_TM','Home_Performance','Away_Performance','ratio_Elo','Home_Diff_Goals','Away_Diff_Goals']]

Elo.fit(Regressors, Games['Winvalue'])

Elo.print_summary()

In [None]:
# Predicted probabilities Main + Diff

Games['Aprob_Elo'] = 1/(1+np.exp(-(Elo.coef_[6]\
                               -Elo.coef_[0]*Games['log_ratio_ln_TM']\
                               -Elo.coef_[1]*Games['Home_Performance']\
                               -Elo.coef_[2]*Games['Away_Performance']\
                               -Elo.coef_[3]*Games['ratio_Elo']\
                               -Elo.coef_[4]*Games['Home_Diff_Goals']\
                               -Elo.coef_[5]*Games['Away_Diff_Goals'])))

Games['Dprob_Elo'] = 1/(1+np.exp(-(Elo.coef_[7]\
                               -Elo.coef_[0]*Games['log_ratio_ln_TM']\
                               -Elo.coef_[1]*Games['Home_Performance']\
                               -Elo.coef_[2]*Games['Away_Performance']\
                               -Elo.coef_[3]*Games['ratio_Elo']\
                               -Elo.coef_[4]*Games['Home_Diff_Goals']\
                               -Elo.coef_[5]*Games['Away_Diff_Goals'])))\
                               -Games['Aprob_Elo']

Games['Hprob_Elo'] = 1 - Games['Aprob_Elo'] - Games['Dprob_Elo']

In [None]:
# Result prediction Main + Diff

Maxprob = Games[['Aprob_Elo','Dprob_Elo','Hprob_Elo']].max(axis=1)

Games['Elo_pred']=np.where(Maxprob == Games['Aprob_Elo'],'A',\
                               np.where(Maxprob == Games['Dprob_Elo'],'D','H'))

In [None]:
# Confusion matrix Main + Diff

pd.crosstab(Games['Win'], Games['Elo_pred'],dropna= True)

In [None]:
# Precision for model Elo 

p_Elo = (68+6+77)/265

p_Elo

# Standings (ATTENTION! RUN A MODEL THAT WANT TO BE VISUALIZED FOR EACH SECTION)

In [None]:
# Variable Pts for home and away team

Odds['HPts'] = np.where(Odds['FTR']=='H',3,np.where(Odds['FTR']=='D',1,0))
Odds['APts'] = np.where(Odds['FTR']=='A',3,np.where(Odds['FTR']=='D',1,0))

In [None]:
# Variable Expected Pts for home and away team

Odds['xHPts'] = 3*Odds['Hprob_Odds'] + Odds['Dprob_Odds']
Odds['xAPts'] = 3*Odds['Aprob_Odds'] + Odds['Dprob_Odds']

In [None]:
# Standings for home games

Home_Teams = Odds.groupby('Home_Team')[['HPts','xHPts']].sum().reset_index()
Home_Games = Odds.groupby('Home_Team')['Date'].size().reset_index()
Home_Teams = pd.merge(Home_Teams,Home_Games, on = 'Home_Team')
Home_Teams.rename(columns={'HPts':'Pts','xHPts':'xPts','Home_Team':'Team', 'Date':'Games'},inplace=True)

In [None]:
# Standings for away games

Away_Teams = Odds.groupby('Away_Team')[['APts','xAPts']].sum().reset_index()
Away_Games = Odds.groupby('Away_Team')['Date'].size().reset_index()
Away_Teams = pd.merge(Away_Teams,Away_Games, on = 'Away_Team')
Away_Teams.rename(columns={'APts':'Pts','xAPts':'xPts','Away_Team':'Team','Date':'Games'},inplace=True)

In [None]:
# Standings

St_Odds = pd.merge(Home_Teams,Away_Teams,on='Team')
St_Odds['Pts'] = St_Odds['Pts_x'] + St_Odds['Pts_y']
St_Odds['xPts_Odds'] = St_Odds['xPts_x'] + St_Odds['xPts_y']
St_Odds['Games'] = St_Odds['Games_x'] + St_Odds['Games_y']
St_Odds.drop(['Pts_x','Pts_y','xPts_x','xPts_y','Games_x','Games_y'],axis=1,inplace=True)
St_Odds['Rank'] = pd.to_numeric(St_Odds['Pts'].rank(ascending=False),downcast='integer')
St_Odds['xRank_Odds'] = pd.to_numeric(St_Odds['xPts_Odds'].rank(ascending=False),downcast='integer')
St_Odds.sort_values(by='Pts',ascending=False,inplace=True)
G = St_Odds['Games']
St_Odds.drop('Games',axis=1,inplace=True)
St_Odds.insert(1,'Games',G)

In [None]:
# Variable xPts for home and away team model TMValue_Costant

St_TMValue_Costant = Games[['Home_Team','Away_Team','Aprob_TMValue_Costant','Dprob_TMValue_Costant','Hprob_TMValue_Costant']].copy()

St_TMValue_Costant['xHPts'] = 3*St_TMValue_Costant['Hprob_TMValue_Costant'] + St_TMValue_Costant['Dprob_TMValue_Costant']
St_TMValue_Costant['xAPts'] = 3*St_TMValue_Costant['Aprob_TMValue_Costant'] + St_TMValue_Costant['Dprob_TMValue_Costant']
St_TMValue_Costant.drop(['Aprob_TMValue_Costant','Dprob_TMValue_Costant','Hprob_TMValue_Costant'],axis=1,inplace=True)

In [None]:
# Standings for home games model TMValue_Costant

Home_Teams = St_TMValue_Costant.groupby('Home_Team')['xHPts'].sum().reset_index()
Home_Teams.rename(columns={'xHPts':'xPts','Home_Team':'Team'},inplace=True)

In [None]:
# Standings for away games model TMValue_Costant

Away_Teams = St_TMValue_Costant.groupby('Away_Team')['xAPts'].sum().reset_index()
Away_Teams.rename(columns={'xAPts':'xPts','Away_Team':'Team'},inplace=True)

In [None]:
# Standings

St_TMValue_Costant = pd.merge(Home_Teams,Away_Teams,on='Team')
St_TMValue_Costant['xPts_TMValue_Costant'] = St_TMValue_Costant['xPts_x'] + St_TMValue_Costant['xPts_y']
St_TMValue_Costant.drop(['xPts_x','xPts_y'],axis=1,inplace=True)
St_TMValue_Costant['xRank_TMValue_Costant'] = pd.to_numeric(St_TMValue_Costant['xPts_TMValue_Costant'].rank(ascending=False),downcast='integer')
St = pd.merge(St_Odds,St_TMValue_Costant,on='Team')
St.sort_values(by='Pts',ascending=False,inplace=True)

In [None]:
# Variable xPts for home and away team model TMValue_Lineups

St_TMValue_Lineups = Games[['Home_Team','Away_Team','Aprob_TMValue_Lineups','Dprob_TMValue_Lineups','Hprob_TMValue_Lineups']].copy()

St_TMValue_Lineups['xHPts'] = 3*St_TMValue_Lineups['Hprob_TMValue_Lineups'] + St_TMValue_Lineups['Dprob_TMValue_Lineups']
St_TMValue_Lineups['xAPts'] = 3*St_TMValue_Lineups['Aprob_TMValue_Lineups'] + St_TMValue_Lineups['Dprob_TMValue_Lineups']
St_TMValue_Lineups.drop(['Aprob_TMValue_Lineups','Dprob_TMValue_Lineups','Hprob_TMValue_Lineups'],axis=1,inplace=True)

In [None]:
# Standings for home games model TMValue_Lineups

Home_Teams = St_TMValue_Lineups.groupby('Home_Team')['xHPts'].sum().reset_index()
Home_Teams.rename(columns={'xHPts':'xPts','Home_Team':'Team'},inplace=True)

In [None]:
# Standings for away games model TMValue_Lineups

Away_Teams = St_TMValue_Lineups.groupby('Away_Team')['xAPts'].sum().reset_index()
Away_Teams.rename(columns={'xAPts':'xPts','Away_Team':'Team'},inplace=True)

In [None]:
# Standings

St_TMValue_Lineups = pd.merge(Home_Teams,Away_Teams,on='Team')
St_TMValue_Lineups['xPts_TMValue_Lineups'] = St_TMValue_Lineups['xPts_x'] + St_TMValue_Lineups['xPts_y']
St_TMValue_Lineups.drop(['xPts_x','xPts_y'],axis=1,inplace=True)
St_TMValue_Lineups['xRank_TMValue_Lineups'] = pd.to_numeric(St_TMValue_Lineups['xPts_TMValue_Lineups'].rank(ascending=False),downcast='integer')
St = pd.merge(St,St_TMValue_Lineups,on='Team')
St.sort_values(by='Pts',ascending=False,inplace=True)

In [None]:
# Variable xPts for home and away team model Elo

St_Elo = Games[['Home_Team','Away_Team','Aprob_Elo','Dprob_Elo','Hprob_Elo']].copy()

St_Elo['xHPts'] = 3*St_Elo['Hprob_Elo'] + St_Elo['Dprob_Elo']
St_Elo['xAPts'] = 3*St_Elo['Aprob_Elo'] + St_Elo['Dprob_Elo']
St_Elo.drop(['Aprob_Elo','Dprob_Elo','Hprob_Elo'],axis=1,inplace=True)

In [None]:
# Standings for home games model TMValue_Lineups

Home_Teams = St_Elo.groupby('Home_Team')['xHPts'].sum().reset_index()
Home_Teams.rename(columns={'xHPts':'xPts','Home_Team':'Team'},inplace=True)

In [None]:
# Standings for away games model TMValue_Lineups

Away_Teams = St_Elo.groupby('Away_Team')['xAPts'].sum().reset_index()
Away_Teams.rename(columns={'xAPts':'xPts','Away_Team':'Team'},inplace=True)

In [None]:
# Standings

St_Elo = pd.merge(Home_Teams,Away_Teams,on='Team')
St_Elo['xPts_Elo'] = St_Elo['xPts_x'] + St_Elo['xPts_y']
St_Elo.drop(['xPts_x','xPts_y'],axis=1,inplace=True)
St_Elo['xRank_Elo'] = pd.to_numeric(St_Elo['xPts_Elo'].rank(ascending=False),downcast='integer')
St = pd.merge(St,St_Elo,on='Team')
St.sort_values(by='Pts',ascending=False,inplace=True)
St

In [None]:
# Variables Score for each model

St['Score_Odds'] = [0]*len(St)

St['Score_TMValue_Costant'] = [0]*len(St)

St['Score_TMValue_Lineups'] = [0]*len(St)

St['Score_Elo'] = [0]*len(St)

In [None]:
# Calculation Score

i = 0

for i in range(0,len(St)):
    
    if St.loc[i,'Rank'] == 1.5:
    
        if abs(St.loc[i,'Rank']-St.loc[i,'xRank_Odds']) <= 1:
            St.loc[i,'Score_Odds'] = 20
        else:
            St.loc[i,'Score_Odds'] = 20 - abs(St.loc[i,'Rank']-St.loc[i,'xRank_Odds'])
            
        if abs(St.loc[i,'Rank']-St.loc[i,'xRank_TMValue_Costant']) <= 1:
            St.loc[i,'Score_TMValue_Costant'] = 20
        else:
            St.loc[i,'Score_TMValue_Costant'] = 20 - abs(St.loc[i,'Rank']-St.loc[i,'xRank_TMValue_Costant'])
            
        if abs(St.loc[i,'Rank']-St.loc[i,'xRank_TMValue_Lineups']) <= 1:
            St.loc[i,'Score_TMValue_Lineups'] = 20
        else:
            St.loc[i,'Score_TMValue_Lineups'] = 20 - abs(St.loc[i,'Rank']-St.loc[i,'xRank_TMValue_Lineups'])
            
        if abs(St.loc[i,'Rank']-St.loc[i,'xRank_Elo']) <= 1:
            St.loc[i,'Score_Elo'] = 20
        else:
            St.loc[i,'Score_Elo'] = 20 - abs(St.loc[i,'Rank']-St.loc[i,'xRank_Elo'])
            
    if St.loc[i,'Rank'] == 15:
    
        if (St.loc[i,'xRank_Odds'] >= 14) & (St.loc[i,'xRank_Odds'] <= 16):
            St.loc[i,'Score_Odds'] = 20
        else:
            St.loc[i,'Score_Odds'] = 20 - abs(St.loc[i,'Rank']-St.loc[i,'xRank_Odds'])
            
        if (St.loc[i,'xRank_TMValue_Costant'] >= 14) & (St.loc[i,'xRank_TMValue_Costant'] <= 16):
            St.loc[i,'Score_TMValue_Costant'] = 20
        else:
            St.loc[i,'Score_TMValue_Costant'] = 20 - abs(St.loc[i,'Rank']-St.loc[i,'xRank_TMValue_Costant'])
            
        if (St.loc[i,'xRank_TMValue_Lineups'] >= 14) & (St.loc[i,'xRank_TMValue_Lineups'] <= 16):
            St.loc[i,'Score_TMValue_Lineups'] = 20
        else:
            St.loc[i,'Score_TMValue_Lineups'] = 20 - abs(St.loc[i,'Rank']-St.loc[i,'xRank_TMValue_Lineups'])
            
        if (St.loc[i,'xRank_Elo'] >= 14) & (St.loc[i,'xRank_Elo'] <= 16):
            St.loc[i,'Score_Elo'] = 20
        else:
            St.loc[i,'Score_Elo'] = 20 - abs(St.loc[i,'Rank']-St.loc[i,'xRank_Elo'])
                                       
    else:
        
        St.loc[i,'Score_Odds'] = 20 - abs(St.loc[i,'Rank']-St.loc[i,'xRank_Odds'])
        St.loc[i,'Score_TMValue_Costant'] = 20 - abs(St.loc[i,'Rank']-St.loc[i,'xRank_TMValue_Costant'])
        St.loc[i,'Score_TMValue_Lineups'] = 20 - abs(St.loc[i,'Rank']-St.loc[i,'xRank_TMValue_Lineups'])
        St.loc[i,'Score_Elo'] = 20 - abs(St.loc[i,'Rank']-St.loc[i,'xRank_Elo'])

In [None]:
# Score Model Odds

St['Score_Odds'].sum()

In [None]:
# Score Model TMValue_Costant

St['Score_TMValue_Costant'].sum()

In [None]:
# Score Model TMValue_Lineups

St['Score_TMValue_Lineups'].sum()

In [None]:
# Score Model Elo

St['Score_Elo'].sum()

# Generilzed Linear Models for npxG_Expected

In [None]:
Milan = Games[(Games['Home_Team']=='Milan')|(Games['Away_Team']=='Milan')].copy()

Milan = Milan.reset_index(drop = True)

Milan.head()

In [None]:
# Subset games to exctract milan and opponent usefull statistics

SerieA_Milan = SerieA[(SerieA.loc[:,'Home_Team']=='Milan')|(SerieA.loc[:,'Away_Team']=='Milan')].copy()
SerieA_Milan.sort_values(by='Matchweek',inplace=True)
SerieA_Milan.reset_index(drop=True, inplace=True)

In [None]:
SerieA_Milan.to_csv('/Users/lorenzoleoni/Desktop/Materiale Personale/Database/SerieA_Milan.csv',index=False)

In [None]:
# Exctraction usefull statistics

Milan.loc[:,'Home_Away'] = np.where(Milan.loc[:,'Home_Team']=='Milan','H','A')

i = 0

k = 0

Milan['npxG_Expected'] = [0]*len(Milan)

Milan['Cmp_percent_Passes'] = [0]*len(Milan)

Milan['Att_Dribbles'] = [0]*len(Milan)

Milan['Str_Corner_Kicks'] = [0]*len(Milan)

Milan['Def_Touches'] = [0]*len(Milan)

Milan['Att_Pressures'] = [0]*len(Milan)

Milan['Opponent_percent_Pressures'] = [0]*len(Milan)

Milan['Opponent_Att_Long'] = [0]*len(Milan)

Milan['Fls'] = [0]*len(Milan)

while k < max(SerieA_Milan['Matchweek']):
        
    if SerieA_Milan.loc[i,'Team']!='Milan':
        
        Milan.loc[k,'Opponent_percent_Pressures'] = SerieA_Milan[(SerieA_Milan['Matchweek']==k+1)&(SerieA_Milan['Team']!='Milan')]['percent_Pressures'].iloc[0]
        
        Milan.loc[k,'Cmp_percent_Passes'] = SerieA_Milan[(SerieA_Milan['Matchweek']==k+1)&(SerieA_Milan['Team']=='Milan')]['Cmp_percent_Passes'].iloc[0]
       
        Milan.loc[k,'npxG_Expected'] = SerieA_Milan[(SerieA_Milan['Matchweek']==k+1)&(SerieA_Milan['Team']=='Milan')]['npxG_Expected'].iloc[0]
        
        Milan.loc[k,'Att_Pressures'] = SerieA_Milan[(SerieA_Milan['Matchweek']==k+1)&(SerieA_Milan['Team']=='Milan')]['Att 3rd_Pressures'].iloc[0]
        
        Milan.loc[k,'Att_Dribbles'] = SerieA_Milan[(SerieA_Milan['Matchweek']==k+1)&(SerieA_Milan['Team']=='Milan')]['Att_Dribbles'].iloc[0]
        
        Milan.loc[k,'Str_Corner_Kicks'] = SerieA_Milan[(SerieA_Milan['Matchweek']==k+1)&(SerieA_Milan['Team']=='Milan')]['Str_Corner_Kicks'].iloc[0]
        
        Milan.loc[k,'Def_Touches'] = SerieA_Milan[(SerieA_Milan['Matchweek']==k+1)&(SerieA_Milan['Team']=='Milan')]['Def 3rd_Touches'].iloc[0]
        
        Milan.loc[k,'Fls'] = SerieA_Milan[(SerieA_Milan['Matchweek']==k+1)&(SerieA_Milan['Team']=='Milan')]['Fls_9'].iloc[0]
        
        Milan.loc[k,'Opponent_Att_Long'] = SerieA_Milan[(SerieA_Milan['Matchweek']==k+1)&(SerieA_Milan['Team']!='Milan')]['Att_Long'].iloc[0]
        
        k += 1
        
    i += 1

Milan.head()

In [None]:
# Save variables for Milan and Opponent in each match

i = 0

Milan['Milan_Performance'] = [0]*len(Milan)

Milan['Opponent_Performance'] = [0]*len(Milan)

Milan['Milan_Elo'] = [0]*len(Milan)

Milan['Opponent_Elo'] = [0]*len(Milan)

Milan['Milan_Cup'] = [0]*len(Milan)

Milan['Opponent_Cup'] = [0]*len(Milan)

Milan['Milan_Diff_Goals'] = [0]*len(Milan)

Milan['Opponent_Diff_Goals'] = [0]*len(Milan)

Milan['Opponent_ln_TM'] = [0]*len(Milan)

Milan['Milan_ln_TM'] = [0]*len(Milan)

for i in range(0,len(Milan)):

    Milan.loc[i,'Milan_Performance'] = np.where(Milan.loc[i,'Home_Team']=='Milan',\
                                                  Milan.loc[i,'Home_Performance'], \
                                                  Milan.loc[i,'Away_Performance'])

    Milan.loc[i,'Opponent_Performance'] = np.where(Milan.loc[i,'Home_Team']!='Milan',\
                                                  Milan.loc[i,'Home_Performance'], \
                                                  Milan.loc[i,'Away_Performance'])
    
    Milan.loc[i,'Milan_Elo'] = np.where(Milan.loc[i,'Home_Team']=='Milan',\
                                                  Milan.loc[i,'Elo_Home'], \
                                                  Milan.loc[i,'Elo_Away'])

    Milan.loc[i,'Opponent_Elo'] = np.where(Milan.loc[i,'Home_Team']!='Milan',\
                                                  Milan.loc[i,'Elo_Home'], \
                                                  Milan.loc[i,'Elo_Away'])
    
    Milan.loc[i,'Milan_Cup'] = np.where(Milan.loc[i,'Home_Team']=='Milan',\
                                                  Milan.loc[i,'Cup_Home'], \
                                                  Milan.loc[i,'Cup_Away'])
    
    Milan.loc[i,'Opponent_Cup'] = np.where(Milan.loc[i,'Home_Team']!='Milan',\
                                                  Milan.loc[i,'Cup_Home'], \
                                                  Milan.loc[i,'Cup_Away'])
    
    Milan.loc[i,'Milan_ln_TM'] = np.where(Milan.loc[i,'Home_Team']=='Milan',\
                                                  Milan.loc[i,'ln_TM_Home'], \
                                                  Milan.loc[i,'ln_TM_Away'])
    
    Milan.loc[i,'Opponent_ln_TM'] = np.where(Milan.loc[i,'Home_Team']!='Milan',\
                                                  Milan.loc[i,'ln_TM_Home'], \
                                                  Milan.loc[i,'ln_TM_Away'])
    
    Milan.loc[i,'Milan_Diff_Goals'] = np.where(Milan.loc[i,'Home_Team']=='Milan',\
                                                  Milan.loc[i,'Home_Diff_Goals'], \
                                                  Milan.loc[i,'Away_Diff_Goals'])
    
    Milan.loc[i,'Opponent_Diff_Goals'] = np.where(Milan.loc[i,'Home_Team']!='Milan',\
                                                  Milan.loc[i,'Home_Diff_Goals'], \
                                                  Milan.loc[i,'Away_Diff_Goals'])
            
Milan['log_ratio_ln_TM'] = np.log(Milan['Milan_ln_TM']) - np.log(Milan['Opponent_ln_TM'])
Milan['ratio_Elo'] = Milan['Milan_Elo']/Milan['Opponent_Elo']
Milan.drop(['Cup_Home','Cup_Away','Home_Performance','Away_Performance','ln_TM_Home',\
            'ln_TM_Away','Elo_Home','Elo_Away','Home_Diff_Goals','Away_Diff_Goals',\
            'Aprob_TMValue_Costant','Dprob_TMValue_Costant','Hprob_TMValue_Costant',\
           'TMValue_Costant_pred','Aprob_TMValue_Lineups','Dprob_TMValue_Lineups',\
           'Hprob_TMValue_Lineups','TMValue_Lineups_pred','Aprob_Elo','Dprob_Elo',\
            'Hprob_Elo','Elo_pred'], axis=1,inplace=True)
Milan.info()

In [None]:
# Distribution npxG_Expected

plt.figure(figsize=(14,7)) # Make it 14x7 inch

plt.style.use('seaborn-whitegrid') # nice and clean grid

plt.hist(Milan['npxG_Expected'], bins=20, facecolor = '#2ab0ff', edgecolor='#169acf', linewidth=0.5)

plt.title('npxG_Expected', fontsize=35) 

plt.xlabel('Distribution', fontsize=28) 

plt.ylabel('Frequency', fontsize=28)

plt.xticks(fontsize=20)

plt.yticks(fontsize=20)

plt.show()

## FORWARD SELECTION

In [None]:
# Train data first half of the season

train_data = Milan.loc[(Milan['Matchweek']<=19).values].copy()
train_data.drop(['Matchweek','Match_Date','Home_Team','Away_Team','Win','Winvalue','Home_Away',\
                 'Milan_Elo','Opponent_Elo','Opponent_ln_TM','Milan_ln_TM'], axis=1, inplace=True)

In [None]:
# Create gamma regression object

mod = GammaRegressor()

In [None]:
# Forward Selection

forward_selection = sfs(mod, k_features=5, forward=True, floating=True, verbose=2, scoring='neg_mean_gamma_deviance')
forward_selection = forward_selection.fit(train_data.loc[:, train_data.columns != 'npxG_Expected'],train_data['npxG_Expected'])

In [None]:
# Features selected

feat_names = list(forward_selection.k_feature_names_)
print(feat_names)

In [None]:
# Gamma regression with the features selected

npxG_for = smf.glm(formula='npxG_Expected ~ ratio_Elo + Def_Touches + Att_Pressures + Opponent_percent_Pressures + Milan_Performance', data = train_data, \
             family=sm.families.Gamma(link=sm.families.links.log())).fit()
print(npxG_for.summary())

In [None]:
# AIC model forward selectiom

npxG_for.aic

In [None]:
# Distribution residuals from model npxG_Expected forward selection

plt.figure(figsize=(14,7)) # Make it 14x7 inch

plt.style.use('seaborn-whitegrid') # nice and clean grid

plt.hist(npxG_for.resid_response, bins=10, facecolor = '#2ab0ff', edgecolor='#169acf', linewidth=0.5)

plt.title('Residuals Forward Selection', fontsize=35) 

plt.xlabel('Distribution', fontsize=28) 

plt.ylabel('Frequency', fontsize=28)

plt.xticks(fontsize=20)

plt.yticks(fontsize=20)

plt.show()

In [None]:
# Shapiro-Wilk test, H0 : Normal Residuals

shapiro(npxG_for.resid_response)

In [None]:
# Test data

test_data = Milan.loc[(Milan['Matchweek']>19).values].copy()

In [None]:
# Prediction of npxG_Expected from test dataset

test_data['npxG_pred'] = npxG_for.predict(test_data)
test_data[['npxG_Expected','npxG_pred']]

In [None]:
# Deviance model npxG

D_npxG_for = mean_gamma_deviance(test_data['npxG_Expected'],test_data['npxG_pred'])
D_npxG_for

## GAMMA REGRESSION (Best)

In [None]:
# Features selected from the forward selection and correlation with r, if it's used the canonical link function (inverse power) we obtain a better CS and deviance but a 
# not a theoretically correct model

npxG = smf.glm(formula='npxG_Expected ~ Opponent_percent_Pressures*Def_Touches + Att_Pressures + Milan_Performance + Milan_Cup', data = train_data, \
             family=sm.families.Gamma(link=sm.families.links.log())).fit()
print(npxG.summary())

In [None]:
# AIC model gamma regression

npxG.aic

In [None]:
# Distribution Observation Opponent_percent_Pressures vs npxG_Expected

plt.figure(figsize=(14,7)) # Make it 14x7 inch

plt.style.use('seaborn-whitegrid') # nice and clean grid

plt.scatter(train_data['Opponent_percent_Pressures'], train_data['npxG_Expected'], s = 100, facecolor = '#2ab0ff', edgecolor='#169acf', linewidth=0.5)

plt.title('O%P vs npxG_Expected', fontsize=35) 

plt.xlabel('Opponent_percent_Pressures', fontsize=28) 

plt.ylabel('npxG_Expected', fontsize=28)

plt.xticks(fontsize=20)

plt.yticks(fontsize=20)

plt.show()

In [None]:
# Distribution Observation Opponent_percent_Pressures vs npxG_Expected

plt.figure(figsize=(14,7)) # Make it 14x7 inch

plt.style.use('seaborn-whitegrid') # nice and clean grid

plt.scatter(train_data['Att_Pressures'], train_data['npxG_Expected'], s = 100, facecolor = '#2ab0ff', edgecolor='#169acf', linewidth=0.5)

plt.title('AP vs npxG_Expected', fontsize=35) 

plt.xlabel('Att_Pressures', fontsize=28) 

plt.ylabel('npxG_Expected', fontsize=28)

plt.xticks(fontsize=20)

plt.yticks(fontsize=20)

plt.show()

In [None]:
# Shapiro-Wilk test, H0 : Normal Residuals

shapiro(npxG.resid_response)

In [None]:
# Residuals distribution from the model npxG_for

# Distribution Observation Opponent_percent_Pressures vs npxG_Expected

plt.figure(figsize=(14,7)) # Make it 14x7 inch

plt.style.use('seaborn-whitegrid') # nice and clean grid

plt.hist(npxG.resid_response, bins = 10, facecolor = '#2ab0ff', edgecolor='#169acf', linewidth=0.5)

plt.title('Residuals Gamma Regression', fontsize=35) 

plt.xlabel('Distribution', fontsize=28) 

plt.ylabel('Frequency', fontsize=28)

plt.xticks(fontsize=20)

plt.yticks(fontsize=20)

plt.show()

In [None]:
# Test data

test_data = Milan.loc[(Milan['Matchweek']>19).values].copy()

In [None]:
# Prediction of npxG_Expected from test dataset

test_data['npxG_pred'] = npxG.predict(test_data)
test_data[['npxG_Expected','npxG_pred']]

In [None]:
# Deviance model npxG

D_npxG = mean_gamma_deviance(test_data['npxG_Expected'],test_data['npxG_pred'])
D_npxG

## OUTLIER REMOVAL

In [None]:
# Drop the outlier with the residual value above 0.7 in train data

train_data.drop(np.where(npxG.resid_response > 0.7)[0][0], axis=0,inplace=True)

In [None]:
# Opponent_percent_Pressures*Def_Touches + Milan_Performance + Milan_Cup + Att_Pressures, if it's used the canonical link function (inverse power) we obtain a better CS and deviance

npxG_res = smf.glm(formula='npxG_Expected ~ Opponent_percent_Pressures*Def_Touches + Def_Touches + Att_Pressures + Milan_Performance + Milan_Cup', data = train_data, \
             family=sm.families.Gamma(link=sm.families.links.log())).fit()
print(npxG_res.summary())

In [None]:
# AIC model outlier removal

npxG_res.aic

In [None]:
# Distribution residuals from model npxG_Expected forward selection

plt.figure(figsize=(14,7)) # Make it 14x7 inch

plt.style.use('seaborn-whitegrid') # nice and clean grid

plt.hist(npxG_res.resid_response, bins=10, facecolor = '#2ab0ff', edgecolor='#169acf', linewidth=0.5)

plt.title('Residuals Outlier Removal ', fontsize=35) 

plt.xlabel('Distribution', fontsize=28) 

plt.ylabel('Frequency', fontsize=28)

plt.xticks(fontsize=20)

plt.yticks(fontsize=20)

plt.show()

In [None]:
# Shapiro-Wilk test, H0 : Normal Residuals

shapiro(npxG_res.resid_response)

In [None]:
# Test data

test_data = Milan.loc[(Milan['Matchweek']>19).values].copy()

In [None]:
# Prediction of npxG_Expected from test dataset

test_data['npxG_pred'] = npxG_res.predict(test_data)
test_data[['npxG_Expected','npxG_pred']]

In [None]:
# Deviance model npxG_res

D_npxG_res = mean_gamma_deviance(test_data['npxG_Expected'],test_data['npxG_pred'])
D_npxG_res

## INVERSE GAUSSIAN

In [None]:
train_data = Milan.loc[(Milan['Matchweek']<=19).values].copy()

In [None]:
# Features selected from the forward selection and correlation with r, if it's used the canonical link function (inverse power) we obtain a better CS and deviance

npxG_inv = smf.glm(formula='npxG_Expected ~ Opponent_percent_Pressures*Def_Touches + Att_Pressures + Milan_Performance + Milan_Cup', data = train_data, \
             family=sm.families.InverseGaussian(link=sm.families.links.log())).fit()
print(npxG.summary())

In [None]:
# AIC model inverse Gaussian

npxG_inv.aic

In [None]:
# Distribution residuals from model npxG_Expected forward selection

plt.figure(figsize=(14,7)) # Make it 14x7 inch

plt.style.use('seaborn-whitegrid') # nice and clean grid

plt.hist(npxG_inv.resid_response, bins=10, facecolor = '#2ab0ff', edgecolor='#169acf', linewidth=0.5)

plt.title('Residuals Inverse Gamma', fontsize=35) 

plt.xlabel('Distribution', fontsize=28) 

plt.ylabel('Frequency', fontsize=28)

plt.xticks(fontsize=20)

plt.yticks(fontsize=20)

plt.show()

In [None]:
# Shapiro-Wilk test, H0 : Normal Residuals

shapiro(npxG_inv.resid_response)

In [None]:
# Test data

test_data = Milan.loc[(Milan['Matchweek']>19).values].copy()

In [None]:
# Prediction of npxG_Expected from test dataset

test_data['npxG_pred'] = npxG_inv.predict(test_data)
test_data[['npxG_Expected','npxG_pred']]

In [None]:
# Deviance model npxG_inv

D_npxG_inv = mean_gamma_deviance(test_data['npxG_Expected'],test_data['npxG_pred'])
D_npxG_inv

## PREDICTIONS

In [None]:
# Prediction of npxG_Expected for the game Naples VS Milan using mean value of the Rossoneri and opponent team and the model npxG

data = {'Opponent_percent_Pressures' : SerieA[SerieA['Team']=='Napoli']['percent_Pressures'].mean(), 'Def_Touches':Milan['Def_Touches'].mean(),\
        'Milan_Performance':[Milan.loc[26,'Milan_Performance']-0.5], 'Milan_Cup':1,'Att_Pressures':Milan['Att_Pressures'].mean()}
NapVSMil = pd.DataFrame(data = data)
npxG.predict(NapVSMil) # the real one is 1.1 from FBref

In [None]:
# Prediction of npxG_Expected for the game Naples VS Milan using real data of the Rossoneri and opponent team and the model npxG

data = {'Opponent_percent_Pressures' : 37.1, 'Def_Touches':202,\
        'Milan_Performance':[Milan.loc[26,'Milan_Performance']-0.5], 'Milan_Cup':1,'Att_Pressures':69}
NapVSMil = pd.DataFrame(data = data)
npxG.predict(NapVSMil) # the real one is 1.1 from FBref

In [None]:
# Check the Milan_Perfomance in order to fix the correct value for the next game

performance = Milan.loc[26,'Milan_Performance']-0.5
performance

In [None]:
# Update of the statistics for forecasting Milan VS Empoli, adding new row of stats from their last match

data = {'Opponent_percent_Pressures' : SerieA[(SerieA['Team']=='Empoli')]['percent_Pressures'].reset_index(drop=True), 'Def_Touches':Milan['Def_Touches'],\
        'Milan_Performance':Milan['Milan_Performance'], 'Milan_Cup':Milan['Milan_Cup'],'Att_Pressures':Milan['Att_Pressures']}
df = pd.DataFrame(data = data)
df.loc[27,:] = [32.5,202,2,0,69]
df

In [None]:
# Prediction of npxG_Expected for the game Milan VS Empoli using mean value of the Rossoneri and opponent team and the model npxG

data = {'Opponent_percent_Pressures' : df['Opponent_percent_Pressures'].mean(), 'Def_Touches':df['Def_Touches'].mean(),\
        'Milan_Performance':[df.loc[27,'Milan_Performance']], 'Milan_Cup':[df.loc[27,'Milan_Cup']],'Att_Pressures':df['Att_Pressures'].mean()}
MilVSEmp = pd.DataFrame(data = data)
npxG.predict(MilVSEmp) # the real one is 0.7 from FBref

In [None]:
# Prediction of npxG_Expected for the game Milan VS Milan using real data of the Rossoneri and opponent team and the model npxG

data = {'Opponent_percent_Pressures' : 31.7, 'Def_Touches':211,\
        'Milan_Performance':[df.loc[27,'Milan_Performance']], 'Milan_Cup':[df.loc[27,'Milan_Cup']],'Att_Pressures':50}
MilVSEmp = pd.DataFrame(data = data)
npxG.predict(MilVSEmp) # the real one is 0.7 from FBref

In [None]:
# Update of the statistics for forecasting Cagliari VS Milan, adding new row of stats from their last match

data = {'Opponent_percent_Pressures' : SerieA[(SerieA['Team']=='Cagliari')]['percent_Pressures'].reset_index(drop=True), 'Def_Touches':Milan['Def_Touches'],\
        'Milan_Performance':Milan['Milan_Performance'], 'Milan_Cup':Milan['Milan_Cup'],'Att_Pressures':Milan['Att_Pressures']}
df = pd.DataFrame(data = data)
df.loc[27,:] = [29.7,202,2,0,69]
df.loc[28,:] = [31.3,211,2,0,50]
df

In [None]:
# Prediction of npxG_Expected for the game Cagliari VS Milan using mean value of the Rossoneri and opponent team and the model npxG

data = {'Opponent_percent_Pressures' : df['Opponent_percent_Pressures'].mean(), 'Def_Touches':df['Def_Touches'].mean(),\
        'Milan_Performance':[df.loc[28,'Milan_Performance']], 'Milan_Cup':[df.loc[28,'Milan_Cup']],'Att_Pressures':df['Att_Pressures'].mean()}
CagVSMil = pd.DataFrame(data = data)
npxG.predict(CagVSMil) # the real one is 2.4 from FBref

In [None]:
# Prediction of npxG_Expected for the game Cagliari VS Milan using real data of the Rossoneri and opponent team and the model npxG

data = {'Opponent_percent_Pressures' : 35.1, 'Def_Touches':203,\
        'Milan_Performance':[2], 'Milan_Cup':0,'Att_Pressures':33}
CagVSMil = pd.DataFrame(data = data)
npxG.predict(CagVSMil) # the real one is 2.4 from FBref