In [212]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
from sklearn.decomposition import PCA

In [3]:
epl_19 = pd.read_csv('./2019:2020/E0.csv', encoding='cp1252')

In [4]:
epl_19['FTR'] = epl_19['FTR'].replace({'H':1, 'D':0.5, 'A':0})

In [5]:
epl_19['HTR'] = epl_19['HTR'].replace({'H':1, 'D':0.5, 'A':0})

In [6]:
home_team_odds = epl_19[['B365H', 'BWH', 'IWH', 'PSH', 'WHH', 'VCH']]

In [7]:
home_team_odds

Unnamed: 0,B365H,BWH,IWH,PSH,WHH,VCH
0,1.14,1.14,1.15,1.15,1.12,1.14
1,12.00,11.50,11.00,11.68,13.00,12.00
2,1.95,1.95,1.97,2.04,2.00,2.00
3,2.62,2.65,2.65,2.71,2.70,2.70
4,3.00,3.20,3.10,3.21,3.10,3.20
...,...,...,...,...,...,...
375,3.25,3.20,3.15,3.25,3.20,3.20
376,1.08,1.05,1.07,1.08,1.07,1.05
377,7.50,7.50,7.50,7.79,8.00,8.00
378,2.15,2.20,2.20,2.18,2.15,2.15


In [8]:
home_team_odds.var(axis=1)

0      0.000120
1      0.448067
2      0.001230
3      0.001337
4      0.006950
         ...   
375    0.001417
376    0.000187
377    0.061350
378    0.000617
379    0.002950
Length: 380, dtype: float64

In [9]:
epl_19['home_team_var'] = home_team_odds.var(axis=1)

In [10]:
draw_odds = epl_19[['B365D', 'BWD', 'IWD', 'PSD', 'WHD', 'VCD']]

In [11]:
epl_19['draw_var'] = draw_odds.var(axis=1)

In [12]:
away_team_odds = epl_19[['B365A', 'BWA', 'IWA', 'PSA', 'WHA', 'VCA']]

In [13]:
epl_19['away_team_var'] = away_team_odds.var(axis=1)

In [14]:
home_team_odds_c = epl_19[['B365CH', 'BWCH', 'IWCH', 'PSCH', 'WHCH', 'VCCH']]

In [15]:
epl_19['home_team_var_c'] = home_team_odds_c.var(axis=1)

In [16]:
draw_odds_c = epl_19[['B365CD', 'BWCD', 'IWCD', 'PSCD', 'WHCD', 'VCCD']]

In [17]:
epl_19['draw_var_c'] = draw_odds_c.var(axis=1)

In [18]:
away_team_odds_c = epl_19[['B365CA', 'BWCA', 'IWCA', 'PSCA', 'WHCA', 'VCCA']]

In [19]:
epl_19['away_team_var_c'] = away_team_odds_c.var(axis=1)

In [20]:
epl_19

Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,...,MaxCAHH,MaxCAHA,AvgCAHH,AvgCAHA,home_team_var,draw_var,away_team_var,home_team_var_c,draw_var_c,away_team_var_c
0,E0,09/08/2019,20:00,Liverpool,Norwich,4,1,1.0,4,0,...,1.99,2.07,1.90,1.99,0.000120,0.681267,4.010417,0.000187,0.634817,2.796817
1,E0,10/08/2019,12:30,West Ham,Man City,0,5,0.0,0,1,...,2.07,1.98,1.97,1.92,0.448067,0.107200,0.000227,0.254017,0.136867,0.000107
2,E0,10/08/2019,15:00,Bournemouth,Sheffield United,1,1,0.5,0,0,...,2.00,1.96,1.96,1.92,0.001230,0.001600,0.018667,0.000427,0.003000,0.016950
3,E0,10/08/2019,15:00,Burnley,Southampton,3,0,1.0,0,0,...,1.90,2.07,1.86,2.02,0.001337,0.002950,0.000817,0.002187,0.003017,0.001667
4,E0,10/08/2019,15:00,Crystal Palace,Everton,0,0,0.5,0,0,...,2.03,2.08,1.96,1.93,0.006950,0.004400,0.001430,0.002417,0.008417,0.000550
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,E0,26/07/2020,16:00,Leicester,Man United,0,2,0.0,0,0,...,1.94,2.05,1.86,2.02,0.001417,0.010067,0.001667,0.006817,0.008417,0.001667
376,E0,26/07/2020,16:00,Man City,Norwich,5,0,1.0,2,0,...,2.06,1.88,2.02,1.84,0.000187,1.846667,46.645400,0.000350,9.066667,63.766667
377,E0,26/07/2020,16:00,Newcastle,Liverpool,1,3,0.0,1,1,...,2.03,2.00,1.95,1.92,0.061350,0.038350,0.000470,0.185150,0.045667,0.001670
378,E0,26/07/2020,16:00,Southampton,Sheffield United,3,1,1.0,0,1,...,2.03,1.96,1.98,1.89,0.000617,0.008000,0.008417,0.000417,0.009867,0.011800


In [22]:
epl_19 = epl_19.drop(columns=['Div', 'Date', 'Time', 'HomeTeam', 'AwayTeam', 'Referee'])

In [39]:
X, y = epl_19.drop(columns=['FTR']).values, epl_19['FTR'].values

In [42]:
pca = PCA(0.95)

In [43]:
pca.fit(X)
pca.transform(X)[:5,]

array([[ 64.09382248,   8.66146027,  -6.35934765,   1.17168397,
         -9.89015137,  -3.11117921,  -0.90987013,   1.87970037],
       [-20.69350465,  33.45656129,   5.43596806,  -7.43075608,
         -1.27394066,  -3.23596368,   2.86132163,  -0.27865011],
       [ -3.8220932 ,  -6.41354824,  -0.97545861,  -3.52699294,
          2.41304248,  -4.99424955,   4.76015339,  -1.97084308],
       [ -9.30285354,  -3.07456856,  -2.47148633,  -3.07316978,
         -0.14121565,   3.07011075,   3.87480249,  -3.00458068],
       [-11.8678015 ,  -2.39750512,  -3.23607243,  -5.91979495,
          3.4823253 ,  -3.61546435,  -3.21695626,   2.87914013]])

In [74]:
b_19 = pd.read_csv('./2019:2020/D1.csv', encoding='cp1252')

In [75]:
f_19 = pd.read_csv('./2019:2020/F1.csv', encoding='cp1252')

In [76]:
i_19 = pd.read_csv('./2019:2020/I1.csv', encoding='cp1252')

In [77]:
s_19 = pd.read_csv('./2019:2020/SP1.csv', encoding='cp1252')

In [78]:
for league in [b_19, f_19, i_19, s_19]:
    
    home_team_odds = pd.DataFrame()
    draw_odds = pd.DataFrame()
    away_team_odds = pd.DataFrame()
    home_team_odds_c = pd.DataFrame()
    draw_odds_c = pd.DataFrame()
    away_team_odds_c = pd.DataFrame()
    
    league['FTR'] = league['FTR'].replace({'H':1, 'D':0.5, 'A':0})

    league['HTR'] = league['HTR'].replace({'H':1, 'D':0.5, 'A':0})

    home_team_odds = league[['B365H', 'BWH', 'IWH', 'PSH', 'WHH', 'VCH']]

    league['home_team_var'] = home_team_odds.var(axis=1)

    draw_odds = league[['B365D', 'BWD', 'IWD', 'PSD', 'WHD', 'VCD']]

    league['draw_var'] = draw_odds.var(axis=1)

    away_team_odds = league[['B365A', 'BWA', 'IWA', 'PSA', 'WHA', 'VCA']]

    league['away_team_var'] = away_team_odds.var(axis=1)

    home_team_odds_c = league[['B365CH', 'BWCH', 'IWCH', 'PSCH', 'WHCH', 'VCCH']]

    league['home_team_var_c'] = home_team_odds_c.var(axis=1)

    draw_odds_c = league[['B365CD', 'BWCD', 'IWCD', 'PSCD', 'WHCD', 'VCCD']]

    league['draw_var_c'] = draw_odds_c.var(axis=1)

    away_team_odds_c = league[['B365CA', 'BWCA', 'IWCA', 'PSCA', 'WHCA', 'VCCA']]

    league['away_team_var_c'] = away_team_odds_c.var(axis=1)

In [79]:
b_19 = b_19.drop(columns=['Div', 'Date', 'Time', 'HomeTeam', 'AwayTeam'])

In [80]:
f_19 = f_19.drop(columns=['Div', 'Date', 'Time', 'HomeTeam', 'AwayTeam'])

In [81]:
i_19 = i_19.drop(columns=['Div', 'Date', 'Time', 'HomeTeam', 'AwayTeam'])

In [82]:
s_19 = s_19.drop(columns=['Div', 'Date', 'Time', 'HomeTeam', 'AwayTeam'])

In [83]:
epl_19

Unnamed: 0,FTHG,FTAG,FTR,HTHG,HTAG,HTR,HS,AS,HST,AST,...,MaxCAHH,MaxCAHA,AvgCAHH,AvgCAHA,home_team_var,draw_var,away_team_var,home_team_var_c,draw_var_c,away_team_var_c
0,4,1,1.0,4,0,1.0,15,12,7,5,...,1.99,2.07,1.90,1.99,0.000120,0.681267,4.010417,0.000187,0.634817,2.796817
1,0,5,0.0,0,1,0.0,5,14,3,9,...,2.07,1.98,1.97,1.92,0.448067,0.107200,0.000227,0.254017,0.136867,0.000107
2,1,1,0.5,0,0,0.5,13,8,3,3,...,2.00,1.96,1.96,1.92,0.001230,0.001600,0.018667,0.000427,0.003000,0.016950
3,3,0,1.0,0,0,0.5,10,11,4,3,...,1.90,2.07,1.86,2.02,0.001337,0.002950,0.000817,0.002187,0.003017,0.001667
4,0,0,0.5,0,0,0.5,6,10,2,3,...,2.03,2.08,1.96,1.93,0.006950,0.004400,0.001430,0.002417,0.008417,0.000550
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,0,2,0.0,0,0,0.5,14,7,3,3,...,1.94,2.05,1.86,2.02,0.001417,0.010067,0.001667,0.006817,0.008417,0.001667
376,5,0,1.0,2,0,1.0,31,5,10,4,...,2.06,1.88,2.02,1.84,0.000187,1.846667,46.645400,0.000350,9.066667,63.766667
377,1,3,0.0,1,1,0.5,3,14,2,6,...,2.03,2.00,1.95,1.92,0.061350,0.038350,0.000470,0.185150,0.045667,0.001670
378,3,1,1.0,0,1,0.0,13,5,4,3,...,2.03,1.96,1.98,1.89,0.000617,0.008000,0.008417,0.000417,0.009867,0.011800


In [84]:
b_19

Unnamed: 0,FTHG,FTAG,FTR,HTHG,HTAG,HTR,HS,AS,HST,AST,...,MaxCAHH,MaxCAHA,AvgCAHH,AvgCAHA,home_team_var,draw_var,away_team_var,home_team_var_c,draw_var_c,away_team_var_c
0,2,2,0.5,1,2,0.0,17,6,7,3,...,2.04,1.93,1.98,1.91,0.001267,0.537617,3.175000,0.001497,7.924167e-01,7.669400
1,5,1,1.0,1,1,0.5,23,5,10,1,...,1.98,2.04,1.91,1.97,0.000360,0.091200,2.368067,0.000950,8.261500e-01,17.832017
2,3,0,1.0,0,0,0.5,19,19,8,5,...,1.97,2.06,1.90,1.99,0.001150,0.006000,0.000817,0.033787,2.366583e-31,0.070800
3,3,2,1.0,2,2,0.5,13,11,4,6,...,2.15,1.91,2.03,1.85,0.000337,0.044017,0.468817,0.000337,1.148000e-01,0.310417
4,1,3,0.0,0,1,0.0,23,12,10,7,...,1.95,2.11,1.89,2.00,0.000417,0.012000,0.017667,0.000550,8.416667e-03,0.028667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
301,1,0,1.0,1,0,1.0,13,6,2,1,...,2.09,1.97,1.99,1.89,0.000067,0.002017,0.268067,0.000737,6.581667e-02,0.102017
302,2,1,1.0,1,0,1.0,22,4,8,2,...,2.00,2.05,1.93,1.95,0.000177,0.024600,0.034817,0.001027,4.666667e-02,0.310667
303,3,0,1.0,1,0,1.0,11,25,7,4,...,1.84,2.15,1.80,2.07,0.010467,0.004000,0.001217,0.018417,6.400000e-03,0.002750
304,6,1,1.0,3,0,1.0,18,6,9,2,...,1.97,2.05,1.89,1.98,0.000777,0.017737,0.033150,0.000657,2.721667e-02,0.014817


In [85]:
f_19

Unnamed: 0,FTHG,FTAG,FTR,HTHG,HTAG,HTR,HS,AS,HST,AST,...,MaxCAHH,MaxCAHA,AvgCAHH,AvgCAHA,home_team_var,draw_var,away_team_var,home_team_var_c,draw_var_c,away_team_var_c
0,0,3,0.0,0,2,0.0,7,13,1,3,...,2.06,2.01,2.01,1.87,0.005000,0.012267,0.002110,0.013617,0.010667,0.003417
1,0,2,0.0,0,0,0.5,10,8,2,5,...,2.06,1.97,1.98,1.89,0.000227,0.016817,0.032417,0.001977,0.010150,0.112000
2,3,1,1.0,3,1,1.0,14,8,4,3,...,1.80,2.17,1.77,2.11,0.000480,0.005787,0.008417,0.001417,0.002737,0.013067
3,1,1,0.5,1,0,1.0,16,13,5,3,...,2.12,1.83,2.08,1.79,0.001777,0.006800,0.004817,0.001417,0.003617,0.000867
4,1,2,0.0,1,2,0.0,15,12,7,4,...,2.16,1.82,2.09,1.79,0.026400,0.009267,0.001017,0.005200,0.000817,0.001017
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
274,2,1,1.0,0,1,0.0,12,14,5,4,...,2.09,1.90,2.01,1.84,0.005867,0.009217,0.000417,0.012867,0.012067,0.007017
275,1,0,1.0,1,0,1.0,12,12,3,4,...,2.05,1.93,1.99,1.86,0.000630,0.011217,0.012000,0.001867,0.009187,0.018777
276,1,1,0.5,0,0,0.5,11,9,4,2,...,2.07,1.98,1.94,1.91,0.001617,0.011800,0.003267,0.002267,0.004550,0.004000
277,5,0,1.0,2,0,1.0,16,11,9,1,...,2.00,2.00,1.94,1.91,0.001667,0.008977,0.005417,0.001950,0.002200,0.010150


In [86]:
i_19

Unnamed: 0,FTHG,FTAG,FTR,HTHG,HTAG,HTR,HS,AS,HST,AST,...,MaxCAHH,MaxCAHA,AvgCAHH,AvgCAHA,home_team_var,draw_var,away_team_var,home_team_var_c,draw_var_c,away_team_var_c
0,0,1,0.0,0,1,0.0,5,9,4,4,...,1.93,2.12,1.83,2.04,1.356267,0.010750,0.000347,0.770667,0.014417,0.000310
1,3,4,0.0,1,2,0.0,10,8,6,6,...,1.94,2.14,1.80,2.09,0.054950,0.006467,0.000337,0.011710,0.007350,0.001137
2,1,0,1.0,0,0,0.5,11,6,6,0,...,1.91,2.09,1.85,2.04,0.023417,0.005417,0.000667,0.007217,0.003267,0.001550
3,0,1,0.0,0,0,0.5,9,8,5,3,...,2.12,1.85,2.08,1.82,0.000387,0.007617,0.021427,0.001307,0.004400,0.080667
4,3,3,0.5,2,2,0.5,17,6,9,4,...,1.99,2.20,1.92,1.97,0.000747,0.014067,0.364600,0.003667,0.018617,0.245760
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,1,3,0.0,1,1,0.5,7,20,5,11,...,2.07,1.98,1.98,1.89,0.075067,0.006400,0.000670,0.340067,0.047417,0.000427
376,1,1,0.5,1,0,1.0,13,13,5,11,...,2.11,1.88,2.04,1.84,0.000417,0.027267,0.002600,0.000227,0.003467,0.012800
377,3,0,1.0,3,0,1.0,4,13,4,3,...,2.00,2.10,1.87,2.01,0.000880,0.028417,0.037400,0.004590,0.038737,0.029800
378,3,4,0.0,2,2,0.5,12,18,8,11,...,1.92,2.11,1.85,2.02,0.000627,0.018520,0.044750,0.000577,0.026200,0.066667


In [87]:
s_19

Unnamed: 0,FTHG,FTAG,FTR,HTHG,HTAG,HTR,HS,AS,HST,AST,...,MaxCAHH,MaxCAHA,AvgCAHH,AvgCAHA,home_team_var,draw_var,away_team_var,home_team_var_c,draw_var_c,away_team_var_c
0,1,0,1.0,0,0,0.5,11,11,5,2,...,2.02,2.03,1.91,1.98,0.040000,0.000267,0.001817,0.055467,0.005217,0.003200
1,1,3,0.0,0,1,0.0,7,17,4,11,...,2.00,2.20,1.82,2.06,0.119667,0.000067,0.003760,0.100417,0.055467,0.003777
2,1,1,0.5,0,0,0.5,14,12,6,3,...,1.96,2.12,1.89,2.00,0.000040,0.007150,0.031417,0.000440,0.004017,0.013000
3,2,1,1.0,1,0,1.0,16,11,4,5,...,2.12,1.88,2.07,1.83,0.003750,0.001537,0.001720,0.002320,0.004497,0.000667
4,0,1,0.0,0,0,0.5,13,4,2,2,...,1.95,2.06,1.90,1.99,0.001417,0.002467,0.011067,0.003667,0.001617,0.236417
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,4,0,1.0,1,0,1.0,11,11,6,3,...,2.04,1.99,1.93,1.94,0.001867,0.026417,0.036017,0.000747,0.015950,0.083067
376,2,2,0.5,1,1,0.5,15,7,7,4,...,2.06,1.93,2.00,1.88,0.014270,0.026817,0.011657,0.006377,0.012817,0.002417
377,1,0,1.0,0,0,0.5,13,13,4,4,...,1.91,2.08,1.86,2.01,0.026467,0.019217,0.003897,0.040067,0.017867,0.000307
378,2,2,0.5,1,1,0.5,12,12,5,4,...,1.89,2.11,1.84,2.04,0.000947,0.019067,0.023467,0.001147,0.017800,0.054417


In [88]:
matches_19 = epl_19.append([b_19, f_19, i_19, s_19])

In [89]:
matches_19

Unnamed: 0,FTHG,FTAG,FTR,HTHG,HTAG,HTR,HS,AS,HST,AST,...,MaxCAHH,MaxCAHA,AvgCAHH,AvgCAHA,home_team_var,draw_var,away_team_var,home_team_var_c,draw_var_c,away_team_var_c
0,4,1,1.0,4,0,1.0,15,12,7,5,...,1.99,2.07,1.90,1.99,0.000120,0.681267,4.010417,0.000187,0.634817,2.796817
1,0,5,0.0,0,1,0.0,5,14,3,9,...,2.07,1.98,1.97,1.92,0.448067,0.107200,0.000227,0.254017,0.136867,0.000107
2,1,1,0.5,0,0,0.5,13,8,3,3,...,2.00,1.96,1.96,1.92,0.001230,0.001600,0.018667,0.000427,0.003000,0.016950
3,3,0,1.0,0,0,0.5,10,11,4,3,...,1.90,2.07,1.86,2.02,0.001337,0.002950,0.000817,0.002187,0.003017,0.001667
4,0,0,0.5,0,0,0.5,6,10,2,3,...,2.03,2.08,1.96,1.93,0.006950,0.004400,0.001430,0.002417,0.008417,0.000550
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,4,0,1.0,1,0,1.0,11,11,6,3,...,2.04,1.99,1.93,1.94,0.001867,0.026417,0.036017,0.000747,0.015950,0.083067
376,2,2,0.5,1,1,0.5,15,7,7,4,...,2.06,1.93,2.00,1.88,0.014270,0.026817,0.011657,0.006377,0.012817,0.002417
377,1,0,1.0,0,0,0.5,13,13,4,4,...,1.91,2.08,1.86,2.01,0.026467,0.019217,0.003897,0.040067,0.017867,0.000307
378,2,2,0.5,1,1,0.5,12,12,5,4,...,1.89,2.11,1.84,2.04,0.000947,0.019067,0.023467,0.001147,0.017800,0.054417


In [92]:
e_20 = pd.read_csv('./2020:2021/E0.csv', encoding='cp1252')

In [93]:
b_20 = pd.read_csv('./2020:2021/D1.csv', encoding='cp1252')

In [94]:
f_20 = pd.read_csv('./2020:2021/F1.csv', encoding='cp1252')

In [95]:
i_20 = pd.read_csv('./2020:2021/I1.csv', encoding='cp1252')

In [96]:
s_20 = pd.read_csv('./2020:2021/SP1.csv', encoding='cp1252')

In [97]:
for league in [e_20, b_20, f_20, i_20, s_20]:
    
    home_team_odds = pd.DataFrame()
    draw_odds = pd.DataFrame()
    away_team_odds = pd.DataFrame()
    home_team_odds_c = pd.DataFrame()
    draw_odds_c = pd.DataFrame()
    away_team_odds_c = pd.DataFrame()
    
    league['FTR'] = league['FTR'].replace({'H':1, 'D':0.5, 'A':0})

    league['HTR'] = league['HTR'].replace({'H':1, 'D':0.5, 'A':0})

    home_team_odds = league[['B365H', 'BWH', 'IWH', 'PSH', 'WHH', 'VCH']]

    league['home_team_var'] = home_team_odds.var(axis=1)

    draw_odds = league[['B365D', 'BWD', 'IWD', 'PSD', 'WHD', 'VCD']]

    league['draw_var'] = draw_odds.var(axis=1)

    away_team_odds = league[['B365A', 'BWA', 'IWA', 'PSA', 'WHA', 'VCA']]

    league['away_team_var'] = away_team_odds.var(axis=1)

    home_team_odds_c = league[['B365CH', 'BWCH', 'IWCH', 'PSCH', 'WHCH', 'VCCH']]

    league['home_team_var_c'] = home_team_odds_c.var(axis=1)

    draw_odds_c = league[['B365CD', 'BWCD', 'IWCD', 'PSCD', 'WHCD', 'VCCD']]

    league['draw_var_c'] = draw_odds_c.var(axis=1)

    away_team_odds_c = league[['B365CA', 'BWCA', 'IWCA', 'PSCA', 'WHCA', 'VCCA']]

    league['away_team_var_c'] = away_team_odds_c.var(axis=1)

In [98]:
e_20 = e_20.drop(columns=['Div', 'Date', 'Time', 'HomeTeam', 'AwayTeam', 'Referee'])

In [99]:
b_20 = b_20.drop(columns=['Div', 'Date', 'Time', 'HomeTeam', 'AwayTeam'])

In [100]:
f_20 = f_20.drop(columns=['Div', 'Date', 'Time', 'HomeTeam', 'AwayTeam'])

In [101]:
i_20 = i_20.drop(columns=['Div', 'Date', 'Time', 'HomeTeam', 'AwayTeam'])

In [102]:
s_20 = s_20.drop(columns=['Div', 'Date', 'Time', 'HomeTeam', 'AwayTeam'])

In [103]:
matches_20 = e_20.append([b_20, f_20, i_20, s_20])

In [104]:
matches_20

Unnamed: 0,FTHG,FTAG,FTR,HTHG,HTAG,HTR,HS,AS,HST,AST,...,MaxCAHH,MaxCAHA,AvgCAHH,AvgCAHA,home_team_var,draw_var,away_team_var,home_team_var_c,draw_var_c,away_team_var_c
0,0,3,0.0,0,1,0.0,5,13,2,6,...,2.13,1.92,2.02,1.87,0.140600,0.039897,0.000337,0.042817,0.006200,0.000560
1,1,0,1.0,1,0,1.0,5,9,3,5,...,1.85,2.18,1.79,2.12,0.011617,0.014217,0.001150,0.002267,0.023417,0.002617
2,4,3,1.0,3,2,1.0,22,6,6,3,...,1.90,2.16,1.84,2.04,0.001070,0.218750,0.562400,0.000520,0.024017,1.078400
3,0,2,0.0,0,0,0.5,15,15,3,2,...,2.09,1.91,2.02,1.86,0.000150,0.023200,0.007000,0.001817,0.023467,0.006217
4,0,3,0.0,0,0,0.5,7,13,1,7,...,1.95,2.01,1.91,1.97,0.014417,0.025350,0.000750,0.002067,0.023267,0.002817
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,0,1,0.0,0,0,0.5,7,11,1,1,...,2.07,1.96,1.98,1.88,0.010600,0.020280,0.000107,0.062217,0.018590,0.000547
376,2,1,1.0,0,1,0.0,14,6,4,2,...,2.02,2.02,1.95,1.92,0.000187,0.028467,0.288400,0.001417,0.013000,0.017217
377,1,2,0.0,1,0,1.0,10,14,4,3,...,1.95,2.06,1.90,1.97,0.205000,0.032067,0.000107,0.508600,0.085350,0.000177
378,0,0,0.5,0,0,0.5,9,11,3,2,...,1.84,2.23,1.77,2.12,0.009190,0.002017,0.002337,0.002667,0.001417,0.005107


In [105]:
matches = matches_19.append([matches_20])

In [107]:
matches = matches.reset_index(drop=True)

In [120]:
matches

Unnamed: 0,FTHG,FTAG,FTR,HTHG,HTAG,HTR,HS,AS,HST,AST,...,MaxCAHH,MaxCAHA,AvgCAHH,AvgCAHA,home_team_var,draw_var,away_team_var,home_team_var_c,draw_var_c,away_team_var_c
0,4,1,1.0,4,0,1.0,15,12,7,5,...,1.99,2.07,1.90,1.99,0.000120,0.681267,4.010417,0.000187,0.634817,2.796817
1,0,5,0.0,0,1,0.0,5,14,3,9,...,2.07,1.98,1.97,1.92,0.448067,0.107200,0.000227,0.254017,0.136867,0.000107
2,1,1,0.5,0,0,0.5,13,8,3,3,...,2.00,1.96,1.96,1.92,0.001230,0.001600,0.018667,0.000427,0.003000,0.016950
3,3,0,1.0,0,0,0.5,10,11,4,3,...,1.90,2.07,1.86,2.02,0.001337,0.002950,0.000817,0.002187,0.003017,0.001667
4,0,0,0.5,0,0,0.5,6,10,2,3,...,2.03,2.08,1.96,1.93,0.006950,0.004400,0.001430,0.002417,0.008417,0.000550
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3546,0,1,0.0,0,0,0.5,7,11,1,1,...,2.07,1.96,1.98,1.88,0.010600,0.020280,0.000107,0.062217,0.018590,0.000547
3547,2,1,1.0,0,1,0.0,14,6,4,2,...,2.02,2.02,1.95,1.92,0.000187,0.028467,0.288400,0.001417,0.013000,0.017217
3548,1,2,0.0,1,0,1.0,10,14,4,3,...,1.95,2.06,1.90,1.97,0.205000,0.032067,0.000107,0.508600,0.085350,0.000177
3549,0,0,0.5,0,0,0.5,9,11,3,2,...,1.84,2.23,1.77,2.12,0.009190,0.002017,0.002337,0.002667,0.001417,0.005107


In [121]:
matches.dropna(inplace=True)

In [122]:
matches

Unnamed: 0,FTHG,FTAG,FTR,HTHG,HTAG,HTR,HS,AS,HST,AST,...,MaxCAHH,MaxCAHA,AvgCAHH,AvgCAHA,home_team_var,draw_var,away_team_var,home_team_var_c,draw_var_c,away_team_var_c
0,4,1,1.0,4,0,1.0,15,12,7,5,...,1.99,2.07,1.90,1.99,0.000120,0.681267,4.010417,0.000187,0.634817,2.796817
1,0,5,0.0,0,1,0.0,5,14,3,9,...,2.07,1.98,1.97,1.92,0.448067,0.107200,0.000227,0.254017,0.136867,0.000107
2,1,1,0.5,0,0,0.5,13,8,3,3,...,2.00,1.96,1.96,1.92,0.001230,0.001600,0.018667,0.000427,0.003000,0.016950
3,3,0,1.0,0,0,0.5,10,11,4,3,...,1.90,2.07,1.86,2.02,0.001337,0.002950,0.000817,0.002187,0.003017,0.001667
4,0,0,0.5,0,0,0.5,6,10,2,3,...,2.03,2.08,1.96,1.93,0.006950,0.004400,0.001430,0.002417,0.008417,0.000550
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3546,0,1,0.0,0,0,0.5,7,11,1,1,...,2.07,1.96,1.98,1.88,0.010600,0.020280,0.000107,0.062217,0.018590,0.000547
3547,2,1,1.0,0,1,0.0,14,6,4,2,...,2.02,2.02,1.95,1.92,0.000187,0.028467,0.288400,0.001417,0.013000,0.017217
3548,1,2,0.0,1,0,1.0,10,14,4,3,...,1.95,2.06,1.90,1.97,0.205000,0.032067,0.000107,0.508600,0.085350,0.000177
3549,0,0,0.5,0,0,0.5,9,11,3,2,...,1.84,2.23,1.77,2.12,0.009190,0.002017,0.002337,0.002667,0.001417,0.005107


In [166]:
X, y = matches.drop(columns=['FTR']).values, matches['FTR'].values

In [190]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=0)

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform(X_test)

In [191]:
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA

### Logistic Regression

In [192]:
pca = PCA(0.95)
lr = LogisticRegression(multi_class='ovr', random_state=1, solver='lbfgs')

In [193]:
X_train_pca = pca.fit_transform(X_train_sc)
X_test_pca = pca.transform(X_test_sc)

In [194]:
X_train_pca.shape

(2812, 20)

In [195]:
X_test_pca.shape

(704, 20)

In [196]:
y_train

array([0.5, 0. , 0. , ..., 1. , 0. , 0.5])

In [197]:
lr.fit(X_train_pca, y_train.astype(str))
lr.predict(X_test_pca)

array(['1.0', '0.0', '1.0', '0.0', '0.5', '0.0', '1.0', '1.0', '1.0',
       '0.5', '0.0', '0.0', '0.5', '1.0', '0.0', '0.5', '0.0', '0.0',
       '0.0', '0.0', '0.0', '0.0', '1.0', '1.0', '0.0', '1.0', '1.0',
       '1.0', '1.0', '0.5', '0.5', '0.5', '0.0', '0.0', '1.0', '0.0',
       '1.0', '0.0', '1.0', '0.5', '1.0', '0.0', '0.0', '1.0', '1.0',
       '1.0', '0.0', '1.0', '1.0', '0.0', '0.0', '0.0', '1.0', '0.0',
       '1.0', '0.0', '1.0', '1.0', '0.5', '1.0', '0.0', '1.0', '1.0',
       '0.0', '0.0', '1.0', '1.0', '1.0', '0.0', '0.0', '0.5', '0.0',
       '1.0', '1.0', '1.0', '1.0', '0.0', '1.0', '0.0', '0.0', '1.0',
       '1.0', '1.0', '1.0', '1.0', '1.0', '0.0', '0.5', '0.0', '0.0',
       '0.0', '1.0', '1.0', '1.0', '0.5', '0.0', '1.0', '0.5', '0.0',
       '0.5', '0.0', '1.0', '0.0', '0.0', '0.0', '0.0', '0.0', '1.0',
       '1.0', '0.0', '0.0', '0.0', '1.0', '1.0', '0.0', '0.5', '1.0',
       '1.0', '0.0', '1.0', '0.0', '0.5', '1.0', '1.0', '0.0', '1.0',
       '0.5', '1.0',

In [198]:
lr.score(X_test_pca, y_test.astype(str))

0.7428977272727273

In [199]:
lr.score(X_train_pca, y_train.astype(str))

0.7546230440967283

In [200]:
pd.DataFrame(y_test).value_counts(normalize=True)

1.0    0.419034
0.0    0.330966
0.5    0.250000
dtype: float64

### Gridsearch on Logistic Regression

In [203]:
logreg = LogisticRegression(max_iter=10000, tol=0.1)

In [204]:
grid={"C":np.logspace(-4, 4, 50), "penalty":['l2']}

In [208]:
logreg_cv=GridSearchCV(logreg,grid,cv=5)
logreg_cv.fit(X_train_pca,y_train.astype(str))

GridSearchCV(cv=5, estimator=LogisticRegression(max_iter=10000, tol=0.1),
             param_grid={'C': array([1.00000000e-04, 1.45634848e-04, 2.12095089e-04, 3.08884360e-04,
       4.49843267e-04, 6.55128557e-04, 9.54095476e-04, 1.38949549e-03,
       2.02358965e-03, 2.94705170e-03, 4.29193426e-03, 6.25055193e-03,
       9.10298178e-03, 1.32571137e-02, 1.93069773e-02, 2.81176870e-02,
       4.09491506e-02, 5...
       3.72759372e+00, 5.42867544e+00, 7.90604321e+00, 1.15139540e+01,
       1.67683294e+01, 2.44205309e+01, 3.55648031e+01, 5.17947468e+01,
       7.54312006e+01, 1.09854114e+02, 1.59985872e+02, 2.32995181e+02,
       3.39322177e+02, 4.94171336e+02, 7.19685673e+02, 1.04811313e+03,
       1.52641797e+03, 2.22299648e+03, 3.23745754e+03, 4.71486636e+03,
       6.86648845e+03, 1.00000000e+04]),
                         'penalty': ['l2']})

In [209]:
logreg_cv.best_score_

0.7546329715618543

In [210]:
logreg_cv.best_params_

{'C': 0.040949150623804234, 'penalty': 'l2'}

### Random Forest

In [213]:
rf = RandomForestClassifier(n_estimators=80, max_features=None, max_depth=3, min_samples_split=3, ccp_alpha=0.01)

In [214]:
rf.fit(X_train_pca,y_train.astype(str))

RandomForestClassifier(ccp_alpha=0.01, max_depth=3, max_features=None,
                       min_samples_split=3, n_estimators=80)

In [216]:
rf.score(X_train_pca,y_train.astype(str))

0.6692745376955903

In [217]:
rf.score(X_test_pca, y_test.astype(str))

0.6647727272727273

### Gridsearch on random forest

In [218]:
rf = RandomForestClassifier()

In [219]:
my_params = {
    'criterion': ['gini', 'entropy'],
    'n_estimators': [18, 20, 25],
    'max_depth': [4, 10, 20],
    'max_features': ['auto', 1.0, 2, 3]
    
}

In [220]:
grid = GridSearchCV(rf, param_grid = my_params, cv = 5)

In [221]:
grid.fit(X_train_pca,y_train.astype(str))

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [4, 10, 20],
                         'max_features': ['auto', 1.0, 2, 3],
                         'n_estimators': [18, 20, 25]})

In [222]:
grid.score(X_train_pca,y_train.astype(str))

0.9971550497866287

In [223]:
grid.score(X_test_pca, y_test.astype(str))

0.7414772727272727

### SVM

In [225]:
from sklearn.svm import SVC
svc = SVC()

In [226]:
svc.fit(X_train_pca,y_train.astype(str))

SVC()

In [227]:
from sklearn.metrics import accuracy_score

print(accuracy_score(y_train.astype(str), svc.predict(X_train_pca)))
print(accuracy_score(y_test.astype(str), svc.predict(X_test_pca)))

0.7827169274537695
0.7428977272727273


### Gridsearch on SVM

In [229]:
# Instantiate SVM.
svc = SVC()
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
gs = GridSearchCV(SVC(), tuned_parameters, cv=5)
gs.fit(X_train_pca,y_train.astype(str))

GridSearchCV(cv=5, estimator=SVC(),
             param_grid=[{'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001],
                          'kernel': ['rbf']},
                         {'C': [1, 10, 100, 1000], 'kernel': ['linear']}])

In [230]:
# Evaluate model.
gs.score(X_test_pca, y_test.astype(str))

0.7613636363636364

In [231]:
gs.score(X_train_pca,y_train.astype(str))

0.835348506401138

In [232]:
gs.best_params_

{'C': 1000, 'gamma': 0.001, 'kernel': 'rbf'}

### Neural Networks

In [233]:
import os
os.environ['KERAS_BACKEND']='tensorflow'
import keras
import tensorflow as tf
from keras import optimizers
from keras.models import Sequential, load_model
from keras.layers import Dense
from keras.layers import Dropout, Embedding, Input, BatchNormalization, Flatten
from keras.regularizers import l1, l2
from keras.callbacks import EarlyStopping

In [236]:
model = Sequential()
model.add(Dense(20, activation='relu', input_shape=(20,)))
# model.add(Input(shape=(20,)))
model.add(BatchNormalization())
model.add(Dense(20,activation='relu'))
model.add(Dense(10, activation='relu'))
model.add(Dropout(.20))
model.add(Dense(1, activation='sigmoid'))

# compile
model.compile(loss='binary_crossentropy', 
                optimizer='adam', 
                metrics=['accuracy'])

es = EarlyStopping(monitor='val_accuracy', patience=5)

# fit model with early stopping
model.fit(X_train_pca, y_train, 
                         validation_data=(X_test_pca, y_test), 
                         epochs=100,
                         callbacks=[es])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100


<keras.callbacks.History at 0x150c5e640>

In [237]:
model.evaluate(X_test_pca, y_test)



[0.3629230558872223, 0.703125]