In [1]:
# Setting up
import numpy as np
import pandas as pd
import seaborn as sns # for visualiation
from scipy.stats import ttest_ind # t-tests
import statsmodels.formula.api as smf # linear modeling
import statsmodels.api as sm
import matplotlib.pyplot as plt # plotting
from sklearn.preprocessing import MinMaxScaler       # scaling data
from sklearn.model_selection import train_test_split # splitting data
from sklearn.neighbors import KNeighborsRegressor    # regressor
from sklearn.model_selection import GridSearchCV     # for grid search
from sklearn.pipeline import make_pipeline           # for making pipelines
from sklearn.neighbors import KNeighborsClassifier   # for KNeighborsClassifier model
from sklearn.tree import DecisionTreeClassifier      # for Decision Tree model
from sklearn.metrics import accuracy_score           # for accuracy_score
# add Tree-based feature selection
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn import preprocessing
# Use cross validation (`cross_val_score`) to test a classifier 
# across 10 different splits of the data
# Use a K value of 3 for your KNN.
# Notice the huge variation in performance across folds!
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import PolynomialFeatures


# ignore warning 
import warnings
warnings.filterwarnings("ignore")

In [4]:
data_1415 = pd.read_csv('./data/1415.csv')
data_1516 = pd.read_csv('./data/1516.csv')
data_1617 = pd.read_csv('./data/1617.csv')
data_1718 = pd.read_csv('./data/1718.csv')
data_1819 = pd.read_csv('./data/1819.csv')

In [56]:
data = data_1415
data = data.append(data_1516)
data = data.append(data_1617)
data = data.append(data_1718)

In [57]:
bet_data = data[['B365H', 'B365D', 'B365A',
                 'BWH'  , 'BWD'  , 'BWA'  ,
                 'IWH'  , 'IWD'  , 'IWA'  ,
                 'LBH'  , 'LBD'  , 'LBA'  , 
                 'PSH'  , 'PSD'  , 'PSA'  , 
                 'SJH'  , 'SJD'  , 'SJA'  , 
                 'VCH'  , 'VCD'  , 'VCA'  , 
                 'WHH'  , 'WHD'  , 'WHA'  ]]

In [58]:
bet_home = bet_data[['B365H','BWH','IWH','LBH','PSH','SJH','VCH','WHH']].mean(axis=1)
bet_draw = bet_data[['B365D','BWD','IWD','LBD','PSD','SJD','VCD','WHD']].mean(axis=1)
bet_away = bet_data[['B365A','BWA','IWA','LBA','PSA','SJA','VCA','WHA']].mean(axis=1)

In [59]:
print(bet_away)

0      12.188750
1       2.373750
2       9.450000
3       3.008750
4       4.212500
5       3.366250
6       2.076250
7       9.143750
8       1.655000
9       1.401250
10      2.945000
11     14.650000
12      3.031250
13      2.673750
14      4.387500
15      5.616250
16      3.082500
17      1.762500
18      7.343750
19      4.240000
20      1.610000
21      2.161250
22     16.681250
23      5.197500
24      3.097500
25      4.477500
26      2.698750
27      3.466250
28      1.837500
29      2.700000
         ...    
350     1.517143
351     7.734286
352     1.334286
353    19.428571
354     1.844286
355     3.534286
356     2.597143
357     3.338571
358     2.765714
359     3.181429
360     1.418571
361     6.431429
362     3.265714
363    27.142857
364     2.495714
365    17.810000
366     1.987143
367    22.544286
368    16.285714
369     1.804286
370     3.465714
371     4.708571
372     1.591429
373    16.277143
374     8.782857
375     1.601429
376     1.417143
377     4.1485

In [60]:
df = data[['HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'HS', 'AS', 'HST', 'AST', 'HY', 'AY', 'HR', 'AR']]
print(df.head())

     HomeTeam        AwayTeam  FTHG  FTAG FTR    HS    AS  HST  AST   HY   AY  \
0     Arsenal  Crystal Palace   2.0   1.0   H  14.0   4.0  6.0  2.0  2.0  2.0   
1   Leicester         Everton   2.0   2.0   D  11.0  13.0  3.0  3.0  1.0  1.0   
2  Man United         Swansea   1.0   2.0   A  14.0   5.0  5.0  4.0  2.0  4.0   
3         QPR            Hull   0.0   1.0   A  19.0  11.0  6.0  4.0  1.0  2.0   
4       Stoke     Aston Villa   0.0   1.0   A  12.0   7.0  2.0  2.0  0.0  3.0   

    HR   AR  
0  0.0  1.0  
1  0.0  0.0  
2  0.0  0.0  
3  0.0  0.0  
4  0.0  0.0  


In [63]:
df['odd_home'] = bet_home
df['odd_draw'] = bet_draw
df['odd_away'] = bet_away

In [66]:
print('number of duplicate values = ' + str(np.sum(data.duplicated())))
print('number of null values in each variable')
print(df.isnull().sum())

number of duplicate values = 0
number of null values in each variable
HomeTeam    1
AwayTeam    1
FTHG        1
FTAG        1
FTR         1
HS          1
AS          1
HST         1
AST         1
HY          1
AY          1
HR          1
AR          1
odd_home    1
odd_draw    1
odd_away    1
dtype: int64


In [69]:
df.dropna()

Unnamed: 0,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HS,AS,HST,AST,HY,AY,HR,AR,odd_home,odd_draw,odd_away
0,Arsenal,Crystal Palace,2.0,1.0,H,14.0,4.0,6.0,2.0,2.0,2.0,0.0,1.0,1.257500,5.868750,12.188750
1,Leicester,Everton,2.0,2.0,D,11.0,13.0,3.0,3.0,1.0,1.0,0.0,0.0,3.086250,3.310000,2.373750
2,Man United,Swansea,1.0,2.0,A,14.0,5.0,5.0,4.0,2.0,4.0,0.0,0.0,1.362500,4.943750,9.450000
3,QPR,Hull,0.0,1.0,A,19.0,11.0,6.0,4.0,1.0,2.0,0.0,0.0,2.478750,3.207500,3.008750
4,Stoke,Aston Villa,0.0,1.0,A,12.0,7.0,2.0,2.0,0.0,3.0,0.0,0.0,1.956250,3.358750,4.212500
5,West Brom,Sunderland,2.0,2.0,D,10.0,7.0,5.0,2.0,3.0,1.0,0.0,0.0,2.251250,3.298750,3.366250
6,West Ham,Tottenham,0.0,1.0,A,18.0,10.0,4.0,4.0,1.0,0.0,1.0,1.0,3.663750,3.431250,2.076250
7,Liverpool,Southampton,2.0,1.0,H,12.0,12.0,5.0,6.0,1.0,2.0,0.0,0.0,1.353750,5.035000,9.143750
8,Newcastle,Man City,0.0,2.0,A,12.0,13.0,0.0,5.0,1.0,5.0,0.0,0.0,5.181250,4.020000,1.655000
9,Burnley,Chelsea,1.0,3.0,A,9.0,11.0,2.0,3.0,1.0,1.0,0.0,0.0,8.633750,4.663750,1.401250


In [67]:
df.head()

Unnamed: 0,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HS,AS,HST,AST,HY,AY,HR,AR,odd_home,odd_draw,odd_away
0,Arsenal,Crystal Palace,2.0,1.0,H,14.0,4.0,6.0,2.0,2.0,2.0,0.0,1.0,1.2575,5.86875,12.18875
1,Leicester,Everton,2.0,2.0,D,11.0,13.0,3.0,3.0,1.0,1.0,0.0,0.0,3.08625,3.31,2.37375
2,Man United,Swansea,1.0,2.0,A,14.0,5.0,5.0,4.0,2.0,4.0,0.0,0.0,1.3625,4.94375,9.45
3,QPR,Hull,0.0,1.0,A,19.0,11.0,6.0,4.0,1.0,2.0,0.0,0.0,2.47875,3.2075,3.00875
4,Stoke,Aston Villa,0.0,1.0,A,12.0,7.0,2.0,2.0,0.0,3.0,0.0,0.0,1.95625,3.35875,4.2125
