## Import dataset

In [1]:
import pandas as pd

data = pd.read_csv('data/all_seasons.csv')
print(data.shape[0], 'matches')
data.head()

3999 matches


Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,...,BbAvAHA,GBH,GBD,GBA,SBH,SBD,SBA,BSH,BSD,BSA
0,E0,09/08/2019,20:00,Liverpool,Norwich,4,1,H,4,0,...,,,,,,,,,,
1,E0,10/08/2019,12:30,West Ham,Man City,0,5,A,0,1,...,,,,,,,,,,
2,E0,10/08/2019,15:00,Bournemouth,Sheffield United,1,1,D,0,0,...,,,,,,,,,,
3,E0,10/08/2019,15:00,Burnley,Southampton,3,0,H,0,0,...,,,,,,,,,,
4,E0,10/08/2019,15:00,Crystal Palace,Everton,0,0,D,0,0,...,,,,,,,,,,


In [2]:
all_teams = pd.read_csv('data/teams.csv')
display(all_teams)

Unnamed: 0,Teams
0,Liverpool
1,West Ham
2,Bournemouth
3,Burnley
4,Crystal Palace
5,Watford
6,Tottenham
7,Leicester
8,Newcastle
9,Man United


In [3]:
home_teams = all_teams['Teams']

team_list = []
for t in home_teams:
    if t not in team_list:
        team_list.append(t)
        
print('Teams in Season: 2019-2020')
display(team_list)

Teams in Season: 2019-2020


['Liverpool',
 'West Ham',
 'Bournemouth',
 'Burnley',
 'Crystal Palace',
 'Watford',
 'Tottenham',
 'Leicester',
 'Newcastle',
 'Man United',
 'Arsenal',
 'Aston Villa',
 'Brighton',
 'Everton',
 'Norwich',
 'Southampton',
 'Man City',
 'Sheffield United',
 'Chelsea',
 'Wolves']

## Preprocess data

### Keep records of only the teams in the current season

In [4]:
filtered = data[(data['HomeTeam'].isin(team_list))]
data = filtered[(filtered['AwayTeam'].isin(team_list))]

print(data.shape, 'records')

(1933, 139) records


### Keep only required columns

In [5]:
req_cols = ['Date', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 
            'HTHG', 'HTAG', 'HTR', 'Referee', 'HS', 'AS', 'HST', 'AST', 
            'HF', 'AF', 'HC', 'AC', 'HY', 'AY', 'HR', 'AR']
data = data[req_cols]

display(data.head())

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,...,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR
0,09/08/2019,Liverpool,Norwich,4,1,H,4,0,H,M Oliver,...,7,5,9,9,11,2,0,2,0,0
1,10/08/2019,West Ham,Man City,0,5,A,0,1,A,M Dean,...,3,9,6,13,1,1,2,2,0,0
2,10/08/2019,Bournemouth,Sheffield United,1,1,D,0,0,D,K Friend,...,3,3,10,19,3,4,2,1,0,0
3,10/08/2019,Burnley,Southampton,3,0,H,0,0,D,G Scott,...,4,3,6,12,2,7,0,0,0,0
4,10/08/2019,Crystal Palace,Everton,0,0,D,0,0,D,J Moss,...,2,3,16,14,6,2,2,1,0,1


### Prepare features and label

In [6]:
X = data.drop(['FTR'], 1)
print(X.shape, 'X features')
y = data['FTR']
print(y.shape, 'labels')
Z = X.drop(['Date', 'HTR', 'Referee'], 1)
print(Z.shape, 'Z features')

(1933, 21) X features
(1933,) labels
(1933, 18) Z features


### Scale and standardise the feature data
* Center to the mean and component wise scale to unit variance

In [7]:
from sklearn.preprocessing import scale

cols = [['FTHG', 'FTAG', 'HTHG', 'HTAG', 'HS', 'AS', 'HST', 'AST', 
        'HC', 'AC', 'HF', 'AF', 'HY', 'AY', 'HR', 'AR']]
for col in cols:
    X[col] = scale(X[col])
    
for col in cols:
    Z[col] = scale(Z[col])

In [8]:
display(Z.tail())
print(Z.shape)

Unnamed: 0,HomeTeam,AwayTeam,FTHG,FTAG,HTHG,HTAG,HS,AS,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR
3993,Man United,Watford,-0.465328,-1.032883,0.319787,-0.731166,-1.263721,-0.916667,-1.406146,-0.547407,-1.292904,-0.001873,0.006454,0.046208,1.94693,-1.429184,-0.250917,-0.292992
3994,Newcastle,Chelsea,1.047673,-1.032883,0.319787,-0.731166,0.347607,-1.128062,0.149354,-0.913165,0.150209,-0.28039,-0.633321,-1.061055,-1.245787,-0.643235,-0.250917,-0.292992
3995,Southampton,Man City,-1.221829,-0.19575,-0.833474,-0.731166,-1.084685,0.351706,-0.783946,-0.913165,-0.715659,-0.28039,-1.592984,2.62982,1.14875,-0.643235,-0.250917,-0.292992
3997,Tottenham,Leicester,2.560675,2.31565,0.319787,1.945621,-0.010466,0.985892,0.149354,1.64714,-0.427036,0.55516,-0.633321,-0.32288,-0.447608,0.142715,-0.250917,-0.292992
3998,West Ham,Everton,1.047673,-0.19575,0.319787,-0.731166,0.16857,0.563101,-0.472846,0.915624,-0.138413,0.55516,0.006454,0.415295,-1.245787,-0.643235,-0.250917,-0.292992


(1933, 18)


### Handle categorical values
* Input data needs to be continous variables that are integers
* Convert to dummy variables

In [9]:
def preprocess(Z):
    df = pd.DataFrame(index=Z.index)
    for col, data in Z.iteritems():
        if data.dtype == object:
            data = pd.get_dummies(data, prefix=col)
        df = df.join(data)
    return df

Z = preprocess(Z)
print ("Processed feature columns ({} total features):\n{}".format(len(Z.columns), list(Z.columns)))

Processed feature columns (56 total features):
['HomeTeam_Arsenal', 'HomeTeam_Aston Villa', 'HomeTeam_Bournemouth', 'HomeTeam_Brighton', 'HomeTeam_Burnley', 'HomeTeam_Chelsea', 'HomeTeam_Crystal Palace', 'HomeTeam_Everton', 'HomeTeam_Leicester', 'HomeTeam_Liverpool', 'HomeTeam_Man City', 'HomeTeam_Man United', 'HomeTeam_Newcastle', 'HomeTeam_Norwich', 'HomeTeam_Sheffield United', 'HomeTeam_Southampton', 'HomeTeam_Tottenham', 'HomeTeam_Watford', 'HomeTeam_West Ham', 'HomeTeam_Wolves', 'AwayTeam_Arsenal', 'AwayTeam_Aston Villa', 'AwayTeam_Bournemouth', 'AwayTeam_Brighton', 'AwayTeam_Burnley', 'AwayTeam_Chelsea', 'AwayTeam_Crystal Palace', 'AwayTeam_Everton', 'AwayTeam_Leicester', 'AwayTeam_Liverpool', 'AwayTeam_Man City', 'AwayTeam_Man United', 'AwayTeam_Newcastle', 'AwayTeam_Norwich', 'AwayTeam_Sheffield United', 'AwayTeam_Southampton', 'AwayTeam_Tottenham', 'AwayTeam_Watford', 'AwayTeam_West Ham', 'AwayTeam_Wolves', 'FTHG', 'FTAG', 'HTHG', 'HTAG', 'HS', 'AS', 'HST', 'AST', 'HF', 'AF', 

In [10]:
print('Feature Values:')
display(Z.head())
Z.info()

Feature Values:


Unnamed: 0,HomeTeam_Arsenal,HomeTeam_Aston Villa,HomeTeam_Bournemouth,HomeTeam_Brighton,HomeTeam_Burnley,HomeTeam_Chelsea,HomeTeam_Crystal Palace,HomeTeam_Everton,HomeTeam_Leicester,HomeTeam_Liverpool,...,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR
0,0,0,0,0,0,0,0,0,0,1,...,0.460454,0.184109,-0.427036,-0.558907,1.605892,-1.061055,-1.245787,0.142715,-0.250917,-0.292992
1,0,0,0,0,0,0,0,0,0,0,...,-0.783946,1.64714,-1.292904,0.55516,-1.592984,-1.430142,0.350571,0.142715,-0.250917,-0.292992
2,0,0,1,0,0,0,0,0,0,0,...,-0.783946,-0.547407,-0.138413,2.226261,-0.953209,-0.32288,0.350571,-0.643235,-0.250917,-0.292992
3,0,0,0,0,1,0,0,0,0,0,...,-0.472846,-0.547407,-1.292904,0.276644,-1.273097,0.784382,-1.245787,-1.429184,-0.250917,-0.292992
4,0,0,0,0,0,0,1,0,0,0,...,-1.095046,-0.547407,1.593322,0.833677,0.006454,-1.061055,0.350571,-0.643235,-0.250917,3.16038


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1933 entries, 0 to 3998
Data columns (total 56 columns):
HomeTeam_Arsenal             1933 non-null uint8
HomeTeam_Aston Villa         1933 non-null uint8
HomeTeam_Bournemouth         1933 non-null uint8
HomeTeam_Brighton            1933 non-null uint8
HomeTeam_Burnley             1933 non-null uint8
HomeTeam_Chelsea             1933 non-null uint8
HomeTeam_Crystal Palace      1933 non-null uint8
HomeTeam_Everton             1933 non-null uint8
HomeTeam_Leicester           1933 non-null uint8
HomeTeam_Liverpool           1933 non-null uint8
HomeTeam_Man City            1933 non-null uint8
HomeTeam_Man United          1933 non-null uint8
HomeTeam_Newcastle           1933 non-null uint8
HomeTeam_Norwich             1933 non-null uint8
HomeTeam_Sheffield United    1933 non-null uint8
HomeTeam_Southampton         1933 non-null uint8
HomeTeam_Tottenham           1933 non-null uint8
HomeTeam_Watford             1933 non-null uint8
HomeTeam_We

### Split data into training and test sets

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(Z, y, test_size = 50,
                                                    random_state = 2,
                                                    stratify = y)
print('Training data:',len(X_train))
print('Test data:',len(X_test))

Training data: 1883
Test data: 50


## Create models
Classifiers:
* Logistic Regression
* Support Vector Classifier
* K-Nearest Neighbors

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
#from sklearn.linear_model import LinearRegression
#from sklearn import tree

clf_lr = LogisticRegression(random_state=42)
clf_svc = SVC(random_state=912, kernel='rbf')
clf_knn = KNeighborsClassifier(n_neighbors=60)

## Train and Evaluate the models

* Train the model
* Test based on the F1 score and Accuarcy
  * F1 score considers both the precision and the recall of the test to compute the score
  * The F1 score can be interpreted as a weighted average of the precision and recall
  * F1 score reaches its best value at 1 and worst at 0.
  * Accuracy is the ratio of correct predictions to the total predictions

In [13]:
from time import time
from sklearn.metrics import f1_score

def train(clf, X_train, y_train):
    start = time()
    clf.fit(X_train, y_train)
    end = time()
    print('Model trained in {:.4f} secs'.format(end-start))

def test(clf, features, labels):
    start = time()
    y_pred = clf.predict(features)
    end = time()
    print('Test predictions made in {:.4f} secs'.format(end-start))
    f1 = f1_score(labels, y_pred, average='macro')
    acc = sum(labels==y_pred)/float(len(y_pred))
    return f1, acc

def train_test(clf, X_train, y_train, X_test, y_test):
    print('Training {}...'.format(clf.__class__.__name__))
    train(clf, X_train, y_train)
    f1, acc = test(clf, X_train, y_train)
    print('For Training set: F1 score= {:.4f}, Accuracy= {:.4f}'.format(f1, acc))
    f1, acc = test(clf, X_test, y_test)
    print('For Test set: F1 score= {:.4f}, Accuracy= {:.4f}'.format(f1, acc))
    
train_test(clf_lr, X_train, y_train, X_test, y_test)
print('------------------------------------------------')
train_test(clf_svc, X_train, y_train, X_test, y_test)
print('------------------------------------------------')
train_test(clf_knn, X_train, y_train, X_test, y_test)
print('------------------------------------------------')

Training LogisticRegression...
Model trained in 0.0631 secs
Test predictions made in 0.0014 secs
For Training set: F1 score= 1.0000, Accuracy= 1.0000
Test predictions made in 0.0009 secs
For Test set: F1 score= 1.0000, Accuracy= 1.0000
------------------------------------------------
Training SVC...
Model trained in 0.1697 secs
Test predictions made in 0.1115 secs
For Training set: F1 score= 1.0000, Accuracy= 1.0000
Test predictions made in 0.0037 secs
For Test set: F1 score= 1.0000, Accuracy= 1.0000
------------------------------------------------
Training KNeighborsClassifier...
Model trained in 0.0067 secs
Test predictions made in 0.3239 secs
For Training set: F1 score= 0.7131, Accuracy= 0.7817
Test predictions made in 0.0103 secs
For Test set: F1 score= 0.6711, Accuracy= 0.7400
------------------------------------------------


## Use the best model for making predictions
* Set the model
* Train the model with training dataset
* Make predictions
* Predict the probability of results (Away team win, draw, Home team win)

In [14]:
model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=42, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [15]:
model.predict(X_test)

array(['A', 'D', 'H', 'H', 'D', 'D', 'H', 'A', 'H', 'H', 'H', 'A', 'H',
       'H', 'D', 'A', 'A', 'A', 'A', 'D', 'H', 'D', 'H', 'H', 'D', 'H',
       'H', 'D', 'D', 'D', 'D', 'A', 'H', 'A', 'H', 'H', 'H', 'A', 'H',
       'A', 'A', 'D', 'A', 'H', 'A', 'H', 'H', 'H', 'A', 'H'],
      dtype=object)

In [16]:
pred_prob = model.predict_proba(X_test)
pred_prob = pd.DataFrame(pred_prob, columns=['Away Team Win', 'Draw', 'Home Team Win'])

display((pred_prob*100).head())

Unnamed: 0,Away Team Win,Draw,Home Team Win
0,99.983507,0.016493,2.496719e-09
1,1.888309,93.921973,4.189718
2,0.000185,3.844102,96.15571
3,0.005124,8.171812,91.82306
4,3.001587,93.755141,3.243272


## Incorporate the result probabilities into the fixture

In [17]:
fixtures = pd.read_csv('data/fixtures/epl-2019-GMT.csv')
fixtures.head()

Unnamed: 0,Round Number,Date,Location,Home Team,Away Team,Result
0,1,09/08/2019 20:00,Anfield,Liverpool,Norwich,4 - 1
1,1,10/08/2019 12:30,London Stadium,West Ham,Man City,0 - 5
2,1,10/08/2019 15:00,Vitality Stadium,Bournemouth,Sheffield Utd,1 - 1
3,1,10/08/2019 15:00,Turf Moor,Burnley,Southampton,3 - 0
4,1,10/08/2019 15:00,Selhurst Park,Crystal Palace,Everton,0 - 0


In [18]:
fixtures['HomeTeam'] = fixtures['Home Team']
fixtures['AwayTeam'] = fixtures['Away Team']
fixtures['HTHG'] = 0
fixtures['HTAG'] = 0
fixtures['FTHG'] = 0
fixtures['FTAG'] = 0
fixtures['HS'] = 0
fixtures['AS'] = 0
fixtures['HST'] = 0
fixtures['AST'] = 0
fixtures['HC'] = 0
fixtures['AC'] = 0
fixtures['HF'] = 0
fixtures['AF'] = 0
fixtures['HY'] = 0
fixtures['AY'] = 0
fixtures['HR'] = 0
fixtures['AR'] = 0

fixtures = fixtures.drop(['Round Number', 'Date', 'Location', 'Home Team', 'Away Team', 'Result'], 1)

print(fixtures.shape, 'features')
display(fixtures.head())

(380, 18) features


Unnamed: 0,HomeTeam,AwayTeam,HTHG,HTAG,FTHG,FTAG,HS,AS,HST,AST,HC,AC,HF,AF,HY,AY,HR,AR
0,Liverpool,Norwich,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,West Ham,Man City,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Bournemouth,Sheffield Utd,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Burnley,Southampton,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Crystal Palace,Everton,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [19]:
pp_fixtures = preprocess(fixtures)
print(pp_fixtures.shape, 'features')

(380, 56) features


In [20]:
model.predict(pp_fixtures)

array(['D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D',
       'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D',
       'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D',
       'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D',
       'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D',
       'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D',
       'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D',
       'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D',
       'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D',
       'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D',
       'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D',
       'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D',
       'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D',
       'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D

In [21]:
fixtures['Result Predicted'] = model.predict(pp_fixtures)

In [23]:
Result = fixtures.drop(['FTHG','FTAG','HTHG','HTAG','HS','AS',
                        'HST','AST','HF','AF','HC','AC',
                        'HY','AY','HR','AR'],1)
Result.head(10)

Unnamed: 0,HomeTeam,AwayTeam,Result Predicted
0,Liverpool,Norwich,D
1,West Ham,Man City,D
2,Bournemouth,Sheffield Utd,D
3,Burnley,Southampton,D
4,Crystal Palace,Everton,D
5,Watford,Brighton,D
6,Spurs,Aston Villa,D
7,Leicester,Wolves,D
8,Newcastle,Arsenal,D
9,Man Utd,Chelsea,D


In [24]:
fixture_pred_prob = model.predict_proba(pp_fixtures) *100
fixture_pred_prob = pd.DataFrame(fixture_pred_prob, columns=['Away win %','Draw %','Home win %'])

display(fixture_pred_prob)

Unnamed: 0,Away win %,Draw %,Home win %
0,0.214990,72.948080,26.836930
1,0.420715,68.716700,30.862585
2,0.281879,79.120598,20.597523
3,0.402556,58.959524,40.637920
4,0.348315,65.527898,34.123786
...,...,...,...
375,0.373874,68.984759,30.641367
376,0.254909,60.075348,39.669742
377,0.422568,69.467163,30.110269
378,0.302957,80.180821,19.516223
