## Import dataset

In [1]:
import pandas as pd

data = pd.read_csv('data/all_seasons.csv')
print(data.shape[0], 'matches')
data.head(5)

7430 matches


Unnamed: 0,Date,HomeTeam,AwayTeam,HTHG,HTAG,HTR,FTHG,FTAG,FTR,HS,...,HST,AST,HC,AC,HF,AF,HY,AY,HR,AR
0,14/08/04,Aston Villa,Southampton,2.0,0.0,H,2.0,0.0,H,14.0,...,5.0,2.0,12.0,6.0,14.0,9.0,0.0,0.0,0.0,0.0
1,14/08/04,Blackburn,West Brom,0.0,1.0,A,1.0,1.0,D,12.0,...,4.0,2.0,4.0,5.0,15.0,17.0,1.0,0.0,0.0,0.0
2,14/08/04,Bolton,Charlton,2.0,0.0,H,4.0,1.0,H,21.0,...,11.0,5.0,9.0,5.0,10.0,12.0,1.0,1.0,0.0,0.0
3,14/08/04,Man City,Fulham,1.0,0.0,H,1.0,1.0,D,12.0,...,5.0,2.0,9.0,4.0,14.0,12.0,0.0,2.0,0.0,0.0
4,14/08/04,Middlesbrough,Newcastle,0.0,1.0,A,2.0,2.0,D,15.0,...,8.0,4.0,6.0,7.0,16.0,13.0,3.0,1.0,0.0,0.0


In [2]:
all_teams = pd.read_csv('data/teams.csv')

home_teams = all_teams['Teams']

team_list = []
for t in home_teams:
    if t not in team_list:
        team_list.append(t)
        
print(len(team_list), 'Teams in Season')
i=1
for team in team_list:
    print(i, team)
    i+=1

20 Teams in Season
1 Liverpool
2 West Ham
3 Bournemouth
4 Burnley
5 Crystal Palace
6 Watford
7 Tottenham
8 Leicester
9 Newcastle
10 Man United
11 Arsenal
12 Aston Villa
13 Brighton
14 Everton
15 Norwich
16 Southampton
17 Man City
18 Sheffield United
19 Chelsea
20 Wolves


## Preprocess data

### Keep records of only the teams in the current season

In [3]:
filtered = data[(data['HomeTeam'].isin(team_list))]
data = filtered[(filtered['AwayTeam'].isin(team_list))]

print(data.shape, 'records')

(2961, 21) records


### Prepare features and label

In [4]:
X = data.drop(['FTR'], axis=1)
Z = X.drop(['Date', 'HTR'], axis=1)
print(Z.shape[1], 'Features')
for c in Z.columns:
    print(c)
y = data['FTR']

18 Features
HomeTeam
AwayTeam
HTHG
HTAG
FTHG
FTAG
HS
AS
HST
AST
HC
AC
HF
AF
HY
AY
HR
AR


### Scale and standardise the feature data
* Center to the mean and component wise scale to unit variance

In [5]:
from sklearn.preprocessing import scale

cols = [['FTHG', 'FTAG', 'HTHG', 'HTAG', 'HS', 'AS', 'HST', 'AST', 
        'HC', 'AC', 'HF', 'AF', 'HY', 'AY', 'HR', 'AR']]
for col in cols:
    X[col] = scale(X[col])
    Z[col] = scale(Z[col])
    
print(Z.shape)
Z.tail()

(2961, 18)


Unnamed: 0,HomeTeam,AwayTeam,HTHG,HTAG,FTHG,FTAG,HS,AS,HST,AST,HC,AC,HF,AF,HY,AY,HR,AR
7419,Man United,Arsenal,-0.836251,-0.712426,-1.208834,-0.1576,-0.866365,-1.255492,-0.894901,-0.971452,-0.02133,0.067751,2.603266,1.09753,2.050946,0.152883,-0.250641,-0.310832
7420,Arsenal,Everton,1.519733,2.009502,1.836779,1.554894,0.628678,0.690849,0.010911,0.469304,-0.66253,-0.671148,-1.098057,-1.728594,-1.20353,-1.41622,-0.250641,-0.310832
7422,Chelsea,Aston Villa,-0.836251,0.648538,-0.44743,1.554894,0.254917,0.907109,0.010911,2.270248,0.29927,0.437201,0.752604,1.611371,-1.20353,-0.631668,-0.250641,-0.310832
7424,Leicester,Tottenham,-0.836251,-0.712426,0.313973,-0.1576,-0.679484,0.474589,0.010911,0.469304,-0.02133,1.545549,-1.626818,-1.471674,-1.20353,-1.41622,-0.250641,-0.310832
7427,Southampton,Newcastle,1.519733,-0.712426,1.075376,-0.1576,-1.240125,1.123369,-0.894901,1.189682,-0.98313,0.437201,1.016984,-0.187072,-1.20353,-0.631668,3.49758,-0.310832


### Handle categorical values
* Input data needs to be continous variables that are integers
* Convert to dummy variables

In [6]:
def preprocess(Z):
    df = pd.DataFrame(index=Z.index)
    for col, data in Z.iteritems():
        if data.dtype == object:
            data = pd.get_dummies(data, prefix=col)
        df = df.join(data)
    return df

Z = preprocess(Z)
print ("Processed feature columns ({} total features):\n{}".format(len(Z.columns), list(Z.columns)))

Processed feature columns (56 total features):
['HomeTeam_Arsenal', 'HomeTeam_Aston Villa', 'HomeTeam_Bournemouth', 'HomeTeam_Brighton', 'HomeTeam_Burnley', 'HomeTeam_Chelsea', 'HomeTeam_Crystal Palace', 'HomeTeam_Everton', 'HomeTeam_Leicester', 'HomeTeam_Liverpool', 'HomeTeam_Man City', 'HomeTeam_Man United', 'HomeTeam_Newcastle', 'HomeTeam_Norwich', 'HomeTeam_Sheffield United', 'HomeTeam_Southampton', 'HomeTeam_Tottenham', 'HomeTeam_Watford', 'HomeTeam_West Ham', 'HomeTeam_Wolves', 'AwayTeam_Arsenal', 'AwayTeam_Aston Villa', 'AwayTeam_Bournemouth', 'AwayTeam_Brighton', 'AwayTeam_Burnley', 'AwayTeam_Chelsea', 'AwayTeam_Crystal Palace', 'AwayTeam_Everton', 'AwayTeam_Leicester', 'AwayTeam_Liverpool', 'AwayTeam_Man City', 'AwayTeam_Man United', 'AwayTeam_Newcastle', 'AwayTeam_Norwich', 'AwayTeam_Sheffield United', 'AwayTeam_Southampton', 'AwayTeam_Tottenham', 'AwayTeam_Watford', 'AwayTeam_West Ham', 'AwayTeam_Wolves', 'HTHG', 'HTAG', 'FTHG', 'FTAG', 'HS', 'AS', 'HST', 'AST', 'HC', 'AC', 

### Feature information

In [7]:
print('\nFeature values:')
Z.head()


Feature values:


Unnamed: 0,HomeTeam_Arsenal,HomeTeam_Aston Villa,HomeTeam_Bournemouth,HomeTeam_Brighton,HomeTeam_Burnley,HomeTeam_Chelsea,HomeTeam_Crystal Palace,HomeTeam_Everton,HomeTeam_Leicester,HomeTeam_Liverpool,...,HST,AST,HC,AC,HF,AF,HY,AY,HR,AR
0,0,1,0,0,0,0,0,0,0,0,...,-0.291027,-0.971452,1.90227,0.437201,0.752604,-0.700913,-1.20353,-1.41622,-0.250641,-0.310832
5,0,0,0,0,0,0,0,0,0,0,...,1.218661,1.189682,-0.02133,2.284449,0.752604,1.09753,-1.20353,-0.631668,-0.250641,-0.310832
7,0,0,0,0,0,0,0,0,0,0,...,0.312848,1.189682,-0.98313,1.1761,1.545745,-0.187072,1.237327,-0.631668,-0.250641,-0.310832
8,0,0,0,0,0,1,0,0,0,0,...,-0.291027,-0.611263,-1.30373,-0.671148,1.545745,-0.957833,-1.20353,-1.41622,-0.250641,-0.310832
9,0,0,0,0,0,0,0,1,0,0,...,-0.291027,3.350815,-1.94493,0.80665,0.752604,1.868291,0.423708,-0.631668,-0.250641,-0.310832


### Split data into training and test sets

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(Z, y, test_size = 50,
                                                    random_state = 2,
                                                    stratify = y)
print('Training data:',len(X_train))
print('Test data:',len(X_test))

Training data: 2911
Test data: 50


## Create models
Classifiers:
* Logistic Regression
* Support Vector Classifier
* K-Nearest Neighbors

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

clf_lr = LogisticRegression(random_state=42)
clf_svc = SVC(kernel='poly', random_state=42)
clf_knn = KNeighborsClassifier(n_neighbors=10)

## Train and Evaluate the models

* Train the model
* Test based on the F1 score and Accuarcy
  * F1 score considers both the precision and the recall of the test to compute the score
  * The F1 score can be interpreted as a weighted average of the precision and recall
  * F1 score reaches its best value at 1 and worst at 0.
  * Accuracy is the ratio of correct predictions to the total predictions

In [10]:
from time import time
from sklearn.metrics import f1_score

def train(clf, X_train, y_train):
    start = time()
    clf.fit(X_train, y_train)
    end = time()
    print('Model trained in {:.4f} secs'.format(end-start))

def test(clf, features, labels):
    start = time()
    y_pred = clf.predict(features)
    end = time()
    print('Test predictions made in {:.4f} secs'.format(end-start))
    f1 = f1_score(labels, y_pred, average='macro')
    acc = sum(labels==y_pred)/float(len(y_pred))
    return f1, acc

def train_test(clf, X_train, y_train, X_test, y_test):
    print('Training {}...'.format(clf.__class__.__name__))
    train(clf, X_train, y_train)
    f1, acc = test(clf, X_train, y_train)
    print('For Training set: F1 score= {:.4f}, Accuracy= {:.4f}'.format(f1, acc))
    f1, acc = test(clf, X_test, y_test)
    print('For Test set: F1 score= {:.4f}, Accuracy= {:.4f}'.format(f1, acc))
    
train_test(clf_lr, X_train, y_train, X_test, y_test)
print('------------------------------------------------')
train_test(clf_svc, X_train, y_train, X_test, y_test)
print('------------------------------------------------')
train_test(clf_knn, X_train, y_train, X_test, y_test)
print('------------------------------------------------')

Training LogisticRegression...
Model trained in 0.0827 secs
Test predictions made in 0.0017 secs
For Training set: F1 score= 1.0000, Accuracy= 1.0000
Test predictions made in 0.0012 secs
For Test set: F1 score= 1.0000, Accuracy= 1.0000
------------------------------------------------
Training SVC...
Model trained in 0.3545 secs
Test predictions made in 0.2335 secs
For Training set: F1 score= 0.9535, Accuracy= 0.9626
Test predictions made in 0.0048 secs
For Test set: F1 score= 0.8800, Accuracy= 0.9000
------------------------------------------------
Training KNeighborsClassifier...
Model trained in 0.0123 secs
Test predictions made in 0.6155 secs
For Training set: F1 score= 0.8196, Accuracy= 0.8416
Test predictions made in 0.0124 secs
For Test set: F1 score= 0.7006, Accuracy= 0.7200
------------------------------------------------


## Use the best model for making predictions
* Set the model
* Train the model with training dataset
* Make predictions
* Predict the probability of results (Away team win, draw, Home team win)

In [11]:
model = SVC(kernel='poly', random_state=42)
model.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='poly',
    max_iter=-1, probability=False, random_state=42, shrinking=True, tol=0.001,
    verbose=False)

In [12]:
pred = model.predict(X_test)
pred_df = pd.DataFrame(pred, columns=['Prediction'])

if(model.probability):
    pred_prob = model.predict_proba(X_test) 
    pred_prob_df = pd.DataFrame(pred_prob, columns=['Away Win %', 'Draw %', 'Home Win %']) 
    pred_prob_df = pred_prob_df.round(6)*100
    prediction_df = pd.concat([pred_df, pred_prob_df], axis=1) 
else:
    prediction_df = pred_df
    
prediction_df.head()

Unnamed: 0,Prediction
0,H
1,H
2,D
3,A
4,H


## Incorporate the result probabilities into the fixture

In [13]:
fixtures = pd.read_csv('data/fixtures/epl-2019-GMT.csv')

print(fixtures.shape[0], 'matches')
fixtures.head()

380 matches


Unnamed: 0,Round Number,Date,Location,Home Team,Away Team,Result
0,1,09/08/2019 20:00,Anfield,Liverpool,Norwich,4 - 1
1,1,10/08/2019 12:30,London Stadium,West Ham,Man City,0 - 5
2,1,10/08/2019 15:00,Vitality Stadium,Bournemouth,Sheffield Utd,1 - 1
3,1,10/08/2019 15:00,Turf Moor,Burnley,Southampton,3 - 0
4,1,10/08/2019 15:00,Selhurst Park,Crystal Palace,Everton,0 - 0


In [14]:
fixtures = fixtures.drop(['Round Number','Date','Location','Result'], 1)
fixtures.columns = ['HomeTeam', 'AwayTeam']
fixtures['HTHG'] = 0
fixtures['HTAG'] = 0
fixtures['FTHG'] = 0
fixtures['FTAG'] = 0
fixtures['HS'] = 0
fixtures['AS'] = 0
fixtures['HST'] = 0
fixtures['AST'] = 0
fixtures['HC'] = 0
fixtures['AC'] = 0
fixtures['HF'] = 0
fixtures['AF'] = 0
fixtures['HY'] = 0
fixtures['AY'] = 0
fixtures['HR'] = 0
fixtures['AR'] = 0

print(fixtures.shape, 'features')
fixtures.head()

(380, 18) features


Unnamed: 0,HomeTeam,AwayTeam,HTHG,HTAG,FTHG,FTAG,HS,AS,HST,AST,HC,AC,HF,AF,HY,AY,HR,AR
0,Liverpool,Norwich,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,West Ham,Man City,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Bournemouth,Sheffield Utd,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Burnley,Southampton,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Crystal Palace,Everton,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [15]:
pp_fixtures = preprocess(fixtures)
print ("Processed feature columns ({} total features):\n{}".format(
    len(pp_fixtures.columns), 
    list(pp_fixtures.columns)
))

Processed feature columns (56 total features):
['HomeTeam_Arsenal', 'HomeTeam_Aston Villa', 'HomeTeam_Bournemouth', 'HomeTeam_Brighton', 'HomeTeam_Burnley', 'HomeTeam_Chelsea', 'HomeTeam_Crystal Palace', 'HomeTeam_Everton', 'HomeTeam_Leicester', 'HomeTeam_Liverpool', 'HomeTeam_Man City', 'HomeTeam_Man Utd', 'HomeTeam_Newcastle', 'HomeTeam_Norwich', 'HomeTeam_Sheffield Utd', 'HomeTeam_Southampton', 'HomeTeam_Spurs', 'HomeTeam_Watford', 'HomeTeam_West Ham', 'HomeTeam_Wolves', 'AwayTeam_Arsenal', 'AwayTeam_Aston Villa', 'AwayTeam_Bournemouth', 'AwayTeam_Brighton', 'AwayTeam_Burnley', 'AwayTeam_Chelsea', 'AwayTeam_Crystal Palace', 'AwayTeam_Everton', 'AwayTeam_Leicester', 'AwayTeam_Liverpool', 'AwayTeam_Man City', 'AwayTeam_Man Utd', 'AwayTeam_Newcastle', 'AwayTeam_Norwich', 'AwayTeam_Sheffield Utd', 'AwayTeam_Southampton', 'AwayTeam_Spurs', 'AwayTeam_Watford', 'AwayTeam_West Ham', 'AwayTeam_Wolves', 'HTHG', 'HTAG', 'FTHG', 'FTAG', 'HS', 'AS', 'HST', 'AST', 'HC', 'AC', 'HF', 'AF', 'HY', 'A

In [16]:
model.predict(pp_fixtures)

array(['H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H',
       'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H',
       'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H',
       'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H',
       'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H',
       'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H',
       'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H',
       'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H',
       'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H',
       'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H',
       'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H',
       'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H',
       'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H',
       'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H

In [17]:
fixtures['Prediction'] = model.predict(pp_fixtures)

In [18]:
Result = fixtures.drop(['FTHG','FTAG','HTHG','HTAG','HS','AS',
                        'HST','AST','HF','AF','HC','AC',
                        'HY','AY','HR','AR'],1)


Result.tail(10)

Unnamed: 0,HomeTeam,AwayTeam,Prediction
370,Arsenal,Watford,H
371,Burnley,Brighton,H
372,Chelsea,Wolves,H
373,Crystal Palace,Spurs,H
374,Everton,Bournemouth,H
375,Leicester,Man Utd,H
376,Man City,Norwich,H
377,Newcastle,Liverpool,H
378,Southampton,Sheffield Utd,H
379,West Ham,Aston Villa,H


In [19]:
if(model.probability):
    fixture_pred_prob = model.predict_proba(pp_fixtures) *100
    fixture_pred_prob = pd.DataFrame(fixture_pred_prob, columns=['Away win %','Draw %','Home win %'])
    display(fixture_pred_prob)