## Import dataset

In [1]:
import pandas as pd

data = pd.read_csv('../data/combined/combined_data.csv')
print(data.shape[0], 'matches')
data.head(5)

3630 matches


Unnamed: 0,Date,HomeTeam,AwayTeam,HTHG,HTAG,HTR,FTHG,FTAG,FTR,HS,...,HST,AST,HC,AC,HF,AF,HY,AY,HR,AR
0,17/08/13,Arsenal,Aston Villa,1.0,1.0,D,1.0,3.0,A,16.0,...,4.0,4.0,4.0,3.0,15.0,18.0,4.0,5.0,1.0,0.0
1,17/08/13,Liverpool,Stoke,1.0,0.0,H,1.0,0.0,H,26.0,...,11.0,4.0,12.0,6.0,11.0,11.0,1.0,1.0,0.0,0.0
2,17/08/13,Norwich,Everton,0.0,0.0,D,2.0,2.0,D,8.0,...,2.0,6.0,6.0,8.0,13.0,10.0,2.0,0.0,0.0,0.0
3,17/08/13,Sunderland,Fulham,0.0,0.0,D,0.0,1.0,A,20.0,...,3.0,1.0,6.0,1.0,14.0,14.0,0.0,3.0,0.0,0.0
4,17/08/13,Swansea,Man United,0.0,2.0,A,1.0,4.0,A,17.0,...,6.0,7.0,7.0,4.0,13.0,10.0,1.0,3.0,0.0,0.0


In [2]:
curr_df = pd.read_csv('../data/season-2018-2019.csv')

home_teams = curr_df['HomeTeam']

team_list = []
for t in home_teams:
    if t not in team_list:
        team_list.append(t)
        
print(len(team_list), 'Teams in Season')
i=1
for team in team_list:
    print(i, team)
    i+=1

20 Teams in Season
1 Man United
2 Bournemouth
3 Fulham
4 Huddersfield
5 Newcastle
6 Watford
7 Wolves
8 Arsenal
9 Liverpool
10 Southampton
11 Cardiff
12 Chelsea
13 Everton
14 Leicester
15 Tottenham
16 West Ham
17 Brighton
18 Burnley
19 Man City
20 Crystal Palace


## Preprocess data

### Keep records of only the teams in the current season

In [3]:
filtered = data[(data['HomeTeam'].isin(team_list))]
data = filtered[(filtered['AwayTeam'].isin(team_list))]

print(data.shape, 'records')

(1788, 21) records


### Prepare features and label

In [4]:
X = data.drop(['FTR'], axis=1)
Z = X.drop(['Date', 'HTR'], axis=1)
print(Z.shape[1], 'Features')
for c in Z.columns:
    print(c)
y = data['FTR']

18 Features
HomeTeam
AwayTeam
HTHG
HTAG
FTHG
FTAG
HS
AS
HST
AST
HC
AC
HF
AF
HY
AY
HR
AR


### Scale and standardise the feature data
* Center to the mean and component wise scale to unit variance

In [5]:
from sklearn.preprocessing import scale

cols = [['FTHG', 'FTAG', 'HTHG', 'HTAG', 'HS', 'AS', 'HST', 'AST', 
        'HC', 'AC', 'HF', 'AF', 'HY', 'AY', 'HR', 'AR']]
for col in cols:
    X[col] = scale(X[col])
    Z[col] = scale(Z[col])
    
print(Z.shape)
Z.tail()

(1788, 18)


Unnamed: 0,HomeTeam,AwayTeam,HTHG,HTAG,FTHG,FTAG,HS,AS,HST,AST,HC,AC,HF,AF,HY,AY,HR,AR
3625,Liverpool,Wolves,0.328622,-0.732737,0.308402,-1.031586,-0.204476,-0.917444,-0.119938,-0.907005,-0.627103,-1.430888,-2.190239,0.036045,-1.244514,0.17262,-0.248904,-0.285377
3626,Man United,Cardiff,-0.830305,0.58398,-1.225459,0.627862,2.086049,0.336836,1.473291,-0.144558,1.656522,-1.053093,-0.422626,-1.371144,1.15074,0.974293,-0.248904,-0.285377
3627,Southampton,Huddersfield,0.328622,-0.732737,-0.458528,-0.201862,-0.733059,-0.290304,-0.757229,-0.525782,-0.627103,-0.675298,-0.717228,-1.371144,-1.244514,-0.629053,-0.248904,-0.285377
3628,Tottenham,Everton,0.328622,-0.732737,0.308402,0.627862,-0.556864,1.173023,-0.757229,1.761561,0.351593,-0.297503,-0.128023,0.598921,-1.244514,0.17262,-0.248904,-0.285377
3629,Watford,West Ham,-0.830305,1.900697,-0.458528,2.287309,0.500301,0.963976,0.835999,1.761561,0.351593,-1.053093,-0.128023,-0.245392,-0.446096,-1.430727,3.796924,-0.285377


### Handle categorical values
* Input data needs to be continous variables that are integers
* Convert to dummy variables

In [6]:
def preprocess(Z):
    df = pd.DataFrame(index=Z.index)
    for col, data in Z.iteritems():
        if data.dtype == object:
            data = pd.get_dummies(data, prefix=col)
        df = df.join(data)
    return df

Z = preprocess(Z)

### Feature information

In [7]:
print('\nFeature values:')
Z.head()


Feature values:


Unnamed: 0,HomeTeam_Arsenal,HomeTeam_Bournemouth,HomeTeam_Brighton,HomeTeam_Burnley,HomeTeam_Cardiff,HomeTeam_Chelsea,HomeTeam_Crystal Palace,HomeTeam_Everton,HomeTeam_Fulham,HomeTeam_Huddersfield,...,HST,AST,HC,AC,HF,AF,HY,AY,HR,AR
6,0,0,0,0,0,0,0,0,0,0,...,-0.438583,-1.288229,-0.627103,-0.675298,-0.128023,-1.089706,-1.244514,-0.629053,-0.248904,-0.285377
8,0,0,0,0,0,0,1,0,0,0,...,-0.757229,-0.907005,-0.953335,0.835882,-1.306432,-0.52683,-0.446096,-1.430727,-0.248904,-0.285377
9,0,0,0,0,0,0,0,0,0,0,...,1.791936,-1.288229,0.677826,-1.430888,-0.422626,-1.089706,0.352322,0.974293,-0.248904,3.258059
13,0,0,0,0,0,0,0,0,1,0,...,0.517354,0.999113,-1.605799,1.213677,-0.128023,-0.808268,0.352322,0.17262,-0.248904,-0.285377
15,0,0,0,0,0,0,0,0,0,0,...,-1.713166,-1.288229,-0.300871,-0.297503,0.461181,0.880359,-1.244514,-0.629053,-0.248904,-0.285377


### Split data into training and test sets

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(Z, y, test_size = 50,
                                                    random_state = 2,
                                                    stratify = y)
print('Training data:',len(X_train))
print('Test data:',len(X_test))

Training data: 1738
Test data: 50


## Create models
Classifiers:
* Logistic Regression
* Support Vector Classifier
* K-Nearest Neighbors

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

clf_lr = LogisticRegression(random_state=42)
clf_svc = SVC(kernel='poly', random_state=42)
clf_knn = KNeighborsClassifier(n_neighbors=10)

## Train and Evaluate the models

* Train the model
* Test based on the F1 score and Accuarcy
  * F1 score considers both the precision and the recall of the test to compute the score
  * The F1 score can be interpreted as a weighted average of the precision and recall
  * F1 score reaches its best value at 1 and worst at 0.
  * Accuracy is the ratio of correct predictions to the total predictions

In [10]:
from time import time
from sklearn.metrics import f1_score

def train(clf, X_train, y_train):
    start = time()
    clf.fit(X_train, y_train)
    end = time()
    print('Model trained in {:.4f} secs'.format(end-start))

def test(clf, features, labels):
    start = time()
    y_pred = clf.predict(features)
    end = time()
    print('Test predictions made in {:.4f} secs'.format(end-start))
    f1 = f1_score(labels, y_pred, average='macro')
    acc = sum(labels==y_pred)/float(len(y_pred))
    return f1, acc

def train_test(clf, X_train, y_train, X_test, y_test):
    print('Training {}...'.format(clf.__class__.__name__))
    train(clf, X_train, y_train)
    f1, acc = test(clf, X_train, y_train)
    print('For Training set: F1 score= {:.4f}, Accuracy= {:.4f}'.format(f1, acc))
    f1, acc = test(clf, X_test, y_test)
    print('For Test set: F1 score= {:.4f}, Accuracy= {:.4f}'.format(f1, acc))
    
train_test(clf_lr, X_train, y_train, X_test, y_test)
print('------------------------------------------------')
train_test(clf_svc, X_train, y_train, X_test, y_test)
print('------------------------------------------------')
train_test(clf_knn, X_train, y_train, X_test, y_test)
print('------------------------------------------------')

Training LogisticRegression...
Model trained in 0.0655 secs
Test predictions made in 0.0013 secs
For Training set: F1 score= 1.0000, Accuracy= 1.0000
Test predictions made in 0.0009 secs
For Test set: F1 score= 1.0000, Accuracy= 1.0000
------------------------------------------------
Training SVC...
Model trained in 0.1406 secs
Test predictions made in 0.0956 secs
For Training set: F1 score= 0.9176, Accuracy= 0.9310
Test predictions made in 0.0034 secs
For Test set: F1 score= 0.8772, Accuracy= 0.9000
------------------------------------------------
Training KNeighborsClassifier...
Model trained in 0.0064 secs
Test predictions made in 0.2349 secs
For Training set: F1 score= 0.7964, Accuracy= 0.8314
Test predictions made in 0.0084 secs
For Test set: F1 score= 0.7083, Accuracy= 0.8000
------------------------------------------------


## Use the best model for making predictions
* Set the model
* Train the model with training dataset
* Make predictions
* Predict the probability of results (Away team win, draw, Home team win)

In [11]:
model = SVC(kernel='poly', random_state=42)
model.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='poly',
    max_iter=-1, probability=False, random_state=42, shrinking=True, tol=0.001,
    verbose=False)

In [12]:
pred = model.predict(X_test)
pred_df = pd.DataFrame(pred, columns=['Prediction'])

if(model.probability):
    pred_prob = model.predict_proba(X_test) 
    pred_prob_df = pd.DataFrame(pred_prob, columns=['Away Win %', 'Draw %', 'Home Win %']) 
    pred_prob_df = pred_prob_df.round(6)*100
    prediction_df = pd.concat([pred_df, pred_prob_df], axis=1) 
else:
    prediction_df = pred_df
    
prediction_df.head()

Unnamed: 0,Prediction
0,H
1,H
2,H
3,H
4,H


## Incorporate the result probabilities into the fixture

In [13]:
fixtures = pd.read_csv('../data/fixtures/epl-2018-GMT.csv')

print(fixtures.shape[0], 'matches')
fixtures.head()

380 matches


Unnamed: 0,Round Number,Date,Location,Home Team,Away Team,Result
0,1,10/08/2018 20:00,Old Trafford,Man Utd,Leicester,2 - 1
1,1,11/08/2018 12:30,St. James' Park,Newcastle,Spurs,1 - 2
2,1,11/08/2018 15:00,Vitality Stadium,Bournemouth,Cardiff,2 - 0
3,1,11/08/2018 15:00,Craven Cottage,Fulham,Crystal Palace,0 - 2
4,1,11/08/2018 15:00,John Smith's Stadium,Huddersfield,Chelsea,0 - 3


In [14]:
fixtures = fixtures.drop(['Round Number','Date','Location','Result'], 1)
fixtures.columns = ['HomeTeam', 'AwayTeam']
fixtures['HTHG'] = 0
fixtures['HTAG'] = 0
fixtures['FTHG'] = 0
fixtures['FTAG'] = 0
fixtures['HS'] = 0
fixtures['AS'] = 0
fixtures['HST'] = 0
fixtures['AST'] = 0
fixtures['HC'] = 0
fixtures['AC'] = 0
fixtures['HF'] = 0
fixtures['AF'] = 0
fixtures['HY'] = 0
fixtures['AY'] = 0
fixtures['HR'] = 0
fixtures['AR'] = 0

print(fixtures.shape, 'features')
fixtures.head()

(380, 18) features


Unnamed: 0,HomeTeam,AwayTeam,HTHG,HTAG,FTHG,FTAG,HS,AS,HST,AST,HC,AC,HF,AF,HY,AY,HR,AR
0,Man Utd,Leicester,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Newcastle,Spurs,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Bournemouth,Cardiff,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Fulham,Crystal Palace,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Huddersfield,Chelsea,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [15]:
pp_fixtures = preprocess(fixtures)

In [16]:
model.predict(pp_fixtures)

array(['H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H',
       'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H',
       'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H',
       'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H',
       'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H',
       'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H',
       'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H',
       'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H',
       'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H',
       'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H',
       'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H',
       'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H',
       'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H',
       'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H

In [17]:
fixtures['Prediction'] = model.predict(pp_fixtures)

In [18]:
Result = fixtures.drop(['FTHG','FTAG','HTHG','HTAG','HS','AS',
                        'HST','AST','HF','AF','HC','AC',
                        'HY','AY','HR','AR'],1)


Result.tail(10)

Unnamed: 0,HomeTeam,AwayTeam,Prediction
370,Brighton,Man City,H
371,Burnley,Arsenal,H
372,Crystal Palace,Bournemouth,H
373,Fulham,Newcastle,H
374,Leicester,Chelsea,H
375,Liverpool,Wolves,H
376,Man Utd,Cardiff,H
377,Southampton,Huddersfield,H
378,Spurs,Everton,H
379,Watford,West Ham,H


In [19]:
if(model.probability):
    fixture_pred_prob = model.predict_proba(pp_fixtures) *100
    fixture_pred_prob = pd.DataFrame(fixture_pred_prob, columns=['Away win %','Draw %','Home win %'])
    display(fixture_pred_prob)