# Football Game Result Prediction in the English Premier League

## Jong-Won Lee

## Dataset

- Datasets for the respective season from http://www.football-data.co.uk/englandm.php
- Combined dataset contains all 380 games of the seasons from 2014/2015 until 2018/2019
- The test set is the season 2019/2020 with just 288 games due to the virus outbreak
- Training data: 1900 rows and 68 columns
- Test data: 288 rows and 106 columns

In [None]:
#import
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from sklearn import preprocessing
import numpy as np

In [None]:
pd.set_option('display.max_columns', None)

games = pd.read_csv('epl_season_2014-2019.csv')

#remove betting attributes
games.drop(games.iloc[:, 23:68], inplace = True, axis = 1)

#remove league since all are division 0
games.drop(columns='Div', axis=1, inplace=True)


games.head()

* Date

* HomeTeam = Home Team
* AwayTeam = Away Team


* FTHG = Full Time Home Team Goals
* FTAG = Full Time Away Team Goals
* **FTR = Full Time Result (H=Home Win, D=Draw, A=Away Win)**


* HTHG = Half Time Home Team Goals
* HTAG = Half Time Away Team Goals
* HTR = Half Time Result (H=Home Win, D=Draw, A=Away Win)


* Referee = Match Referee


* HS = Home Team Shots
* AS = Away Team Shots


* HST = Home Team Shots on Target
* AST = Away Team Shots on Target


* HC = Home Team Corners
* AC = Away Team Corners


* HF = Home Team Fouls Committed
* AF = Away Team Fouls Committed


* HY = Home Team Yellow Cards
* AY = Away Team Yellow Cards


* HR = Home Team Red Cards
* AR = Away Team Red Cards

## Preprocess for Data Analysis

In [None]:
games_data = games.copy()

# extract target label and drop NaN row 380
games = games.drop(games.index[380])
games_data = games_data.drop(games_data.index[380])
games_target = games_data['FTR']


#transform FTR from target array into numeric values/labels same as in test set
label_encoder = preprocessing.LabelEncoder()
games_target = label_encoder.fit_transform(games_target)
print(games_target)

#transform HTR, labeling for analysis, later with one hot for classification
label_encoder = preprocessing.LabelEncoder()
games_data['HTR'] = label_encoder.fit_transform(games_data['HTR'])
print(games_data['HTR'])

#transform FTR, labeling for analysis before dropping
label_encoder = preprocessing.LabelEncoder()
games_data['FTR'] = label_encoder.fit_transform(games_data['FTR'])
print(games_data['FTR'])

#transform Referee, labeling for analysis, later droppedd
label_encoder = preprocessing.LabelEncoder()
games_data['Referee'] = label_encoder.fit_transform(games_data['Referee'])
print(games_data['Referee'])

#transform Date, ordinal labeling for analysis,
ordinal_encoder = preprocessing.OrdinalEncoder()
games_data['Date'] = ordinal_encoder.fit_transform(games_data['Date'].values.reshape(-1,1))
print(games_data['Date'])

games_data.head(20)
#games_target[380]

## Plotting

In [None]:
# plotting shot attributes

pd.plotting.scatter_matrix(games_data[['HS', 'HF']], figsize=(8, 8))

In [None]:
# plotting shot attributes
plt.scatter(games_data['HC'], games_data['HS'])

plt.show()

In [None]:
# plotting shots and corners
pd.plotting.scatter_matrix(games[['HS','AS','HST','AST','HC','AC']], figsize=(10, 10))

## Data Statistics and Feature Selection

In [None]:
# 0 = away team win
# 1 = draw
# 2 = home team win

# looking at the distribution of the target variable FTR
plt.hist(games_target)

matches = games.shape[0]

homewins = len(games[games.FTR == 'H'])
winrate_home = (homewins / matches) * 100

awaywins = len(games[games.FTR == 'A'])
winrate_away = (awaywins / matches) * 100

draws = len(games[games.FTR == 'D'])
winrate_draw = (draws / matches) * 100

print("Number of matches: {}".format(matches))
print()
print("Number of homewins: {}".format(homewins))
print("Winrate of hometeams: {:.2f}%".format(winrate_home))
print()
print("Number of awaywins: {}".format(awaywins))
print("Winrate of awayteams: {:.2f}%".format(winrate_away))
print()
print("Number of draws: {}".format(draws))
print("Winrate of draws: {:.2f}%".format(winrate_draw))

In [None]:
# relevance of half time result (HTR) for full time result (FTR)
sum_homewin = 0;
sum_awaywin = 0;
sum_draw = 0;
sum_change = 0;

for i in range(1900):
    if(i != 380 and games_target[i] == games_data.loc[i]['HTR']):
        if(games_target[i] == 2):
            sum_homewin += 1
        if(games_target[i] == 0):
            sum_awaywin += 1
        if(games_target[i] == 1):
            sum_draw += 1
    else: sum_change += 1
        
print("Half time homewin leads to full time homewin: #{} = {:.2f}%".format(sum_homewin, 100*(sum_homewin/homewins)))
print("Half time awaywin leads to full time awaywin: #{} = {:.2f}%".format(sum_awaywin, 100*(sum_awaywin/awaywins)))
print("Half time draw leads to full time draw: #{} = {:.2f}%".format(sum_draw, 100*(sum_draw/draws)))
print("HTR != FTR: #{}".format(sum_change))

print("check sum of matches (1900): #{}".format(sum_change+sum_homewin+sum_awaywin+sum_draw))

In [None]:
games_pp = games_data.copy()

games_pp.head()

In [None]:
# check for null values in training data
print(games_pp.isnull().values.sum())

In [None]:
# looking at home team attribute
print(games_pp['HomeTeam'].value_counts())
print(games_pp['HomeTeam'].value_counts().count())

## Preprocessing

In [None]:
# transform hometeam and awayteam categorial values into numeric values by manually mapping with dictionary
# to achieve also same labeling in test set
# higher numbers might get higher weights: teams that played more seasons will get higher number
List_A = ['Sheffield United','QPR', 'Fulham', 'Norwich', 'Cardiff', 'Wolves', 'Middlesbrough', 'Huddersfield', 'Hull', 'Brighton', 'Aston Villa', 'Sunderland', 'Bournemouth', 'Watford', 'Newcastle', 'West Brom', 'Swansea', 'Stoke', 'Burnley', 'Tottenham', 'Leicester', 'Southampton', 'Man City', 'Crystal Palace', 'Everton', 'Man United', 'Liverpool','West Ham', 'Arsenal', 'Chelsea']
List_B = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29]
d=dict(zip(List_A, List_B))
print(d)

# transform teams not with one hot to avoid curse of dimensionality? but same as in test set
# higher values for more seasons, sheffield team for test set

games_pp['HomeTeam'] = games_pp['HomeTeam'].map(d).fillna(games_pp['HomeTeam'])

games_pp['AwayTeam'] = games_pp['AwayTeam'].map(d).fillna(games_pp['AwayTeam'])

games_pp.head(50)

## Profiling Report

In [None]:
%pip install -q -U pandas-profiling[notebook,html]
from pandas_profiling import ProfileReport

profile = ProfileReport(games_pp, title='Pandas Profiling Report', html={'style':{'full_width':True}})
profile.to_notebook_iframe()

## Preprocessing

In [None]:
# remove full-time related attributes: yellow/red cards, fouls, shots, corners
games_pp = games_pp.drop(columns=['HY','AY', 'HR', 'AR'])

games_pp = games_pp.drop(columns=['HF','AF'])

games_pp = games_pp.drop(columns=['HS','AS', 'HST', 'AST', 'HC', 'AC'])

# remove referee and date since no correlation
games_pp = games_pp.drop(columns=['Referee'])
games_pp.drop(columns='Date', axis=1, inplace=True)


# remove full time goals
games_pp = games_pp.drop(columns=['FTHG', 'FTAG'])

# drop target
games_pp = games_pp.drop(columns='FTR')

#encode half time result HTR with one hot
encoder = preprocessing.OneHotEncoder()
games_onehot = pd.DataFrame(encoder.fit_transform(games['HTR'].values.reshape(-1,1)).toarray(), columns=encoder.get_feature_names(['HTR']))

print(games_onehot)

games_pp = games_pp.drop(columns=['HTR'])
games_pp = games_pp.join(games_onehot)
games_pp.head()

In [None]:
#scale all numeric values to same range with MinMaxScaler
scaler = preprocessing.MinMaxScaler()

games_pp[['HTHG', 'HTAG']] = scaler.fit_transform(games_pp[['HTHG', 'HTAG']])

games_pp

#remove NaN (there is one in line 1900)
games_pp = games_pp.drop(games_pp.index[1899])
print(games_pp.isnull().values.sum())
print(np.isnan(games_target).any())
games_target = np.delete(games_target, 1899)

games_pp

## Preprocess Test Set

In [None]:
games_test = pd.read_csv('test_season_2019-2020.csv')
#remove betting attributes
games_test.drop(games_test.iloc[:, 24:106], inplace = True, axis = 1)

#remove league and date and time
games_test.drop(columns='Div', axis=1, inplace=True)
games_test.drop(columns='Date', axis=1, inplace=True)
games_test.drop(columns='Time', axis=1, inplace=True)

#drop target
games_target_test = games_test['FTR']
games_test = games_test.drop(columns='FTR')

#transform FTR #2=H,1=D,0=A but same as in test set
label_encoder = preprocessing.LabelEncoder()
games_target_test = label_encoder.fit_transform(games_target_test)
print(games_target_test)

# removed yellow and red cards
# removed fouls and referee
# removed shots and corners
games_test = games_test.drop(columns=['HF','AF'])
games_test = games_test.drop(columns=['HY','AY', 'HR', 'AR'])
games_test= games_test.drop(columns=['Referee'])
games_test = games_test.drop(columns=['HS','AS', 'HST', 'AST', 'HC', 'AC'])

#remove full time goals
games_test = games_test.drop(columns=['FTHG', 'FTAG'])

#onehot HTR
games_onehot = pd.DataFrame(encoder.fit_transform(games_test['HTR'].values.reshape(-1,1)).toarray(), columns=encoder.get_feature_names(['HTR']))
games_test = games_test.drop(columns=['HTR'])
games_test = games_test.join(games_onehot)

#normalising
games_test[['HTHG', 'HTAG']] = scaler.fit_transform(games_test[['HTHG', 'HTAG']])

# transforming home and awayteam with mapping from training set
# higher values for more seasons, sheffield team for test set
# transform teams not with one hot to avoid curse of dimensionality
games_test['HomeTeam'] = games_test['HomeTeam'].map(d).fillna(games_test['HomeTeam'])

games_test['AwayTeam'] = games_test['AwayTeam'].map(d).fillna(games_test['AwayTeam'])

games_test.head(20)

## Test Data Statistics / Profiling Report

In [None]:
# test data set just for analysis
games_test_c = pd.read_csv('test_season_2019-2020.csv')
games_test_c.head(20)

In [None]:
# analyze and check team values
print(games_test['HomeTeam'].value_counts())
print(games_test['HomeTeam'].value_counts().count())


#compute panda profilling report
%pip install -q -U pandas-profiling[notebook,html]
from pandas_profiling import ProfileReport

profile = ProfileReport(games_test, title='Pandas Profiling Report', html={'style':{'full_width':True}})
profile.to_notebook_iframe()

In [None]:
# 0 = away team win
# 1 = draw
# 2 = home team win

# looking at distribution of the class label
plt.hist(games_target_test)

matches = games_test_c.shape[0]

homewins = len(games_test_c[games_test_c.FTR == 'H'])
winrate_home = (homewins / matches) * 100

awaywins = len(games_test_c[games_test_c.FTR == 'A'])
winrate_away = (awaywins / matches) * 100

draws = len(games_test_c[games_test_c.FTR == 'D'])
winrate_draw = (draws / matches) * 100

print("Number of matches: {}".format(matches))
print()
print("Number of homewins: {}".format(homewins))
print("Winrate of hometeams: {:.2f}%".format(winrate_home))
print()
print("Number of awaywins: {}".format(awaywins))
print("Winrate of awayteams: {:.2f}%".format(winrate_away))
print()
print("Number of draws: {}".format(draws))
print("Winrate of draws: {:.2f}%".format(winrate_draw))

In [None]:
#checking for NaN values
games_test.head(10)
print(games_pp.isnull().values.sum())
check_for_nan2 = games_pp['HomeTeam'].isnull().sum()
check_for_nan3 = games_pp['AwayTeam'].isnull().sum()
nan_rows = games[games['HomeTeam'].isnull()]
#print(nan_rows)
nan_rows = games[games['AwayTeam'].isnull()]
#print(nan_rows)

games_pp

## Model Training and Evaluation

* Decision Tree
* KNN
* GaussianNB, MultinomialNB
* SVC
* RandomForest

## Baseline Classification with


### Decision Tree, KNN, GaussianNB, SVC, MultinomialNB and Random Forest

In [None]:
#plot confusion matrix
import itertools
import matplotlib.pyplot as plt
import numpy as np
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()
    #plt.show()

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from imblearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.utils.multiclass import unique_labels
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

#baseline configuration, classification and evaluation
pipeline = Pipeline([('estimator', DecisionTreeClassifier())])

estimators = [
    DecisionTreeClassifier(), KNeighborsClassifier(), GaussianNB(), SVC(gamma='auto'), MultinomialNB(), RandomForestClassifier()
]

# stratified 10 fold cross-validation
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# evaluate accuracy and print out classification report and confusion matrix for each estimator
for i, estimator in enumerate(estimators):
    plt.figure(figsize=(12,12))
    plt.subplot(3,3,i+1)
    pipeline.set_params(estimator=estimator)
    prediction = cross_val_predict(pipeline, games_pp, games_target, cv=cv, n_jobs=-1)
    cm = confusion_matrix(games_target, prediction)
    plot_confusion_matrix(cm, classes=unique_labels(games_target), title=type(estimator).__name__)
    accuracy= cross_val_score(pipeline, games_pp, games_target, cv=cv, scoring='accuracy')
    
    print(type(estimator).__name__)
    print("Accuracy = {}%".format(accuracy.mean() * 100.0))
    print(classification_report(games_target, prediction))
    plt.show()

### Decision Tree

In [None]:
#fit model
dt = DecisionTreeClassifier()
dt.fit(games_pp, games_target)

In [None]:
%pip install -q -U graphviz

#visualize tree

import os
os.environ['PATH'] += ';C:\\Users\\joey_\\Desktop\\Jupyter Notebook\\graphviz-2.38\\release\\bin'

import graphviz
from sklearn import tree

dot_data = tree.export_graphviz(dt, out_file=None, 
                                feature_names=games_pp.columns, 
                                class_names=label_encoder.classes_,
                               filled=True, rounded=True, special_characters=True) 

graph = graphviz.Source(dot_data) 

display(graph)

In [None]:
#cross-validate with stratifiedkfold for mean accuracy
cross_val = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

accuracy_dt= cross_val_score(dt, games_pp, games_target, cv=cross_val, scoring='accuracy')
accuracy_dt.mean()

print(accuracy_dt.mean())

In [None]:
# cross-val prediction for error analysis
predicted = cross_val_predict(dt, games_pp, games_target, cv=10)

#classification report with cross-validated prediction
from sklearn.metrics import classification_report
print("Classification Report:")
print(classification_report(games_target, predicted, target_names=label_encoder.classes_))

### Decision Tree Hyperparameter Tuning with GridSearchCV

In [None]:
#parameters
parameters = {
    'criterion':['gini', 'entropy'], 
    'max_depth':[ 2, 3, 4, 5, None],
    'min_samples_split' :[2,3,4,5]
}

# specify the cross validation
stratified_10_fold_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# create the grid search instance
grid_search_estimator = GridSearchCV(dt, parameters, scoring='accuracy', cv=stratified_10_fold_cv, return_train_score=False)

# run the grid search
grid_search_estimator.fit(games_pp, games_target)

# print the results of all hyper-parameter combinations
results = pd.DataFrame(grid_search_estimator.cv_results_)
display(results)
    
# print the best parameter setting
print("best score is {} with params {}".format(grid_search_estimator.best_score_, grid_search_estimator.best_params_))

# cross-validate
prediction = cross_val_predict(grid_search_estimator, games_pp, games_target, cv=cross_val, n_jobs=-1)

# calculate
cm = confusion_matrix(games_target, prediction)
acc = accuracy_score(games_target, prediction)

# print classification report and confusion matrix
print("Optimised Decision Tree with accuracy of {}".format(acc))
plot_confusion_matrix(cm, classes=label_encoder.classes_, title='Decision Tree Classifier')
plt.show()
print(classification_report(games_target, prediction, target_names=label_encoder.classes_))

# fit the grid search (= determine the optimal parameters)
grid_search_estimator.fit(games_pp, games_target)
print("Optimised Parameters: {}".format(grid_search_estimator.best_params_))

In [None]:
#prediction with test set and tuned parameters
dt = DecisionTreeClassifier(criterion= 'gini', max_depth= 5, min_samples_split= 3)
dt.fit(games_pp, games_target)
dt_prediction = dt.predict(games_test)

cnf_matrix = confusion_matrix(games_target_test, dt_prediction)
np.set_printoptions(precision=2)
plot_confusion_matrix(cnf_matrix, classes=label_encoder.classes_)

#evaluation with confusion matrix and accuracy
print("Confusion Matrix:")
print(confusion_matrix(games_target_test, dt_prediction))
print()

print("Accuracy: {}".format(accuracy_score(games_target_test, dt_prediction))) #before: 52,08%

%pip install -q -U graphviz

#visualize new tree

import os
os.environ['PATH'] += ';C:\\Users\\joey_\\Desktop\\Jupyter Notebook\\graphviz-2.38\\release\\bin'

import graphviz
from sklearn import tree

dot_data = tree.export_graphviz(dt, out_file=None, 
                                feature_names=games_pp.columns, 
                                class_names=label_encoder.classes_,
                               filled=True, rounded=True, special_characters=True) 

graph = graphviz.Source(dot_data) 

display(graph)



In [None]:
#classification report
print("Classification Report:")
print(classification_report(games_target_test, dt_prediction, target_names=label_encoder.classes_))

### K-Nearest-Neighbor

In [None]:
#fit model
knn = KNeighborsClassifier()
accuracy_knn= cross_val_score(knn, games_pp, games_target, cv=cross_val, scoring='accuracy')
accuracy_knn.mean()
print(accuracy_knn.mean())
print(knn)

In [None]:
# cross-val prediction
predicted = cross_val_predict(knn, games_pp, games_target, cv=10)

# cross-val prediction and classification report
print("Classification Report:")
print(classification_report(games_target, predicted, target_names=label_encoder.classes_))

### K-NN Hyperparameter Tuning with GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV

# create an estimator
knn_estimator = KNeighborsClassifier()

# specify the parameter grid
parameters = {
    'n_neighbors': range(2, 9)
}

# specify the cross validation
stratified_10_fold_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# create the grid search instance
grid_search_estimator = GridSearchCV(knn_estimator, parameters, scoring='accuracy', cv=stratified_10_fold_cv, return_train_score=False)

# run the grid search
grid_search_estimator.fit(games_pp, games_target)

# print the results of all hyper-parameter combinations
results = pd.DataFrame(grid_search_estimator.cv_results_)
display(results)
    
# print the best parameter setting
print("best score is {} with params {}".format(grid_search_estimator.best_score_, grid_search_estimator.best_params_))

# cross-validate
prediction = cross_val_predict(grid_search_estimator, games_pp, games_target, cv=cross_val, n_jobs=-1)

# calculate
cm = confusion_matrix(games_target, prediction)
acc = accuracy_score(games_target, prediction)

# print classification report and confusion matrix
print("Optimised KNN with accuracy of {}".format(acc))
plot_confusion_matrix(cm, classes=label_encoder.classes_, title='KNN')
plt.show()
print(classification_report(games_target, prediction, target_names=label_encoder.classes_))

# fit the grid search (= determine the optimal parameters)
grid_search_estimator.fit(games_pp, games_target)
print("Optimised Parameters: {}".format(grid_search_estimator.best_params_))

In [None]:
#prediction with test data and tuned parameters
knn = KNeighborsClassifier(7)
knn.fit(games_pp, games_target)
knn_prediction = knn.predict(games_test)

cnf_matrix = confusion_matrix(games_target_test, knn_prediction)
np.set_printoptions(precision=2)
plot_confusion_matrix(cnf_matrix, classes=label_encoder.classes_)

print("Confusion Matrix:")
print(confusion_matrix(games_target_test, knn_prediction))
print()


print("Accuracy: {}".format(accuracy_score(games_target_test, knn_prediction))) #before: 51,04%

In [None]:
#classification report
print("Classification Report:")
print(classification_report(games_target_test, knn_prediction, target_names=label_encoder.classes_))

### Gaussian Naive Bayes

In [None]:
# create estimator
gnb = GaussianNB()
gnb.fit(games_pp, games_target)
print(gnb)
print('----------------CV-------------------')

# evaluation with training data
cross_val = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

accuracy_gnb= cross_val_score(gnb, games_pp, games_target, cv=cross_val, scoring='accuracy')
accuracy_gnb.mean()

print('Accuracy:')
print(accuracy_gnb.mean())

predicted = cross_val_predict(gnb, games_pp, games_target, cv=10)

from sklearn.metrics import classification_report
print("Classification Report:")
print(classification_report(games_target, predicted, target_names=label_encoder.classes_))

print('--------------TEST---------------------')
#with test data and no hyperparameters
gnb_prediction = gnb.predict(games_test)

cnf_matrix = confusion_matrix(games_target_test, gnb_prediction)
np.set_printoptions(precision=2)
plot_confusion_matrix(cnf_matrix, classes=label_encoder.classes_)

print("Confusion Matrix:")
print(confusion_matrix(games_target_test, gnb_prediction))
print()

print("Accuracy: {}".format(accuracy_score(games_target_test, gnb_prediction)))
#before optimisation 57,99%

print("Classification Report:")
print(classification_report(games_target_test, gnb_prediction, target_names=label_encoder.classes_))



### Multinomial Naive Bayes

In [None]:
# create estimator
mnb = MultinomialNB()
mnb.fit(games_pp, games_target)
print(mnb)
print('----------------CV-------------------')

# evaluation with training data
cross_val = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

accuracy_mnb= cross_val_score(mnb, games_pp, games_target, cv=cross_val, scoring='accuracy')
accuracy_mnb.mean()

print('Accuracy:')
print(accuracy_mnb.mean())

predicted = cross_val_predict(mnb, games_pp, games_target, cv=10)

from sklearn.metrics import classification_report
print("Classification Report:")
print(classification_report(games_target, predicted, target_names=label_encoder.classes_))

print('---------------TEST--------------------')
#with test and tuned parameters
mnb = MultinomialNB(alpha=0)
mnb.fit(games_pp, games_target)
mnb_prediction = mnb.predict(games_test)

cnf_matrix = confusion_matrix(games_target_test, mnb_prediction)
np.set_printoptions(precision=2)
plot_confusion_matrix(cnf_matrix, classes=label_encoder.classes_)

print("Confusion Matrix:")
print(confusion_matrix(games_target_test, mnb_prediction))
print()

print("Accuracy: {}".format(accuracy_score(games_target_test, mnb_prediction)))
#before optimisiation 50%

print("Classification Report:")
print(classification_report(games_target_test, mnb_prediction, target_names=label_encoder.classes_))


### MultinomialNB Hyperparameter Change

In [None]:
# MultinomialNB without laplace smoothing

mnb1 = MultinomialNB(alpha=0)
mnb1.fit(games_pp, games_target)
print(mnb1)
print('---------------CV--------------------')

#with training
cross_val = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

accuracy_mnb1= cross_val_score(mnb1, games_pp, games_target, cv=cross_val, scoring='accuracy')
accuracy_mnb1.mean()

print('Accuracy:')
print(accuracy_mnb1.mean())

### Support Vector Machines

In [None]:
# create estimator
svc = SVC(gamma='auto')
svc.fit(games_pp, games_target)
print(svc)
print('---------------CV------------------')

# evaluation with training data
cross_val = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

accuracy_svc= cross_val_score(svc, games_pp, games_target, cv=cross_val, scoring='accuracy')
accuracy_svc.mean()

print('Accuracy:')
print(accuracy_svc.mean())

predicted = cross_val_predict(svc, games_pp, games_target, cv=10)

from sklearn.metrics import classification_report
print("Classification Report:")
print(classification_report(games_target, predicted, target_names=label_encoder.classes_))

print('--------------TEST (tuned)---------------------')
#with test and tuned parameters

svc = SVC(C=1, gamma=0.1, kernel='linear')
svc.fit(games_pp, games_target)
svc_prediction = svc.predict(games_test)

cnf_matrix = confusion_matrix(games_target_test, svc_prediction)
np.set_printoptions(precision=2)
plot_confusion_matrix(cnf_matrix, classes=label_encoder.classes_)

print("Confusion Matrix:")
print(confusion_matrix(games_target_test, svc_prediction))
print()

print("Accuracy: {}".format(accuracy_score(games_target_test, svc_prediction)))
#before optimisiation 50,69%

print("Classification Report:")
print(classification_report(games_target_test, svc_prediction, target_names=label_encoder.classes_))



### SVC Hyperparamter Tuning with GridSearchCV

In [None]:
parameters = { #cant handle too many parameters, too much computing power needed
    'kernel':['linear'], #rbf', 'poly'
    'gamma':[0.1, 1], #10, 100
    'C' :[0.1, 1] #10, 100, 1000
}

# specify the cross validation
stratified_10_fold_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# create the grid search instance
grid_search_estimator = GridSearchCV(svc, parameters, scoring='accuracy', cv=stratified_10_fold_cv, return_train_score=False)

# run the grid search
grid_search_estimator.fit(games_pp, games_target)

# print the results of all hyper-parameter combinations
results = pd.DataFrame(grid_search_estimator.cv_results_)
display(results)
    
# print the best parameter setting
print("best score is {} with params {}".format(grid_search_estimator.best_score_, grid_search_estimator.best_params_))

# cross-validate
prediction = cross_val_predict(grid_search_estimator, games_pp, games_target, cv=cv, n_jobs=-1)

# calculate
cm = confusion_matrix(games_target, prediction)
acc = accuracy_score(games_target, prediction)

# print classification matrix and confusion matrix
print("Optimised SVC with accuracy of {}".format(acc))
plot_confusion_matrix(cm, classes=label_encoder.classes_, title='SVC')
plt.show()
print(classification_report(games_target, prediction, target_names=label_encoder.classes_))

# fit the grid search (= determine the optimal parameters)
grid_search_estimator.fit(games_pp, games_target)
print("Optimised Parameters: {}".format(grid_search_estimator.best_params_))

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

#create estimator
rf = RandomForestClassifier()
rf.fit(games_pp, games_target)
#print(rf)
print('---------------CV-------------------------')

# evaluation with training data
cross_val = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

accuracy_rf= cross_val_score(rf, games_pp, games_target, cv=cross_val, scoring='accuracy')
accuracy_rf.mean()

print('Accuracy:')
print(accuracy_rf.mean())

predicted = cross_val_predict(rf, games_pp, games_target, cv=10)

from sklearn.metrics import classification_report
print("Classification Report:")
print(classification_report(games_target, predicted, target_names=label_encoder.classes_))

print('-----------------TEST------------------')
#with test and tuned parameters

rf = RandomForestClassifier(bootstrap = True, max_depth = 100, max_features = 'sqrt', min_samples_leaf = 4, min_samples_split=5,n_estimators=100)
rf.fit(games_pp, games_target)
rf_prediction = rf.predict(games_test)
print(rf)

cnf_matrix = confusion_matrix(games_target_test, rf_prediction)
np.set_printoptions(precision=2)
plot_confusion_matrix(cnf_matrix, classes=label_encoder.classes_)

print("Confusion Matrix:")
print(confusion_matrix(games_target_test, rf_prediction))
print()

print("Accuracy: {}".format(accuracy_score(games_target_test, rf_prediction)))
accuracy_rf= cross_val_score(rf, games_test, games_target_test, cv=cross_val, scoring='accuracy')
accuracy_rf.mean()

print('Mean Accuracy:')
print(accuracy_rf.mean())

#before 57,64%

print("Classification Report:")
print(classification_report(games_target_test, rf_prediction, target_names=label_encoder.classes_))


### Random Forest Hyperparamter Tuning with GridSearchCV

In [None]:
parameters = { #cant handle too many parameters, too much computing power needed
    'bootstrap': [True, False],
    'max_depth': [50, 100, None],
    'max_features': ['auto', 'sqrt'],
    'min_samples_leaf': [1,4],
    'min_samples_split': [2, 5],
    'n_estimators': [100, 200]
}

{}

# specify the cross validation
stratified_10_fold_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# create the grid search instance
grid_search_estimator = GridSearchCV(rf, parameters, scoring='accuracy', cv=stratified_10_fold_cv, return_train_score=False)

# run the grid search
grid_search_estimator.fit(games_pp, games_target)

# print the results of all hyper-parameter combinations
results = pd.DataFrame(grid_search_estimator.cv_results_)
display(results)
    
# print the best parameter setting
print("best score is {} with params {}".format(grid_search_estimator.best_score_, grid_search_estimator.best_params_))

# cross-validate
prediction = cross_val_predict(grid_search_estimator, games_pp, games_target, cv=cv, n_jobs=-1)

# calculate
cm = confusion_matrix(games_target, prediction)
acc = accuracy_score(games_target, prediction)

# print classification report and confusion matrix
print("Optimised Random Forest with accuracy of {}".format(acc))
plot_confusion_matrix(cm, classes=label_encoder.classes_, title='RF')
plt.show()
print(classification_report(games_target, prediction, target_names=label_encoder.classes_))

# fit the grid search (= determine the optimal parameters)
grid_search_estimator.fit(games_pp, games_target)
print("Optimised Parameters: {}".format(grid_search_estimator.best_params_))

## Result Evaluation

* random guessing = 33.33%

* assigning every class to homewin = 44.79%

* all classifiers are better at predicting homewin and awaywin than predicting major class but more difficult to predict draws



# Data set with New Attribute Ranking

In [None]:
# load new dataset with new ranking attribute
games_ranking = pd.read_csv('epl_season_2014-2019_ranking.csv')

# remove betting attributes and rename new columns
games_ranking.drop(games_ranking.iloc[:, 23:68], inplace = True, axis = 1)
games_ranking.rename(columns={'HomeTeamPerviousRanking': 'HomePreviousRanking', 'HomeAwayPerviousRanking':'AwayPreviousRanking'}, inplace=True)
games_ranking.head()

# remove league and date
games_ranking.drop(columns='Div', axis=1, inplace=True)
games_ranking.drop(columns='Date', axis=1, inplace=True)

# drop target
games_target_ranking = games_ranking['FTR']
games_ranking = games_ranking.drop(columns='FTR')

# transform FTR #2=H,1=D,0=A but same as in test set
label_encoder = preprocessing.LabelEncoder()
games_target_ranking = label_encoder.fit_transform(games_target_ranking)

# removed yellow and red cards
# removed fouls and referee
# removed shots and corners
games_ranking = games_ranking.drop(columns=['HF','AF'])
games_ranking = games_ranking.drop(columns=['HY','AY', 'HR', 'AR'])
games_ranking = games_ranking.drop(columns=['Referee'])
games_ranking = games_ranking.drop(columns=['HS','AS', 'HST', 'AST', 'HC', 'AC'])

# remove full time goals
games_ranking = games_ranking.drop(columns=['FTHG', 'FTAG'])

# onehot HTR
games_onehot_ranking = pd.DataFrame(encoder.fit_transform(games_ranking['HTR'].values.reshape(-1,1)).toarray(), columns=encoder.get_feature_names(['HTR']))
games_ranking = games_ranking.drop(columns=['HTR'])
games_ranking = games_ranking.join(games_onehot_ranking)

# normalising + new attribute
games_ranking[['HTHG', 'HTAG','HomePreviousRanking', 'AwayPreviousRanking']] = scaler.fit_transform(games_ranking[['HTHG', 'HTAG','HomePreviousRanking', 'AwayPreviousRanking']])

# transforming home and awayteam with mapping from training set
# higher values for more seasons, sheffield team for test set
# transform teams not with one hot to avoid curse of dimensionality
games_ranking['HomeTeam'] = games_ranking['HomeTeam'].map(d).fillna(games_ranking['HomeTeam'])

games_ranking['AwayTeam'] = games_ranking['AwayTeam'].map(d).fillna(games_ranking['AwayTeam'])

games_ranking

In [None]:
# load new test with new ranking attribute
games_test_ranking = pd.read_csv('test_season_2019-2020_ranking.csv')
games_test_ranking.rename(columns={'HomeTeamRankings': 'HomePreviousRanking', 'AwayTeamRankings':'AwayPreviousRanking'}, inplace=True)

# remove betting attributes
games_test_ranking.drop(games_test_ranking.iloc[:, 24:106], inplace = True, axis = 1)

# remove league and date and time
games_test_ranking.drop(columns='Div', axis=1, inplace=True)
games_test_ranking.drop(columns='Date', axis=1, inplace=True)
games_test_ranking.drop(columns='Time', axis=1, inplace=True)

# drop target
games_target_test_ranking = games_test_ranking['FTR']
games_test_ranking = games_test_ranking.drop(columns='FTR')

# transform FTR #2=H,1=D,0=A but same as in test set
label_encoder = preprocessing.LabelEncoder()
games_target_test_ranking = label_encoder.fit_transform(games_target_test_ranking)

# removed yellow and red cards
# removed fouls and referee
# removed shots and corners
games_test_ranking = games_test_ranking.drop(columns=['HF','AF'])
games_test_ranking = games_test_ranking.drop(columns=['HY','AY', 'HR', 'AR'])
games_test_ranking = games_test_ranking.drop(columns=['Referee'])
games_test_ranking = games_test_ranking.drop(columns=['HS','AS', 'HST', 'AST', 'HC', 'AC'])

# remove full time goals
games_test_ranking = games_test_ranking.drop(columns=['FTHG', 'FTAG'])

# onehot HTR
games_onehot_ranking = pd.DataFrame(encoder.fit_transform(games_test_ranking['HTR'].values.reshape(-1,1)).toarray(), columns=encoder.get_feature_names(['HTR']))
games_test_ranking = games_test_ranking.drop(columns=['HTR'])
games_test_ranking = games_test_ranking.join(games_onehot_ranking)

# normalising + new attribute
games_test_ranking[['HTHG', 'HTAG','HomePreviousRanking', 'AwayPreviousRanking']] = scaler.fit_transform(games_test_ranking[['HTHG', 'HTAG','HomePreviousRanking', 'AwayPreviousRanking']])

# transforming home and awayteam with mapping from training set
# higher values for more seasons, sheffield team for test set
# transform teams not with one hot to avoid curse of dimensionality
games_test_ranking['HomeTeam'] = games_test_ranking['HomeTeam'].map(d).fillna(games_test_ranking['HomeTeam'])

games_test_ranking['AwayTeam'] = games_test_ranking['AwayTeam'].map(d).fillna(games_test_ranking['AwayTeam'])

games_test_ranking.head(20)

In [None]:
# with optimised parameters
# DecisionTreeClassifier(criterion= 'gini', max_depth= 5, min_samples_split= 3), KNeighborsClassifier(n_neighbors = 7), GaussianNB(), SVC(gamma='auto'), MultinomialNB(alpha=0), RandomForestClassifier(bootstrap = True, max_depth = 100, max_features = 'sqrt', min_samples_leaf = 4, min_samples_split=5,n_estimators=100)]
# accuracy without tuning: 49,31%, 55,56%, 57,99%, 56,25%, 57,63%, 58,68%

pipeline = Pipeline([('estimator', None)])
estimators = [ #optimized
DecisionTreeClassifier(), KNeighborsClassifier(), GaussianNB(), SVC(gamma='auto'), MultinomialNB(), RandomForestClassifier()]

# evaluate accuracy for training and test set
for i, estimator in enumerate(estimators):
    
    pipeline.set_params(estimator=estimator)
    accuracy= cross_val_score(pipeline, games_ranking, games_target_ranking, cv=cross_val, scoring='accuracy')
    print(type(estimator).__name__)
    print("Accuracy = {}%".format(accuracy.mean() * 100.0))
    
    pipeline.fit(games_ranking, games_target_ranking)
    test_prediction = pipeline.predict(games_test_ranking)
    print("Test Accuracy: {}%".format(accuracy_score(games_target_test_ranking, test_prediction)* 100.0))
    print()

classes = label_encoder.classes_
plt.figure(figsize=(12,12))

# Plot confusion matrix and classification report for optimized algorithms
for i, estimator in enumerate(estimators):
    plt.figure(figsize=(12,12))
    plt.subplot(3,2,i+1)
    pipeline.set_params(estimator=estimator)
    
    prediction = cross_val_predict(pipeline, games_ranking, games_target_ranking, cv=cv)
    cnf_matrix = confusion_matrix(games_target_ranking, prediction)
    np.set_printoptions(precision=2)
    plot_confusion_matrix(cnf_matrix, classes=classes, title = type(estimator).__name__)
    rep = classification_report(games_target_ranking, prediction)

    print('Classification report: {}'.format(type(estimator).__name__))
    print(rep)   
    
    plt.show() 

    


# Data set with New Attribute Shots and Corners converted to halftime

In [None]:
# load new dataset with new ranking attribute
games_shots = pd.read_csv('epl_season_2014-2019_shots.csv')

# remove betting attributes and rename new columns
games_shots.drop(games_shots.iloc[:, 31:76], inplace = True, axis = 1)
games_shots = games_shots.drop(games_shots.index[380])

# remove league and date
games_shots.drop(columns='Div', axis=1, inplace=True)
games_shots.drop(columns='Date', axis=1, inplace=True)

# drop target
games_target_shots = games_shots['FTR']
games_shots = games_shots.drop(columns='FTR')

# transform FTR #2=H,1=D,0=A but same as in test set
label_encoder = preprocessing.LabelEncoder()
games_target_shots = label_encoder.fit_transform(games_target_shots)

# removed yellow and red cards
# removed fouls and referee
# removed shots and corners
games_shots = games_shots.drop(columns=['HF','AF', 'HTHF', 'HTAF'])
games_shots = games_shots.drop(columns=['HY','AY', 'HR', 'AR'])
games_shots = games_shots.drop(columns=['Referee'])
games_shots = games_shots.drop(columns=['HS','AS', 'HST', 'AST', 'HC', 'AC'])

# remove full time goals
games_shots = games_shots.drop(columns=['FTHG', 'FTAG'])

# onehot HTR
games_onehot_shots = pd.DataFrame(encoder.fit_transform(games_shots['HTR'].values.reshape(-1,1)).toarray(), columns=encoder.get_feature_names(['HTR']))
games_shots = games_shots.drop(columns=['HTR'])
games_shots = games_shots.join(games_onehot_shots)

# normalising + new attribute
games_shots[['HTHG', 'HTAG','HTHST','HTAST', 'HTHS','HTAS', 'HTHC', 'HTAC' ]] = scaler.fit_transform(games_shots[['HTHG', 'HTAG','HTHST','HTAST', 'HTHS','HTAS', 'HTHC', 'HTAC']])

# transforming home and awayteam with mapping from training set
# higher values for more seasons, sheffield team for test set
# transform teams not with one hot to avoid curse of dimensionality
games_shots['HomeTeam'] = games_shots['HomeTeam'].map(d).fillna(games_shots['HomeTeam'])

games_shots['AwayTeam'] = games_shots['AwayTeam'].map(d).fillna(games_shots['AwayTeam'])

games_shots= games_shots.drop(games_shots.index[1899])

# check for null values
print(games_shots.isnull().values.sum())
print(np.isnan(games_target_shots).any())
games_target_shots = np.delete(games_target_shots, 1899)

games_shots



In [None]:
# load new test with new ranking attribute
games_test_shots = pd.read_csv('test_season_2019-2020_shots.csv')

# remove betting attributes
games_test_shots.drop(games_test_shots.iloc[:, 32:114], inplace = True, axis = 1)

# remove league and date and time
games_test_shots.drop(columns='Div', axis=1, inplace=True)
games_test_shots.drop(columns='Date', axis=1, inplace=True)
games_test_shots.drop(columns='Time', axis=1, inplace=True)

# drop target
games_target_test_shots = games_test_shots['FTR']
games_test_shots = games_test_shots.drop(columns='FTR')

# transform FTR #2=H,1=D,0=A but same as in test set
label_encoder = preprocessing.LabelEncoder()
games_target_test_shots = label_encoder.fit_transform(games_target_test_shots)

# removed yellow and red cards
# removed fouls and referee
# removed shots and corners
games_test_shots = games_test_shots.drop(columns=['HF','AF','HTHF', 'HTAF'])
games_test_shots = games_test_shots.drop(columns=['HY','AY', 'HR', 'AR'])
games_test_shots = games_test_shots.drop(columns=['Referee'])
games_test_shots = games_test_shots.drop(columns=['HS','AS', 'HST', 'AST', 'HC', 'AC'])

# remove full time goals
games_test_shots = games_test_shots.drop(columns=['FTHG', 'FTAG'])

# onehot HTR
games_onehot_shots = pd.DataFrame(encoder.fit_transform(games_test_shots['HTR'].values.reshape(-1,1)).toarray(), columns=encoder.get_feature_names(['HTR']))
games_test_shots = games_test_shots.drop(columns=['HTR'])
games_test_shots = games_test_shots.join(games_onehot_ranking)

# normalising + new attribute

games_test_shots[['HTHG', 'HTAG','HTHST','HTAST', 'HTHS','HTAS','HTHC', 'HTAC']] = scaler.fit_transform(games_test_shots[['HTHG', 'HTAG','HTHST','HTAST', 'HTHS','HTAS','HTHC', 'HTAC']])

# transforming home and awayteam with mapping from training set
# higher values for more seasons, sheffield team for test set
# transform teams not with one hot to avoid curse of dimensionality
games_test_shots['HomeTeam'] = games_test_shots['HomeTeam'].map(d).fillna(games_test_shots['HomeTeam'])

games_test_shots['AwayTeam'] = games_test_shots['AwayTeam'].map(d).fillna(games_test_shots['AwayTeam'])

games_test_shots

In [None]:
# with optimised parameters:
# DecisionTreeClassifier(criterion= 'gini', max_depth= 5, min_samples_split= 3), KNeighborsClassifier(n_neighbors = 7), GaussianNB(), SVC(C = 1, gamma=0.1, kernel ='linear'), MultinomialNB(alpha=0), RandomForestClassifier(bootstrap = True, max_depth = 100, max_features = 'sqrt', min_samples_leaf = 4, min_samples_split=5,n_estimators=100)
# accuracy without tuning: 45,83%, 44,79%, 44,44%, 51,04%, 48,26%, 60,10%

pipeline = Pipeline([('estimator', None)])
estimators = [ #optimized
    DecisionTreeClassifier(), KNeighborsClassifier(), GaussianNB(), SVC(gamma='auto'), MultinomialNB(), RandomForestClassifier()
]

# evaluate accuracy for training and test set
for i, estimator in enumerate(estimators):
    
    pipeline.set_params(estimator = estimator)
    accuracy= cross_val_score(pipeline, games_shots, games_target_shots, cv=cross_val, scoring='accuracy')
    
    print(type(estimator).__name__)
    print("Accuracy = {}%".format(accuracy.mean() * 100.0))
    
    pipeline.fit(games_shots, games_target_shots)
    test_prediction = pipeline.predict(games_test_shots)
    print("Test Accuracy: {}%".format(accuracy_score(games_target_test_shots, test_prediction)* 100.0))
    print()

classes = label_encoder.classes_

# Plot confusion matrix and classification report for optimized algorithms
for i, estimator in enumerate(estimators):
    plt.figure(figsize=(12,12))
    plt.subplot(3,2,i+1)
    pipeline.set_params(estimator=estimator)
    
    prediction = cross_val_predict(pipeline, games_shots, games_target_shots, cv=cv)
    cnf_matrix = confusion_matrix(games_target_shots, prediction)
    np.set_printoptions(precision=2)
    plot_confusion_matrix(cnf_matrix, classes=classes, title = type(estimator).__name__)
    rep = classification_report(games_target_shots, prediction)

    print('Classification report: {}'.format(type(estimator).__name__))
    print(rep)   
    
    plt.show()
     

# Result

- baseline: random guessing: 33.33% or major class: 44.79%

- best model: RandomForest optimized: 62.15% on basic training data with just halftime attributes
