In [1]:
# IMPORT LIBRARIES
import pandas as pd
import sklearn
from sklearn import preprocessing
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.neural_network import MLPClassifier 
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.compose import ColumnTransformer
import import_ipynb

# Import related ipynb files
# import dataPrep as dp

In [2]:
# Reading the dataset through a Pandas function
data = pd.read_csv("ovr_data.csv")

data.isnull().sum()

Div         0
Date        0
HomeTeam    0
AwayTeam    0
FTHG        0
FTAG        0
FTR         0
HTHG        3
HTAG        3
HTR         3
HS          2
AS          2
HST         2
AST         2
HF          4
AF          4
HC          2
AC          2
HY          3
AY          2
HR          2
AR          2
B365H       3
B365D       3
B365A       3
HTGS        0
ATGS        0
HTGC        0
ATGC        0
HTP         0
ATP         0
HM1         0
HM2         0
HM3         0
HM4         0
HM5         0
AM1         0
AM2         0
AM3         0
AM4         0
AM5         0
MW          0
FTR.1       0
dtype: int64

In [3]:
# Data Cleaning extra steps

# Parse data to int for old dataset
data = data.dropna()

# Parse all the columns to integer for convenience
data['FTHG'] = data['FTHG'].astype(int)
data['FTAG'] = data['FTAG'].astype(int)
data['HS'] = data['HS'].astype(int)
data['AS'] = data['AS'].astype(int)
data['HST'] = data['HST'].astype(int)
data['AST'] = data['AST'].astype(int)
data['HF'] = data['HF'].astype(int)
data['AF'] = data['AF'].astype(int)
data['HC'] = data['HC'].astype(int)
data['AC'] = data['AC'].astype(int)
data['HY'] = data['HY'].astype(int)
data['AY'] = data['AY'].astype(int)
data['HR'] = data['HR'].astype(int)
data['AR'] = data['AR'].astype(int)

# Filter by Division
def Div(Div):
    
    return data[data['Div'] == Div][['Div','Date','HomeTeam','AwayTeam','FTHG','FTAG','HTHG','HTAG','HTR',
                                     'HS','AS','HST','AST','HF','AF','HC','AC','HY','AY','HR','AR','B365H','B365D','B365A',
                                     'HTGS','ATGS','HTGC','ATGC','HTP','ATP',
                                     'HM1','HM2','HM3','HM4','HM5','AM1','AM2','AM3','AM4','AM5','MW','FTR']]
# data = Div("E0")

# Replace D and A in 'FTR' to NH
data['FTR'] = data['FTR'].replace(['D', 'A'], 'NH')

# Remove first match week
# data = data[data['MW']>10]

# Calculate Goal Scored Difference
data["TGSDiff"] = (data["HTGS"] - data["ATGS"])

# Calculate Goal Scored Difference
data["TGCDiff"] = (data["HTGC"] - data["ATGC"])

# Calculate Point Difference
data["PDiff"] = (data["HTP"] - data["ATP"])

# Calculate Form Points
# HM1
data['HM1'] = data['HM1'].replace(['M'], '0')
data['HM1'] = data['HM1'].replace(['W'], '3')
data['HM1'] = data['HM1'].replace(['D'], '1')
data['HM1'] = data['HM1'].replace(['L'], '0')

# HM2
data['HM2'] = data['HM2'].replace(['M'], '0')
data['HM2'] = data['HM2'].replace(['W'], '3')
data['HM2'] = data['HM2'].replace(['D'], '1')
data['HM2'] = data['HM2'].replace(['L'], '0')

# HM3
data['HM3'] = data['HM3'].replace(['M'], '0')
data['HM3'] = data['HM3'].replace(['W'], '3')
data['HM3'] = data['HM3'].replace(['D'], '1')
data['HM3'] = data['HM3'].replace(['L'], '0')

# HM4
data['HM4'] = data['HM4'].replace(['M'], '0')
data['HM4'] = data['HM4'].replace(['W'], '3')
data['HM4'] = data['HM4'].replace(['D'], '1')
data['HM4'] = data['HM4'].replace(['L'], '0')

# HM5
data['HM5'] = data['HM5'].replace(['M'], '0')
data['HM5'] = data['HM5'].replace(['W'], '3')
data['HM5'] = data['HM5'].replace(['D'], '1')
data['HM5'] = data['HM5'].replace(['L'], '0')

# AM1
data['AM1'] = data['AM1'].replace(['M'], '0')
data['AM1'] = data['AM1'].replace(['W'], '3')
data['AM1'] = data['AM1'].replace(['D'], '1')
data['AM1'] = data['AM1'].replace(['L'], '0')

# AM2
data['AM2'] = data['AM2'].replace(['M'], '0')
data['AM2'] = data['AM2'].replace(['W'], '3')
data['AM2'] = data['AM2'].replace(['D'], '1')
data['AM2'] = data['AM2'].replace(['L'], '0')

# AM3
data['AM3'] = data['AM3'].replace(['M'], '0')
data['AM3'] = data['AM3'].replace(['W'], '3')
data['AM3'] = data['AM3'].replace(['D'], '1')
data['AM3'] = data['AM3'].replace(['L'], '0')

# AM4
data['AM4'] = data['AM4'].replace(['M'], '0')
data['AM4'] = data['AM4'].replace(['W'], '3')
data['AM4'] = data['AM4'].replace(['D'], '1')
data['AM4'] = data['AM4'].replace(['L'], '0')

# AM5
data['AM5'] = data['AM5'].replace(['M'], '0')
data['AM5'] = data['AM5'].replace(['W'], '3')
data['AM5'] = data['AM5'].replace(['D'], '1')
data['AM5'] = data['AM5'].replace(['L'], '0')

# Parse value to integer
data[["HM1","HM2","HM3","HM4","HM5","AM1","AM2","AM3","AM4","AM5"]] = data[["HM1","HM2","HM3","HM4","HM5",
                                                                            "AM1","AM2","AM3","AM4","AM5"]].astype(int)

# Home
data["HFP3"] = (data["HM1"] + data["HM2"] + data["HM3"])
data["HFP5"] = (data["HM1"] + data["HM2"] + data["HM3"] + data["HM4"] + data["HM5"])

# Away
data["AFP3"] = (data["AM1"] + data["AM2"] + data["AM3"])
data["AFP5"] = (data["AM1"] + data["AM2"] + data["AM3"] + data["AM4"] + data["AM5"])

# Difference
data["FPDiff3"] = (data["HFP3"] - data["AFP3"])
data["FPDiff5"] = (data["HFP5"] - data["AFP5"])

data.dtypes

Div          object
Date         object
HomeTeam     object
AwayTeam     object
FTHG          int32
FTAG          int32
FTR          object
HTHG        float64
HTAG        float64
HTR          object
HS            int32
AS            int32
HST           int32
AST           int32
HF            int32
AF            int32
HC            int32
AC            int32
HY            int32
AY            int32
HR            int32
AR            int32
B365H       float64
B365D       float64
B365A       float64
HTGS        float64
ATGS        float64
HTGC        float64
ATGC        float64
HTP           int64
ATP           int64
HM1           int32
HM2           int32
HM3           int32
HM4           int32
HM5           int32
AM1           int32
AM2           int32
AM3           int32
AM4           int32
AM5           int32
MW            int64
FTR.1        object
TGSDiff     float64
TGCDiff     float64
PDiff         int64
HFP3          int32
HFP5          int32
AFP3          int32
AFP5          int32


In [4]:
# Use lable encoder for data transformation
labelencoder = LabelEncoder()
data['Date'] = labelencoder.fit_transform(data['Date'])
data['Div'] = labelencoder.fit_transform(data['Div'])
data['HomeTeam'] = labelencoder.fit_transform(data['HomeTeam'])
data['AwayTeam'] = labelencoder.fit_transform(data['AwayTeam'])

# Filter unwanted columns
cols = ['HTR','HTGS','ATGS','TGSDiff','HTGC','ATGC','TGCDiff','HTP','ATP','PDiff',
        'HM1','HM2','HM3','HM4','HM5','AM1','AM2','AM3','AM4','AM5',
        'HFP3','HFP5','AFP3','AFP5','FPDiff3','FPDiff5','FTR']
data = data[cols]

data['HTR'] = labelencoder.fit_transform(data['HTR'])
data['HM1'] = labelencoder.fit_transform(data['HM1'])
data['HM2'] = labelencoder.fit_transform(data['HM2'])
data['HM3'] = labelencoder.fit_transform(data['HM3'])
data['HM4'] = labelencoder.fit_transform(data['HM4'])
data['HM5'] = labelencoder.fit_transform(data['HM5'])
data['AM1'] = labelencoder.fit_transform(data['AM1'])
data['AM2'] = labelencoder.fit_transform(data['AM2'])
data['AM3'] = labelencoder.fit_transform(data['AM3'])
data['AM4'] = labelencoder.fit_transform(data['AM4'])
data['AM5'] = labelencoder.fit_transform(data['AM5'])

data.head()

Unnamed: 0,HTR,HTGS,ATGS,TGSDiff,HTGC,ATGC,TGCDiff,HTP,ATP,PDiff,...,AM3,AM4,AM5,HFP3,HFP5,AFP3,AFP5,FPDiff3,FPDiff5,FTR
0,2,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,...,0,0,0,0,0,0,0,0,0,H
1,2,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,...,0,0,0,0,0,0,0,0,0,H
2,1,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,...,0,0,0,0,0,0,0,0,0,NH
3,2,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,...,0,0,0,0,0,0,0,0,0,H
4,2,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,...,0,0,0,0,0,0,0,0,0,NH


In [5]:
# Takes feature columns and assign them to variable "X"
# For old data
X = data.iloc[:, 0:26]

# Takes target column and assign them to variable "Y"
y = data[['FTR']].copy()

In [6]:
X.head()

Unnamed: 0,HTR,HTGS,ATGS,TGSDiff,HTGC,ATGC,TGCDiff,HTP,ATP,PDiff,...,AM2,AM3,AM4,AM5,HFP3,HFP5,AFP3,AFP5,FPDiff3,FPDiff5
0,2,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
y.head()

Unnamed: 0,FTR
0,H
1,H
2,NH
3,H
4,NH


In [8]:
# y actually contains all categories or classes:
y.FTR.unique()

array(['H', 'NH'], dtype=object)

In [9]:
# Now transforming categorial into numerical values
le = preprocessing.LabelEncoder()
y = y.apply(le.fit_transform)

In [10]:
y.head()

Unnamed: 0,FTR
0,0
1,0
2,1
3,0
4,1


In [11]:
# Now for train and test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

In [12]:
# Feature scaling
# scaler = StandardScaler()
# scaler.fit(X_train)
# X_train = scaler.transform(X_train)  
# X_test = scaler.transform(X_test)

In [13]:
# Multilayer Perceptron
mlp = MLPClassifier(alpha = 0.01, hidden_layer_sizes=(15), max_iter=20000)

mlp.fit(X_train, y_train.values.ravel())

MLPClassifier(alpha=0.01, hidden_layer_sizes=15, max_iter=20000)

In [14]:
predictions = mlp.predict(X_test)

In [15]:
# print(predictions)

In [16]:
# Last thing: evaluation of algorithm performance in classifying flowers
print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))

[[1575  445]
 [ 471 1953]]
              precision    recall  f1-score   support

           0       0.77      0.78      0.77      2020
           1       0.81      0.81      0.81      2424

    accuracy                           0.79      4444
   macro avg       0.79      0.79      0.79      4444
weighted avg       0.79      0.79      0.79      4444



In [17]:
print(accuracy_score(y_test, predictions))

0.7938793879387939
