In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [2]:
from glob import glob

req_cols = ['HomeTeam', 'AwayTeam', 
            'HTHG', 'HTAG', 'HTR', 'FTHG', 'FTAG', 'FTR',
            'HS', 'AS', 'HST', 'AST', 'HC', 'AC', 
            'HF', 'AF', 'HY', 'AY', 'HR', 'AR']

def concat(outfile = 'combined_data.csv'): #outfile = "name of the output file"
    filenames = glob('../data/*.csv') #many filenames have similar pattern 
    dataframes = []
    for files in filenames:
        dataframes.append(pd.read_csv(files))
    concatDF = pd.concat(dataframes, axis=0, sort=False) #axis=0 to concat vertically 
    concatDF = concatDF[req_cols]
    concatDF.to_csv('../data/combined/'+ outfile, index = None) #index=None for no indexing
    
concat() #concat function call
    
data = pd.read_csv('../data/combined/combined_data.csv')
data.head(5)

Unnamed: 0,HomeTeam,AwayTeam,HTHG,HTAG,HTR,FTHG,FTAG,FTR,HS,AS,HST,AST,HC,AC,HF,AF,HY,AY,HR,AR
0,Arsenal,Aston Villa,1.0,1.0,D,1.0,3.0,A,16.0,9.0,4.0,4.0,4.0,3.0,15.0,18.0,4.0,5.0,1.0,0.0
1,Liverpool,Stoke,1.0,0.0,H,1.0,0.0,H,26.0,10.0,11.0,4.0,12.0,6.0,11.0,11.0,1.0,1.0,0.0,0.0
2,Norwich,Everton,0.0,0.0,D,2.0,2.0,D,8.0,19.0,2.0,6.0,6.0,8.0,13.0,10.0,2.0,0.0,0.0,0.0
3,Sunderland,Fulham,0.0,0.0,D,0.0,1.0,A,20.0,5.0,3.0,1.0,6.0,1.0,14.0,14.0,0.0,3.0,0.0,0.0
4,Swansea,Man United,0.0,2.0,A,1.0,4.0,A,17.0,15.0,6.0,7.0,7.0,4.0,13.0,10.0,1.0,3.0,0.0,0.0


In [3]:
read_team_names = pd.read_csv('../data/season-2018-2019.csv')
team_name_list = read_team_names['HomeTeam']
team_name=[]
for teams in team_name_list:
    if teams not in team_name:
        team_name.append(teams)
i=1
print("\nTeams in Season:")
for t in team_name:
    print(i, t)
    i+=1


Teams in Season:
1 Man United
2 Bournemouth
3 Fulham
4 Huddersfield
5 Newcastle
6 Watford
7 Wolves
8 Arsenal
9 Liverpool
10 Southampton
11 Cardiff
12 Chelsea
13 Everton
14 Leicester
15 Tottenham
16 West Ham
17 Brighton
18 Burnley
19 Man City
20 Crystal Palace


In [4]:
filteredData = data[(data.HomeTeam.isin(team_name))]
data = filteredData[(filteredData.AwayTeam.isin(team_name))]

In [5]:
# Separate into feature set and target variable
X_all = data.drop(['FTR'],1)
y_all = data['FTR']
Z_all = X_all.drop(['HTR'],1)

In [6]:
# Standardising the data.
from sklearn.preprocessing import scale
cols = [['FTHG','FTAG','HTHG','HTAG','HC','AC','HS','AS','HST','AST','HF','AF','HY','AY','HR','AR']]
for col in cols:
    X_all[col] = scale(X_all[col])

In [7]:
# converts catagorical variables into dummy variables
def preprocess_features(Z):
    # Initialize new output DataFrame
    output = pd.DataFrame(index = Z.index)
    # Investigate each feature column for the data
    for col, col_data in Z.iteritems():
        # If data type is categorical, convert to dummy variables
        if col_data.dtype == object:
            col_data = pd.get_dummies(col_data, prefix = col)                
        # Collect the revisedd columns
        output = output.join(col_data)
    return output

Z_all = preprocess_features(Z_all)

In [8]:
from sklearn.model_selection import train_test_split

# Shuffle and split the dataset into training and testing set.
X_train, X_test, y_train, y_test = train_test_split(Z_all, y_all, 
                                                    test_size = 50,
                                                    random_state = 2,
                                                    stratify = y_all)

In [9]:
from time import time 
from sklearn.metrics import f1_score

def train_classifier(clf, X_train, y_train):
    start = time()
    clf.fit(X_train, y_train)
    end = time()
    print ("Trained model in {:.4f} seconds".format(end - start))

def predict_labels(clf, features, target):
    start = time()
    y_pred = clf.predict(features)
    end = time()
    print ("Made predictions in {:.4f} seconds.".format(end - start))
    return f1_score(target, y_pred, average='macro'), sum(target == y_pred) / float(len(y_pred))

def train_predict(clf, X_train, y_train, X_test, y_test):
    # Indicate the classifier and the training set size
    print ("Training a {} using a training set size of {}. . .".format(clf.__class__.__name__, len(X_train)))
    
    # Train the classifier
    train_classifier(clf, X_train, y_train)
    
    # Print the results of prediction for both training and testing
    f1, acc = predict_labels(clf, X_train, y_train)
    print (f1, acc)
    print ("F1 score and accuracy score for training set: {:.4f} , {:.4f}.".format(f1 , acc))
    
    f1, acc = predict_labels(clf, X_test, y_test)
    print ("F1 score and accuracy score for test set: {:.4f} , {:.4f}.".format(f1 , acc))

In [10]:
clf_A = LogisticRegression(random_state = 42)
clf_B = SVC(random_state = 912, kernel='rbf')
clf_C = KNeighborsClassifier(n_neighbors = 60)

train_predict(clf_A, X_train, y_train, X_test, y_test)
print ('')
train_predict(clf_B, X_train, y_train, X_test, y_test)
print ('')
train_predict(clf_C, X_train, y_train, X_test, y_test)
print ('')

Training a LogisticRegression using a training set size of 1738. . .


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Trained model in 1.0223 seconds
Made predictions in 0.0022 seconds.
1.0 1.0
F1 score and accuracy score for training set: 1.0000 , 1.0000.
Made predictions in 0.0016 seconds.
F1 score and accuracy score for test set: 1.0000 , 1.0000.

Training a SVC using a training set size of 1738. . .
Trained model in 0.2298 seconds
Made predictions in 0.1417 seconds.
0.8005066690649271 0.832566168009206
F1 score and accuracy score for training set: 0.8005 , 0.8326.
Made predictions in 0.0053 seconds.
F1 score and accuracy score for test set: 0.8183 , 0.8600.

Training a KNeighborsClassifier using a training set size of 1738. . .
Trained model in 0.0055 seconds
Made predictions in 0.2549 seconds.
0.5159285534994548 0.6622554660529344
F1 score and accuracy score for training set: 0.5159 , 0.6623.
Made predictions in 0.0097 seconds.
F1 score and accuracy score for test set: 0.5675 , 0.7000.



In [11]:
model = LogisticRegression()
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [12]:
model.predict(X_test)

array(['H', 'H', 'D', 'H', 'H', 'A', 'H', 'A', 'A', 'A', 'D', 'A', 'H',
       'H', 'A', 'H', 'A', 'H', 'A', 'A', 'H', 'D', 'H', 'H', 'A', 'D',
       'A', 'H', 'D', 'D', 'D', 'A', 'H', 'D', 'D', 'D', 'A', 'A', 'H',
       'A', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'D', 'H'],
      dtype=object)

In [13]:
predictedProbability = model.predict_proba(X_test)
predictedProbability = pd.DataFrame(predictedProbability, columns=['Away Team','Draw','Home Team'])

display((predictedProbability *100).head(10))

Unnamed: 0,Away Team,Draw,Home Team
0,1.453694e-21,4.176818e-09,100.0
1,6.030539e-05,4.152721,95.84722
2,3.713242,92.2904,3.996362
3,7.55901e-05,2.101736,97.89819
4,0.000134341,2.683065,97.3168
5,99.99309,0.006905084,1.097078e-09
6,1.4041429999999999e-21,2.45843e-09,100.0
7,99.9949,0.00509908,2.143421e-10
8,99.99254,0.007458939,2.853099e-10
9,99.99997,3.151288e-05,6.981288e-15


In [14]:
fixtures = pd.read_csv('../data/fixtures/epl-2018-GMT.csv')
fixtures = fixtures.drop(['Round Number','Date','Location','Result'],1)
fixtures.columns = ['HomeTeam','AwayTeam']
fixtures['FTHG']= 0
fixtures['FTAG'] =0 
fixtures['HTHG'] = 0
fixtures['HTAG'] = 0
fixtures['HS'] = 0
fixtures['AS'] = 0
fixtures['HST'] = 0
fixtures['AST'] = 0
fixtures['HF'] = 0
fixtures['AF'] = 0
fixtures['HC'] = 0
fixtures['AC'] = 0
fixtures['HY'] = 0
fixtures['AY'] = 0
fixtures['HR'] = 0
fixtures['AR'] = 0

In [15]:
display(fixtures.head(10))

Unnamed: 0,HomeTeam,AwayTeam,FTHG,FTAG,HTHG,HTAG,HS,AS,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR
0,Man Utd,Leicester,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Newcastle,Spurs,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Bournemouth,Cardiff,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Fulham,Crystal Palace,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Huddersfield,Chelsea,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,Watford,Brighton,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,Wolves,Everton,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,Liverpool,West Ham,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,Southampton,Burnley,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,Arsenal,Man City,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [16]:
preprocessedFixtures = preprocess_features(fixtures)

In [17]:
model.predict(preprocessedFixtures)

array(['D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D',
       'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D',
       'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D',
       'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D',
       'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D',
       'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D',
       'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D',
       'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D',
       'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D',
       'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D',
       'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D',
       'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D',
       'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D',
       'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D

In [18]:
fixtures['Result Predicted'] = model.predict(preprocessedFixtures)

In [19]:
Result = fixtures.drop(['FTHG','FTAG','HTHG','HTAG','HS','AS','HST','HF','AF','HY',
                        'AY','HR','AR','HC','AC','AST','HC','AC'],1)
Result.head(10)

Unnamed: 0,HomeTeam,AwayTeam,Result Predicted
0,Man Utd,Leicester,D
1,Newcastle,Spurs,D
2,Bournemouth,Cardiff,D
3,Fulham,Crystal Palace,D
4,Huddersfield,Chelsea,D
5,Watford,Brighton,D
6,Wolves,Everton,D
7,Liverpool,West Ham,D
8,Southampton,Burnley,D
9,Arsenal,Man City,D


In [20]:
fixturePredictedProbability = model.predict_proba(preprocessedFixtures) *100
fixturePredictedProbability = pd.DataFrame(fixturePredictedProbability, columns=['Away win %','Draw %','Home win %'])

display(fixturePredictedProbability)

Unnamed: 0,Away win %,Draw %,Home win %
0,18.098375,57.319355,24.582270
1,24.413897,53.344694,22.241408
2,21.864893,57.133817,21.001290
3,26.253689,48.734915,25.011396
4,23.676771,58.812849,17.510380
...,...,...,...
375,13.800696,72.203199,13.996105
376,19.898009,49.777588,30.324403
377,15.141875,69.768625,15.089500
378,18.417653,55.163959,26.418389


In [21]:
final = pd.concat([Result, fixturePredictedProbability], axis = 1)

In [22]:
final.head(20)

Unnamed: 0,HomeTeam,AwayTeam,Result Predicted,Away win %,Draw %,Home win %
0,Man Utd,Leicester,D,18.098375,57.319355,24.58227
1,Newcastle,Spurs,D,24.413897,53.344694,22.241408
2,Bournemouth,Cardiff,D,21.864893,57.133817,21.00129
3,Fulham,Crystal Palace,D,26.253689,48.734915,25.011396
4,Huddersfield,Chelsea,D,23.676771,58.812849,17.51038
5,Watford,Brighton,D,15.039447,65.081786,19.878767
6,Wolves,Everton,D,12.90479,66.02857,21.066641
7,Liverpool,West Ham,D,15.528088,67.325498,17.146414
8,Southampton,Burnley,D,17.858173,65.291585,16.850242
9,Arsenal,Man City,D,16.370841,61.093636,22.535523


In [23]:
readFixtures = pd.read_csv('../data/fixtures/epl-2018-GMT.csv')
exportToFixtures = final.drop(['HomeTeam','AwayTeam'],1)

PredictedResultWithFixtureData = pd.concat([readFixtures,exportToFixtures], axis = 1)
PredictedResultWithFixtureData.to_csv('../data/predictions/predictions.csv')

display(PredictedResultWithFixtureData.tail(25))

Unnamed: 0,Round Number,Date,Location,Home Team,Away Team,Result,Result Predicted,Away win %,Draw %,Home win %
355,36,27/04/2019 15:00,Vicarage Road,Watford,Wolves,1 - 2,D,15.136156,68.421595,16.442249
356,36,27/04/2019 17:30,Amex Stadium,Brighton,Newcastle,1 - 1,D,13.432419,64.245435,22.322146
357,36,28/04/2019 12:00,King Power Stadium,Leicester,Arsenal,3 - 0,D,17.007369,65.469043,17.523588
358,36,28/04/2019 14:05,Turf Moor,Burnley,Man City,0 - 1,D,23.723953,54.666322,21.609725
359,36,28/04/2019 16:30,Old Trafford,Man Utd,Chelsea,1 - 1,D,19.562111,57.897122,22.540767
360,37,03/05/2019 20:00,Goodison Park,Everton,Burnley,2 - 0,D,15.053182,62.02287,22.923948
361,37,04/05/2019 12:30,Vitality Stadium,Bournemouth,Spurs,1 - 0,D,21.264463,60.119487,18.61605
362,37,04/05/2019 15:00,London Stadium,West Ham,Southampton,3 - 0,D,16.994485,61.65814,21.347375
363,37,04/05/2019 15:00,Molineux Stadium,Wolves,Fulham,1 - 0,D,13.677184,64.366126,21.95669
364,37,04/05/2019 17:30,Cardiff City Stadium,Cardiff,Crystal Palace,2 - 3,D,29.31681,45.178432,25.504758
