In [0]:
#data preprocessing
import pandas as pd
#produces a prediction model in the form of an ensemble of weak prediction models, typically decision tree
import xgboost as xgb
#the outcome (dependent variable) has only a limited number of possible values. 
#Logistic Regression is used when response variable is categorical in nature.
from sklearn.linear_model import LogisticRegression
#A random forest is a meta estimator that fits a number of decision tree classifiers 
#on various sub-samples of the dataset and use averaging to improve the predictive 
#accuracy and control over-fitting.
from sklearn.ensemble import RandomForestClassifier
#a discriminative classifier formally defined by a separating hyperplane.
from sklearn.svm import SVC 
#displayd data
from IPython.display import display
%matplotlib inline

In [0]:
import os
import csv
DATA_PATH = '/content/drive/My Drive'

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
%cd /content/drive/My Drive

/content/drive/My Drive


In [0]:
data = pd.read_csv(os.path.join(DATA_PATH, 'final_dataset.csv'))
# Remove first 3 matchweeks
data = data[data.MW > 3]
#Drop all irrelevant statistics
data.drop(['Unnamed: 0','HomeTeam', 'AwayTeam', 'Date', 'MW', 'HTFormPtsStr', 'ATFormPtsStr', 'FTHG', 'FTAG',
           'HTGS', 'ATGS', 'HTGC', 'ATGC','HomeTeamLP', 'AwayTeamLP','DiffPts','HTFormPts','ATFormPts',
           'HM4','HM5','AM4','AM5','HTLossStreak5','ATLossStreak5','HTWinStreak5','ATWinStreak5',
           'HTWinStreak3','HTLossStreak3','ATWinStreak3','ATLossStreak3'],1, inplace=True)


#Full Time Result (H=Home Win, D=Draw, A=Away Win)
#HTGD - Home team goal difference
#ATGD - away team goal difference
#HTP - Home team points
#ATP - Away team points
#DiffFormPts Diff in points
#DiffLP - Differnece in last years prediction

#Input - 12 features
#Output - Full Time Result (H=Home Win, D=Draw, A=Away Win)

# Preview data.
display(data.head())


Unnamed: 0,FTR,HTP,ATP,HM1,HM2,HM3,AM1,AM2,AM3,HTGD,ATGD,DiffFormPts,DiffLP
30,H,1.25,1.0,D,D,W,D,W,L,0.5,0.25,0.25,-16.0
31,D,0.75,0.25,L,L,W,D,L,L,-0.5,-0.75,0.5,-2.0
32,H,1.0,1.0,L,D,W,D,W,L,0.0,0.25,0.0,-3.0
33,D,0.75,0.5,L,L,W,D,L,D,-0.25,-0.25,0.25,3.0
34,D,1.0,1.5,D,L,W,W,W,L,0.0,0.75,-0.5,3.0


In [0]:
# Total number of matches.
n_matches = data.shape[0]

# Calculate number of features.
n_features = data.shape[1] - 1

# Calculate matches won by home team.
n_homewins = len(data[data.FTR == 'H'])

# Calculate win rate for home team.
win_rate = (float(n_homewins) / (n_matches)) * 100

# Print the results
print ('Total number of matches: {}'.format(n_matches))
print ('Number of features: {}'.format(n_features))
print ('Number of matches won by home team: {}'.format(n_homewins))
print ('Win rate of home team: {:.2f}%'.format(win_rate))

Total number of matches: 4900
Number of features: 12
Number of matches won by home team: 2289
Win rate of home team: 46.71%


In [0]:
# from pandas.tools.plotting import scatter_matrix
# #this doesn't work because colab doesn't support this library
# scatter_matrix(data[['HTGD','ATGD','HTP','ATP','DiffFormPts','DiffLP']], figsize=(10,10))

In [0]:
# Separate into feature set and target variable
#FTR = Full Time Result (H=Home Win, D=Draw, A=Away Win)
X_all = data.drop(['FTR'],1)
y_all = data['FTR']

# Standardising the data.
from sklearn.preprocessing import scale

cols = [['HTGD','ATGD','HTP','ATP','DiffLP']]
for col in cols:
    X_all[col] = scale(X_all[col])

In [0]:
X_all.HM1 = X_all.HM1.astype('str')
X_all.HM2 = X_all.HM2.astype('str')
X_all.HM3 = X_all.HM3.astype('str')
X_all.AM1 = X_all.AM1.astype('str')
X_all.AM2 = X_all.AM2.astype('str')
X_all.AM3 = X_all.AM3.astype('str')

def preprocess_features(X):
    ''' Preprocesses the football data and converts categorical variables into dummy variables. '''
    
    # Initialize new output DataFrame
    output = pd.DataFrame(index = X.index)

    # Investigate each feature column for the data
    for col, col_data in X.iteritems():

        # If data type is categorical, convert to dummy variables
        if col_data.dtype == object:
            col_data = pd.get_dummies(col_data, prefix = col)
                    
        # Collect the revised columns
        output = output.join(col_data)
    
    return output

X_all = preprocess_features(X_all)
print ('Processed feature columns ({} total features):\n{}'.format(len(X_all.columns), list(X_all.columns)))


Processed feature columns (24 total features):
['HTP', 'ATP', 'HM1_D', 'HM1_L', 'HM1_W', 'HM2_D', 'HM2_L', 'HM2_W', 'HM3_D', 'HM3_L', 'HM3_W', 'AM1_D', 'AM1_L', 'AM1_W', 'AM2_D', 'AM2_L', 'AM2_W', 'AM3_D', 'AM3_L', 'AM3_W', 'HTGD', 'ATGD', 'DiffFormPts', 'DiffLP']


In [0]:
# Show the feature information by printing the first five rows
print ("\nFeature values:")
display(X_all.head())


Feature values:


Unnamed: 0,HTP,ATP,HM1_D,HM1_L,HM1_W,HM2_D,HM2_L,HM2_W,HM3_D,HM3_L,HM3_W,AM1_D,AM1_L,AM1_W,AM2_D,AM2_L,AM2_W,AM3_D,AM3_L,AM3_W,HTGD,ATGD,DiffFormPts,DiffLP
30,-0.046121,-0.617418,1,0,0,1,0,0,0,0,1,1,0,0,0,0,1,0,1,0,0.752387,0.35651,0.25,-1.85867
31,-1.1288,-2.252347,0,1,0,0,1,0,0,0,1,1,0,0,0,1,0,0,1,0,-0.733431,-1.133322,0.5,-0.232147
32,-0.58746,-0.617418,0,1,0,1,0,0,0,0,1,1,0,0,0,0,1,0,1,0,0.009478,0.35651,0.0,-0.348327
33,-1.1288,-1.707371,0,1,0,0,1,0,0,0,1,1,0,0,0,1,0,1,0,0,-0.361976,-0.388406,0.25,0.348754
34,-0.58746,0.472535,1,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,1,0,0.009478,1.101426,-0.5,0.348754


In [0]:
from sklearn.model_selection import train_test_split

# Shuffle and split the dataset into training and testing set.
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, 
                                                    test_size = 50,
                                                    random_state = 2,
                                                    stratify = y_all)

In [0]:
#for measuring training time
from time import time 
# F1 score (also F-score or F-measure) is a measure of a test's accuracy. 
#It considers both the precision p and the recall r of the test to compute 
#the score: p is the number of correct positive results divided by the number of 
#all positive results, and r is the number of correct positive results divided by 
#the number of positive results that should have been returned. The F1 score can be 
#interpreted as a weighted average of the precision and recall, where an F1 score 
#reaches its best value at 1 and worst at 0.
from sklearn.metrics import f1_score

def train_classifier(clf, X_train, y_train):
    ''' Fits a classifier to the training data. '''
    
    # Start the clock, train the classifier, then stop the clock
    start = time()
    clf.fit(X_train, y_train)
    end = time()
    
    # Print the results
    print ("Trained model in {:.4f} seconds".format(end - start))

    
def predict_labels(clf, features, target):
    ''' Makes predictions using a fit classifier based on F1 score. '''
    
    # Start the clock, make predictions, then stop the clock
    start = time()
    y_pred = clf.predict(features)
    
    end = time()
    # Print and return results
    print ("Made predictions in {:.4f} seconds.".format(end - start))
    
    return f1_score(target, y_pred, pos_label='positive',average='weighted'), sum(target == y_pred) / float(len(y_pred))
    # return f1_score(target, y_pred, pos_label='H'), sum(target == y_pred) / float(len(y_pred))


def train_predict(clf, X_train, y_train, X_test, y_test):
    ''' Train and predict using a classifer based on F1 score. '''
    
    # Indicate the classifier and the training set size
    print ("Training a {} using a training set size of {}. . .".format(clf.__class__.__name__, len(X_train)))
    
    # Train the classifier
    train_classifier(clf, X_train, y_train)
    
    # Print the results of prediction for both training and testing
    f1, acc = predict_labels(clf, X_train, y_train)
    print (f1, acc)
    print ("F1 score and accuracy score for training set: {:.4f} , {:.4f}.".format(f1 , acc))
    
    f1, acc = predict_labels(clf, X_test, y_test)
    print ("F1 score and accuracy score for test set: {:.4f} , {:.4f}.".format(f1 , acc))

In [0]:
# Initialize the three models (XGBoost is initialized later)
clf_A = LogisticRegression(random_state = 42)
clf_B = SVC(random_state = 912, kernel='rbf')
#Boosting refers to this general problem of producing a very accurate prediction rule 
#by combining rough and moderately inaccurate rules-of-thumb
clf_C = xgb.XGBClassifier(seed = 82)

train_predict(clf_A, X_train, y_train, X_test, y_test)
print ('')
train_predict(clf_B, X_train, y_train, X_test, y_test)
print ('')
train_predict(clf_C, X_train, y_train, X_test, y_test)
print ('')

Training a LogisticRegression using a training set size of 4850. . .
Trained model in 0.2269 seconds
Made predictions in 0.0026 seconds.
0.49686179219317383 0.5463917525773195
F1 score and accuracy score for training set: 0.4969 , 0.5464.
Made predictions in 0.0015 seconds.
F1 score and accuracy score for test set: 0.4654 , 0.5200.

Training a SVC using a training set size of 4850. . .




Trained model in 2.2530 seconds
Made predictions in 1.1585 seconds.
0.5335986937661085 0.5872164948453609
F1 score and accuracy score for training set: 0.5336 , 0.5872.
Made predictions in 0.0132 seconds.
F1 score and accuracy score for test set: 0.4120 , 0.4800.

Training a XGBClassifier using a training set size of 4850. . .




Trained model in 1.1723 seconds
Made predictions in 0.0602 seconds.
0.5477846117295458 0.5907216494845361
F1 score and accuracy score for training set: 0.5478 , 0.5907.
Made predictions in 0.0021 seconds.
F1 score and accuracy score for test set: 0.4602 , 0.5200.



**Clearly XGBoost seems like the best model as it has the highest F1 score and accuracy score on the test set.**



In [0]:
# Import 'GridSearchCV' and 'make_scorer'
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer


# Create the parameters list you wish to tune
parameters = { 'learning_rate' : [0.1],
               'n_estimators' : [40],
               'max_depth': [3],
               'min_child_weight': [3],
               'gamma':[0.4],
               'subsample' : [0.8],
               'colsample_bytree' : [0.8],
               'scale_pos_weight' : [1],
               'reg_alpha':[1e-5]
             }  

# Initialize the classifier
clf = xgb.XGBClassifier(seed=2)

# Make an f1 scoring function using 'make_scorer' 
f1_scorer = make_scorer(f1_score,pos_label='positive',average='weighted')
# f1_scorer = make_scorer(f1_score,pos_label='H')


# Perform grid search on the classifier using the f1_scorer as the scoring method
grid_obj = GridSearchCV(clf,
                        scoring=f1_scorer,
                        param_grid=parameters,
                        cv=5)

# Fit the grid search object to the training data and find the optimal parameters
grid_obj = grid_obj.fit(X_train,y_train)

# Get the estimator
clf = grid_obj.best_estimator_
print (clf)

# Report the final F1 score for training and testing after parameter tuning
f1, acc = predict_labels(clf, X_train, y_train)
print ("F1 score and accuracy score for training set: {:.4f} , {:.4f}.".format(f1 , acc))
    
f1, acc = predict_labels(clf, X_test, y_test)
print ("F1 score and accuracy score for test set: {:.4f} , {:.4f}.".format(f1 , acc))



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, gamma=0.4,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=3, missing=None, n_estimators=40, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=1e-05, reg_lambda=1, scale_pos_weight=1, seed=2,
              silent=None, subsample=0.8, verbosity=1)
Made predictions in 0.0250 seconds.
F1 score and accuracy score for training set: 0.5192 , 0.5722.
Made predictions in 0.0015 seconds.
F1 score and accuracy score for test set: 0.4602 , 0.5200.


In [0]:
# Import 'GridSearchCV' and 'make_scorer'
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from pprint import pprint


# Create the parameters list you wish to tune
parameters = { 'learning_rate' : [0.03],
               'n_estimators' : [20],
               'max_depth': [5],
               'min_child_weight': [5],
               'gamma':[0.2],
               'subsample':[0.8],
               'colsample_bytree':[0.8],
               'scale_pos_weight' : [1],
               'reg_alpha':[1e-2]
             }  

# Initialize the classifier
clf = xgb.XGBClassifier(seed=2)

# Make an f1 scoring function using 'make_scorer' 
f1_scorer = make_scorer(f1_score,pos_label='positive', average='weighted')
# f1_scorer = make_scorer(f1_score,pos_label='H')


# Perform grid search on the classifier using the f1_scorer as the scoring method
grid_obj = GridSearchCV(clf,
                        scoring=f1_scorer,
                        param_grid=parameters,
                        cv=5)

# Fit the grid search object to the training data and find the optimal parameters
grid_obj = grid_obj.fit(X_all,y_all)

# Get the estimator
clf = grid_obj.best_estimator_
print (clf)

# Report the final F1 score for training and testing after parameter tuning
f1, acc = predict_labels(clf, X_train, y_train)
print ("F1 score and accuracy score for training set: {:.4f} , {:.4f}.".format(f1 , acc))



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, gamma=0.2,
              learning_rate=0.03, max_delta_step=0, max_depth=5,
              min_child_weight=5, missing=None, n_estimators=20, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0.01, reg_lambda=1, scale_pos_weight=1, seed=2,
              silent=None, subsample=0.8, verbosity=1)
Made predictions in 0.0213 seconds.
F1 score and accuracy score for training set: 0.5470 , 0.5955.


In [0]:
import numpy as np

In [0]:
# X_test
test_labels = clf.predict(X_test[:])
print(test_labels)
# print(X_test[0][2])

['H' 'H' 'A' 'A' 'H' 'A' 'H' 'H' 'A' 'H' 'A' 'H' 'A' 'H' 'D' 'H' 'H' 'H'
 'H' 'H' 'H' 'A' 'H' 'A' 'H' 'H' 'H' 'H' 'H' 'H' 'A' 'H' 'H' 'H' 'H' 'H'
 'H' 'D' 'A' 'A' 'H' 'A' 'H' 'H' 'D' 'H' 'H' 'H' 'H' 'H']


In [0]:
true_labels = y_test.to_numpy()
print(true_labels[:])
print(type(true_labels))

['A' 'D' 'H' 'A' 'H' 'A' 'H' 'H' 'A' 'H' 'H' 'D' 'H' 'D' 'A' 'D' 'A' 'H'
 'H' 'A' 'H' 'A' 'D' 'A' 'D' 'H' 'H' 'A' 'D' 'H' 'D' 'H' 'H' 'H' 'H' 'H'
 'H' 'D' 'A' 'A' 'H' 'H' 'D' 'D' 'D' 'H' 'A' 'D' 'A' 'H']
<class 'numpy.ndarray'>


In [0]:
def compare(test_labels, true_labels):
  coincidencelist = []
  for i in range(len(test_labels)):
    if test_labels[i] == true_labels[i]:
      coincidencelist.append(test_labels[i])

  accuracy = len(coincidencelist)*100/len(test_labels)
  return len(coincidencelist), len(test_labels), accuracy 
    # for j in range(len(true_labels)):
    #   if true_labels[]


In [0]:
compare(test_labels, true_labels)

(28, 50, 56.0)

**FINISHED**