In [1]:
# Import the necessary libraries.
import os
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from IPython.display import display

import warnings
warnings.filterwarnings('ignore')

In [2]:
cwd = os.getcwd()

In [3]:
data = pd.read_csv(cwd + "/../data/final/dataset.csv")

In [4]:
# Total number of students.
n_matches = data.shape[0]

# Calculate number of features.
n_features = data.shape[1] - 1

# Print the results
print("Total number of matches: {}".format(n_matches))
print("Number of features: {}".format(n_features))

Total number of matches: 6360
Number of features: 31


In [5]:
# Separate into feature set and target variable
X_all = data.drop(['FTR'],1)
y_all = data['FTR']

# # Standardising the data.
# from sklearn.preprocessing import scale


# cols = [['HTGD','ATGD','HTP','ATP','DiffLP']]
# for col in cols:
#     X_all[col] = scale(X_all[col])

In [6]:
X_all.HM1 = X_all.HM1.astype('str')
X_all.HM2 = X_all.HM2.astype('str')
X_all.HM3 = X_all.HM3.astype('str')
X_all.AM1 = X_all.AM1.astype('str')
X_all.AM2 = X_all.AM2.astype('str')
X_all.AM3 = X_all.AM3.astype('str')

def preprocess_features(X):
    ''' Preprocesses the football data and converts catagorical variables into dummy variables. '''
    
    # Initialize new output DataFrame
    output = pd.DataFrame(index = X.index)

    # Investigate each feature column for the data
    for col, col_data in X.iteritems():

        # If data type is categorical, convert to dummy variables
        if col_data.dtype == object:
            col_data = pd.get_dummies(col_data, prefix = col)
                    
        # Collect the revised columns
        output = output.join(col_data)
    
    return output

X_all = preprocess_features(X_all)
print("Processed feature columns ({} total features):\n{}".format(len(X_all.columns), list(X_all.columns)))

Processed feature columns (61 total features):
['HTGS', 'ATGS', 'HTGC', 'ATGC', 'HTP', 'ATP', 'HM1_D', 'HM1_L', 'HM1_M', 'HM1_W', 'HM2_D', 'HM2_L', 'HM2_M', 'HM2_W', 'HM3_D', 'HM3_L', 'HM3_M', 'HM3_W', 'HM4_D', 'HM4_L', 'HM4_M', 'HM4_W', 'HM5_D', 'HM5_L', 'HM5_M', 'HM5_W', 'AM1_D', 'AM1_L', 'AM1_M', 'AM1_W', 'AM2_D', 'AM2_L', 'AM2_M', 'AM2_W', 'AM3_D', 'AM3_L', 'AM3_M', 'AM3_W', 'AM4_D', 'AM4_L', 'AM4_M', 'AM4_W', 'AM5_D', 'AM5_L', 'AM5_M', 'AM5_W', 'EstimatedHomeGoals', 'EstimatedAwayGoals', 'HTWinStreak3', 'HTWinStreak5', 'HTLossStreak3', 'HTLossStreak5', 'ATWinStreak3', 'ATWinStreak5', 'ATLossStreak3', 'ATLossStreak5', 'HTGD', 'ATGD', 'DiffPts', 'DiffFormPts', 'DiffLP']


In [7]:
# Show the feature information by printing the first five rows
print("Feature values:")
display(X_all.head())

Feature values:


Unnamed: 0,HTGS,ATGS,HTGC,ATGC,HTP,ATP,HM1_D,HM1_L,HM1_M,HM1_W,...,HTLossStreak5,ATWinStreak3,ATWinStreak5,ATLossStreak3,ATLossStreak5,HTGD,ATGD,DiffPts,DiffFormPts,DiffLP
0,0.107843,0.142857,0.164706,0.134146,1.090909,1.636364,0,1,0,0,...,0,0,0,0,0,-0.272727,0.363636,-0.545455,-0.181818,8.0
1,0.088235,0.066667,0.2,0.146341,0.454545,0.636364,0,1,0,0,...,0,0,0,0,0,-0.727273,-0.454545,-0.181818,0.181818,5.0
2,0.117647,0.12381,0.070588,0.085366,1.727273,1.545455,1,0,0,0,...,0,0,0,0,0,0.545455,0.545455,0.181818,0.272727,14.0
3,0.107843,0.095238,0.058824,0.170732,2.0,0.818182,0,0,0,1,...,0,0,0,1,0,0.545455,-0.363636,1.181818,1.0,9.0
4,0.088235,0.07619,0.211765,0.121951,0.727273,1.090909,0,1,0,0,...,0,0,0,0,0,-0.818182,-0.181818,-0.363636,-0.545455,13.0


In [8]:
from sklearn.model_selection import train_test_split

# Shuffle and split the dataset into training and testing set.
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, 
                                                    test_size = 30,
                                                    random_state = 2)

In [9]:
from time import time 
from sklearn.metrics import f1_score

def train_classifier(clf, X_train, y_train):
    ''' Fits a classifier to the training data. '''
    
    # Start the clock, train the classifier, then stop the clock
    start = time()
    clf.fit(X_train, y_train)
    end = time()
    
    # Print the results
    print("Trained model in {:.4f} seconds.".format(end - start))

    
def predict_labels(clf, features, target):
    ''' Makes predictions using a fit classifier based on F1 score. '''
    
    # Start the clock, make predictions, then stop the clock
    start = time()
    y_pred = clf.predict(features)
    
    end = time()
    # Print and return results
    print("Made predictions in {:.4f} seconds.".format(end - start))
    
    acc = sum(target == y_pred) / float(len(y_pred))

    return f1_score(target, y_pred, average='micro'), acc

def train_predict(clf, X_train, y_train, X_test, y_test):
    ''' Train and predict using a classifer based on F1 score. '''
    
    # Indicate the classifier and the training set size
    print("Training a {} using a training set size of {}".format(clf.__class__.__name__, len(X_train)))
    
    # Train the classifier
    train_classifier(clf, X_train, y_train)
    
    # Print the results of prediction for both training and testing
    f1, acc = predict_labels(clf, X_train, y_train)
    print("F1 score and accuracy score for training set: {:.4f} , {:.4f}.".format(f1 , acc))
    
    f1, acc = predict_labels(clf, X_test, y_test)
    print("F1 score and accuracy score for test set: {:.4f} , {:.4f}.".format(f1 , acc))

In [10]:
# TODO: Initialize the three models (XGBoost is initialized later)
clf_A = LogisticRegression(random_state = 42)
clf_B = SVC(random_state = 912, kernel='rbf')
clf_C = xgb.XGBClassifier(seed = 82)
clf_D = RandomForestClassifier()

train_predict(clf_A, X_train, y_train, X_test, y_test)
print()
train_predict(clf_B, X_train, y_train, X_test, y_test)
print()
train_predict(clf_C, X_train, y_train, X_test, y_test)
print()
train_predict(clf_D, X_train, y_train, X_test, y_test)
print()

Training a LogisticRegression using a training set size of 6330
Trained model in 0.3122 seconds.
Made predictions in 0.0045 seconds.
F1 score and accuracy score for training set: 0.5431 , 0.5431.
Made predictions in 0.0025 seconds.
F1 score and accuracy score for test set: 0.6667 , 0.6667.

Training a SVC using a training set size of 6330
Trained model in 3.6839 seconds.
Made predictions in 2.4337 seconds.
F1 score and accuracy score for training set: 0.5485 , 0.5485.
Made predictions in 0.0123 seconds.
F1 score and accuracy score for test set: 0.6667 , 0.6667.

Training a XGBClassifier using a training set size of 6330
Trained model in 2.8049 seconds.
Made predictions in 0.0097 seconds.
F1 score and accuracy score for training set: 0.9801 , 0.9801.
Made predictions in 0.0035 seconds.
F1 score and accuracy score for test set: 0.4667 , 0.4667.

Training a RandomForestClassifier using a training set size of 6330
Trained model in 1.0288 seconds.
Made predictions in 0.1110 seconds.
F1 scor

In [11]:
import pickle
import time

timestr = time.strftime("%Y%m%d-%H%M%S")

# Save model
with open(cwd + f'/../model/model-{timestr}.pickle', 'wb') as f:
    pickle.dump(clf_A, f)