In [10]:

#data preprocessing
import pandas as pd
#produces a prediction model in the form of an ensemble of weak prediction models, typically decision tree
#the outcome (dependent variable) has only a limited number of possible values. 
#Logistic Regression is used when response variable is categorical in nature.
from sklearn.linear_model import LogisticRegression
#A random forest is a meta estimator that fits a number of decision tree classifiers 
#on various sub-samples of the dataset and use averaging to improve the predictive 
#accuracy and control over-fitting.
from sklearn.ensemble import RandomForestClassifier
#-----------------------------------------------------------------------------------------------------------
from sklearn.naive_bayes import GaussianNB
#-------------------------------------------------------------------------------------------------------------
from sklearn.svm import SVC
#displayd data
from IPython.display import display
%matplotlib inline

In [2]:
# Read data and drop redundant column.
data = pd.read_csv('final_dataset_1.csv', index_col = 0)
#data.drop(columns=[''])
# Preview data.
display(data.tail())


#Full Time Result (H=Home Win, D=Draw, A=Away Win)
#HTGD - Home team goal difference
#ATGD - away team goal difference
#HTP - Home team points
#ATP - Away team points
#DiffFormPts Diff in points
#DiffLP - Differnece in last years prediction

#Input - 12 other features (fouls, shots, goals, misses,corners, red card, yellow cards)
#Output - Full Time Result (H=Home Win, D=Draw, A=Away Win)

Unnamed: 0,FTR,HTP,ATP,HM1,HM2,HM3,AM1,AM2,AM3,HTGD,ATGD,DiffFormPts,DiffLP
6075,H,1.263158,1.631579,L,D,L,W,L,W,-0.394737,0.394737,-0.236842,-3.0
6076,NH,1.210526,1.710526,W,W,L,D,L,W,-0.263158,0.789474,0.026316,6.0
6077,NH,1.157895,1.0,L,L,W,W,W,D,-0.263158,-0.368421,-0.131579,2.0
6078,NH,1.105263,1.552632,D,L,D,D,W,L,-0.368421,0.342105,-0.157895,7.0
6079,H,1.657895,1.105263,L,W,D,D,L,L,0.315789,-0.526316,0.157895,-14.0


In [3]:
#what is the win rate for the home team?

# Total number of matches.
n_matches = data.shape[0]

# Calculate number of features. -1 because we are saving one as the target variable (win/lose/draw)
n_features = data.shape[1] - 1

# Calculate matches won by home team.
n_homewins = len(data[data.FTR == 'H'])

# Calculate win rate for home team.
win_rate = (float(n_homewins) / (n_matches)) * 100

# Print the results
print ("Total number of matches:"+ str(n_matches))
print ("Number of features: "+str(n_features))
print ("Number of matches won by home team: "+str(n_homewins))
print ("Win rate of home team: "+ str(win_rate) + "%")

Total number of matches:6080
Number of features: 12
Number of matches won by home team: 2816
Win rate of home team: 46.31578947368421%


In [4]:
# Separate into feature set and target variable
#FTR = Full Time Result (H=Home Win, D=Draw, A=Away Win)
X_all = data.drop(['FTR'],1)
y_all = data['FTR']

# Standardising the data.
from sklearn.preprocessing import scale

#Center to the mean and component wise scale to unit variance.
cols = [['HTGD','ATGD','HTP','ATP','DiffLP']]
for col in cols:
    X_all[col] = scale(X_all[col])

In [5]:
#last 3 wins for both sides
X_all.HM1 = X_all.HM1.astype('str')
X_all.HM2 = X_all.HM2.astype('str')
X_all.HM3 = X_all.HM3.astype('str')
X_all.AM1 = X_all.AM1.astype('str')
X_all.AM2 = X_all.AM2.astype('str')
X_all.AM3 = X_all.AM3.astype('str')

#we want continous vars that are integers for our input data, so lets remove any categorical vars
def preprocess_features(X):
    ''' Preprocesses the football data and converts catagorical variables into dummy variables. '''
    
    # Initialize new output DataFrame
    output = pd.DataFrame(index = X.index)

    # Investigate each feature column for the data
    for col, col_data in X.iteritems():

        # If data type is categorical, convert to dummy variables
        if col_data.dtype == object:
            col_data = pd.get_dummies(col_data, prefix = col)
                    
        # Collect the revised columns
        output = output.join(col_data)
    
    return output

X_all = preprocess_features(X_all)
print ("Processed feature columns ("+str(len(X_all.columns))+" total features):\n"+str(list(X_all.columns)))

Processed feature columns (30 total features):
['HTP', 'ATP', 'HM1_D', 'HM1_L', 'HM1_M', 'HM1_W', 'HM2_D', 'HM2_L', 'HM2_M', 'HM2_W', 'HM3_D', 'HM3_L', 'HM3_M', 'HM3_W', 'AM1_D', 'AM1_L', 'AM1_M', 'AM1_W', 'AM2_D', 'AM2_L', 'AM2_M', 'AM2_W', 'AM3_D', 'AM3_L', 'AM3_M', 'AM3_W', 'HTGD', 'ATGD', 'DiffFormPts', 'DiffLP']


In [6]:
# Show the feature information by printing the first five rows
print ("\nFeature values:")
display(X_all.head())


Feature values:


Unnamed: 0,HTP,ATP,HM1_D,HM1_L,HM1_M,HM1_W,HM2_D,HM2_L,HM2_M,HM2_W,...,AM2_M,AM2_W,AM3_D,AM3_L,AM3_M,AM3_W,HTGD,ATGD,DiffFormPts,DiffLP
0,-2.303259,-2.373125,0,0,1,0,0,0,1,0,...,1,0,0,0,1,0,0.014963,-0.022914,0.0,0.0
1,-2.303259,-2.373125,0,0,1,0,0,0,1,0,...,1,0,0,0,1,0,0.014963,-0.022914,0.0,-0.496101
2,-2.303259,-2.373125,0,0,1,0,0,0,1,0,...,1,0,0,0,1,0,0.014963,-0.022914,0.0,0.248051
3,-2.303259,-2.373125,0,0,1,0,0,0,1,0,...,1,0,0,0,1,0,0.014963,-0.022914,0.0,0.124025
4,-2.303259,-2.373125,0,0,1,0,0,0,1,0,...,1,0,0,0,1,0,0.014963,-0.022914,0.0,-1.240253


In [7]:
from sklearn.model_selection import train_test_split

# Shuffle and split the dataset into training and testing set.
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, 
                                                    test_size = 50,
                                                    random_state = 2,
                                                    stratify = y_all)

In [11]:
#for measuring training time
from time import time 
# F1 score (also F-score or F-measure) is a measure of a test's accuracy. 
#It considers both the precision p and the recall r of the test to compute 
#the score: p is the number of correct positive results divided by the number of 
#all positive results, and r is the number of correct positive results divided by 
#the number of positive results that should have been returned. The F1 score can be 
#interpreted as a weighted average of the precision and recall, where an F1 score 
#reaches its best value at 1 and worst at 0.
from sklearn.metrics import f1_score

def train_classifier(clf, X_train, y_train):
    ''' Fits a classifier to the training data. '''
    
    # Start the clock, train the classifier, then stop the clock
    start = time()
    clf.fit(X_train, y_train)
    end = time()
    
    # Print the results
    print ("Trained model in {:.4f} seconds".format(end - start))

    
def predict_labels(clf, features, target):
    ''' Makes predictions using a fit classifier based on F1 score. '''
    
    # Start the clock, make predictions, then stop the clock
    start = time()
    y_pred = clf.predict(features)
    
    end = time()
    # Print and return results
    print ("Made predictions in {:.4f} seconds.".format(end - start))
    
    return f1_score(target, y_pred, pos_label='H'), sum(target == y_pred) / float(len(y_pred))


def train_predict(clf, X_train, y_train, X_test, y_test):
    ''' Train and predict using a classifer based on F1 score. '''
    
    # Indicate the classifier and the training set size
    print ("Training a {} using a training set size of {}. . .".format(clf.__class__.__name__, len(X_train)))
    
    # Train the classifier
    train_classifier(clf, X_train, y_train)
    
    # Print the results of prediction for both training and testing
    f1, acc = predict_labels(clf, X_train, y_train)
    print (f1, acc)
    print ("F1 score and accuracy score for training set: {:.4f} , {:.4f}.".format(f1 , acc))
    
    f1, acc = predict_labels(clf, X_test, y_test)
    print ("F1 score and accuracy score for test set: {:.4f} , {:.4f}.".format(f1 , acc))

In [12]:
# Initialize the three models (XGBoost is initialized later)
clf_A = LogisticRegression(random_state = 42)
clf_B = SVC(random_state = 912, kernel='rbf')
#----------------------------------------------------------------------------------------------------
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB() 
gnb.fit(X_train, y_train) 
y_pred = gnb.predict(X_test) 
from sklearn import metrics 
#----------------------------------------------------------------------------------------------------

train_predict(clf_A, X_train, y_train, X_test, y_test)
print ('')
train_predict(clf_B, X_train, y_train, X_test, y_test)
print ('')

Gaussian Naive Bayes model accuracy(in %): 70.0
Training a LogisticRegression using a training set size of 6030. . .
Trained model in 0.0309 seconds
Made predictions in 0.0020 seconds.
0.6142533936651584 0.6606965174129353
F1 score and accuracy score for training set: 0.6143 , 0.6607.
Made predictions in 0.0010 seconds.
F1 score and accuracy score for test set: 0.6818 , 0.7200.

Training a SVC using a training set size of 6030. . .




Trained model in 2.2999 seconds
Made predictions in 1.3105 seconds.
0.6134387351778657 0.6756218905472637
F1 score and accuracy score for training set: 0.6134 , 0.6756.
Made predictions in 0.0110 seconds.
F1 score and accuracy score for test set: 0.6818 , 0.7200.



In [13]:
print("Gaussian Naive Bayes model accuracy(in %):", metrics.accuracy_score(y_test, y_pred)*100)

Gaussian Naive Bayes model accuracy(in %): 70.0
