In [115]:
# ========== Packages ==========

import pandas as pd
import numpy as np
import datetime as dt
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier

In [116]:
# ========== Functions ==========

def preprocess(data):
    # This function pre-processes the data, and splits the input parameters and the target parameter
    # Input: data frame
    # Output: data frame, Series
    
    # Date and the name of the player are irrelevent to the Brownlow votes and slowed down the performance, so I removed them.   
    data = pd.DataFrame(data.drop("Date", axis = 1))   
    data = pd.DataFrame(data.drop("Name", axis = 1))
    data = pd.DataFrame(pd.get_dummies(data))  # Convert categorical variables to indicators
    
    # Split input and target
    x =  pd.DataFrame(data.drop("Brownlow Votes",axis = 1)) # Initialise the input parameters
    y = data["Brownlow Votes"] # Initialise the target parameter  
    return x, y 

def get_data_years(data, year_range):
    # This function extracts and returns the data that are within the year_range
    # Inputs: dataframe, range
    # Outputs: data frame
    
    return data[data['Date'].dt.year.isin(year_range)]   

def eval_model(name, target, predicted):
    # This function evaluates the accuracy of a model prediction using 3 metrics: accuracy, f_score and mean absolute error, and prints the results
    # Inputs: String, Numpy array, Numpy array
    # Output: float, float, float
    
    accuracy = metrics.accuracy_score(target, predicted)
    f_score = metrics.f1_score(target, predicted, average = 'weighted')  # Since our data is imbalanced (we have more 0 Brownlow votes than the other ones, f_score can be a better metric)
    abs_error = metrics.mean_absolute_error(target, predicted) # To compute the absolute error of the model. This helps us to get a sense of closeness of predictions to the target
    print_res(name, accuracy, f_score, abs_error) # print the results
    return accuracy, f_score, abs_error 
    
    
def print_res(name, accuracy, f_score, abs_error):
    # This function prints the results of evaluation
    # Inputs: String, float, float, float
    # Output: None
    
    print(name+ ":\n")
    print("This model is {:.2f}% accurate".format(accuracy*100.0))
    print("The f_score is {:.2f}%".format(f_score*100.0))
    print("The absolute error is {:.2f}%\n".format(abs_error))
    

In [117]:
# ========== Pre-processing ==========

raw_data = pd.read_csv("RawData.csv")
raw_data['Date'] = pd.to_datetime(raw_data['Date']) # Modify the 'Date' column to 'datetime' format to be able to select years

# Split the sets. I used 2003 to 2005 for cross validation. 2006 and 2015 for training set 
training_set = get_data_years(raw_data, range(2006, 2016))
cross_val_set = get_data_years(raw_data, range(2003, 2006))
test_set =  get_data_years(raw_data, range(2016, 2020))

# Preprocess the data
x_train, y_train = preprocess(training_set)
x_cross_val, y_cross_val = preprocess(cross_val_set)
x_test, y_test = preprocess(test_set)


# I ran into an issue that certain teams only appeared in one set and not the others. To keep the number of features the same 
# for all the data sets I removed the features corresponding to those teams

for col in x_train.columns:
    if col not in x_cross_val.columns:
         x_train.drop(col, inplace=True, axis=1)
            
for col in x_test.columns:
    if col not in x_cross_val.columns:
         x_test.drop(col, inplace=True, axis=1)
 


In [118]:
# ========== Modelling ==========
# I experimented with 3 different models and tested their efficienies

# Logistic Regression
log_reg = LogisticRegression(solver='sag', max_iter = 5000)
log_reg.fit(x_train, y_train) # fit the parameters into the model

# Neural networks: MLP classifier
mlp = MLPClassifier(random_state=1, max_iter=5000)
mlp.fit(x_train, y_train)

# Decision tree: Random Forest classifer
r_forest = RandomForestClassifier(max_depth=2, random_state=0)
r_forest.fit(x_train, y_train)

RandomForestClassifier(max_depth=2, random_state=0)

In [119]:
# ========== Test ==========

# Predict
y_pred_reg = log_reg.predict(x_cross_val) 
y_pred_mlp = mlp.predict(x_cross_val) 
y_pred_forest = r_forest.predict(x_cross_val)

# Evaluate 
eval_model("Logistic regression", y_cross_val, y_pred_reg) # evaluate the regression model
eval_model("MLP", y_cross_val, y_pred_mlp) # evaluate the MLP model
eval_model("Random forest", y_cross_val, y_pred_forest) # evaluate the random forest model

Logistic regression:

This model is 93.81% accurate
The f_score is 91.36%
The absolute error is 0.12%

MLP:

This model is 93.21% accurate
The f_score is 91.36%
The absolute error is 0.12%

Random forest:

This model is 93.51% accurate
The f_score is 90.38%
The absolute error is 0.13%



(0.935132478807486, 0.9037859293845576, 0.12973504238502806)

In [120]:
# ========== Example of prediction using logistic regression ==========

# Test the accuracy of the model
print("This model is {:.2f}% accurate".format(metrics.accuracy_score(y_test, y_pred_test)*100.0))
y_pred_test = log_reg.predict(x_test)

# Example: predict the winner of Brownlow medal for 2019 season
test_set = get_data_years(raw_data, range(2016, 2020))  # re-read the test set to include Name and Date
test_set.insert(1,'Predicted Brownlow', y_pred_test) # add the predictions to the data
test_2019 = test_set[test_set['Date'].dt.year == 2019] # extract 2019 data 
res = test_2019.groupby('Name')['Predicted Brownlow'].sum().to_frame(name = 'Total Brownlow').reset_index() # group by player names to calculate the sum of brownlow in the whole season

print("The predicted winners of the 2019 season are:")
res.nlargest(3, 'Total Brownlow') # Print out the 3 largest total Brownlow votes

This model is 94.21% accurate
The predicted winners of the 2019 season are:


Unnamed: 0,Name,Total Brownlow
82,Brodie Grundy,24.0
493,Patrick Dangerfield,24.0
178,Dustin Martin,21.0
