In [None]:
# ========== Packages ==========
import pandas as pd
import numpy as np
import datetime as dt
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [37]:
# ========== Functions ==========
def preprocess(data):
    # This function pre-processes the data, and splits the input parameters and the target parameter
    # Input: data frame
    # Output: data frame, Series
    
    # Date and the name of the player are irrelevent to the Brownlow votes and slowed down the performance, so I removed them.   
    data = pd.DataFrame(data.drop("Date", axis = 1))   
    data = pd.DataFrame(data.drop("Name", axis = 1))
    data = pd.DataFrame(pd.get_dummies(data))  # Convert categorical variables to indicators
    
    # Split input and target
    x =  pd.DataFrame(data.drop("Brownlow Votes",axis = 1)) # Initialise the input parameters
    y = data["Brownlow Votes"] # Initialise the target parameter  
    return x, y 

def get_data_years(data, year_range):
    # This function extracts and returns the data that are within the year_range
    # Inputs: dataframe, range
    # Outputs: data frame
    
    return data[data['Date'].dt.year.isin(year_range)]   

In [46]:
# ========== Pre-pross ==========
raw_data = pd.read_csv("RawData.csv")
raw_data['Date'] = pd.to_datetime(raw_data['Date']) # Modify the 'Date' column to 'datetime' format to be able to select years

# Split the sets. I used 2003 to 2005 for cross validation. 2006 and 2015 for training set 
training_set = get_data_years(raw_data, range(2006, 2016))
cross_val_set = get_data_years(raw_data, range(2003, 2006))
test_set =  get_data_years(raw_data, range(2016, 2020))


x_train, y_train = preprocess(training_set)
x_cross_val, y_cross_val = preprocess(cross_val_set)
x_test, y_test = preprocess(test_set)


# I ran into an issue that certain teams only appeared in one set and not the others. To keep the number of features the same 
# for all the data sets I removed features corresponding to those teams

for col in x_train.columns:
    if col not in x_cross_val.columns:
         x_train.drop(col, inplace=True, axis=1)
            
for col in x_test.columns:
    if col not in x_cross_val.columns:
         x_test.drop(col, inplace=True, axis=1)
 


100


In [None]:
# ========== Model ==========

# Logistic Regression
log_reg = LogisticRegression(solver='sag', max_iter = 5000)
log_reg.fit(x_train, y_train) # fit the parameters into the model



In [62]:
# ========== Test ==========

# Cross validation
y_pred = log_reg.predict(x_cross_val) 
y_pred_test = log_reg.predict(x_test)
accuracy = metrics.accuracy_score(y_cross_val, y_pred)
f_score = metrics.f1_score(y_cross_val, y_pred, average = 'weighted')  # Since our data is imbalanced (we have more 0 Brownlow votes than the other ones, f_score can be a better metric)
abs_error = metrics.mean_absolute_error(y_cross_val, y_pred) # To compute the absolute error of the model. This helps us to get a sense of closeness of predictions to the target
print("This model is {:.2f}% accurate".format(accuracy*100.0))
print("This model is {:.2f}% accurate".format(f_score*100.0))
print("The absolute error is {:.2f}%".format(abs_error))


This model is 93.81% accurate
This model is 91.36% accurate
The absolute error is 0.12%


In [54]:
# ========== Example of prediction ==========

# Test the accuracy of the model
print("This model is {:.2f}% accurate".format(metrics.accuracy_score(y_test, y_pred_test)*100.0))

# Example: predict the winner of Brownlow medal for 2019 season
test_set = get_data_years(raw_data, range(2016, 2020))  # re-read the test set to include Name and Date
test_set.insert(1,'Predicted Brownlow', y_pred_test) # add the predictions to the data
test_2019 = test_set[test_set['Date'].dt.year == 2019] # extract 2019 data 
res = test_2019.groupby('Name')['Predicted Brownlow'].sum().to_frame(name = 'Total Brownlow').reset_index() # group by player names to calculate the sum of brownlow in the whole season

print("The predicted winners of the 2019 season are:")
res.nlargest(3, 'Total Brownlow') # Print out the 3 largest total Brownlow votes

This model is 94.21% accurate
The predicted winners of the 2019 season are:


Unnamed: 0,Name,Total Brownlow
82,Brodie Grundy,24.0
493,Patrick Dangerfield,24.0
178,Dustin Martin,21.0
