# Preprocessing and Model Implementation

This notebook will cover the preprocessing, model implementation, parameter tuning and model evaluation steps of the machine learning capstone project. 

In [2]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import sklearn as sk

In [3]:
# import dataset 
data = pd.read_csv('capstone data final.csv', index_col=[0,1,2])
data.tail()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,prev rd score,three rd av,five rd av,season av,prev against opp,prev at venue,three rd av team for,three rd av opp against,last team opp,last team venue,score
year,round,player,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2017,20,Jackson Trengove,52.0,58.666667,58.2,65.111111,57.0,52.0,1577.0,1576.0,1497.0,1491.0,32
2017,20,Aaron Young,68.0,51.333333,55.4,57.5,23.0,68.0,1577.0,1576.0,1497.0,1491.0,25
2017,20,Tom Clurey,47.0,42.333333,43.8,51.611111,31.0,47.0,1577.0,1576.0,1497.0,1491.0,24
2017,20,Angus Monfries,,,,,92.0,83.0,1577.0,1576.0,1497.0,1491.0,22
2017,20,Sam Powell-Pepper,54.0,79.0,78.8,72.111111,68.0,54.0,1577.0,1576.0,1497.0,1491.0,16


## Preprocessing

Here I perform the necessary preprocessing on my dataset to prepare it for use in the machine learning models.

In [4]:
# Step 1: remove all observations with null values

data = data.dropna()
data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,prev rd score,three rd av,five rd av,season av,prev against opp,prev at venue,three rd av team for,three rd av opp against,last team opp,last team venue,score
year,round,player,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2014,14,Matt Rosa,89.0,77.333333,84.0,89.818182,77.0,68.0,1625.666667,1844.666667,1503.0,1454.0,109
2014,14,Matt Priddis,108.0,117.0,117.2,107.333333,102.0,111.0,1625.666667,1844.666667,1503.0,1454.0,101
2014,14,Jack Newnes,54.0,74.333333,79.6,78.75,91.0,88.0,1307.0,1598.666667,1444.0,1464.0,101
2014,14,Lenny Hayes,81.0,98.333333,104.2,100.454545,79.0,109.0,1307.0,1598.666667,1444.0,1464.0,95
2014,14,Jack Darling,64.0,73.666667,80.2,80.25,102.0,52.0,1625.666667,1844.666667,1503.0,1454.0,90


In [5]:
# Step 2: Cross validation
# Splits are based on rounds rather than observations so that 1 we don't get look ahead bias, 2 to keep inline with
# the end goal of the analysis which is to select highest predicted players each round

test = data.loc[(2017, 6):]
validation = data.loc[(2016, 19):(2017, 5)]
training = data.loc[:(2016, 18)]
print(len(training), len(validation), len(test))

11721 2985 4305


In [6]:
# Step 3: extract target variables from dataset

train_features = np.array(training.drop('score', axis=1))
train_targets = np.array(training['score'])
val_features = np.array(validation.drop('score', axis=1))
val_targets = np.array(validation['score'])
test_features = np.array(test.drop('score', axis=1))
test_targets = np.array(test['score'])

In [7]:
# Step 4: Normalize features

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(train_features)

train_features = scaler.transform(train_features)
val_features = scaler.transform(val_features)
test_features = scaler.transform(test_features)

print(train_features[:10])

[[ 0.53209599  0.12473422  0.48371329  0.80217904  0.12508374 -0.24162979
   0.57939626  2.94512659 -0.35467602 -0.74511171]
 [ 1.242367    2.03579787  2.20489739  1.7106359   1.02772527  1.30663849
   0.57939626  2.94512659 -0.35467602 -0.74511171]
 [-0.77629796 -0.01980001  0.25560455  0.22810661  0.630563    0.47849499
  -2.63734196  0.3535313  -0.731877   -0.68179208]
 [ 0.23303452  1.1364738   1.53093976  1.3538544   0.19729506  1.23462601
  -2.63734196  0.3535313  -0.731877   -0.68179208]
 [-0.40247112 -0.05191873  0.28671029  0.30590698  1.02772527 -0.81772961
   0.57939626  2.94512659 -0.35467602 -0.74511171]
 [ 0.15826915  1.36130482  1.06435371  0.89805425  0.91940829 -0.02559235
   0.57939626  2.94512659 -0.35467602 -0.74511171]
 [-1.71086508 -0.93518344 -1.29968228 -1.34086752 -0.38039552 -0.63769842
  -2.63734196  0.3535313  -0.731877   -0.68179208]
 [-1.15012481 -1.04759895 -0.61535607  0.20217316  0.37782337 -0.7097109
  -2.63734196  0.3535313  -0.731877   -0.68179208]
 

## Model Implementation and Refinement

Below are the implementations of the machine learning models to be evaluated for the project.

In [8]:
# import models
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV

In [9]:
#1 Gradient Boosting

# create model
gb = GradientBoostingRegressor()

# set up gridsearch
params = {'learning_rate': [0.01, 0.03, 0.1, 0.3], 'n_estimators': [10, 100, 200, 500, 1000]}
gb_reg = GridSearchCV(gb, params, scoring='r2')

# conduct training/tuning
gb_reg.fit(train_features, train_targets)
pd.DataFrame(gb_reg.cv_results_)[['mean_test_score', 'mean_train_score', 'params', 'rank_test_score']].sort('rank_test_score')



Unnamed: 0,mean_test_score,mean_train_score,params,rank_test_score
3,0.310706,0.353674,"{'learning_rate': 0.01, 'n_estimators': 500}",1
7,0.310021,0.36039,"{'learning_rate': 0.03, 'n_estimators': 200}",2
6,0.309554,0.337666,"{'learning_rate': 0.03, 'n_estimators': 100}",3
4,0.307694,0.379726,"{'learning_rate': 0.01, 'n_estimators': 1000}",4
15,0.307186,0.3383,"{'learning_rate': 0.3, 'n_estimators': 10}",5
11,0.305903,0.380774,"{'learning_rate': 0.1, 'n_estimators': 100}",6
8,0.30339,0.401472,"{'learning_rate': 0.03, 'n_estimators': 500}",7
2,0.30215,0.321793,"{'learning_rate': 0.01, 'n_estimators': 200}",8
12,0.295556,0.422489,"{'learning_rate': 0.1, 'n_estimators': 200}",9
9,0.289469,0.453183,"{'learning_rate': 0.03, 'n_estimators': 1000}",10


In [10]:
# create tuned model and evaluate on validation dataset
gb = GradientBoostingRegressor(learning_rate=0.01, n_estimators=500)
gb.fit(train_features, train_targets)
print(gb.score(val_features, val_targets))

0.323852255436


In [11]:
#2 SVM

# create model
svm = SVR()

# set up gridsearch
params2 = {'C': [0.1, 1, 10], 'epsilon': [0.01, 0.1, 1], 'kernel': ['linear', 'rbf']}
sv_reg = GridSearchCV(svm, params2)

# conduct training/tuning
sv_reg.fit(train_features, train_targets)
pd.DataFrame(sv_reg.cv_results_)[['mean_test_score', 'mean_train_score', 'params', 'rank_test_score']]

Unnamed: 0,mean_test_score,mean_train_score,params,rank_test_score
0,0.31123,0.314891,"{'epsilon': 0.01, 'kernel': 'linear', 'C': 0.1}",7
1,0.264118,0.268168,"{'epsilon': 0.01, 'kernel': 'rbf', 'C': 0.1}",17
2,0.311297,0.314897,"{'epsilon': 0.1, 'kernel': 'linear', 'C': 0.1}",5
3,0.264098,0.268143,"{'epsilon': 0.1, 'kernel': 'rbf', 'C': 0.1}",18
4,0.311519,0.314863,"{'epsilon': 1, 'kernel': 'linear', 'C': 0.1}",1
5,0.264223,0.268106,"{'epsilon': 1, 'kernel': 'rbf', 'C': 0.1}",16
6,0.31128,0.314944,"{'epsilon': 0.01, 'kernel': 'linear', 'C': 1}",6
7,0.302918,0.317497,"{'epsilon': 0.01, 'kernel': 'rbf', 'C': 1}",12
8,0.311331,0.314966,"{'epsilon': 0.1, 'kernel': 'linear', 'C': 1}",2
9,0.302901,0.317456,"{'epsilon': 0.1, 'kernel': 'rbf', 'C': 1}",13


In [12]:
# create tuned model and evaluate on validation dataset
svm = SVR(epsilon=1, kernel='linear', C=0.1)
svm.fit(train_features, train_targets)
print(svm.score(val_features, val_targets))

0.331944041386


In [16]:
#3 Neural Network

# create model
nn = MLPRegressor(activation='relu', solver='sgd')

# set up gridsearch
params3 = {'hidden_layer_sizes': [(5,), (20,), (20, 5), (5, 3)], 'alpha': [0.003, 0.0001, 0.0003], 
           'batch_size': [16, 32], 'max_iter': [1000, 10000]}
nn_reg = GridSearchCV(nn, params3)

# conduct training/tuning
nn_reg.fit(train_features, train_targets)
pd.DataFrame(nn_reg.cv_results_)[['mean_test_score', 'mean_train_score', 'params', 'rank_test_score']]

Unnamed: 0,mean_test_score,mean_train_score,params,rank_test_score
0,0.281932,0.288368,"{'alpha': 0.003, 'max_iter': 1000, 'hidden_lay...",4
1,0.259892,0.26837,"{'alpha': 0.003, 'max_iter': 10000, 'hidden_la...",16
2,0.21561,0.221245,"{'alpha': 0.003, 'max_iter': 1000, 'hidden_lay...",31
3,0.183659,0.193162,"{'alpha': 0.003, 'max_iter': 10000, 'hidden_la...",36
4,0.167493,0.176651,"{'alpha': 0.003, 'max_iter': 1000, 'hidden_lay...",40
5,0.262369,0.27391,"{'alpha': 0.003, 'max_iter': 10000, 'hidden_la...",13
6,0.116619,0.120518,"{'alpha': 0.003, 'max_iter': 1000, 'hidden_lay...",46
7,0.213396,0.215131,"{'alpha': 0.003, 'max_iter': 10000, 'hidden_la...",32
8,0.199157,0.221598,"{'alpha': 0.003, 'max_iter': 1000, 'hidden_lay...",34
9,0.233993,0.24735,"{'alpha': 0.003, 'max_iter': 10000, 'hidden_la...",28


In [17]:
nn_reg.best_params_

{'alpha': 0.0001,
 'batch_size': 16,
 'hidden_layer_sizes': (5,),
 'max_iter': 10000}

In [18]:
# create tuned model and evaluate on validation dataset
nn = MLPRegressor(activation='relu', solver='sgd', alpha=0.0003, batch_size=32, hidden_layer_sizes=(5,), max_iter=1000)
nn.fit(train_features, train_targets)
print(nn.score(val_features, val_targets))

0.267154430907


### Optimal Model: SVM

SVM is chosen as the model to be used to generate predictions on testing dataset as a result of best performance on validation dataset. Below I use the model to create predictions on the testing set. These predictions are evaluated against the true test set target values and are also used to rank players for the purpose of constructing the captain choice model. Lastly I generate the evaluation metric data from the ranked data for comparison with the benchmark models.

In [19]:
# generate predictions on testset
pred = svm.predict(test_features)

# evaluate predictions 
svm.score(test_features, test_targets)

0.30518696075410268

In [20]:
# add model predictions to test dataframe 
test['predictions'] = pred
test.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,prev rd score,three rd av,five rd av,season av,prev against opp,prev at venue,three rd av team for,three rd av opp against,last team opp,last team venue,score,predictions
year,round,player,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2017,6,Zac Williams,92.0,87.0,87.2,87.2,113.0,81.0,1688.666667,1637.666667,1895.0,1798.0,107,86.010879
2017,6,Toby Greene,77.0,85.0,85.8,85.8,108.0,91.0,1688.666667,1637.666667,1895.0,1798.0,97,85.067435
2017,6,Heath Shaw,81.0,78.0,78.4,78.4,176.0,75.0,1688.666667,1637.666667,1895.0,1798.0,96,84.751595
2017,6,Devon Smith,87.0,83.0,82.8,82.8,52.0,67.0,1688.666667,1637.666667,1895.0,1798.0,96,77.013092
2017,6,Josh Kelly,82.0,109.333333,104.0,104.0,76.0,127.0,1688.666667,1637.666667,1895.0,1798.0,91,97.34203


In [21]:
# construct captain choice dataset and evaluation metric using test predictions
yr = 2017
rd = list(range(6,21))
player = []
score = []
rd_col = []

for r in rd:
    top5 = test.loc[(yr,r)].sort_values('predictions', ascending=False).iloc[0:5]
    for index, row in top5.iterrows():
        player.append(index)
        score.append(row['score'])

for r in rd:
    for i in range(5):
        rd_col.append(r)

mp = pd.DataFrame()
mp['Round'] = rd_col
mp['Player'] = player
mp['Score'] = score
mp.head()

Unnamed: 0,Round,Player,Score
0,6,Zach Merrett,108.0
1,6,Marc Murphy,100.0
2,6,Dayne Zorko,76.0
3,6,Tom Rockliff,100.0
4,6,Adam Treloar,82.0


In [22]:
# write prediction captain choice data to file
mp.to_csv('Model Prediction Top 5s.csv', index=False)

In [23]:
# calculate evaluation metric and add data to benchmark evaluation metrics 
model_av = mp.groupby('Round')['Score'].mean()

benchmarks = pd.read_csv('Benchmarks.csv', index_col=0)
benchmarks.head()

Unnamed: 0_level_0,Calvins Captains,Season Average,3 Week Average
Round,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
6,98.2,99.8,110.6
7,114.4,121.8,110.6
8,113.0,118.0,122.2
9,131.6,113.4,117.4
10,101.0,106.8,98.6


In [24]:
benchmarks['Model Average'] = model_av
benchmarks.head()

Unnamed: 0_level_0,Calvins Captains,Season Average,3 Week Average,Model Average
Round,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
6,98.2,99.8,110.6,93.2
7,114.4,121.8,110.6,109.2
8,113.0,118.0,122.2,120.4
9,131.6,113.4,117.4,129.0
10,101.0,106.8,98.6,112.2


In [25]:
benchmarks.mean()

Calvins Captains    109.653333
Season Average      109.346667
3 Week Average      109.733333
Model Average       111.666667
dtype: float64

In [26]:
# write out evaluation data
benchmarks.to_csv('Evaluation Data.csv')

## Sensitivity Analysis 

Here I perform sensitivity analysis to investigate how the model performs under abnormal conditions

In [28]:
# first, transform testing data by reducing magnitude 
small_features = test_features / 10
small_targets = test_targets / 10
print(gb.score(small_features , small_targets))

-611.84094064


In [29]:
large_features = test_features * 10
large_targets = test_targets * 10
print(gb.score(large_features , large_targets))

-6.38694419867


In [30]:
negative_features = test_features * -1
negative_targets = test_targets * -1
print(gb.score(negative_features , negative_targets))

-30.6284559413
