In [106]:
import pandas as pd
import numpy as np

# Modeling
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyClassifier
from sklearn.metrics import (mean_squared_error, accuracy_score, recall_score, 
                             precision_score, confusion_matrix, ConfusionMatrixDisplay)

# Visuals
from sklearn.tree import export_graphviz
from IPython.display import Image
import graphviz

In [123]:
import os
os.environ["PATH"] += os.pathsep + 'C:/Users/keith/AppData/Roaming/Python/Python311/site-packages/graphviz'

In [10]:
df = pd.read_csv('data/survivor_boxscores_clean.csv')

In [11]:
df

Unnamed: 0,season,episode,player_name,voted_off_next,total_days,exile_days,votes_for_bootee,votes_against_player,total_votes,tribal_council_appearances,...,cumsum_tribal_council_appearances,cumsum_challenge_wins,cumsum_challenge_appearances,cumsum_sit_outs,cumsum_reward_chl_win,cumsum_reward_chl_teammates,cumsum_reward_chl_win_perc,cumsum_immun_chl_win,cumsum_immun_chl_teammates,cumsum_immun_chl_win_perc
0,1,1,B.B.,1,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.125000,0.125000,0.0,1.0,8.0,0.125000,0.0,0.0,0.000000
1,1,1,Colleen,0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.125000,0.125000,0.0,1.0,8.0,0.125000,0.0,0.0,0.000000
2,1,1,Dirk,0,3.0,0.0,1.0,0.0,8.0,1.0,...,1.0,0.000000,0.125000,0.0,0.0,8.0,0.000000,0.0,0.0,0.000000
3,1,1,Gervase,0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.125000,0.125000,0.0,1.0,8.0,0.125000,0.0,0.0,0.000000
4,1,1,Greg,0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.125000,0.125000,0.0,1.0,8.0,0.125000,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6599,43,16,Laetitia le Roux,0,1.0,0.0,1.0,0.0,6.0,1.0,...,9.0,0.966667,11.230952,3.0,3.0,35.0,0.485714,3.0,51.0,0.480952
6600,43,16,Mike Venter,0,1.0,1.0,0.0,4.0,6.0,1.0,...,9.0,2.283333,11.230952,1.0,3.0,35.0,0.452381,5.0,51.0,1.830952
6601,43,16,Nicole Capper,0,1.0,1.0,1.0,1.0,6.0,1.0,...,11.0,2.534524,11.564286,1.0,1.5,37.0,0.700000,5.5,54.0,1.834524
6602,43,16,Rob Bentele,0,1.0,0.0,1.0,0.0,6.0,1.0,...,10.0,4.495238,11.397619,2.0,2.5,35.0,0.785714,7.0,57.0,3.709524


## Set-up

In [14]:
X = df.drop(['season','episode','player_name','voted_off_next'], axis = 1)
y = df['voted_off_next']
features = list(X.columns)

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1)

## Baseline 1: Dummy Classifier

In [93]:
dummy_reg = DummyClassifier(strategy = 'most_frequent')
dummy_reg.fit(X_train, y_train)

In [94]:
y_pred = dummy_reg.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)

In [95]:
print('Baseline RMSE: {:0.4f}'.format(rmse),
      '\nBaseline Accuracy: {:0.2f}%'.format(accuracy*100))

Baseline RMSE: 0.2667
 Baseline Accuracy: 92.89%


## Baseline 2: RF with undefined hyperparameters

In [119]:
rf = RandomForestClassifier(n_estimators = 1000)
rf.fit(X_train, y_train)

In [85]:
y_pred = rf.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)

In [90]:
print('Baseline RMSE: {:0.4f}'.format(rmse),
      '\nBaseline Accuracy: {:0.2f}%'.format(accuracy*100))

Baseline RMSE: 0.2858
 Baseline Accuracy: 91.83%


## Hyperparameter Tuning

#### Random Search with Cross Validation

In [52]:
n_estimators = [int(x) for x in np.linspace(200, 2000, 19)]
max_features = ['auto','sqrt']
max_depth = [int(x) for x in np.linspace(10, 100, 10)]
max_depth.append(None)
min_samples_split = [3,5,7,9]
min_samples_leaf = [1,2,3,4]
bootstrap = [True,False]

In [55]:
random_params = {'n_estimators': n_estimators,
                 'max_features': max_features,
                 'max_depth': max_depth,
                 'min_samples_split': min_samples_split,
                 'min_samples_leaf': min_samples_leaf,
                 'bootstrap': bootstrap}

In [96]:
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_params,
                               n_iter = 100, cv = 3, verbose = 3, n_jobs = -1)

In [97]:
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


  warn(


In [98]:
rf_random.best_params_

{'n_estimators': 1300,
 'min_samples_split': 7,
 'min_samples_leaf': 2,
 'max_features': 'auto',
 'max_depth': None,
 'bootstrap': True}

#### Evaluate accuracy with above parameters

In [103]:
best_random = rf_random.best_estimator_

In [104]:
y_pred = best_random.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
print('Baseline RMSE: {:0.4f}'.format(rmse),
      '\nBaseline Accuracy: {:0.2f}%'.format(accuracy*100))

Baseline RMSE: 0.2667 
Baseline Accuracy: 92.89%


## Grid Search with Cross Validation

## Final Model Evaluation

#### Visualization

In [124]:
for i in range(3):
    tree = rf.estimators_[i]
    dot_data = export_graphviz(tree, feature_names = X_train.columns,
                               filled = True, max_depth = 2, 
                               impurity = False, proportion = True)
    graph = graphviz.Source(dot_data)
    display(graph)

ExecutableNotFound: failed to execute WindowsPath('dot'), make sure the Graphviz executables are on your systems' PATH

<graphviz.sources.Source at 0x18e29869110>

ExecutableNotFound: failed to execute WindowsPath('dot'), make sure the Graphviz executables are on your systems' PATH

<graphviz.sources.Source at 0x18e181cd810>

ExecutableNotFound: failed to execute WindowsPath('dot'), make sure the Graphviz executables are on your systems' PATH

<graphviz.sources.Source at 0x18e2992ca90>