In [1]:
import pandas as pd
import numpy as np

# Modeling
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyClassifier
from sklearn.metrics import (mean_squared_error, accuracy_score, recall_score, 
                             precision_score, confusion_matrix, ConfusionMatrixDisplay)

# Visuals
from sklearn.tree import export_graphviz
from IPython.display import Image
import graphviz

In [2]:
# import os
# os.environ["PATH"] += os.pathsep + 'C:/Users/keith/AppData/Roaming/Python/Python311/site-packages/graphviz'

In [3]:
df = pd.read_csv('data/survivor_boxscores_clean.csv')

In [4]:
df

Unnamed: 0,season,episode,player_name,voted_off_next,total_days,exile_days,votes_for_bootee,votes_against_player,total_votes,tribal_council_appearances,...,cumsum_tribal_council_appearances,cumsum_challenge_wins,cumsum_challenge_appearances,cumsum_sit_outs,cumsum_reward_chl_win,cumsum_reward_chl_teammates,cumsum_reward_chl_win_perc,cumsum_immun_chl_win,cumsum_immun_chl_teammates,cumsum_immun_chl_win_perc
0,1,1,B.B.,1,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.125000,0.125000,0.0,1.0,8.0,0.125000,0.0,0.0,0.000000
1,1,1,Colleen,0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.125000,0.125000,0.0,1.0,8.0,0.125000,0.0,0.0,0.000000
2,1,1,Dirk,0,3.0,0.0,1.0,0.0,8.0,1.0,...,1.0,0.000000,0.125000,0.0,0.0,8.0,0.000000,0.0,0.0,0.000000
3,1,1,Gervase,0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.125000,0.125000,0.0,1.0,8.0,0.125000,0.0,0.0,0.000000
4,1,1,Greg,0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.125000,0.125000,0.0,1.0,8.0,0.125000,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6599,43,16,Laetitia le Roux,0,1.0,0.0,1.0,0.0,6.0,1.0,...,9.0,0.966667,11.230952,3.0,3.0,35.0,0.485714,3.0,51.0,0.480952
6600,43,16,Mike Venter,0,1.0,1.0,0.0,4.0,6.0,1.0,...,9.0,2.283333,11.230952,1.0,3.0,35.0,0.452381,5.0,51.0,1.830952
6601,43,16,Nicole Capper,0,1.0,1.0,1.0,1.0,6.0,1.0,...,11.0,2.534524,11.564286,1.0,1.5,37.0,0.700000,5.5,54.0,1.834524
6602,43,16,Rob Bentele,0,1.0,0.0,1.0,0.0,6.0,1.0,...,10.0,4.495238,11.397619,2.0,2.5,35.0,0.785714,7.0,57.0,3.709524


## Set-up

In [5]:
X = df.drop(['season','episode','player_name','voted_off_next'], axis = 1)
y = df['voted_off_next']
features = list(X.columns)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

## Baseline 1: Dummy Classifier

In [7]:
dummy_reg = DummyClassifier(strategy = 'most_frequent')
dummy_reg.fit(X_train, y_train)

DummyClassifier(strategy='most_frequent')

In [8]:
y_pred = dummy_reg.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)

  _warn_prf(average, modifier, msg_start, len(result))


In [9]:
print('Baseline RMSE: {:0.4f}'.format(rmse),
      '\nBaseline Accuracy: {:0.2f}%'.format(accuracy*100),
      '\nBaseline Precision: {:0.2f}%'.format(precision*100))

Baseline RMSE: 0.2610 
Baseline Accuracy: 93.19% 
Baseline Precision: 0.00%


## Baseline 2: RF with undefined hyperparameters

In [10]:
rf = RandomForestClassifier(n_estimators = 1000)
rf.fit(X_train, y_train)

RandomForestClassifier(n_estimators=1000)

In [11]:
y_pred = rf.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)

In [12]:
print('Baseline 2 RMSE: {:0.4f}'.format(rmse),
      '\nBaseline 2 Accuracy: {:0.2f}%'.format(accuracy*100),
      '\nBaseline 2 Precision: {:0.2f}%'.format(precision*100))

Baseline 2 RMSE: 0.2751 
Baseline 2 Accuracy: 92.43% 
Baseline 2 Precision: 0.00%


In [13]:
precision

0.0

## Hyperparameter Tuning

#### Random Search with Cross Validation

In [14]:
n_estimators = [int(x) for x in np.linspace(200, 2000, 19)]
max_features = ['auto','sqrt']
max_depth = [int(x) for x in np.linspace(10, 100, 10)]
max_depth.append(None)
min_samples_split = [3,5,7,9]
min_samples_leaf = [1,2,3,4]
bootstrap = [True,False]

In [15]:
random_params = {'n_estimators': n_estimators,
                 'max_features': max_features,
                 'max_depth': max_depth,
                 'min_samples_split': min_samples_split,
                 'min_samples_leaf': min_samples_leaf,
                 'bootstrap': bootstrap}

In [16]:
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_params,
                               n_iter = 100, cv = 3, verbose = 3, n_jobs = -1)

In [17]:
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(n_estimators=1000),
                   n_iter=100, n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 3, 4],
                                        'min_samples_split': [3, 5, 7, 9],
                                        'n_estimators': [200, 300, 400, 500,
                                                         600, 700, 800, 900,
                                                         1000, 1100, 1200, 1300,
                                                         1400, 1500, 1600, 1700,
                                                         1800, 1900, 2000]},
                   verbose=3)

In [18]:
rf_random.best_params_

{'n_estimators': 200,
 'min_samples_split': 9,
 'min_samples_leaf': 3,
 'max_features': 'sqrt',
 'max_depth': 40,
 'bootstrap': True}

#### Evaluate accuracy with above parameters

In [19]:
best_random = rf_random.best_estimator_

In [20]:
y_pred = best_random.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
print('RandomCV RMSE: {:0.4f}'.format(rmse),
      '\nRandomCV Accuracy: {:0.2f}%'.format(accuracy*100),
      '\nRandomCV Precision: {:0.2f}%'.format(precision*100))

RandomCV RMSE: 0.2610 
RandomCV Accuracy: 93.19% 
RandomCV Precision: 0.00%


## Grid Search with Cross Validation

In [21]:
grid_params = {'n_estimators': [1600,1700,1800],
                 'max_features': ['sqrt','auto'],
                 'max_depth': [10,20,30,None],
                 'min_samples_split': [8,9,10],
                 'min_samples_leaf': [2,3,4],
                 'bootstrap': [True]}

In [22]:
rf_grid = GridSearchCV(estimator = rf, param_grid = grid_params,
                       scoring = 'precision', cv = 6, verbose = 3, n_jobs = -1)

In [23]:
rf_grid.fit(X_train, y_train)

Fitting 6 folds for each of 216 candidates, totalling 1296 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

[CV 1/3] END bootstrap=False, max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=9, n_estimators=1800;, score=0.909 total time=  20.9s
[CV 1/3] END bootstrap=True, max_depth=90, max_features=sqrt, min_samples_leaf=1, min_samples_split=9, n_estimators=600;, score=0.915 total time=   4.7s
[CV 3/3] END bootstrap=True, max_depth=90, max_features=sqrt, min_samples_leaf=1, min_samples_split=9, n_estimators=600;, score=0.915 total time=   4.6s
[CV 1/3] END bootstrap=True, max_depth=None, max_features=auto, min_samples_leaf=3, min_samples_split=7, n_estimators=1700;, score=0.916 total time=  12.6s
[CV 3/3] END bootstrap=False, max_depth=50, max_features=sqrt, min_samples_leaf=4, min_samples_split=7, n_estimators=900;, score=0.915 total time=   8.0s
[CV 3/3] END bootstrap=False, max_depth=50, max_features=auto, min_samples_leaf=3, min_samples_split=3, n_estimators=400;, score=0.913 total time=   3.5s
[CV 1/3] END bootstrap=False, max_depth=40, max_features=auto, min_sample

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[CV 1/3] END bootstrap=True, max_depth=40, max_features=sqrt, min_samples_leaf=3, min_samples_split=9, n_estimators=200;, score=0.916 total time=   1.5s
[CV 2/3] END bootstrap=True, max_depth=40, max_features=sqrt, min_samples_leaf=3, min_samples_split=9, n_estimators=200;, score=0.916 total time=   1.7s
[CV 3/3] END bootstrap=True, max_depth=40, max_features=sqrt, min_samples_leaf=3, min_samples_split=9, n_estimators=200;, score=0.916 total time=   1.6s
[CV 1/3] END bootstrap=False, max_depth=60, max_features=auto, min_samples_leaf=4, min_samples_split=5, n_estimators=400;, score=0.914 total time=   4.6s
[CV 2/3] END bootstrap=False, max_depth=60, max_features=auto, min_samples_leaf=4, min_samples_split=5, n_estimators=400;, score=0.916 total time=   4.4s
[CV 3/3] END bootstrap=False, max_depth=60, max_features=auto, min_samples_leaf=4, min_samples_split=5, n_estimators=400;, score=0.915 total time=   4.2s
[CV 1/3] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[CV 3/3] END bootstrap=False, max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=9, n_estimators=1800;, score=0.910 total time=  20.7s
[CV 3/3] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=3, n_estimators=1000;, score=0.916 total time=   6.6s
[CV 2/3] END bootstrap=False, max_depth=80, max_features=auto, min_samples_leaf=3, min_samples_split=7, n_estimators=500;, score=0.915 total time=   4.5s
[CV 3/3] END bootstrap=True, max_depth=None, max_features=auto, min_samples_leaf=3, min_samples_split=7, n_estimators=1700;, score=0.916 total time=  12.3s
[CV 2/3] END bootstrap=True, max_depth=50, max_features=sqrt, min_samples_leaf=3, min_samples_split=3, n_estimators=500;, score=0.916 total time=   3.6s
[CV 3/3] END bootstrap=True, max_depth=50, max_features=sqrt, min_samples_leaf=3, min_samples_split=3, n_estimators=500;, score=0.916 total time=   3.6s
[CV 1/3] END bootstrap=False, max_depth=100, max_features=sqrt, min_sampl

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[CV 2/3] END bootstrap=False, max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=9, n_estimators=1800;, score=0.910 total time=  20.6s
[CV 2/3] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=3, n_estimators=1000;, score=0.916 total time=   6.5s
[CV 1/3] END bootstrap=False, max_depth=80, max_features=auto, min_samples_leaf=3, min_samples_split=7, n_estimators=500;, score=0.912 total time=   4.6s
[CV 2/3] END bootstrap=True, max_depth=None, max_features=auto, min_samples_leaf=3, min_samples_split=7, n_estimators=1700;, score=0.916 total time=  12.4s
[CV 1/3] END bootstrap=True, max_depth=50, max_features=sqrt, min_samples_leaf=3, min_samples_split=3, n_estimators=500;, score=0.915 total time=   3.7s
[CV 1/3] END bootstrap=False, max_depth=50, max_features=auto, min_samples_leaf=3, min_samples_split=3, n_estimators=400;, score=0.912 total time=   3.7s
[CV 2/3] END bootstrap=False, max_depth=100, max_features=sqrt, min_samp

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

[CV 5/6] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=4, min_samples_split=8, n_estimators=1600;, score=0.000 total time=  11.4s
[CV 3/6] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=4, min_samples_split=8, n_estimators=1700;, score=0.000 total time=  12.0s
[CV 1/6] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=4, min_samples_split=8, n_estimators=1800;, score=0.000 total time=  12.8s
[CV 5/6] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=4, min_samples_split=8, n_estimators=1800;, score=0.000 total time=  12.8s
[CV 3/6] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=4, min_samples_split=9, n_estimators=1600;, score=0.000 total time=  11.4s
[CV 1/6] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=4, min_samples_split=9, n_estimators=1700;, score=0.000 total time=  12.0s
[CV 5/6] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_le

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[CV 2/6] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=4, min_samples_split=8, n_estimators=1800;, score=0.000 total time=  12.8s
[CV 6/6] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=4, min_samples_split=8, n_estimators=1800;, score=0.000 total time=  12.8s
[CV 4/6] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=4, min_samples_split=9, n_estimators=1600;, score=0.000 total time=  11.5s
[CV 2/6] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=4, min_samples_split=9, n_estimators=1700;, score=0.000 total time=  12.4s
[CV 2/6] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=4, min_samples_split=9, n_estimators=1800;, score=0.000 total time=  12.8s
[CV 6/6] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=4, min_samples_split=9, n_estimators=1800;, score=0.000 total time=  12.7s
[CV 3/6] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_le

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[CV 2/6] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=4, min_samples_split=9, n_estimators=1600;, score=0.000 total time=  11.4s
[CV 6/6] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=4, min_samples_split=9, n_estimators=1600;, score=0.000 total time=  11.5s
[CV 3/6] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=4, min_samples_split=9, n_estimators=1700;, score=0.000 total time=  12.1s
[CV 6/6] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=4, min_samples_split=9, n_estimators=1700;, score=0.000 total time=  12.1s
[CV 4/6] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=4, min_samples_split=9, n_estimators=1800;, score=0.000 total time=  13.0s
[CV 2/6] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=1600;, score=0.000 total time=  11.4s
[CV 6/6] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_l

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[CV 5/6] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=4, min_samples_split=9, n_estimators=1800;, score=0.000 total time=  12.9s
[CV 4/6] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=1600;, score=0.000 total time=  11.4s
[CV 2/6] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=1700;, score=0.000 total time=  12.0s
[CV 5/6] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=1700;, score=0.000 total time=  12.0s
[CV 3/6] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=1800;, score=0.000 total time=  12.9s
[CV 1/6] END bootstrap=True, max_depth=10, max_features=auto, min_samples_leaf=2, min_samples_split=8, n_estimators=1600;, score=0.000 total time=  11.6s
[CV 5/6] END bootstrap=True, max_depth=10, max_features=auto, min_sample

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

[CV 4/6] END bootstrap=True, max_depth=20, max_features=auto, min_samples_leaf=2, min_samples_split=10, n_estimators=1800;, score=0.000 total time=  15.2s
[CV 2/6] END bootstrap=True, max_depth=20, max_features=auto, min_samples_leaf=3, min_samples_split=8, n_estimators=1600;, score=0.000 total time=  13.2s
[CV 6/6] END bootstrap=True, max_depth=20, max_features=auto, min_samples_leaf=3, min_samples_split=8, n_estimators=1600;, score=0.000 total time=  13.2s
[CV 4/6] END bootstrap=True, max_depth=20, max_features=auto, min_samples_leaf=3, min_samples_split=8, n_estimators=1700;, score=0.000 total time=  14.1s
[CV 2/6] END bootstrap=True, max_depth=20, max_features=auto, min_samples_leaf=3, min_samples_split=8, n_estimators=1800;, score=0.000 total time=  14.8s
[CV 6/6] END bootstrap=True, max_depth=20, max_features=auto, min_samples_leaf=3, min_samples_split=8, n_estimators=1800;, score=0.000 total time=  14.9s
[CV 4/6] END bootstrap=True, max_depth=20, max_features=auto, min_samples_l

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[CV 1/6] END bootstrap=True, max_depth=20, max_features=auto, min_samples_leaf=3, min_samples_split=8, n_estimators=1700;, score=0.000 total time=  14.2s
[CV 5/6] END bootstrap=True, max_depth=20, max_features=auto, min_samples_leaf=3, min_samples_split=8, n_estimators=1700;, score=0.000 total time=  14.3s
[CV 3/6] END bootstrap=True, max_depth=20, max_features=auto, min_samples_leaf=3, min_samples_split=8, n_estimators=1800;, score=0.000 total time=  14.7s
[CV 1/6] END bootstrap=True, max_depth=20, max_features=auto, min_samples_leaf=3, min_samples_split=9, n_estimators=1600;, score=0.000 total time=  13.3s
[CV 5/6] END bootstrap=True, max_depth=20, max_features=auto, min_samples_leaf=3, min_samples_split=9, n_estimators=1600;, score=0.000 total time=  13.2s
[CV 3/6] END bootstrap=True, max_depth=20, max_features=auto, min_samples_leaf=3, min_samples_split=9, n_estimators=1700;, score=0.000 total time=  13.9s
[CV 1/6] END bootstrap=True, max_depth=20, max_features=auto, min_samples_le

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[CV 6/6] END bootstrap=True, max_depth=20, max_features=auto, min_samples_leaf=3, min_samples_split=8, n_estimators=1700;, score=0.000 total time=  14.2s
[CV 4/6] END bootstrap=True, max_depth=20, max_features=auto, min_samples_leaf=3, min_samples_split=8, n_estimators=1800;, score=0.000 total time=  15.0s
[CV 2/6] END bootstrap=True, max_depth=20, max_features=auto, min_samples_leaf=3, min_samples_split=9, n_estimators=1600;, score=0.000 total time=  13.1s
[CV 6/6] END bootstrap=True, max_depth=20, max_features=auto, min_samples_leaf=3, min_samples_split=9, n_estimators=1600;, score=0.000 total time=  13.0s
[CV 4/6] END bootstrap=True, max_depth=20, max_features=auto, min_samples_leaf=3, min_samples_split=9, n_estimators=1700;, score=0.000 total time=  14.0s
[CV 2/6] END bootstrap=True, max_depth=20, max_features=auto, min_samples_leaf=3, min_samples_split=9, n_estimators=1800;, score=0.000 total time=  15.1s
[CV 6/6] END bootstrap=True, max_depth=20, max_features=auto, min_samples_le

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[CV 1/6] END bootstrap=True, max_depth=20, max_features=auto, min_samples_leaf=3, min_samples_split=9, n_estimators=1700;, score=0.000 total time=  14.0s
[CV 5/6] END bootstrap=True, max_depth=20, max_features=auto, min_samples_leaf=3, min_samples_split=9, n_estimators=1700;, score=0.000 total time=  14.0s
[CV 3/6] END bootstrap=True, max_depth=20, max_features=auto, min_samples_leaf=3, min_samples_split=9, n_estimators=1800;, score=0.000 total time=  14.6s
[CV 1/6] END bootstrap=True, max_depth=20, max_features=auto, min_samples_leaf=3, min_samples_split=10, n_estimators=1600;, score=0.000 total time=  13.2s
[CV 5/6] END bootstrap=True, max_depth=20, max_features=auto, min_samples_leaf=3, min_samples_split=10, n_estimators=1600;, score=0.000 total time=  13.3s
[CV 3/6] END bootstrap=True, max_depth=20, max_features=auto, min_samples_leaf=3, min_samples_split=10, n_estimators=1700;, score=0.000 total time=  13.9s
[CV 1/6] END bootstrap=True, max_depth=20, max_features=auto, min_samples

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

GridSearchCV(cv=6, estimator=RandomForestClassifier(n_estimators=1000),
             n_jobs=-1,
             param_grid={'bootstrap': [True], 'max_depth': [10, 20, 30, None],
                         'max_features': ['sqrt', 'auto'],
                         'min_samples_leaf': [2, 3, 4],
                         'min_samples_split': [8, 9, 10],
                         'n_estimators': [1600, 1700, 1800]},
             scoring='precision', verbose=3)

## Final Model Evaluation

#### Visualization

In [24]:
# for i in range(3):
#     tree = rf.estimators_[i]
#     dot_data = export_graphviz(tree, feature_names = X_train.columns,
#                                filled = True, max_depth = 2, 
#                                impurity = False, proportion = True)
#     graph = graphviz.Source(dot_data)
#     display(graph)

[CV 5/6] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=8, n_estimators=1800;, score=0.000 total time=  14.8s
[CV 3/6] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=9, n_estimators=1600;, score=0.000 total time=  13.0s
[CV 1/6] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=9, n_estimators=1700;, score=0.000 total time=  14.0s
[CV 5/6] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=9, n_estimators=1700;, score=0.000 total time=  13.9s
[CV 3/6] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=9, n_estimators=1800;, score=0.000 total time=  14.6s
[CV 1/6] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=1600;, score=0.000 total time=  13.2s
[CV 5/6] END bootstrap=True, max_depth=None, max_features=sqrt,

In [27]:
rf_grid.best_params_

{'bootstrap': True,
 'max_depth': 10,
 'max_features': 'sqrt',
 'min_samples_leaf': 2,
 'min_samples_split': 8,
 'n_estimators': 1600}

In [29]:
best_grid = rf_grid.best_estimator_

In [30]:
y_pred = best_grid.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
print('GridCV RMSE: {:0.4f}'.format(rmse),
      '\nGridCV Accuracy: {:0.2f}%'.format(accuracy*100),
      '\nGridCV Precision: {:0.2f}%'.format(precision*100))

GridCV RMSE: 0.2610 
GridCV Accuracy: 93.19% 
GridCV Precision: 0.00%
