In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [2]:
# using post-clean data from titanic-logistic-regression
df = pd.read_csv('/content/drive/MyDrive/GitHub/titanic/titanic-post.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Survived,Pclass,Sex,Age,Fare,family
0,0,0,3,0,22.0,7.25,1
1,1,1,1,1,38.0,71.2833,1
2,2,1,3,1,26.0,7.925,0
3,3,1,1,1,35.0,53.1,1
4,4,0,3,0,35.0,8.05,0


In [3]:
# dropping 1st column
df = df.drop(columns=['Unnamed: 0'], axis=1)
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,family
0,0,3,0,22.0,7.25,1
1,1,1,1,38.0,71.2833,1
2,1,3,1,26.0,7.925,0
3,1,1,1,35.0,53.1,1
4,0,3,0,35.0,8.05,0


In [4]:
# splitting data into train/val/test sets
features = df.drop(['Survived'], axis=1)
target = df['Survived']
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=10)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=10)
print(f'{len(X_train)}, {len(X_val)}, {len(X_test)}, {len(y_train)}, {len(y_val)}, {len(y_test)}')

534, 178, 179, 534, 178, 179


In [5]:
# perform grid search to find best model
model = RandomForestClassifier()
hyper_params = {
    'max_depth' : [4, 8, 16, 32],
    'n_estimators': [50, 100, 150]
}
grid_search = GridSearchCV(model, hyper_params, cv=5)
grid_search.fit(X_train, y_train.values.ravel())

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'max_depth': [4, 8, 16, 32],
                         'n_estimators': [50, 100, 150]})

In [6]:
# print grid search results
for par, mean, std in zip(grid_search.cv_results_['params'], 
                        grid_search.cv_results_['mean_test_score'], 
                        grid_search.cv_results_['std_test_score']):
  print(f'{par}, mean={round(mean,4)}, std={round(std,4)}')
print(f'best hyper-parameter= {grid_search.best_params_}')
model = grid_search.best_estimator_

{'max_depth': 4, 'n_estimators': 50}, mean=0.8016, std=0.0357
{'max_depth': 4, 'n_estimators': 100}, mean=0.7979, std=0.0391
{'max_depth': 4, 'n_estimators': 150}, mean=0.8072, std=0.0412
{'max_depth': 8, 'n_estimators': 50}, mean=0.841, std=0.0536
{'max_depth': 8, 'n_estimators': 100}, mean=0.8259, std=0.0373
{'max_depth': 8, 'n_estimators': 150}, mean=0.8241, std=0.0517
{'max_depth': 16, 'n_estimators': 50}, mean=0.8203, std=0.0524
{'max_depth': 16, 'n_estimators': 100}, mean=0.8185, std=0.0676
{'max_depth': 16, 'n_estimators': 150}, mean=0.8185, std=0.0641
{'max_depth': 32, 'n_estimators': 50}, mean=0.8203, std=0.0524
{'max_depth': 32, 'n_estimators': 100}, mean=0.8166, std=0.0605
{'max_depth': 32, 'n_estimators': 150}, mean=0.8279, std=0.0515
best hyper-parameter= {'max_depth': 8, 'n_estimators': 50}


In [7]:
# results on training set
predict = model.predict(X_train)
accuracy = round(accuracy_score(y_train, predict), 4)
precision = round(precision_score(y_train, predict), 4)
recall = round(recall_score(y_train, predict), 4)
print(f'accuracy={accuracy}, precision={precision}, recall={recall}')

accuracy=0.9307, precision=0.9683, recall=0.8551


In [8]:
# results on validation set
predict = model.predict(X_val)
accuracy = round(accuracy_score(y_val, predict), 4)
precision = round(precision_score(y_val, predict), 4)
recall = round(recall_score(y_val, predict), 4)
print(f'accuracy={accuracy}, precision={precision}, recall={recall}')
# Note that the model was overfitting

accuracy=0.8315, precision=0.8333, recall=0.6818


In [9]:
# If this model is picked, the test score will be
predict = model.predict(X_test)
accuracy = round(accuracy_score(y_test, predict), 4)
precision = round(precision_score(y_test, predict), 4)
recall = round(recall_score(y_test, predict), 4)
print(f'accuracy={accuracy}, precision={precision}, recall={recall}')

accuracy=0.8771, precision=0.8571, recall=0.7742


In [None]:
# The model was overfitting, but the test score is still high compared others