In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [None]:
# using post-clean data from titanic-logistic-regression
df = pd.read_csv('/content/drive/MyDrive/GitHub/titanic/titanic-post.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Survived,Pclass,Sex,Age,Fare,family
0,0,0,3,0,22.0,7.25,1
1,1,1,1,1,38.0,71.2833,1
2,2,1,3,1,26.0,7.925,0
3,3,1,1,1,35.0,53.1,1
4,4,0,3,0,35.0,8.05,0


In [None]:
# dropping 1st column
df = df.drop(columns=['Unnamed: 0'], axis=1)
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,family
0,0,3,0,22.0,7.25,1
1,1,1,1,38.0,71.2833,1
2,1,3,1,26.0,7.925,0
3,1,1,1,35.0,53.1,1
4,0,3,0,35.0,8.05,0


In [None]:
# splitting data into train/val/test sets
features = df.drop(['Survived'], axis=1)
target = df['Survived']
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=10)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=10)
print(f'{len(X_train)}, {len(X_val)}, {len(X_test)}, {len(y_train)}, {len(y_val)}, {len(y_test)}')

534, 178, 179, 534, 178, 179


In [None]:
# perform grid search to find best model
model = SVC()
hyper_params = {
    'C': [.01, .1, 1., 10., 100.]
}
grid_search = GridSearchCV(model, hyper_params, cv=5)
grid_search.fit(X_train, y_train.values.ravel())

GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': [0.01, 0.1, 1.0, 10.0, 100.0]})

In [None]:
# print grid search results
for C, mean, std in zip(grid_search.cv_results_['params'], 
                        grid_search.cv_results_['mean_test_score'], 
                        grid_search.cv_results_['std_test_score']):
  print(f'{C}, mean={round(mean,4)}, std={round(std,4)}')
print(f'best hyper-parameter= {grid_search.best_params_}')
model = grid_search.best_estimator_

{'C': 0.01}, mean=0.5993, std=0.0023
{'C': 0.1}, mean=0.6424, std=0.0239
{'C': 1.0}, mean=0.6555, std=0.0279
{'C': 10.0}, mean=0.6986, std=0.0231
{'C': 100.0}, mean=0.7865, std=0.0229
best hyper-parameter= {'C': 100.0}


In [None]:
# results on training set
predict = model.predict(X_train)
accuracy = round(accuracy_score(y_train, predict), 4)
precision = round(precision_score(y_train, predict), 4)
recall = round(recall_score(y_train, predict), 4)
print(f'accuracy={accuracy}, precision={precision}, recall={recall}')

accuracy=0.8015, precision=0.7547, recall=0.7477


In [None]:
# results on validation set
predict = model.predict(X_val)
accuracy = round(accuracy_score(y_val, predict), 4)
precision = round(precision_score(y_val, predict), 4)
recall = round(recall_score(y_val, predict), 4)
print(f'accuracy={accuracy}, precision={precision}, recall={recall}')

accuracy=0.7528, precision=0.6618, recall=0.6818


In [None]:
# If this SVM model were picked, the test score would have been
predict = model.predict(X_test)
accuracy = round(accuracy_score(y_test, predict), 4)
precision = round(precision_score(y_test, predict), 4)
recall = round(recall_score(y_test, predict), 4)
print(f'accuracy={accuracy}, precision={precision}, recall={recall}')

accuracy=0.8212, precision=0.7419, recall=0.7419


In [None]:
# Overall, very similar train/validation/test results between logistic regression and SVC