In [20]:
import pandas as pd
import numpy as np 

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

In [21]:
titanic_df = pd.read_csv('data/titanic_processed.csv')
titanic_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_Q,Embarked_S
0,0,3,1,15.0,1,1,7.2292,0,0
1,0,3,1,22.0,0,0,7.25,0,1
2,0,2,1,18.0,0,0,13.0,0,1
3,1,2,0,18.0,0,1,23.0,0,1
4,0,3,1,20.0,0,0,7.8542,0,1


In [22]:
x = titanic_df.drop('Survived', axis = 1)
y = titanic_df['Survived']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

In [23]:
def summarize_classification(y_test,y_pred):
    acc = accuracy_score(y_test, y_pred, normalize = True)
    num_acc = accuracy_score(y_test, y_pred, normalize = False)
    
    prec = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    
    print('Test data count: ', len(y_test))
    print('accuracy count: ', num_acc)
    print('accuracy score: ', acc)
    print('precision score: ', prec)
    print('recall score: ', recall)

In [24]:
parameters = {'max_depth' : [2, 4, 6, 8, 10]}
grid_search = GridSearchCV(DecisionTreeClassifier(), parameters, cv = 3, return_train_score = True)
grid_search.fit(x_train, y_train)

grid_search.best_params_

{'max_depth': 4}

In [25]:
for i in range(5):
    print('Parameters: ', grid_search.cv_results_['params'][i])
    print('Mean Test Score: ', grid_search.cv_results_['mean_test_score'][i])
    print('Rank Test Score: ', grid_search.cv_results_['rank_test_score'][i])
    print()

Parameters:  {'max_depth': 2}
Mean Test Score:  0.7943376960920822
Rank Test Score:  2

Parameters:  {'max_depth': 4}
Mean Test Score:  0.8013830873479996
Rank Test Score:  1

Parameters:  {'max_depth': 6}
Mean Test Score:  0.7890931031281908
Rank Test Score:  3

Parameters:  {'max_depth': 8}
Mean Test Score:  0.7785296574770258
Rank Test Score:  5

Parameters:  {'max_depth': 10}
Mean Test Score:  0.7873294346978557
Rank Test Score:  4



In [26]:
decision_tree_model = DecisionTreeClassifier( \
                                         max_depth = grid_search.best_params_['max_depth']  
                                            ).fit(x_train, y_train)

In [27]:
y_pred = decision_tree_model.predict(x_test)

In [28]:
summarize_classification(y_test, y_pred)

Test data count:  143
accuracy count:  110
accuracy score:  0.7692307692307693
precision score:  0.9230769230769231
recall score:  0.5454545454545454


In [30]:
parameters = {'penalty' : ['l1', 'l2'],
              'C': [0.1, 0.4, 0.8, 1, 2, 5]
             }
grid_search = GridSearchCV(LogisticRegression(solver = 'liblinear'), parameters, cv =3, return_train_score = True)
grid_search.fit(x_train, y_train)
grid_search.best_params_

{'C': 2, 'penalty': 'l2'}

In [32]:
for i in range(12):
    print('Parameters: ', grid_search.cv_results_['params'][i])
    print('Mean Test Score: ', grid_search.cv_results_['mean_test_score'][i])
    print('Rank Test Score: ', grid_search.cv_results_['rank_test_score'][i])
    print()

Parameters:  {'C': 0.1, 'penalty': 'l1'}
Mean Test Score:  0.7821126891302329
Rank Test Score:  12

Parameters:  {'C': 0.1, 'penalty': 'l2'}
Mean Test Score:  0.7856307435254802
Rank Test Score:  9

Parameters:  {'C': 0.4, 'penalty': 'l1'}
Mean Test Score:  0.7838392276988767
Rank Test Score:  11

Parameters:  {'C': 0.4, 'penalty': 'l2'}
Mean Test Score:  0.7926668523159751
Rank Test Score:  2

Parameters:  {'C': 0.8, 'penalty': 'l1'}
Mean Test Score:  0.7908567715585259
Rank Test Score:  5

Parameters:  {'C': 0.8, 'penalty': 'l2'}
Mean Test Score:  0.7926482873851294
Rank Test Score:  3

Parameters:  {'C': 1, 'penalty': 'l1'}
Mean Test Score:  0.789083820662768
Rank Test Score:  7

Parameters:  {'C': 1, 'penalty': 'l2'}
Mean Test Score:  0.7926390049197067
Rank Test Score:  4

Parameters:  {'C': 2, 'penalty': 'l1'}
Mean Test Score:  0.7855657662675206
Rank Test Score:  10

Parameters:  {'C': 2, 'penalty': 'l2'}
Mean Test Score:  0.7961292119186855
Rank Test Score:  1

Parameters:  {'C

In [35]:
logistic_regression = LogisticRegression(solver = 'liblinear', \
                                        C = grid_search.best_params_['C'], \
                                        penalty = grid_search.best_params_['penalty']).fit(x_train, y_train)

y_pred = logistic_regression.predict(x_test)

In [37]:
summarize_classification(y_test, y_pred)

Test data count:  143
accuracy count:  114
accuracy score:  0.7972027972027972
precision score:  0.8363636363636363
recall score:  0.696969696969697
