In [2]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score

from sklearn.linear_model import LogisticRegression 
from sklearn.tree import DecisionTreeClassifier

In [3]:
titanic = pd.read_csv('titanic_processed1.csv')
titanic.head(5)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,0,3,0,2.0,3,2,27.9,0,0,1
1,1,2,0,13.0,0,1,19.5,0,0,1
2,0,3,1,30.0,0,0,7.225,1,0,0
3,0,3,1,25.0,0,0,7.225,1,0,0
4,0,3,0,18.0,1,0,17.8,0,0,1


In [4]:
X = titanic.drop('Survived', axis = 1)
Y = titanic['Survived']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2)

In [6]:
def summarize_classification(y_test, y_pred):
    acc = accuracy_score(y_test, y_pred, normalize = True) #normalize=True - accuracy in terms of fraction
    num_acc = accuracy_score(y_test, y_pred, normalize = False)#normalize=False - no of accurately predicted label 
    
    prec = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    
    print('Test Data count : ', len(y_test))
    print('Accuracy count : ', num_acc)
    print('Accuracy Score : ', acc)
    print('Precision Score : ', prec)
    print('Recall Score : ', recall)
    print()

In [7]:
from sklearn.model_selection import GridSearchCV

parameters = {'max_depth' : [2, 4, 5, 7, 9, 10]}

#cv=3 use 3 fold cross validation to find the best model split the dataset into 3 parts
# 2 out of 3 set is use to train the model and third will be use for evaluate the model
#Compare  models using default scoring mechanism for the estimator i.e. accuracy

grid_search = GridSearchCV(DecisionTreeClassifier(), parameters, cv = 3, return_train_score = True)
grid_search.fit(x_train, y_train)

grid_search.best_params_



{'max_depth': 4}

In [8]:
#Lets compare for all of the oter models performance

for i in range(6):
    print('Parameters : ', grid_search.cv_results_['params'][i])
    
    print('Mean Test Score : ', grid_search.cv_results_['mean_test_score'][i])
    
    print('Rank : ', grid_search.cv_results_['rank_test_score'][i])

Parameters :  {'max_depth': 2}
Mean Test Score :  0.7978650329527524
Rank :  2
Parameters :  {'max_depth': 4}
Mean Test Score :  0.7996194189176645
Rank :  1
Parameters :  {'max_depth': 5}
Mean Test Score :  0.7855379188712522
Rank :  3
Parameters :  {'max_depth': 7}
Mean Test Score :  0.7714749837556854
Rank :  4
Parameters :  {'max_depth': 9}
Mean Test Score :  0.7451684767474241
Rank :  5
Parameters :  {'max_depth': 10}
Mean Test Score :  0.7328506451313469
Rank :  6


In [9]:


parameters = {'penalty' : ['l1', 'l2'],
              'C' : [0.1, 0.4, 0.8, 1, 2, 5]}


grid_search = GridSearchCV(LogisticRegression(solver='liblinear'), parameters, cv = 3, return_train_score = True)
grid_search.fit(x_train, y_train)

grid_search.best_params_


{'C': 1, 'penalty': 'l2'}

In [15]:
#Lets compare for all of the oter models performance

for i in range(12):
    print('Parameters : ', grid_search.cv_results_['params'][i])
    
    print('Mean Test Score : ', grid_search.cv_results_['mean_test_score'][i])
    
    print('Rank : ', grid_search.cv_results_['rank_test_score'][i])

Parameters :  {'C': 0.1, 'penalty': 'l1'}
Mean Test Score :  0.7575141557597697
Rank :  12
Parameters :  {'C': 0.1, 'penalty': 'l2'}
Mean Test Score :  0.775085862805161
Rank :  11
Parameters :  {'C': 0.4, 'penalty': 'l1'}
Mean Test Score :  0.7803583031653206
Rank :  8
Parameters :  {'C': 0.4, 'penalty': 'l2'}
Mean Test Score :  0.780339738234475
Rank :  10
Parameters :  {'C': 0.8, 'penalty': 'l1'}
Mean Test Score :  0.782112689130233
Rank :  5
Parameters :  {'C': 0.8, 'penalty': 'l2'}
Mean Test Score :  0.7838577926297225
Rank :  2
Parameters :  {'C': 1, 'penalty': 'l1'}
Mean Test Score :  0.780349020699898
Rank :  9
Parameters :  {'C': 1, 'penalty': 'l2'}
Mean Test Score :  0.787366564559547
Rank :  1
Parameters :  {'C': 2, 'penalty': 'l1'}
Mean Test Score :  0.7838485101642996
Rank :  3
Parameters :  {'C': 2, 'penalty': 'l2'}
Mean Test Score :  0.7821034066648102
Rank :  6
Parameters :  {'C': 5, 'penalty': 'l1'}
Mean Test Score :  0.7838485101642996
Rank :  3
Parameters :  {'C': 5,

In [17]:
logistic_model = LogisticRegression(solver='liblinear', \
                                   penalty = grid_search.best_params_['penalty'],
                                   C = grid_search.best_params_['C']).fit(x_train, y_train)

In [19]:
y_pred = logistic_model.predict(x_test)

In [21]:
summarize_classification(y_test, y_pred)

Test Data count :  143
Accuracy count :  117
Accuracy Score :  0.8181818181818182
Precision Score :  0.8076923076923077
Recall Score :  0.7241379310344828

