## Predicting Titanic Survival

In [6]:
import pandas as pd
from pandas import Series, DataFrame
import numpy as np
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

In [7]:
dfML = pd.read_csv('cleandf_titanic.csv', index_col = 'PassengerId')
dfML.head(5)

Unnamed: 0_level_0,Survived,Pclass,Alone,AgeGroup,Sex_Coded,Embarked_Coded
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0,3,0,2,0,3
2,1,1,0,3,1,1
3,1,3,1,2,1,3
4,1,1,0,3,1,3
5,0,3,1,3,0,3


In [8]:
dfML.columns

Index(['Survived', 'Pclass', 'Alone', 'AgeGroup', 'Sex_Coded',
       'Embarked_Coded'],
      dtype='object')

### Split data and train model

In [9]:
Y = dfML['Survived'].values
X = dfML.drop('Survived', axis =1).values

In [83]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state = 4)

In [84]:
print('Train set:', X_train.shape, Y_train.shape)
print('Test test:', X_test.shape, Y_test.shape)

Train set: (499, 5) (499,)
Test test: (215, 5) (215,)


### Decision Tree Classifier

In [154]:
from sklearn.tree import DecisionTreeClassifier
#train model
DTC = DecisionTreeClassifier(criterion="entropy", max_depth = 5).fit(X_train,Y_train)
#make predictions
DTC_pred = DTC.predict(X_test)

In [155]:
from sklearn import metrics
print("DecisionTrees's Accuracy: ", accuracy_score(Y_test, DTC_pred))

DecisionTrees's Accuracy:  0.7906976744186046


### Random Forest Classifier

In [166]:
from sklearn.ensemble import RandomForestClassifier

# train model
RFC= RandomForestClassifier(n_estimators=20).fit(X_train, Y_train)
# predict on test set
RFC_pred = RFC.predict(X_test)
#check accuracy
print("Random Forest Classifier's Accuracy: ", accuracy_score(Y_test, RFC_pred))

Random Forest Classifier's Accuracy:  0.7906976744186046


### Log Regression

In [88]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

LR = LogisticRegression(C=0.01, solver='liblinear').fit(X_train,Y_train)

In [89]:
LR_pred = LR.predict(X_test)

In [90]:
LR_pred_prob = LR.predict_proba(X_test)

In [91]:
from sklearn.metrics import jaccard_similarity_score
jaccard_similarity_score(Y_test, LR_pred)

0.6883720930232559

In [94]:
from sklearn.metrics import classification_report, confusion_matrix
import itertools
import matplotlib as plt

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
print(confusion_matrix(Y_test, LR_pred, labels=[1,0]))

AttributeError: module 'matplotlib' has no attribute 'cm'

In [95]:
# Compute confusion matrix
cnf_matrix = confusion_matrix(Y_test, LR_pred, labels=[1,0])
np.set_printoptions(precision=2)

# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=['churn=1','churn=0'],normalize= False,  title='Confusion matrix')

AttributeError: module 'matplotlib' has no attribute 'figure'

In [96]:
print (classification_report(Y_test, LR_pred))


              precision    recall  f1-score   support

           0       0.65      1.00      0.79       124
           1       1.00      0.26      0.42        91

   micro avg       0.69      0.69      0.69       215
   macro avg       0.82      0.63      0.60       215
weighted avg       0.80      0.69      0.63       215



In [97]:
from sklearn.metrics import log_loss
log_loss(Y_test, LR_pred_prob)

0.5961526579404546

In [102]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score, classification_report, log_loss, jaccard_similarity_score

k = 30
KNC = KNeighborsClassifier(n_neighbors = k).fit(X_train,Y_train)

# Predict on test set
KNC_pred = KNC.predict(X_test)

#Check the accuracy of test set
print("K-Nearest Neighbor's Accuracy: ", accuracy_score(Y_test, KNC_pred))
print("K-Nearest Neighbor's f1_score: ",f1_score(Y_test, KNC_pred, average = 'weighted'))
print("K-Nearest Neighbor's recall_score: ",recall_score(Y_test, KNC_pred, average = 'weighted'))
print (classification_report(Y_test, KNC_pred))

K-Nearest Neighbor's Accuracy:  0.7906976744186046
K-Nearest Neighbor's f1_score:  0.7808409055906361
K-Nearest Neighbor's recall_score:  0.7906976744186046
              precision    recall  f1-score   support

           0       0.75      0.94      0.84       124
           1       0.88      0.58      0.70        91

   micro avg       0.79      0.79      0.79       215
   macro avg       0.82      0.76      0.77       215
weighted avg       0.81      0.79      0.78       215



In [101]:
#Check for best k
ks = [1, 3, 5, 7, 10, 20, 30]
mean_accuracy = {}

for k in ks:
    acc_score= np.zeros(3)
    
    for n in range(3): # to generate 3 random train/test split for each k
        X1_train, X1_test, Y1_train, Y1_test = train_test_split( X_train, Y_train, test_size=0.2)
        # train  and predict
        KNC = KNeighborsClassifier(n_neighbors = k).fit(X1_train,Y1_train)
        KNC_pred = KNC.predict(X1_test)
        # evaluate accuracy
        acc_score[n] = accuracy_score(Y1_test, KNC_pred)
    
    mean_accuracy[k] = round(acc_score.mean(),3)
    
    
print('Mean accuracy score for each k \n', mean_accuracy)

import re
highest = max(mean_accuracy.values())
k = {k for k, acc in mean_accuracy.items() if acc == highest}
k =re.sub("{|}","" , str(k))
print( "The best accuracy score was ",highest, "with k =",k) 

Mean accuracy score for each k 
 {1: 0.75, 3: 0.777, 5: 0.78, 7: 0.79, 10: 0.75, 20: 0.773, 30: 0.807}
The best accuracy score was  0.807 with k = 30
