In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn import metrics

In [None]:
# if you are interested in building a model and predicting
# who survived for a holdout sample, please take a look at:
# https://www.kaggle.com/c/titanic/overview

df = pd.read_csv('~/Desktop/titanic_train.csv')
df = df.set_index('PassengerId')

dummies = pd.get_dummies(df['Sex'], drop_first=True)
df = pd.concat([df, dummies], axis = 1)
dummies = pd.get_dummies(df['Embarked'], drop_first=True)
df = pd.concat([df, dummies], axis = 1)
df = df.drop(['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], axis=1)

In [None]:
# generating a new column in the dataframe that creates
# two clusters - died and survived

def Assess(row):
     if row==1:
        return 'survived'
     return 'died'

df['outcome'] = df.apply(lambda x: Assess(x.Survived), axis=1)
df = df.dropna()
df.head()

In [None]:
# create train and test dataframes based on 80% train and 20% test

train, test = train_test_split(df, test_size = 0.2, stratify = df['outcome'], random_state = 42)

In [None]:
# separating out the independent and dependent variable labels

fn = ['Pclass', 'Age', 'Fare','male']
cn = ['died', 'survived']

In [None]:
# creating the independent and dependent variables for two
# models - logistic regression and classification

X_train = train[fn]
Yvar_train = train.Survived
y_train = train.outcome
X_test = test[fn]
Yvar_test = test.Survived
y_test = test.outcome

In [None]:
# logistic regression - we have seen this before

LogisticModel = sm.Logit(Yvar_train, X_train).fit()
print(LogisticModel.summary())

In [None]:
# we now use this model to help examine our testing set

YPred = LogisticModel.predict(X_test)
compare = pd.concat([Yvar_test, YPred], axis=1)
compare = compare.rename(columns={0: 'prediction'})
compare['difference'] = compare['Survived'] - compare['prediction']
compare.sort_values(by=['difference'])

In [None]:
# go through the residuals to generate a confusion matrix

def rating(difference):
    if difference > 0.5:
        return "false negative"
    elif difference >= 0:
        return "true positive"
    elif difference >= -0.5:
        return "true negative"
    else:
        return "false positive"

compare['type'] = compare.apply(lambda x: rating(x['difference']),axis=1)
true_positives = compare[compare['type'] == "true positive"].count()["type"]
true_negatives = compare[compare['type'] == "true negative"].count()["type"]
false_positives = compare[compare['type'] == "false positive"].count()["type"]
false_negatives = compare[compare['type'] == "false negative"].count()["type"]
print("True positives: ", true_positives)
print("True negatives: ", true_negatives)
print("False positives: ", false_positives)
print("False negatives: ", false_negatives)


In [None]:
# we haven't seen this before, but we are going to measure
# the Receiver Operating Characteristic (ROC)
# https://en.wikipedia.org/wiki/Receiver_operating_characteristic

roc_auc_score(Yvar_test, YPred)

In [None]:
# we then take a look at the ROC curve

fpr, tpr, threshold = roc_curve(Yvar_test, YPred)
roc_auc = metrics.auc(fpr, tpr)

plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
# plt.plot(fpr[2], tpr[2], color='darkorange',
#          lw=lw, label='ROC curve (area = %0.2f)' % roc_auc[2])

In [None]:
# now we try a different approach with a classification method using
# decision trees

mod_dt = DecisionTreeClassifier(max_depth = 2, random_state = 1)
mod_dt.fit(X_train,y_train)
prediction=mod_dt.predict(X_test)
print("The accuracy of the Decision Tree is","{:.3f}".format(metrics.accuracy_score(prediction,y_test)))

In [None]:
mod_dt.feature_importances_

In [None]:
plt.figure(figsize = (10,8))
plot_tree(mod_dt, feature_names = fn, class_names = cn, filled = True)