# Titanic survival classifier
- http://adataanalyst.com/scikit-learn/decision-trees-scikit-learn/
- will be using 3 features (pclass, sex and age)

In [2]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


#### titanic.csv will be shared separately through Slack
- 'survived' 필드가 해당 분류기가 예측해야하는 값을 갖고 있다

In [3]:
import csv
import numpy as np
with open('/Users/khan/Documents/deeplearning/titanic.csv') as csvfile:
    titanic_reader = csv.reader(csvfile, delimiter=',', quotechar='"')
    
    # Header contains feature names
    row = next(titanic_reader)
    feature_names = np.array(row)
    
    # Load dataset, and target classes
    titanic_X, titanic_y = [], []
    for row in titanic_reader:  
        titanic_X.append(row)
        titanic_y.append(row[0]) # The target value is "survived"
    
    titanic_X = np.array(titanic_X)
    titanic_y = np.array(titanic_y)

In [4]:
print (feature_names, titanic_X[0], titanic_y[0])

['survived' 'pclass' 'name' 'sex' 'age' 'sibsp' 'parch' 'ticket' 'fare'
 'cabin' 'embarked'] ['0' '3' 'Braund, Mr. Owen Harris' 'male' '22' '1' '0' 'A/5 21171' '7.25'
 '' 'S'] 0


#### feature를 pclass, age, sex로 구성

In [5]:
titanic_X = titanic_X[:, [1, 4, 3]]
feature_names = feature_names[[1, 4, 3]]
print (feature_names)
print (titanic_X[12],titanic_y[12])

['pclass' 'age' 'sex']
['3' '20' 'male'] 0


#### age 필드의 경우 비어있는 레코드들이 존재. Default 값을 어떻게 할지 결정 필요 -> 평균값 사용

In [6]:
# We have missing values for age
# Assign the mean value
ages = titanic_X[:, 1]
print(ages)
mean_age = np.mean(titanic_X[ages != '', 1].astype(np.float))
titanic_X[titanic_X[:, 1] == '', 1] = mean_age
print(mean_age)

['22' '38' '26' '35' '35' '' '54' '2' '27' '14' '4' '58' '20' '39' '14'
 '55' '2' '' '31' '' '35' '34' '15' '28' '8' '38' '' '19' '' '' '40' '' ''
 '66' '28' '42' '' '21' '18' '14' '40' '27' '' '3' '19' '' '' '' '' '18'
 '7' '21' '49' '29' '65' '' '21' '28.5' '5' '11' '22' '38' '45' '4' '' ''
 '29' '19' '17' '26' '32' '16' '21' '26' '32' '25' '' '' '0.83' '30' '22'
 '29' '' '28' '17' '33' '16' '' '23' '24' '29' '20' '46' '26' '59' '' '71'
 '23' '34' '34' '28' '' '21' '33' '37' '28' '21' '' '38' '' '47' '14.5'
 '22' '20' '17' '21' '70.5' '29' '24' '2' '21' '' '32.5' '32.5' '54' '12'
 '' '24' '' '45' '33' '20' '47' '29' '25' '23' '19' '37' '16' '24' '' '22'
 '24' '19' '18' '19' '27' '9' '36.5' '42' '51' '22' '55.5' '40.5' '' '51'
 '16' '30' '' '' '44' '40' '26' '17' '1' '9' '' '45' '' '28' '61' '4' '1'
 '21' '56' '18' '' '50' '30' '36' '' '' '9' '1' '4' '' '' '45' '40' '36'
 '32' '19' '19' '3' '44' '58' '' '42' '' '24' '28' '' '34' '45.5' '18' '2'
 '32' '26' '16' '40' '24' '35' '22' '30'

#### Gender의 값을 스트링에서 숫자로 바꿔준다

In [7]:
# Encode sex 
from sklearn.preprocessing import LabelEncoder
enc = LabelEncoder()
label_encoder = enc.fit(titanic_X[:, 2])
print ("Categorical classes:", label_encoder.classes_)
integer_classes = label_encoder.transform(label_encoder.classes_)
print ("Integer classes:", integer_classes)
t = label_encoder.transform(titanic_X[:, 2])
titanic_X[:, 2] = t

Categorical classes: ['female' 'male']
Integer classes: [0 1]


In [8]:
# Update feature names
feature_names = ['pclass', 'age', 'sex']
# Convert to numerical values
titanic_X = titanic_X.astype(float)
titanic_y = titanic_y.astype(float)

In [9]:
print (feature_names)
print (titanic_X[12],titanic_y[12])

['pclass', 'age', 'sex']
[  3.  20.   1.] 0.0


In [10]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(titanic_X, titanic_y, test_size=0.25, random_state=33)



In [11]:
from sklearn import tree
clf = tree.DecisionTreeClassifier(criterion='entropy', max_depth=3,min_samples_leaf=5)
clf = clf.fit(X_train,y_train)

In [12]:
from sklearn import metrics
def measure_performance(X,y,clf, show_accuracy=True, show_classification_report=True, show_confusion_matrix=True):
    y_pred=clf.predict(X)   
    if show_accuracy:
        print ("Accuracy:{0:.3f}".format(metrics.accuracy_score(y,y_pred)),"\n")

    if show_classification_report:
        print ("Classification report")
        print (metrics.classification_report(y,y_pred),"\n")
        
    if show_confusion_matrix:
        print ("Confusion matrix")
        print (metrics.confusion_matrix(y,y_pred),"\n")
        
measure_performance(X_train,y_train,clf, show_classification_report=False, show_confusion_matrix=False)

Accuracy:0.802 



In [14]:
from sklearn.cross_validation import cross_val_score, LeaveOneOut
from scipy.stats import sem

def loo_cv(X_train,y_train,clf):
    # Perform Leave-One-Out cross validation
    # We are preforming 1313 classifications!
    loo = LeaveOneOut(X_train[:].shape[0])
    scores=np.zeros(X_train[:].shape[0])
    for train_index,test_index in loo:
        X_train_cv, X_test_cv= X_train[train_index], X_train[test_index]
        y_train_cv, y_test_cv= y_train[train_index], y_train[test_index]
        clf = clf.fit(X_train_cv,y_train_cv)
        y_pred=clf.predict(X_test_cv)
        scores[test_index]=metrics.accuracy_score(y_test_cv.astype(int), y_pred.astype(int))
    print (("Mean score: {0:.3f} (+/-{1:.3f})").format(np.mean(scores), sem(scores)))

In [15]:
loo_cv(X_train, y_train,clf)

Mean score: 0.802 (+/-0.015)


In [16]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=10,random_state=33)
clf = clf.fit(X_train,y_train)
loo_cv(X_train,y_train,clf)

Mean score: 0.796 (+/-0.016)


In [17]:
clf_dt=tree.DecisionTreeClassifier(criterion='entropy', max_depth=3,min_samples_leaf=5)
clf_dt.fit(X_train,y_train)
measure_performance(X_test,y_test,clf_dt)

Accuracy:0.834 

Classification report
             precision    recall  f1-score   support

        0.0       0.85      0.88      0.86       134
        1.0       0.81      0.76      0.79        89

avg / total       0.83      0.83      0.83       223
 

Confusion matrix
[[118  16]
 [ 21  68]] 



In [21]:
print(X_train[0])
print(feature_names)
# 
clf.predict([[1, 5, 0]])

[  1.  47.   1.]
['pclass', 'age', 'sex']


array([ 1.])

In [20]:
clf.predict([3, 39, 1])



array([ 0.])