In [1]:
from mlsettings.settings import load_app_config, get_datafolder_path
from mltools.mlcommon import load_data, print_dataset_info, split_dataset, auto_scatter_simple
import os
import math
import datetime

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
% matplotlib inline 


In [2]:
load_app_config()
DIRECTORY = "titanic"
FILENAME ='titanic_full.csv'
input_path = get_datafolder_path()

{'REG': {'ML_DATASOURCE': 'F:\\DataSource', 'ML_PATH': 'F:\\MachineLearning'}, 'UAT': {'ML_DATASOURCE': 'F:\\DataSource', 'ML_PATH': 'F:\\MachineLearning'}, 'PRD': {'ML_DATASOURCE': 'F:\\DataSource', 'ML_PATH': 'F:\\MachineLearning'}, 'DEV': {'ML_DATASOURCE': 'F:\\DataSource', 'ML_PATH': 'F:\\MachineLearning'}}
Adding F:\DataSource  to system path
Adding F:\MachineLearning  to system path


In [3]:
input_file = os.path.join(input_path, DIRECTORY, FILENAME)
input_dataset = load_data(input_file)
print(" input file is :{0} loaded.".format(input_file))
print(input_dataset.info())    

 input file is :F:\DataSource\titanic\titanic_full.csv loaded.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1313 entries, 0 to 1312
Data columns (total 11 columns):
row.names    1313 non-null int64
pclass       1313 non-null object
survived     1313 non-null int64
name         1313 non-null object
age          633 non-null float64
embarked     821 non-null object
home.dest    754 non-null object
room         77 non-null object
ticket       69 non-null object
boat         347 non-null object
sex          1313 non-null object
dtypes: float64(1), int64(2), object(8)
memory usage: 112.9+ KB
None


## Feature Extraction 

In [4]:
from sklearn import feature_extraction
def one_hot_dataframe(data,columns,replace=False):
    fe_vec= feature_extraction.DictVectorizer()
    make_dict = lambda row :dict((column,row[column]) for column in  columns)
    vector_data=pd.DataFrame(fe_vec.fit_transform( data[columns].apply(make_dict, axis=1)).toarray())
    vector_data.columns = fe_vec.get_feature_names()
    vector_data.index= data.index
    if replace:
        data = data.drop(columns, axis=1)
        data = data.join(vector_data)
    return data,vector_data
        

In [5]:
titanic,titanic_n = one_hot_dataframe(input_dataset, ['pclass','embarked', 'sex'], replace=True)

In [6]:
titanic, titanic_n = one_hot_dataframe(titanic, ['home.dest', 'room', 'ticket', 'boat'], replace=True)
print(titanic.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1313 entries, 0 to 1312
Columns: 581 entries, row.names to ticket=L15 1s
dtypes: float64(578), int64(2), object(1)
memory usage: 5.8+ MB
None


In [7]:
mean_age = math.ceil(titanic["age"].mean())
titanic['age'].fillna(mean_age, inplace=True)
titanic.fillna(0, inplace=True)
titanic.head()

Unnamed: 0,row.names,survived,name,age,embarked,embarked=Cherbourg,embarked=Queenstown,embarked=Southampton,pclass=1st,pclass=2nd,...,ticket=248744 L13,ticket=248749 L13,ticket=250647,ticket=27849,ticket=28220 L32 10s,ticket=34218 L10 10s,ticket=36973 L83 9s 6d,ticket=392091,ticket=7076,ticket=L15 1s
0,1,1,"Allen, Miss Elisabeth Walton",29.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,0,"Allison, Miss Helen Loraine",2.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,0,"Allison, Mr Hudson Joshua Creighton",30.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,0,"Allison, Mrs Hudson J.C. (Bessie Waldo Daniels)",25.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,1,"Allison, Master Hudson Trevor",0.9167,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
from sklearn.model_selection import train_test_split
titanic_target = titanic['survived']
titanic_data = titanic.drop(['name', 'row.names', 'survived'],axis=1)
X_train, X_test, y_train, y_test =  train_test_split(titanic_data, titanic_target, test_size=0.25,random_state=33)

In [9]:
from sklearn import tree
dec_tree=tree.DecisionTreeClassifier(criterion='entropy')
dec_tree = dec_tree.fit(X_train, y_train)
from sklearn import metrics
y_pred =dec_tree.predict(X_test)
print ( "Accuracy:{0:.3f}".format(metrics.accuracy_score(y_test,y_pred)), "\n")

Accuracy:0.833 



In [10]:
from sklearn import metrics
con_mat = metrics.confusion_matrix(y_test, y_pred)
print(con_mat)


[[187  15]
 [ 40  87]]


In [11]:
# Let's define our true posititves, false positives, true negatives, and false negatives
def performance_metrics(con_mat):
    true_neg = con_mat[0][0]
    false_neg = con_mat[1][0]
    true_pos = con_mat[1][1]
    false_pos = con_mat[0][1]

# Sensitivity: percent of correct predictions when reference value is 'survived'
    sensitivity = float(true_pos)/(false_neg + true_pos)
    print(sensitivity)
    print(metrics.recall_score(y_test, y_pred))

# Specificity: percent of correct predictions when reference value is 'not survived'
    specificity = float(true_neg) / (true_neg + false_pos)
    print (specificity)
performance_metrics(con_mat)

0.685039370079
0.685039370079
0.925742574257


In [12]:
from sklearn.model_selection import *
def train_and_evaluate(model, X_train, y_train):
    model.fit(X_train, y_train)
    print ("Coefficient of determination on training set:",model.score(X_train, y_train))
    # create a k-fold cross validation iterator of k=5 folds
    cv = KFold(n_splits= 5,shuffle=True, random_state=33)
    scores = cross_val_score(model, X_train, y_train, cv=cv)
    print(scores)
    print ("Average coefficient of determination using 5-fold crossvalidation:",np.mean(scores))
    

In [13]:
dec_tree=tree.DecisionTreeClassifier(criterion='entropy')
train_and_evaluate(dec_tree,X_train, y_train)

Coefficient of determination on training set: 0.935975609756
[ 0.8680203   0.87309645  0.90862944  0.88324873  0.87755102]
Average coefficient of determination using 5-fold crossvalidation: 0.882109188853


In [14]:
from sklearn import metrics
def measure_performance(X, y, clf, show_accuracy=True,show_classification_report=True,
                        show_confusion_matrix=True, show_r2_score=False):
    y_pred = clf.predict(X) 
    if show_accuracy:
        print ("Accuracy:{0:.3f}".format( metrics.accuracy_score(y, y_pred)),"\n" )
    if show_classification_report:
        print ("Classification report")
        print (metrics.classification_report(y, y_pred),"\n")
    if show_confusion_matrix:
        print("Confusion matrix") 
        print(metrics.confusion_matrix(y, y_pred),"\n")
    if show_r2_score:
        print ("Coefficient of determination:{0:.3f}"
               .format( metrics.r2_score(y, y_pred)),"\n")
    return y_pred

In [15]:
y_pred=measure_performance(X_test,y_test,dec_tree, show_accuracy=False, 
                    show_classification_report=True,
                    show_confusion_matrix=True, show_r2_score=False)

Classification report
             precision    recall  f1-score   support

          0       0.83      0.93      0.87       202
          1       0.85      0.69      0.77       127

avg / total       0.84      0.84      0.83       329
 

Confusion matrix
[[187  15]
 [ 39  88]] 



In [16]:
con_mat = metrics.confusion_matrix(y_test, y_pred)
performance_metrics(con_mat)

0.692913385827
0.692913385827
0.925742574257


## Feature Selection using sklearn

In [19]:
from sklearn import feature_selection
fs =feature_selection.SelectPercentile(feature_selection.chi2, percentile=20)

X_train_fs = fs.fit_transform(X_train, y_train)
print(X_train_fs.shape)

dec_tree.fit(X_train_fs, y_train)
X_test_fs = fs.transform(X_test)
 
y_pred_fs = measure_performance(X_test_fs,y_test,dec_tree, show_accuracy=True, 
                    show_classification_report=True,
                    show_confusion_matrix=True, show_r2_score=False)

(984, 115)
Accuracy:0.848 

Classification report
             precision    recall  f1-score   support

          0       0.84      0.93      0.88       202
          1       0.87      0.72      0.78       127

avg / total       0.85      0.85      0.84       329
 

Confusion matrix
[[188  14]
 [ 36  91]] 

