# Predicting Survival in the Titanic Data Set
We will be using a decision tree to make predictions about the Titanic data set from Kaggle. This data set provides information on the Titanic passengers and can be used to predict whether a passenger survived or not.
Loading Data and modules

In [15]:

import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
import sklearn
from pandas import Series, DataFrame
from pylab import rcParams
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report
url= "https://raw.githubusercontent.com/BigDataGal/Python-for-Data-Science/master/titanic-train.csv"
titanic = pd.read_csv(url)
titanic.columns = ['PassengerId','Survived','Pclass','Name','Sex','Age','SibSp','Parch','Ticket','Fare','Cabin','Embarked']
titanic.head()



Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [16]:
print(titanic.shape)

(891, 12)


In [29]:
titanic_clean = titanic.dropna()
print(titanic_clean.dtypes)
print("-"*80)

print(titanic_clean.info())
print("-"*80)


print(titanic_clean.columns)
print("-"*80)

titanic_clean.describe()

PassengerId       int64
Survived          int64
Pclass            int64
Name             object
Sex              object
Age             float64
SibSp             int64
Parch             int64
Ticket           object
Fare            float64
Cabin            object
Embarked         object
Sex_factored      int64
dtype: object
--------------------------------------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
Int64Index: 183 entries, 1 to 889
Data columns (total 13 columns):
PassengerId     183 non-null int64
Survived        183 non-null int64
Pclass          183 non-null int64
Name            183 non-null object
Sex             183 non-null object
Age             183 non-null float64
SibSp           183 non-null int64
Parch           183 non-null int64
Ticket          183 non-null object
Fare            183 non-null float64
Cabin           183 non-null object
Embarked        183 non-null object
Sex_factored    183 non-null int64
dtypes: float64(2), int6

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_factored
count,183.0,183.0,183.0,183.0,183.0,183.0,183.0,183.0
mean,455.36612,0.672131,1.191257,35.674426,0.464481,0.47541,78.682469,0.480874
std,247.052476,0.470725,0.515187,15.643866,0.644159,0.754617,76.347843,0.501005
min,2.0,0.0,1.0,0.92,0.0,0.0,0.0,0.0
25%,263.5,0.0,1.0,24.0,0.0,0.0,29.7,0.0
50%,457.0,1.0,1.0,36.0,0.0,0.0,57.0,0.0
75%,676.0,1.0,1.0,47.5,1.0,1.0,90.0,1.0
max,890.0,1.0,3.0,80.0,3.0,4.0,512.3292,1.0


# Using only Pclass, Sex, Age, SibSp (Siblings aboard), Parch (Parents/children aboard), and Fare to predict whether a passenger survived.

In [63]:
#factorizing Sex column

titanic_clean['Sex_factored'], _ = pd.factorize(titanic_clean['Sex'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [32]:
# features and target
predictors = titanic_clean[['Pclass', 'Sex_factored', 'Age', 'SibSp','Parch', 'Fare']]


targets =titanic_clean.Survived

predictors.head()

Unnamed: 0,Pclass,Sex_factored,Age,SibSp,Parch,Fare
1,1,0,38.0,1,0,71.2833
3,1,0,35.0,1,0,53.1
6,1,1,54.0,0,0,51.8625
10,3,0,4.0,1,1,16.7
11,1,0,58.0,0,0,26.55


In [38]:
#%%Split into training and testing sets
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import sklearn.metrics

pred_train, pred_test, tar_train, tar_test  = train_test_split(predictors, targets, test_size=.3,random_state=12)

In [39]:
print(pred_train.shape)
print(pred_test.shape)
print(tar_train.shape)
print(tar_test.shape)

(128, 6)
(55, 6)
(128,)
(55,)


In [40]:
 #%% Build a decision tree model

#Build model on training data
classifier=DecisionTreeClassifier()
classifier=classifier.fit(pred_train,tar_train)

In [61]:
 # apply decision tree model to get prediction for test data
    
test_pred=classifier.predict(pred_test)

# actual vs prediction table

tar_testl=list(tar_test)
test_predl=list(test_pred)

ac_vs_pd= pd.DataFrame(
    {'Actual': tar_testl,
     'Predicted': test_predl,
    })

print(ac_vs_pd)

    Actual  Predicted
0        1          1
1        0          0
2        1          1
3        0          0
4        1          1
5        0          1
6        1          1
7        0          0
8        0          0
9        1          1
10       1          0
11       1          1
12       0          0
13       0          0
14       0          0
15       1          0
16       1          1
17       0          0
18       0          0
19       1          1
20       1          0
21       1          0
22       1          1
23       1          1
24       0          0
25       1          1
26       0          1
27       0          1
28       0          0
29       1          1
30       1          1
31       0          0
32       1          1
33       1          1
34       1          1
35       1          1
36       1          0
37       0          0
38       1          1
39       1          0
40       1          0
41       1          1
42       1          1
43       1          1
44       1

## Evaluate the model's performance

Including the tree's axis-parallel decision boundaries and how the tree splits

In [49]:
 sklearn.metrics.confusion_matrix(tar_test,test_pred)

array([[15,  4],
       [ 8, 28]], dtype=int64)

In [50]:

# how did our model perform?

count_misclassified = (tar_test!=test_pred).sum()
print('Misclassified samples: {}'.format(count_misclassified))


accuracy = metrics.accuracy_score(tar_test, test_pred)
print('Accuracy: {:.2f}'.format(accuracy))

Misclassified samples: 12
Accuracy: 0.78


In [62]:
print("End of assignment")

End of assignment
