# Who survived the titanic? How do we increase our odds?!


In [1]:
#imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

#data set
titanic_csv = "titanic.csv"
titanic = pd.read_csv(titanic_csv, index_col = "PassengerId")

print(titanic.shape)

(891, 11)


In [2]:
#check for those missing values
titanic.isnull().sum()

Survived      0
Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       687
Embarked      2
dtype: int64

I could drop the missing, but it cuts into my data quite a bit!

In [3]:
titanic.dropna().shape

(183, 11)

In [4]:
#let's look at the data
titanic.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Sometimes a better strategy is to impute some values. 

In [5]:
#Let's check mean, median, and mode? 
print(titanic.Age.mean())
print(titanic.Age.median())
print(titanic.Age.mode())


29.69911764705882
28.0
0    24.0
dtype: float64


In [6]:
titanic.Age.fillna(titanic.Age.median())

PassengerId
1      22.0
2      38.0
3      26.0
4      35.0
5      35.0
6      28.0
7      54.0
8       2.0
9      27.0
10     14.0
11      4.0
12     58.0
13     20.0
14     39.0
15     14.0
16     55.0
17      2.0
18     28.0
19     31.0
20     28.0
21     35.0
22     34.0
23     15.0
24     28.0
25      8.0
26     38.0
27     28.0
28     19.0
29     28.0
30     28.0
       ... 
862    21.0
863    48.0
864    28.0
865    24.0
866    42.0
867    27.0
868    31.0
869    28.0
870     4.0
871    26.0
872    47.0
873    33.0
874    47.0
875    28.0
876    15.0
877    20.0
878    19.0
879    28.0
880    56.0
881    25.0
882    33.0
883    22.0
884    28.0
885    25.0
886    39.0
887    27.0
888    19.0
889    28.0
890    26.0
891    32.0
Name: Age, Length: 891, dtype: float64

## Dealing with categorical data (review)

How do we include a categorical feature in our model? 
- **Ordered Categories** transform them to sensible numeric values (small = 1, medium = 2, large = 3)
- **Undordered Categories** use dummy encoding (0/1)


In [7]:
#Create dummies for sex
titanic["Sex_Female"] = titanic.Sex.map({"male":0, "female":1})
titanic.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_Female
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,1
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0


In [8]:
#what's going on in embarked?
titanic["Embarked"].unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [9]:
#create a dataframe of dummy variables for Embarked
embarked_dummies = pd.get_dummies(titanic.Embarked, prefix = "Embarked")
embarked_dummies.drop(embarked_dummies.columns[0], axis = 1, inplace = True)

In [10]:
#concatenate to the original DF
titanic = pd.concat([titanic, embarked_dummies], axis = 1)
titanic.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_Female,Embarked_Q,Embarked_S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0,0,1
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1,0,0
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,1,0,1
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1,0,1
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0,0,1


## Another method!
If values are missing from a categorical feature, we could treat the missing values as **another category**. I'll impute the "age" value with a K-Nearest Neighbors regressor that shows us what the age of similar passengers was. This gives us more accurate granularity, rather than the overall mean of the age.

In [11]:
#Let's start by looking at what categories have the strongest relationship
titanic.corr()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_Female,Embarked_Q,Embarked_S
Survived,1.0,-0.338481,-0.077221,-0.035322,0.081629,0.257307,0.543351,0.00365,-0.15566
Pclass,-0.338481,1.0,-0.369226,0.083081,0.018443,-0.5495,-0.1319,0.221009,0.08172
Age,-0.077221,-0.369226,1.0,-0.308247,-0.189119,0.096067,-0.093254,-0.022405,-0.032523
SibSp,-0.035322,0.083081,-0.308247,1.0,0.414838,0.159651,0.114631,-0.026354,0.070941
Parch,0.081629,0.018443,-0.189119,0.414838,1.0,0.216225,0.245489,-0.081228,0.063036
Fare,0.257307,-0.5495,0.096067,0.159651,0.216225,1.0,0.182333,-0.117216,-0.166603
Sex_Female,0.543351,-0.1319,-0.093254,0.114631,0.245489,0.182333,1.0,0.074115,-0.125722
Embarked_Q,0.00365,0.221009,-0.022405,-0.026354,-0.081228,-0.117216,0.074115,1.0,-0.496624
Embarked_S,-0.15566,0.08172,-0.032523,0.070941,0.063036,-0.166603,-0.125722,-0.496624,1.0


In [12]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 14 columns):
Survived      891 non-null int64
Pclass        891 non-null int64
Name          891 non-null object
Sex           891 non-null object
Age           714 non-null float64
SibSp         891 non-null int64
Parch         891 non-null int64
Ticket        891 non-null object
Fare          891 non-null float64
Cabin         204 non-null object
Embarked      889 non-null object
Sex_Female    891 non-null int64
Embarked_Q    891 non-null uint8
Embarked_S    891 non-null uint8
dtypes: float64(2), int64(5), object(5), uint8(2)
memory usage: 132.2+ KB


In [14]:
#Import
from sklearn.neighbors import KNeighborsRegressor as knn

#Instantiate - Remember, X is our things that we think are stable predictors based on above, Y is what we want to predict
train_x = titanic[~titanic.Age.isnull()][["Survived","Pclass","SibSp","Fare","Sex_Female"]]
train_y = titanic[~titanic.Age.isnull()]["Age"]
test_x = titanic[titanic.Age.isnull()][["Survived","Pclass","SibSp","Fare","Sex_Female"]]

knn = knn(n_neighbors = 5)

#fit
knn.fit(train_x, train_y)
age_predictions = knn.predict(test_x)



In [15]:
#update the age column with the prediction
titanic["age_knn"] = np.where(titanic.Age.isnull(), knn.predict(titanic[["Survived", "Pclass", "SibSp", "Fare", "Sex_Female"]]),
                             titanic.Age)

In [16]:
titanic.head(10)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_Female,Embarked_Q,Embarked_S,age_knn
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0,0,1,22.0
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1,0,0,38.0
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,1,0,1,26.0
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1,0,1,35.0
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0,0,1,35.0
6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q,0,1,0,29.2
7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,0,0,1,54.0
8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S,0,0,1,2.0
9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S,1,0,1,27.0
10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C,1,0,0,14.0


## Logistic regression
Linear regression is well suited for estimating values, but it isn’t the best tool for predicting the class of an observation. Logistic regression is similar to linear regression, with the only difference being the y data, which should contain integer values indicating the class relative to the observation. 

In [19]:
#define X and y
feature_cols = ["Pclass", "Parch", "age_knn", "Sex_Female", "Embarked_Q", "Embarked_S"]
X = titanic[feature_cols]
y = titanic.Survived

#train, test, split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)

#import, instantiate, firt
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(C=1e9)
logreg.fit(X_train, y_train)

#make predictions for testing set
preds = logreg.predict(X_test)

In [28]:
#Inspect the coefficients
pd.Series(list(zip(feature_cols, logreg.coef_[0])))

#convert log-odds to odds by using exponential funct
pd.Series(list(zip(feature_cols, np.exp(logreg.coef_[0]))))

0        (Pclass, 0.271215216877)
1          (Parch, 0.89648450848)
2       (age_knn, 0.952393603869)
3      (Sex_Female, 13.414980474)
4    (Embarked_Q, 0.823372997395)
5    (Embarked_S, 0.568939093506)
dtype: object

In [36]:
#What's our accuracy? 
from sklearn import metrics
print(metrics.accuracy_score(y_test, preds))


0.811659192825


In [37]:
#How accurate are we? Let's look at a confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, preds)

array([[113,  15],
       [ 27,  68]], dtype=int64)

In [47]:
#Save confusion matrix into a variable called confusion
#Store each of the categories in variables
confusion = metrics.confusion_matrix(y_test, preds)
TP = confusion[1][1]
TN = confusion[0][0]
FP = confusion[0][1]
FN = confusion[1][0]
print(TP)
print(TN)
print(FP)
print(FN)

68
113
15
27


In [43]:
#sensitivity
TP / (TP+FN)

0.71578947368421053

In [44]:
#specificity
FP / (FP+TN)

0.1171875