
# Titanic Analysis

This analysis will test the acuracy of a few deep learning models.

## Import dependencies

In [1]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn import metrics
import pandas as pd

## Load whole dataset (it will be later splitted into train, test and validation)

In [2]:
df = pd.read_csv("https://www.48hours.ai/files/titanic/titanic-all.csv")
# displays data
df.head(n=2000)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
1304,1305,0,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
1305,1306,1,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
1306,1307,0,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
1307,1308,0,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [3]:
# describe data
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,1309.0,1309.0,1309.0,1046.0,1309.0,1309.0,1308.0
mean,655.0,0.377387,2.294882,29.881138,0.498854,0.385027,33.295479
std,378.020061,0.484918,0.837836,14.413493,1.041658,0.86556,51.758668
min,1.0,0.0,1.0,0.17,0.0,0.0,0.0
25%,328.0,0.0,2.0,21.0,0.0,0.0,7.8958
50%,655.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,982.0,1.0,3.0,39.0,1.0,0.0,31.275
max,1309.0,1.0,3.0,80.0,8.0,9.0,512.3292


In [4]:
# make a copy of the dataframe in case we need it later
titanic_unprocessed = df.copy()

## Prepare data

In [5]:
#dropping columns that will not be used for predictions
# process_df = df.drop(["PassengerId","Name","Ticket","Cabin"], axis=1)

# passenger id will not be dropped yet 
process_df = df.drop([ "Name","Ticket","Cabin"], axis=1)

#print number of collums with null values
print (process_df.isnull().sum() )
#processed_df


PassengerId      0
Survived         0
Pclass           0
Sex              0
Age            263
SibSp            0
Parch            0
Fare             1
Embarked         2
dtype: int64


In [6]:
# As can be seen from the above table there are many null values.  
# We are going to fill null values with values

##identify mean ages
male_mean_age = process_df[process_df["Sex"]=="male"]["Age"].mean()
female_mean_age = process_df[process_df["Sex"]=="female"]["Age"].mean()
#print ("female mean age: %1.0f" %female_mean_age )
#print ("male mean age: %1.0f" %male_mean_age )
process_df.loc[ (process_df["Sex"]=="male") & (process_df["Age"].isnull()), "Age"] = male_mean_age
process_df.loc[ (process_df["Sex"]=="female") & (process_df["Age"].isnull()), "Age"] = female_mean_age

# fill fare with mean fare
mean_fare = process_df["Fare"].mean()

# cabin and embarked

process_df["Embarked"] = process_df["Embarked"].fillna("U") # U for unknown
process_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,male,22.0,1,0,7.25,S
1,2,1,1,female,38.0,1,0,71.2833,C
2,3,1,3,female,26.0,0,0,7.925,S
3,4,1,1,female,35.0,1,0,53.1,S
4,5,0,3,male,35.0,0,0,8.05,S


In [7]:
# Check again for null values
print (process_df.head().isnull().sum() )
#process_df

PassengerId    0
Survived       0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
dtype: int64


### Convert categorical variables to numeric values

If a variable contains male / female one hot encoding will encode it to 0 / 1

In [8]:
# the following categorical variables will be converted  sex & embarked

categorical_feature_mask = process_df.dtypes==object
# filter categorical columns using mask and turn it into a list
categorical_cols = process_df.columns[categorical_feature_mask].tolist()

In [9]:
categorical_cols.append('Pclass') # manually add Pclass it is also a categorical variable
categorical_cols


['Sex', 'Embarked', 'Pclass']

In [10]:
process_df = pd.get_dummies(process_df, columns=categorical_cols, drop_first=True)

In [11]:
values_ds = process_df.iloc[:,2:]
values_ds

Unnamed: 0,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S,Embarked_U,Pclass_2,Pclass_3
0,22.000000,1,0,7.2500,1,0,1,0,0,1
1,38.000000,1,0,71.2833,0,0,0,0,0,0
2,26.000000,0,0,7.9250,0,0,1,0,0,1
3,35.000000,1,0,53.1000,0,0,1,0,0,0
4,35.000000,0,0,8.0500,1,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...
1304,30.585228,0,0,8.0500,1,0,1,0,0,1
1305,39.000000,0,0,108.9000,0,0,0,0,0,0
1306,38.500000,0,0,7.2500,1,0,1,0,0,1
1307,30.585228,0,0,8.0500,1,0,1,0,0,1


In [18]:
target_ds  = process_df.iloc[:,1:2]
target_ds

Unnamed: 0,Survived
0,0
1,1
2,1
3,1
4,0
...,...
1304,0
1305,1
1306,0
1307,0


# Split data in train, test and validation batches

In [21]:
train_ratio = 0.75
validation_ratio = 0.15
test_ratio = 0.10

# train is now 75% of the entire data set
# the _junk suffix means that we drop that variable completely
x_train, x_test, y_train, y_test = train_test_split(values_ds, target_ds, test_size=1 - train_ratio)

# test is now 10% of the initial data set
# validation is now 15% of the initial data set
x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, test_size=test_ratio/(test_ratio + validation_ratio)) 

print(x_train, x_val, x_test)
print(y_train, y_val, y_test)

           Age  SibSp  Parch     Fare  Sex_male  Embarked_Q  Embarked_S  \
549   8.000000      1      1  36.7500         1           0           1   
797  31.000000      0      0   8.6833         0           0           1   
131  20.000000      0      0   7.0500         1           0           1   
869   4.000000      1      1  11.1333         1           0           1   
640  20.000000      0      0   7.8542         1           0           1   
..         ...    ...    ...      ...       ...         ...         ...   
842  30.000000      0      0  31.0000         0           0           0   
922  24.000000      2      0  31.5000         1           0           1   
638  41.000000      0      5  39.6875         0           0           1   
485  28.687088      3      1  25.4667         0           0           1   
952  32.000000      0      0  13.5000         1           0           1   

     Embarked_U  Pclass_2  Pclass_3  
549           0         1         0  
797           0        

# Deep learning on data

## First model will be a shallow model