# Titanic Survival Prediction
This project aims to predict which passengers survived the sinking of the Titanic in 1912 based on their age and class. The dataset is taken from Kaggle: https://www.kaggle.com/c/titanic.

In [1]:
import seaborn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

### Load the passenger data

In [2]:
passengers = pd.read_csv("passengers.csv")
passengers.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


The following features are provided in the dataset:
- *survival*: Survival (0 = No, 1 = Yes)
- *pclass*: Ticket class (1 = 1st, 2 = 2nd, 3 = 3rd)
- *sex*: Sex	
- *Age*:Age in years	
- *sibsp*: # of siblings / spouses aboard the Titanic	
- *parch*: # of parents / children aboard the Titanic	
- *ticket*:	Ticket number	
- *fare*: Passenger fare	
- *cabin*: Cabin number	
- *embarked*: Port of Embarkation (C = Cherbourg, Q = Queenstown, S = Southampton)

In [3]:
passengers.shape

(891, 12)

### Clean the Data

In [4]:
#Update sex column to numerical value

passengers["Sex"] = passengers["Sex"].map({'female': 1,'male': 0})
passengers

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",0,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",1,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",1,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",0,26.0,0,0,111369,30.0000,C148,C


In [6]:
#Check for missing values in the age column
passengers["Age"].values

array([22.  , 38.  , 26.  , 35.  , 35.  ,   nan, 54.  ,  2.  , 27.  ,
       14.  ,  4.  , 58.  , 20.  , 39.  , 14.  , 55.  ,  2.  ,   nan,
       31.  ,   nan, 35.  , 34.  , 15.  , 28.  ,  8.  , 38.  ,   nan,
       19.  ,   nan,   nan, 40.  ,   nan,   nan, 66.  , 28.  , 42.  ,
         nan, 21.  , 18.  , 14.  , 40.  , 27.  ,   nan,  3.  , 19.  ,
         nan,   nan,   nan,   nan, 18.  ,  7.  , 21.  , 49.  , 29.  ,
       65.  ,   nan, 21.  , 28.5 ,  5.  , 11.  , 22.  , 38.  , 45.  ,
        4.  ,   nan,   nan, 29.  , 19.  , 17.  , 26.  , 32.  , 16.  ,
       21.  , 26.  , 32.  , 25.  ,   nan,   nan,  0.83, 30.  , 22.  ,
       29.  ,   nan, 28.  , 17.  , 33.  , 16.  ,   nan, 23.  , 24.  ,
       29.  , 20.  , 46.  , 26.  , 59.  ,   nan, 71.  , 23.  , 34.  ,
       34.  , 28.  ,   nan, 21.  , 33.  , 37.  , 28.  , 21.  ,   nan,
       38.  ,   nan, 47.  , 14.5 , 22.  , 20.  , 17.  , 21.  , 70.5 ,
       29.  , 24.  ,  2.  , 21.  ,   nan, 32.5 , 32.5 , 54.  , 12.  ,
         nan, 24.  ,

In [10]:
passengers["Age"].fillna(value = passengers["Age"].mean(),inplace=True)
passengers["Age"].values

array([22.        , 38.        , 26.        , 35.        , 35.        ,
       29.69911765, 54.        ,  2.        , 27.        , 14.        ,
        4.        , 58.        , 20.        , 39.        , 14.        ,
       55.        ,  2.        , 29.69911765, 31.        , 29.69911765,
       35.        , 34.        , 15.        , 28.        ,  8.        ,
       38.        , 29.69911765, 19.        , 29.69911765, 29.69911765,
       40.        , 29.69911765, 29.69911765, 66.        , 28.        ,
       42.        , 29.69911765, 21.        , 18.        , 14.        ,
       40.        , 27.        , 29.69911765,  3.        , 19.        ,
       29.69911765, 29.69911765, 29.69911765, 29.69911765, 18.        ,
        7.        , 21.        , 49.        , 29.        , 65.        ,
       29.69911765, 21.        , 28.5       ,  5.        , 11.        ,
       22.        , 38.        , 45.        ,  4.        , 29.69911765,
       29.69911765, 29.        , 19.        , 17.        , 26.  

In [11]:
#Create a first class column
passengers["First Class"] = passengers["Pclass"].apply(lambda x: 1 if x ==1 else 0)
passengers

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,First Class
0,1,0,3,"Braund, Mr. Owen Harris",0,22.000000,1,0,A/5 21171,7.2500,,S,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.000000,1,0,PC 17599,71.2833,C85,C,1
2,3,1,3,"Heikkinen, Miss. Laina",1,26.000000,0,0,STON/O2. 3101282,7.9250,,S,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.000000,1,0,113803,53.1000,C123,S,1
4,5,0,3,"Allen, Mr. William Henry",0,35.000000,0,0,373450,8.0500,,S,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",0,27.000000,0,0,211536,13.0000,,S,0
887,888,1,1,"Graham, Miss. Margaret Edith",1,19.000000,0,0,112053,30.0000,B42,S,1
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",1,29.699118,1,2,W./C. 6607,23.4500,,S,0
889,890,1,1,"Behr, Mr. Karl Howell",0,26.000000,0,0,111369,30.0000,C148,C,1


In [12]:
#Create a second class column
passengers["Second Class"] = passengers["Pclass"].apply(lambda x: 1 if x == 2 else 0)
passengers

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,First Class,Second Class
0,1,0,3,"Braund, Mr. Owen Harris",0,22.000000,1,0,A/5 21171,7.2500,,S,0,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.000000,1,0,PC 17599,71.2833,C85,C,1,0
2,3,1,3,"Heikkinen, Miss. Laina",1,26.000000,0,0,STON/O2. 3101282,7.9250,,S,0,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.000000,1,0,113803,53.1000,C123,S,1,0
4,5,0,3,"Allen, Mr. William Henry",0,35.000000,0,0,373450,8.0500,,S,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",0,27.000000,0,0,211536,13.0000,,S,0,1
887,888,1,1,"Graham, Miss. Margaret Edith",1,19.000000,0,0,112053,30.0000,B42,S,1,0
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",1,29.699118,1,2,W./C. 6607,23.4500,,S,0,0
889,890,1,1,"Behr, Mr. Karl Howell",0,26.000000,0,0,111369,30.0000,C148,C,1,0


### Select and Split the Data

In [16]:
features = passengers[["Sex", "Age", "First Class","Second Class"]]
survival = passengers["Survived"]

In [18]:
train_features, test_features, train_labels, test_tabels = train_test_split(features,survival, test_size=0.2)

### Normalize the Data

In [19]:
scaler = StandardScaler()
scaler.fit_transform(train_features)
scaler.transform(test_features)

array([[ 1.32653305e+00, -9.83579717e-01, -5.51388524e-01,
        -5.18808452e-01],
       [-7.53844771e-01, -1.60119769e+00, -5.51388524e-01,
        -5.18808452e-01],
       [ 1.32653305e+00, -9.06377470e-01,  1.81360322e+00,
        -5.18808452e-01],
       [ 1.32653305e+00, -1.13798421e+00, -5.51388524e-01,
        -5.18808452e-01],
       [-7.53844771e-01, -3.65961740e-01, -5.51388524e-01,
        -5.18808452e-01],
       [-7.53844771e-01, -3.17929875e-03, -5.51388524e-01,
        -5.18808452e-01],
       [-7.53844771e-01,  9.72517421e-02, -5.51388524e-01,
        -5.18808452e-01],
       [-7.53844771e-01, -3.17929875e-03, -5.51388524e-01,
        -5.18808452e-01],
       [-7.53844771e-01, -6.74770729e-01, -5.51388524e-01,
        -5.18808452e-01],
       [-7.53844771e-01, -3.17929875e-03, -5.51388524e-01,
         1.92749365e+00],
       [-7.53844771e-01,  2.10451017e+00, -5.51388524e-01,
         1.92749365e+00],
       [-7.53844771e-01, -1.36959095e+00, -5.51388524e-01,
      

### Create and Evaluate the Model

In [20]:
model = LogisticRegression()
model.fit(train_features, train_labels)

LogisticRegression()

In [21]:
model.score(train_features, train_labels)

0.7991573033707865

In [23]:
model.score(test_features,test_tabels)

0.7821229050279329

In [24]:
model.coef_

array([[ 2.60218748, -0.02774531,  2.12516555,  1.2619758 ]])

### Predict with the Model

In [27]:
Jack = np.array([0.0,20.0,0.0,0.0])
Rose = np.array([1.0,17.0,1.0,0.0])
You = np.array([1.0,27.0,1.0,0.0])

sample_passengers = np.array([Jack, Rose, You])
sample_passengers = scaler.transform(sample_passengers)
sample_passengers

array([[-0.75384477, -0.75197298, -0.55138852, -0.51880845],
       [ 1.32653305, -0.98357972,  1.81360322, -0.51880845],
       [ 1.32653305, -0.21155725,  1.81360322, -0.51880845]])

In [28]:
model.predict(sample_passengers)

array([0, 1, 1], dtype=int64)

In [29]:
model.predict_proba(sample_passengers)

array([[0.99496512, 0.00503488],
       [0.00571096, 0.99428904],
       [0.00583389, 0.99416611]])