# This Notebook explores the Titanic dataset and tries to predict if a passenger died.

In [1]:
# Load libraries for EDA.
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt

In [3]:
# Load required data
path = "../../../Data/titanic/"
train = pd.read_csv(path + "train.csv")
test = pd.read_csv(path + "test.csv")
gender_submission = pd.read_csv(path + "gender_submission.csv")

Let's peek at the three datasets to see their features.

In [15]:
train.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


The attributes have the following meaning:

- PassengerId: a unique identifier for each passenger
- Survived: that's the target, 0 means the passenger did not survive, while 1 means he/she survived.
- Pclass: passenger class.
- Name, Sex, Age: self-explanatory
- SibSp: how many siblings & spouses of the passenger aboard the Titanic.
- Parch: how many children & parents of the passenger aboard the Titanic.
- Ticket: ticket id
- Fare: price paid (in pounds)
- Cabin: passenger's cabin number
- Embarked: where the passenger embarked the Titanic

In [9]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [11]:
gender_submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [13]:
# Explicity make the passenger id the index.
train = train.set_index("PassengerId")
test = test.set_index("PassengerId")

### Let's do some light Exploratory Data Analysis.

In [11]:
# Check data type and nulls.
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 891 entries, 1 to 891
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     204 non-null    object 
 10  Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB


In [12]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 418 entries, 892 to 1309
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    418 non-null    int64  
 1   Name      418 non-null    object 
 2   Sex       418 non-null    object 
 3   Age       332 non-null    float64
 4   SibSp     418 non-null    int64  
 5   Parch     418 non-null    int64  
 6   Ticket    418 non-null    object 
 7   Fare      417 non-null    float64
 8   Cabin     91 non-null     object 
 9   Embarked  418 non-null    object 
dtypes: float64(2), int64(3), object(5)
memory usage: 35.9+ KB


In [51]:
def percent_null(data):
    """
    This function outputs columns with missing values with their percentages.
    """
    cols = data.columns
    for col in cols:
        null_count = data[col].isnull().value_counts()
        if len(null_count) == 2:
            null_count_percent = np.round((null_count[True]/len(data))*100, 2)
            print(f"Column {col} has {null_count[True]} missing values which is {null_count_percent}%")
print("Train set has the following information missing")
percent_null(train)
print(f"{'-'*50}")
print("Test set has the following information missing")
percent_null(test)

Train set has the following information missing
Column Age has 177 missing values which is 19.87%
Column Cabin has 687 missing values which is 77.1%
Column Embarked has 2 missing values which is 0.22%
--------------------------------------------------
Test set has the following information missing
Column Age has 86 missing values which is 20.57%
Column Fare has 1 missing values which is 0.24%
Column Cabin has 327 missing values which is 78.23%


In [13]:
train["Survived"].value_counts()

Survived
0    549
1    342
Name: count, dtype: int64

In [14]:
train["Sex"].value_counts()

Sex
male      577
female    314
Name: count, dtype: int64

In [15]:
train["Age"].value_counts()

Age
24.00    30
22.00    27
18.00    26
19.00    25
28.00    25
         ..
36.50     1
55.50     1
0.92      1
23.50     1
74.00     1
Name: count, Length: 88, dtype: int64

In [16]:
train["Pclass"].value_counts()

Pclass
3    491
1    216
2    184
Name: count, dtype: int64

### Data Cleaning.

Noticed that filling missing values in cabin with mode values increases the performance compared to deleteing the entire cabin column

In [19]:
# Drop column with alot of missing values.
#train.drop("Cabin", axis=1, inplace=True)
#test.drop("Cabin", axis=1, inplace=True)

In [20]:
# Reduces the number of features.
train["Related"] = train["SibSp"] + train["Parch"]
test["Related"] = test["SibSp"] + test["Parch"]

train.drop(["SibSp", "Parch"], axis=1, inplace=True)
test.drop(["SibSp", "Parch"], axis=1, inplace=True)

In [21]:
# Fill missing values.
mean_age_train = np.mean(train["Age"])
mode_embarked_train = train["Embarked"].mode()
mean_age_test = np.mean(test["Age"])
mean_fare_test = np.mean(test["Fare"])
mode_cabin_train = train["Cabin"].mode()
mode_cabin_test = test["Cabin"].mode()

train["Age"] = train["Age"].fillna(mean_age_train)
train["Embarked"] = train["Embarked"].fillna(mode_embarked_train[0])
test["Age"] = test["Age"].fillna(mean_age_test)
test["Fare"] = test["Fare"].fillna(mean_fare_test)
train["Cabin"] = train["Cabin"].fillna(mode_cabin_train)
test["Cabin"] = test["Cabin"].fillna(mode_cabin_test)

### Preprocessing

In [23]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder

In [24]:
cat_attri = ["Sex", "Embarked", "Ticket", "Cabin"]
attri_to_use = ["Pclass", "Sex", "Age", "Ticket", "Cabin", "Fare", "Embarked", "Related"]

In [25]:
# Change categorical features that would lead to many features using onehot using label encoder.
label_encoder = LabelEncoder()

for col in cat_attri:
    train[col] = label_encoder.fit_transform(train[col])
    test[col] = label_encoder.fit_transform(test[col])

In [26]:
X_train = train[attri_to_use].to_numpy()
y_train = train["Survived"].to_numpy()
X_test = test[attri_to_use].to_numpy()

In [27]:
print(X_train.shape)
print(X_test.shape)

(891, 8)
(418, 8)


In [28]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

### Build some ML models.

In [30]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

In [31]:
rand_clf = RandomForestClassifier()
rand_clf.fit(X_train_scaled, y_train)
rand_pred = rand_clf.predict(X_test_scaled)

In [32]:
gender_submission["Survived"] = rand_pred
rand_gender_submission = gender_submission

In [33]:
rand_gender_submission.to_csv("rand_gender_submission.csv", index=False)

In [34]:
score = cross_val_score(rand_clf, X_train_scaled, y_train, cv=10, scoring="accuracy")

In [35]:
np.mean(score)

0.83167290886392