# This Notebook explores the Titanic dataset and tries to predict if a passenger died.

In [2]:
# Load libraries for EDA.
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt

In [3]:
# Load required data
path = "../../../Data/titanic/"
train = pd.read_csv(path + "train.csv")
test = pd.read_csv(path + "test.csv")
gender_submission = pd.read_csv(path + "gender_submission.csv")

Let's peek at the three datasets to see their features.

In [5]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


The attributes have the following meaning:

- PassengerId: a unique identifier for each passenger
- Survived: that's the target, 0 means the passenger did not survive, while 1 means he/she survived.
- Pclass: passenger class.
- Name, Sex, Age: self-explanatory
- SibSp: how many siblings & spouses of the passenger aboard the Titanic.
- Parch: how many children & parents of the passenger aboard the Titanic.
- Ticket: ticket id
- Fare: price paid (in pounds)
- Cabin: passenger's cabin number
- Embarked: where the passenger embarked the Titanic

In [7]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [8]:
gender_submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [9]:
# Explicity make the passenger id the index.
train = train.set_index("PassengerId")
test = test.set_index("PassengerId")

### Let's do some light Exploratory Data Analysis.

In [11]:
# Check data type and nulls.
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 891 entries, 1 to 891
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     204 non-null    object 
 10  Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB


In [12]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 418 entries, 892 to 1309
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    418 non-null    int64  
 1   Name      418 non-null    object 
 2   Sex       418 non-null    object 
 3   Age       332 non-null    float64
 4   SibSp     418 non-null    int64  
 5   Parch     418 non-null    int64  
 6   Ticket    418 non-null    object 
 7   Fare      417 non-null    float64
 8   Cabin     91 non-null     object 
 9   Embarked  418 non-null    object 
dtypes: float64(2), int64(3), object(5)
memory usage: 35.9+ KB


In [13]:
def percent_null(data):
    """
    This function outputs columns with missing values with their percentages.
    """
    cols_with_null = list()
    cols = data.columns
    for col in cols:
        null_count = data[col].isnull().value_counts()
        if len(null_count) == 2:
            null_count_percent = np.round((null_count[True]/len(data))*100, 2)
            cols_with_null.append(col)
            print(f"Column {col} has {null_count[True]} missing values which is {null_count_percent}%")
    return cols_with_null
print("Train set has the following information missing")
cols_with_null_train = percent_null(train)
print(f"\n {'-'*50} \n")
print("Test set has the following information missing")
cols_with_null_test = percent_null(test)

Train set has the following information missing
Column Age has 177 missing values which is 19.87%
Column Cabin has 687 missing values which is 77.1%
Column Embarked has 2 missing values which is 0.22%

 -------------------------------------------------- 

Test set has the following information missing
Column Age has 86 missing values which is 20.57%
Column Fare has 1 missing values which is 0.24%
Column Cabin has 327 missing values which is 78.23%


In [14]:
survival = train["Survived"].value_counts()
print(f"{survival[1]} people survived whereas {survival[0]} people died representing a {(survival[1]/len(train))*100:.2f}% survival rate")

342 people survived whereas 549 people died representing a 38.38% survival rate


In [15]:
train["Sex"].value_counts()

Sex
male      577
female    314
Name: count, dtype: int64

In [16]:
train["Age"].value_counts()

Age
24.00    30
22.00    27
18.00    26
19.00    25
28.00    25
         ..
36.50     1
55.50     1
0.92      1
23.50     1
74.00     1
Name: count, Length: 88, dtype: int64

In [17]:
train["Pclass"].value_counts()

Pclass
3    491
1    216
2    184
Name: count, dtype: int64

### Data Cleaning.

In [19]:
# Reduces the number of features.
train["Related"] = train["SibSp"] + train["Parch"]
test["Related"] = test["SibSp"] + test["Parch"]

train.drop(["SibSp", "Parch"], axis=1, inplace=True)
test.drop(["SibSp", "Parch"], axis=1, inplace=True)

Will replace missing values based on passenger class i.e replace age by mean per passenger class.

In [21]:
def fill_missing(data, null_cols):
    for col in null_cols:
        if data[col].dtype == 'object':
            for i in data["Pclass"].unique():
                replace_val = data.loc[data["Pclass"]==i, col].mode()
                replace_val = replace_val[0]
                data.loc[data["Pclass"]==i, col] = data.loc[data["Pclass"]==i, col].fillna(replace_val)
        else:
            for i in data["Pclass"].unique():
                replace_val = data.loc[data["Pclass"]==i, col].mean()
                data.loc[data["Pclass"]==i, col] = data.loc[data["Pclass"]==i, col].fillna(replace_val)
    return data
train = fill_missing(train, cols_with_null_train)
test = fill_missing(test, cols_with_null_test)

### Preprocessing

In [23]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder

In [24]:
def extract_salutation(name):
    second_name = name.split(",")[1]
    salutation = second_name.split(".")[0]
    return salutation
train["Salutation"] = train["Name"].apply(extract_salutation)
test["Salutation"] = test["Name"].apply(extract_salutation)

In [25]:
train["AgeBucket"] = train["Age"] // 15 * 15
test["AgeBucket"] = test["Age"] // 15 * 15

In [26]:
train["Salutation"].value_counts()

Salutation
 Mr              517
 Miss            182
 Mrs             125
 Master           40
 Dr                7
 Rev               6
 Mlle              2
 Major             2
 Col               2
 the Countess      1
 Capt              1
 Ms                1
 Sir               1
 Lady              1
 Mme               1
 Don               1
 Jonkheer          1
Name: count, dtype: int64

In [27]:
train.loc[train["Salutation"] == " the Countess"]

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,Ticket,Fare,Cabin,Embarked,Related,Salutation,AgeBucket
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
760,1,1,"Rothes, the Countess. of (Lucy Noel Martha Dye...",female,33.0,110152,86.5,B77,S,0,the Countess,30.0


In [28]:
cat_attri = ["Sex", "Embarked", "Salutation", "Ticket"]
attri_to_use = ["Pclass", "Sex", "Age", "AgeBucket", "Ticket", "Fare", "Salutation","Embarked", "Related"]

In [29]:
# Change categorical features that would lead to many features using onehot using label encoder.
label_encoder = LabelEncoder()

for col in cat_attri:
    train[col] = label_encoder.fit_transform(train[col])
    test[col] = label_encoder.fit_transform(test[col])

In [30]:
X_train = train[attri_to_use].to_numpy()
y_train = train["Survived"].to_numpy()
X_test = test[attri_to_use].to_numpy()

In [31]:
print(X_train.shape)
print(X_test.shape)

(891, 9)
(418, 9)


In [32]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

### Build some ML models.

In [34]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

In [35]:
rand_clf = RandomForestClassifier()
rand_clf.fit(X_train_scaled, y_train)
rand_pred = rand_clf.predict(X_test_scaled)

In [36]:
gender_submission["Survived"] = rand_pred
rand_gender_submission = gender_submission

In [37]:
rand_gender_submission.to_csv("rand_gender_submission.csv", index=False)

In [38]:
score = cross_val_score(rand_clf, X_train_scaled, y_train, cv=10, scoring="accuracy")

In [39]:
np.mean(score)

0.8384519350811486