# This Notebook explores the Titanic dataset and tries to predict if a passenger died.

We start by loading the datasets. 

In [3]:
# Load libraries for EDA.
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt

In [4]:
# Load required data
path = "../../../Data/titanic/"
train = pd.read_csv(path + "train.csv")
test = pd.read_csv(path + "test.csv")
gender_submission = pd.read_csv(path + "gender_submission.csv")

Let's peek at the three datasets and observe their features.

In [6]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


The attributes have the following meaning:

- PassengerId: a unique identifier for each passenger
- Survived: that's the target, 0 means the passenger did not survive, while 1 means he/she survived.
- Pclass: passenger class.
- Name, Sex, Age: self-explanatory
- SibSp: how many siblings & spouses of the passenger aboard the Titanic.
- Parch: how many children & parents of the passenger aboard the Titanic.
- Ticket: ticket id
- Fare: price paid (in pounds)
- Cabin: passenger's cabin number
- Embarked: where the passenger embarked the Titanic

In [8]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [9]:
gender_submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [10]:
# Explicity make the passenger id the index.
train = train.set_index("PassengerId")
test = test.set_index("PassengerId")

### Data Cleaning.

In [12]:
# Check data type and nulls.
train.info()

In [None]:
test.info()

In [None]:
def percent_null(data):
    """
    This function outputs columns with missing values with their percentages.
    """
    cols_with_null = list()
    cols = data.columns
    for col in cols:
        null_count = data[col].isnull().value_counts()
        if len(null_count) == 2:
            null_count_percent = np.round((null_count[True]/len(data))*100, 2)
            cols_with_null.append(col)
            print(f"Column {col} has {null_count[True]} missing values which is {null_count_percent}%")
    return cols_with_null
print("Train set has the following information missing:")
cols_with_null_train = percent_null(train)
print(f"\n {'-'*50} \n")
print("Test set has the following information missing:")
cols_with_null_test = percent_null(test)

Will replace missing values based on passenger class i.e replace age by mean per passenger class.

In [14]:
def fill_missing(data, null_cols):
    for col in null_cols:
        if data[col].dtype == 'object':
            for i in data["Pclass"].unique():
                replace_val = data.loc[data["Pclass"]==i, col].mode()
                replace_val = replace_val[0]
                data.loc[data["Pclass"]==i, col] = data.loc[data["Pclass"]==i, col].fillna(replace_val)
        else:
            for i in data["Pclass"].unique():
                replace_val = data.loc[data["Pclass"]==i, col].mean()
                data.loc[data["Pclass"]==i, col] = data.loc[data["Pclass"]==i, col].fillna(replace_val)
    return data
train = fill_missing(train, cols_with_null_train)
test = fill_missing(test, cols_with_null_test)

NameError: name 'cols_with_null_train' is not defined

In [None]:
train.info()

In [None]:
test.info()

### Let's do some light Exploratory Data Analysis.

In [None]:
# Reduces the number of features.
train["Related"] = train["SibSp"] + train["Parch"]
test["Related"] = test["SibSp"] + test["Parch"]

train.drop(["SibSp", "Parch"], axis=1, inplace=True)
test.drop(["SibSp", "Parch"], axis=1, inplace=True)

In [None]:
survival = train["Survived"].value_counts()
print(f"{survival[1]} people survived whereas {survival[0]} people died representing a {(survival[1]/len(train))*100:.2f}% survival rate")

In [None]:
train["Sex"].value_counts()

In [None]:
np.mean(train["Age"])

In [None]:
np.mean(train["Fare"])

In [None]:
# Calculates aggregates of various columns based on Pclass column.
pclass_group = train.groupby("Pclass").agg(
    pclass_count = pd.NamedAgg(column="Pclass", aggfunc="count"), 
    pclass_age = pd.NamedAgg(column="Age", aggfunc="mean"),
    pclass_fare = pd.NamedAgg(column="Fare", aggfunc="mean"), 
    pclass_survived = pd.NamedAgg(column="Survived", aggfunc="sum"))
pclass_group["pclass_survival_rate"] = np.round((pclass_group["pclass_survived"]/pclass_group["pclass_count"])*100,0)
pclass_group

In [None]:
# Calculates aggregates of various columns based on Embarked column.
embarked_group = train.groupby("Embarked").agg(
    embarked_count = pd.NamedAgg(column="Embarked", aggfunc="count"), 
    embarked_age = pd.NamedAgg(column="Age", aggfunc="mean"),
    embarked_fare = pd.NamedAgg(column="Fare", aggfunc="mean"), 
    embarked_survived = pd.NamedAgg(column="Survived", aggfunc="sum"))
embarked_group["embarked_survival_rate"] = np.round((embarked_group["embarked_survived"]/embarked_group["embarked_count"])*100,0)
embarked_group

Based on EDA the average fare for the trip was 32 pounds whereas the age was 29 years. As expected, the 1st class passenger paid more than the other class passenger but this margin appears to be very huge. Also, age influenced the ability of a passenger to board better classes with older passengers affording better passenger class. 

People in higher passenger classes seemed to have a better survival rate.

### Preprocessing

In [None]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder

In [None]:
def extract_salutation(name):
    """
    Function to extract salutations people used.
    """
    second_name = name.split(",")[1]
    salutation = second_name.split(".")[0].strip()
    return salutation
train["Salutation"] = train["Name"].apply(extract_salutation)
test["Salutation"] = test["Name"].apply(extract_salutation)

In [None]:
# Checks if the newly added column adds valueable information in relation to survival. 
group = train[["Salutation", "Survived", "Age"]].groupby("Salutation").mean().sort_values(by="Survived")
count = train["Salutation"].value_counts()
salutation_survival_count_merge = group.merge(count, how="left", on="Salutation")

In [None]:
salutation_survival_count_merge

In [None]:
# Transforms age into categorical column.
train["AgeBucket"] = train["Age"] // 15 * 15
test["AgeBucket"] = test["Age"] // 15 * 15

In [None]:
# Checks if the newly added column adds valueable information in relation to survival. 
group = train[["AgeBucket", "Survived"]].groupby("AgeBucket").mean().sort_values(by="Survived")
count = train["AgeBucket"].value_counts()
agebucket_survival_count_merge = group.merge(count, how="left", on="AgeBucket")

In [None]:
agebucket_survival_count_merge

In [None]:
one_hot_cat_attributes = ["Sex", "Embarked"]
label_cat_attributes = ["Salutation", "Ticket", "Cabin"]
attributes_to_use = ["Pclass", "AgeBucket", "Ticket", "Fare", "Salutation", "Related", "Cabin"]

In [None]:
# Change categorical features that would lead to many features using onehot using label encoder.
label_encoder = LabelEncoder()

for col in label_cat_attributes:
    train[col] = label_encoder.fit_transform(train[col])
    test[col] = label_encoder.fit_transform(test[col])
    

In [None]:
one_hot_encoder = OneHotEncoder()
one_hot_train = one_hot_encoder.fit_transform(train[one_hot_cat_attributes])
one_hot_test = one_hot_encoder.fit_transform(test[one_hot_cat_attributes])
one_hot_train = one_hot_train.toarray()
one_hot_test = one_hot_test.toarray()

In [None]:
y_train = train["Survived"].to_numpy()
train = train[attributes_to_use].to_numpy()
test = test[attributes_to_use].to_numpy()

In [None]:
X_train = np.c_[train, one_hot_train]
X_test = np.c_[test, one_hot_test]

In [None]:
print(X_train.shape)
print(X_test.shape)

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

### Build some ML models.

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

In [None]:
forest_clf = RandomForestClassifier()
svm_clf = SVC()
neighbor_clf = KNeighborsClassifier()
logistic_clf = LogisticRegression()
tree_clf = DecisionTreeClassifier()

In [None]:
tree_clf.fit(X_train_scaled, y_train)

In [None]:
logistic_clf.fit(X_train_scaled, y_train)

In [None]:
neighbor_clf.fit(X_train_scaled, y_train)

In [None]:
forest_clf.fit(X_train_scaled, y_train)

In [None]:
svm_clf.fit(X_train_scaled, y_train)

In [None]:
forest_pred = forest_clf.predict(X_test_scaled)
svm_pred = svm_clf.predict(X_test_scaled)
neighbor_pred = neighbor_clf.predict(X_test_scaled)
logistic_pred = logistic_clf.predict(X_test_scaled)
tree_pred = tree_clf.predict(X_test_scaled)

In [None]:
gender_submission["Survived"] = forest_pred
random_forest_submission = gender_submission
gender_submission["Survived"] = svm_pred
svm_submission = gender_submission
gender_submission["Survived"] = neighbor_pred
neighbor_submission = gender_submission
gender_submission["Survived"] = logistic_pred
logistic_submission = gender_submission
gender_submission["Survived"] = tree_pred
tree_submission = gender_submission

In [None]:
random_forest_submission.to_csv("random_forest_submission.csv", index=False)
svm_submission.to_csv("svm_submission.csv", index=False)
neighbor_submission.to_csv("neighbor_submission.csv", index=False)
logistic_submission.to_csv("logistic_submission.csv", index=False)
tree_submission.to_csv("tree_submission.csv", index=False)

In [None]:
forest_score = cross_val_score(forest_clf, X_train_scaled, y_train, cv=10, scoring="accuracy")
np.mean(forest_score)

In [None]:
svm_score = cross_val_score(svm_clf, X_train_scaled, y_train, cv=10, scoring="accuracy")
np.mean(svm_score)

In [None]:
neighbor_score = cross_val_score(neighbor_clf, X_train_scaled, y_train, cv=10, scoring="accuracy")
np.mean(neighbor_score)

In [None]:
logistic_score = cross_val_score(logistic_clf, X_train_scaled, y_train, cv=10, scoring="accuracy")
np.mean(logistic_score)

In [None]:
tree_score = cross_val_score(tree_clf, X_train_scaled, y_train, cv=10, scoring="accuracy")
np.mean(tree_score)