# This Notebook explores the Titanic dataset and tries to predict if a passenger died.

In [None]:
# Load libraries for EDA.
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt

In [None]:
# Load required data
path = "../../../Data/titanic/"
train = pd.read_csv(path + "train.csv")
test = pd.read_csv(path + "test.csv")
gender_submission = pd.read_csv(path + "gender_submission.csv")

Let's peek at the three datasets to see their features.

In [None]:
train.head()

The attributes have the following meaning:

- PassengerId: a unique identifier for each passenger
- Survived: that's the target, 0 means the passenger did not survive, while 1 means he/she survived.
- Pclass: passenger class.
- Name, Sex, Age: self-explanatory
- SibSp: how many siblings & spouses of the passenger aboard the Titanic.
- Parch: how many children & parents of the passenger aboard the Titanic.
- Ticket: ticket id
- Fare: price paid (in pounds)
- Cabin: passenger's cabin number
- Embarked: where the passenger embarked the Titanic

In [None]:
test.head()

In [None]:
gender_submission.head()

In [None]:
# Explicity make the passenger id the index.
train = train.set_index("PassengerId")
test = test.set_index("PassengerId")

### Let's do some light Exploratory Data Analysis.

In [None]:
# Check data type and nulls.
train.info()

In [None]:
test.info()

In [None]:
def percent_null(data):
    """
    This function outputs columns with missing values with their percentages.
    """
    cols_with_null = list()
    cols = data.columns
    for col in cols:
        null_count = data[col].isnull().value_counts()
        if len(null_count) == 2:
            null_count_percent = np.round((null_count[True]/len(data))*100, 2)
            cols_with_null.append(col)
            print(f"Column {col} has {null_count[True]} missing values which is {null_count_percent}%")
    return cols_with_null
print("Train set has the following information missing")
cols_with_null_train = percent_null(train)
print(f"\n {'-'*50} \n")
print("Test set has the following information missing")
cols_with_null_test = percent_null(test)

In [None]:
survival = train["Survived"].value_counts()
print(f"{survival[1]} people survived whereas {survival[0]} people died representing a {(survival[1]/len(train))*100:.2f}% survival rate")

In [None]:
train["Sex"].value_counts()

In [None]:
train["Age"].value_counts()

In [None]:
train["Pclass"].value_counts()

### Data Cleaning.

In [None]:
# Reduces the number of features.
train["Related"] = train["SibSp"] + train["Parch"]
test["Related"] = test["SibSp"] + test["Parch"]

train.drop(["SibSp", "Parch"], axis=1, inplace=True)
test.drop(["SibSp", "Parch"], axis=1, inplace=True)

Will replace missing values based on passenger class i.e replace age by mean per passenger class.

In [None]:
def fill_missing(data, null_cols):
    for col in null_cols:
        if data[col].dtype == 'object':
            for i in data["Pclass"].unique():
                replace_val = data.loc[data["Pclass"]==i, col].mode()
                replace_val = replace_val[0]
                data.loc[data["Pclass"]==i, col] = data.loc[data["Pclass"]==i, col].fillna(replace_val)
        else:
            for i in data["Pclass"].unique():
                replace_val = data.loc[data["Pclass"]==i, col].mean()
                data.loc[data["Pclass"]==i, col] = data.loc[data["Pclass"]==i, col].fillna(replace_val)
    return data
train = fill_missing(train, cols_with_null_train)
test = fill_missing(test, cols_with_null_test)

### Preprocessing

In [None]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder

In [None]:
def extract_salutation(name):
    second_name = name.split(",")[1]
    salutation = second_name.split(".")[0]
    return salutation
train["Salutation"] = train["Name"].apply(extract_salutation)
test["Salutation"] = test["Name"].apply(extract_salutation)

In [None]:
train["AgeBucket"] = train["Age"] // 15 * 15
test["AgeBucket"] = test["Age"] // 15 * 15

In [None]:
train["Salutation"].value_counts()

In [None]:
train.loc[train["Salutation"] == " the Countess"]

In [None]:
cat_attri = ["Sex", "Embarked", "Salutation", "Ticket"]
attri_to_use = ["Pclass", "Sex", "Age", "AgeBucket", "Ticket", "Fare", "Salutation","Embarked", "Related"]

In [None]:
# Change categorical features that would lead to many features using onehot using label encoder.
label_encoder = LabelEncoder()

for col in cat_attri:
    train[col] = label_encoder.fit_transform(train[col])
    test[col] = label_encoder.fit_transform(test[col])

In [None]:
X_train = train[attri_to_use].to_numpy()
y_train = train["Survived"].to_numpy()
X_test = test[attri_to_use].to_numpy()

In [None]:
print(X_train.shape)
print(X_test.shape)

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

### Build some ML models.

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

In [None]:
rand_clf = RandomForestClassifier()
rand_clf.fit(X_train_scaled, y_train)
rand_pred = rand_clf.predict(X_test_scaled)

In [None]:
gender_submission["Survived"] = rand_pred
rand_gender_submission = gender_submission

In [None]:
rand_gender_submission.to_csv("rand_gender_submission.csv", index=False)

In [None]:
score = cross_val_score(rand_clf, X_train_scaled, y_train, cv=10, scoring="accuracy")

In [None]:
np.mean(score)