# This Notebook explores the Titanic dataset and tries to predict if a passenger died.

In [None]:
# Load libraries for EDA.
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt

In [None]:
# Load required data
path = "../../../Data/titanic/"
train = pd.read_csv(path + "train.csv")
test = pd.read_csv(path + "test.csv")
gender_submission = pd.read_csv(path + "gender_submission.csv")

Let's peek at the three datasets to see their features.

In [None]:
train.head()

The attributes have the following meaning:

- PassengerId: a unique identifier for each passenger
- Survived: that's the target, 0 means the passenger did not survive, while 1 means he/she survived.
- Pclass: passenger class.
- Name, Sex, Age: self-explanatory
- SibSp: how many siblings & spouses of the passenger aboard the Titanic.
- Parch: how many children & parents of the passenger aboard the Titanic.
- Ticket: ticket id
- Fare: price paid (in pounds)
- Cabin: passenger's cabin number
- Embarked: where the passenger embarked the Titanic

In [None]:
test.head()

In [None]:
gender_submission.head()

In [None]:
# Explicitly have passenger id as the identifier.
train.index = train["PassengerId"]
test.index = test["PassengerId"]

# Drop it from the dateset thereafter.
train.drop("PassengerId", axis=1, inplace=True)
test.drop("PassengerId", axis=1, inplace=True)

### Let's do some light Exploratory Data Analysis.

In [None]:
# Check data type and nulls.
train.info()

In [None]:
test.info()

In [None]:
train["Survived"].value_counts()

In [None]:
train["Sex"].value_counts()

In [None]:
train["Age"].value_counts()

In [None]:
train["Pclass"].value_counts()

### Data Cleaning.

In [None]:
# Drop column with alot of missing values.
train.drop("Cabin", axis=1, inplace=True)
test.drop("Cabin", axis=1, inplace=True)

In [None]:
# Fill missing values.
mean_age_train = np.mean(train["Age"])
mode_embarked_train = train["Embarked"].mode()
mean_age_test = np.mean(test["Age"])
mean_fare_test = np.mean(test["Fare"])

train["Age"] = train["Age"].fillna(mean_age_train)
train["Embarked"] = train["Embarked"].fillna(mode_embarked_train[0])
test["Age"] = test["Age"].fillna(mean_age_test)
test["Fare"] = test["Fare"].fillna(mean_fare_test)

### Preprocessing

In [None]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder

In [None]:
cat_attri = ["Sex", "Embarked"]
num_attri = ["Pclass", "Age", "SibSp", "Parch", "Fare"]

In [None]:
# Change categorical to numeric.
hot_encoder = OneHotEncoder()
train_cat_one_hot = hot_encoder.fit_transform(train[cat_attri])
test_cat_one_hot = hot_encoder.fit_transform(test[cat_attri])
train_cat_one_hot = train_cat_one_hot.toarray()
test_cat_one_hot = test_cat_one_hot.toarray()

In [None]:
# Change categorical features that would lead to many features using onehot using label encoder.
label_encoder = LabelEncoder()
train_cat_label_ticket = label_encoder.fit_transform(train["Ticket"])
test_cat_label_ticket = label_encoder.fit_transform(test["Ticket"])
train_cat_label_name = label_encoder.fit_transform(train["Name"])
test_cat_label_name = label_encoder.fit_transform(test["Name"])

In [None]:
num_train = train[num_attri]
num_test = test[num_attri]

In [None]:
X_train = np.c_[train_cat_one_hot, train_cat_label_ticket, train_cat_label_name, num_train]
y_train = train["Survived"].to_numpy()
X_test = np.c_[test_cat_one_hot, test_cat_label_ticket, test_cat_label_name, num_test]

In [None]:
print(X_train.shape)
print(X_test.shape)

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

### Build some ML models.

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

In [None]:
rand_clf = RandomForestClassifier()
rand_clf.fit(X_train_scaled, y_train)
rand_pred = rand_clf.predict(X_test_scaled)

In [None]:
gender_submission["Survived"] = rand_pred
rand_gender_submission = gender_submission

In [None]:
rand_gender_submission.to_csv("rand_gender_submission.csv", index=False)

In [None]:
score = cross_val_score(rand_clf, X_train_scaled, y_train, cv=10, scoring="accuracy")

In [None]:
np.mean(score)