# This Notebook explores the Titanic dataset and tries to predict if a passenger died.

### Loading and Viewing the datasets. 

In [3]:
# Load libraries for EDA.
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt

In [4]:
# Load required data
path = "../../../Data/titanic/"
train = pd.read_csv(path + "train.csv")
test = pd.read_csv(path + "test.csv")

Let's peek at the three datasets and observe their features.

In [6]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [7]:
train.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


In [8]:
train["Ticket"].value_counts()

Ticket
347082      7
CA. 2343    7
1601        7
3101295     6
CA 2144     6
           ..
9234        1
19988       1
2693        1
PC 17612    1
370376      1
Name: count, Length: 681, dtype: int64

The attributes have the following meaning:

- PassengerId: a unique identifier for each passenger
- Survived: that's the target, 0 means the passenger did not survive, while 1 means he/she survived.
- Pclass: passenger class.
- Name, Sex, Age: self-explanatory
- SibSp: how many siblings & spouses of the passenger aboard the Titanic.
- Parch: how many children & parents of the passenger aboard the Titanic.
- Ticket: ticket id
- Fare: price paid (in pounds)
- Cabin: passenger's cabin number
- Embarked: where the passenger embarked the Titanic

In [10]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [11]:
# Explicity make the passenger id the index.
train = train.set_index("PassengerId")
test = test.set_index("PassengerId")

### Data Cleaning.

In [13]:
# Check data type and nulls.
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 891 entries, 1 to 891
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     204 non-null    object 
 10  Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB


In [14]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 418 entries, 892 to 1309
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    418 non-null    int64  
 1   Name      418 non-null    object 
 2   Sex       418 non-null    object 
 3   Age       332 non-null    float64
 4   SibSp     418 non-null    int64  
 5   Parch     418 non-null    int64  
 6   Ticket    418 non-null    object 
 7   Fare      417 non-null    float64
 8   Cabin     91 non-null     object 
 9   Embarked  418 non-null    object 
dtypes: float64(2), int64(3), object(5)
memory usage: 35.9+ KB


In [15]:
def percent_null(data):
    """
    This function outputs columns with missing values with their percentages.
    """
    cols_with_null = list()
    cols = data.columns
    for col in cols:
        null_count = data[col].isnull().value_counts()
        if len(null_count) == 2:
            null_count_percent = np.round((null_count[True]/len(data))*100, 2)
            cols_with_null.append(col)
            print(f"Column {col} has {null_count[True]} missing values which is {null_count_percent}%")
    return cols_with_null
print("Train set has the following information missing:")
cols_with_null_train = percent_null(train)
print(f"\n {'-'*50} \n")
print("Test set has the following information missing:")
cols_with_null_test = percent_null(test)

Train set has the following information missing:
Column Age has 177 missing values which is 19.87%
Column Cabin has 687 missing values which is 77.1%
Column Embarked has 2 missing values which is 0.22%

 -------------------------------------------------- 

Test set has the following information missing:
Column Age has 86 missing values which is 20.57%
Column Fare has 1 missing values which is 0.24%
Column Cabin has 327 missing values which is 78.23%


We will replace missing values based on passenger class and sex.

In [17]:
def fill_missing(data, null_cols):
    """
    Fills numerical columns with missing values using their mean based on grouping.
    Categorical columns are filled using mode.
    """
    for col in null_cols:
        if data[col].dtype == 'object':
            data[col] = data.groupby(["Pclass", "Sex"])[col].transform(lambda x: x.fillna(x.mode()[0]))
        else:
            data[col] = data.groupby(["Pclass", "Sex"])[col].transform(lambda x: x.fillna(x.mean()))
    return data
train = fill_missing(train, cols_with_null_train)
test = fill_missing(test, cols_with_null_test)

In [18]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 891 entries, 1 to 891
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       891 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     891 non-null    object 
 10  Embarked  891 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB


In [19]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 418 entries, 892 to 1309
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    418 non-null    int64  
 1   Name      418 non-null    object 
 2   Sex       418 non-null    object 
 3   Age       418 non-null    float64
 4   SibSp     418 non-null    int64  
 5   Parch     418 non-null    int64  
 6   Ticket    418 non-null    object 
 7   Fare      418 non-null    float64
 8   Cabin     418 non-null    object 
 9   Embarked  418 non-null    object 
dtypes: float64(2), int64(3), object(5)
memory usage: 35.9+ KB


### Let's do some light Exploratory Data Analysis.

In [21]:
# Reduces the number of features.
train["Related"] = train["SibSp"] + train["Parch"]
test["Related"] = test["SibSp"] + test["Parch"]

train.drop(["SibSp", "Parch"], axis=1, inplace=True)
test.drop(["SibSp", "Parch"], axis=1, inplace=True)

In [22]:
survival = train["Survived"].value_counts()
print(f"{survival[1]} people survived whereas {survival[0]} people died representing a {(survival[1]/len(train))*100:.2f}% survival rate")

342 people survived whereas 549 people died representing a 38.38% survival rate


In [23]:
train["Sex"].value_counts()

Sex
male      577
female    314
Name: count, dtype: int64

In [24]:
np.mean(train["Age"])

29.318642716644145

In [25]:
np.mean(train["Fare"])

32.204207968574636

In [26]:
# Calculates aggregates of various columns based on Pclass column.
pclass_group = train.groupby("Pclass").agg(
    pclass_count = pd.NamedAgg(column="Pclass", aggfunc="count"), 
    pclass_age = pd.NamedAgg(column="Age", aggfunc="mean"),
    pclass_fare = pd.NamedAgg(column="Fare", aggfunc="mean"), 
    pclass_survived = pd.NamedAgg(column="Survived", aggfunc="sum"))
pclass_group["pclass_survival_rate"] = np.round((pclass_group["pclass_survived"]/pclass_group["pclass_count"])*100,0)
pclass_group

Unnamed: 0_level_0,pclass_count,pclass_age,pclass_fare,pclass_survived,pclass_survival_rate
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,216,38.378866,84.154687,136,63.0
2,184,29.907295,20.662183,87,47.0
3,491,25.112288,13.67555,119,24.0


In [27]:
# Calculates aggregates of various columns based on Embarked column.
embarked_group = train.groupby("Embarked").agg(
    embarked_count = pd.NamedAgg(column="Embarked", aggfunc="count"), 
    embarked_age = pd.NamedAgg(column="Age", aggfunc="mean"),
    embarked_fare = pd.NamedAgg(column="Fare", aggfunc="mean"), 
    embarked_survived = pd.NamedAgg(column="Survived", aggfunc="sum"))
embarked_group["embarked_survival_rate"] = np.round((embarked_group["embarked_survived"]/embarked_group["embarked_count"])*100,0)
embarked_group

Unnamed: 0_level_0,embarked_count,embarked_age,embarked_fare,embarked_survived,embarked_survival_rate
Embarked,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
C,168,30.461519,59.954144,93,55.0
Q,77,25.690425,13.27603,30,39.0
S,646,29.45389,27.243651,219,34.0


Based on EDA the average fare for the trip was 32 pounds whereas the age was 29 years. As expected, the 1st class passenger paid more than the other class passenger but this margin appears to be very huge. Also, age influenced the ability of a passenger to board better classes with older passengers affording better passenger class. 

People in higher passenger classes seemed to have a better survival rate.

### Preprocessing

In [30]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [31]:
def extract_salutation(name):
    """
    Function to extract salutations people used.
    """
    second_name = name.split(",")[1]
    salutation = second_name.split(".")[0].strip()
    return salutation
#train["Salutation"] = train["Name"].apply(extract_salutation)
#test["Salutation"] = test["Name"].apply(extract_salutation)

In [32]:
def name_ticket(data):
    first_name = data["Name"].split(",")[0]
    ticket = data["Ticket"]
    return first_name + "_" + ticket
train["Name_Ticket"] = train.apply(name_ticket, axis=1)
test["Name_Ticket"] = test.apply(name_ticket, axis=1)

TypeError: name_ticket() missing 1 required positional argument: 'ticket'

In [None]:
train.drop(["Name", "Ticket"], axis=1, inplace=True)
test.drop(["Name", "Ticket"], axis=1, inplace=True)

In [None]:
categorical_cols = ["Sex", "Name_Ticket", "Cabin", "Embarked"]
numeric_cols = ["Pclass", "Age", "Related", "Fare"]

In [None]:
# Checks if the newly added column adds valueable information in relation to survival.
salutation_group = train.groupby("Salutation").agg(
    salutation_count = pd.NamedAgg(column="Salutation", aggfunc="count"), 
    salutation_age = pd.NamedAgg(column="Age", aggfunc="mean"),
    salutation_fare = pd.NamedAgg(column="Fare", aggfunc="mean"), 
    salutation_survived = pd.NamedAgg(column="Survived", aggfunc="sum"))
salutation_group["salutation_survival_rate"] = np.round((salutation_group["salutation_survived"] /
                                                         salutation_group["salutation_count"])*100,0)
salutation_group

In [None]:
# Transforms age into categorical column.
train["AgeBucket"] = train["Age"] // 15 * 15
test["AgeBucket"] = test["Age"] // 15 * 15

In [None]:
# Checks if the newly added column adds valueable information in relation to survival. 
group = train[["AgeBucket", "Survived"]].groupby("AgeBucket").mean().sort_values(by="Survived")
count = train["AgeBucket"].value_counts()
agebucket_survival_count_merge = group.merge(count, how="left", on="AgeBucket")

In [None]:
agebucket_survival_count_merge

In [None]:
train

In [None]:
one_hot_encoder = OneHotEncoder(handle_unknown='ignore')
one_hot_encoder.fit(train[categorical_cols])
one_hot_train = one_hot_encoder.transform(train[categorical_cols])
one_hot_test = one_hot_encoder.transform(test[categorical_cols])
one_hot_train = one_hot_train.toarray()
one_hot_test = one_hot_test.toarray()

In [None]:
y_train = train["Survived"].to_numpy()

In [None]:
X_train = np.c_[train[numeric_cols], one_hot_train]
X_test = np.c_[test[numeric_cols], one_hot_test]

In [None]:
print(X_train.shape)
print(X_test.shape)

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

### Build some ML models.

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, GridSearchCV

Let's use grid search to search for hyperparameters.

In [None]:
svm_clf = SVC()

In [None]:
params = [{"C": [1, 2, 3, 5], "degree": [3, 4, 5, 6]}]

In [None]:
grid_search = GridSearchCV(svm_clf, params, scoring="accuracy", verbose=3)
grid_search.fit(X_train_scaled, y_train)

In [None]:
print(grid_search.best_params_)

In [None]:
cvres = grid_search.cv_results_
for mean_test_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(f"mean_test_score: {mean_test_score} and params: {params}")

In [None]:
print(cvres["mean_test_score"].max())

In [None]:
predictions = grid_search.predict(X_test_scaled)

In [None]:
df = pd.DataFrame()
df["PassengerId"] = np.arange(892, 1310)
df["Survived"] = predictions
df.to_csv("titanic_svm.csv", index=False)