# Titanic Survivor Predictor
**Authour:** *Kamau Wa Wainaina*

## Loading Datasets.

In [None]:
# Library for loading datasets.
import pandas as pd
# Library for linear algebra.
import numpy as np

In [None]:
# Load the datasets.
path = "../../../Data/titanic/"
train = pd.read_csv(path+"train.csv")
test = pd.read_csv(path+"test.csv")

In [None]:
pd.set_option("display.max_colwidth", None) # Ensures column content isn't truncated.

Let's peek at the first five rows of both train and test.

In [None]:
train.head()

In [None]:
test.head()

We observe the following about the data:
- PassengerId seems to identify each observation (should make it the index).
- Survived column in train is the target we're trying to predict.
- Next we should perform EDA to know more about the data. 

In [None]:
# First, let's make passenger id the index in both datasets.
train = train.set_index("PassengerId")
test = test.set_index("PassengerId")

## Exploratory Data Analysis.

In this section I want to investigate the following (will focus on train to avoid data snooping):
1. Data types and columns with missing values.
2. Number of those whose survived.
    - Categorized by Sex, Age, and Pclass.
3. How expensive was the trip.
4. How name can be used to predict survivors.
5. Similarily, how is ticket related to survivors.

1. **Data types and columns with missing values.**

The info function shows both data types and missing values.

In [None]:
train.info()

Let's create a function to calculate percentage of missing information.

In [None]:
def missing_percent(data):
    has_missing_vals = data.isnull().any() # Checks if any column has missing values.
    cols_with_missing = []
    for col, val in zip(has_missing_vals.index, has_missing_vals.values):
        if val == True:
            cols_with_missing.append(col)
            
    for col in cols_with_missing:
        missing_count = data[col].isnull().sum() # Counts number of True since True == 1.
        total_count = len(data[col])
        missing_percent = np.round((missing_count/total_count)*100, 2)
        print(f"{col} has {missing_percent}% of the values missing.")
        
missing_percent(train)

**It is evident that Cabin has the highest proportion of missing data, followed by Age, and then Embarked. Additionally, there are 6 numerical columns and 5 categorical columns in total.**

2. **Number of those whose survived.**

How many people survived?

In [None]:
survived = train["Survived"].sum() # Survived records 1 as survived and 0 as perished.
print(f"{survived} people survived.")

What was the surival rate? 

In [None]:
total_passengers = len(train)
survival_rate = np.round((survived/total_passengers)*100, 2)
print(f"The survival rate of boarding the titanic was {survival_rate}%.")

Of those who survived how many were female and male?

In [None]:
survived_female = train.query("Sex == 'female'")["Survived"].sum() # Works because Survived has 1 and 0.
survived_male = train.query("Sex == 'male'")["Survived"].sum()
print(f"{survived_female} females survived while {survived_male} males survived.")

Which gender had a better survival rate?

In [None]:
total_female = len(train.query("Sex == 'female'"))
total_male = len(train.query("Sex == 'male'"))

female_rate = np.round((survived_female/total_female)*100, 2)
male_rate = np.round((survived_male/total_male)*100, 2)

overall_female_rate = np.round((survived_female/total_passengers)*100, 2)
overall_male_rate = np.round((survived_male/total_passengers)*100, 2)

print(f"Among females, the survival rate was {female_rate}% whereas among males it was {male_rate}%.")
print(f"Females aboard the titanic had a survival rate of {overall_female_rate}% whereas males had {overall_male_rate}%.")

Did age affect surival rate?

*To answer this question, I'll create age buckets to make analysis easier*

In [None]:
print(f" Minimum age: {train['Age'].min()} \n Maximum age: {train['Age'].max()}")

In [None]:
# Define age buckets
bins = [0, 12, 18, 35, 60, 100]  # Specify bucket edges
labels = ['Child', 'Teen', 'Young Adult', 'Adult', 'Senior']  # Specify labels for the buckets

# Create the age buckets
train['Age_group'] = pd.cut(train['Age'], bins=bins, labels=labels)

*Next, I'll calculate how many people survived per age group*

In [None]:
survived_age_group_dict = {} # This dict will help while calculating survival rates.
for group in train["Age_group"].unique():
    # I set the engine to be python as numexpr which runs .query doesn't support nullable.
    survived_age_group = train.query("Age_group == @group", engine="python")["Survived"].sum()
    survived_age_group_dict[group] = survived_age_group
    print(f"{survived_age_group} {group} survived.")

*I can now answer the question of whether age affected surival rate*

In [None]:
for group, survival_count in survived_age_group_dict.items():
    
    if pd.isna(group): # This avoids cases where the age isn't known.
        continue
        
    total_age_group = len(train.query("Age_group == @group", engine="python"))
    age_group_rate = np.round((survival_count/total_age_group)*100, 2)
    overall_age_group_rate = np.round((survival_count/total_passengers)*100, 2)
    
    print(f"Among {group}, the survival rate was {age_group_rate}%.")
    print(f"{group} aboard the titanic had a survival rate of {overall_age_group_rate}%.")
    print(f"{'-'*60}")

How did passenger classes affect survival?

In [None]:
survived_pclass_dict = {} # This dict will help while calculating survival rates.
for pclass in train["Pclass"].unique():
    survived_pclass = train.query("Pclass == @pclass")["Survived"].sum()
    survived_pclass_dict[pclass] = survived_pclass
    print(f"{survived_pclass} passengers from passenger class {pclass} survived.")

In [None]:
for pclass, survival_count in survived_pclass_dict.items():
            
    total_pclass = len(train.query("Pclass == @pclass"))
    pclass_rate = np.round((survival_count/total_pclass)*100, 2)
    overall_pclass_rate = np.round((survival_count/total_passengers)*100, 2)
    
    print(f"Among passenger class {pclass}, the survival rate was {pclass_rate}%.")
    print(f"Passengers in class {pclass} aboard the titanic had a survival rate of {overall_pclass_rate}%.")
    print(f"{'-'*70}")

**The overall survival rate on the Titanic was just 38%. Interestingly, women had a higher likelihood of survival compared to men. Among the age groups, children stood out with better chances of survival. Unsurprisingly, first-class passengers had a significantly higher probability of making it through the disaster.**

3. **How expensive was the trip.**

Overall, how expense was the trip?

In [None]:
mean_fare = np.ceil(train["Fare"].mean())
print(f"On average the passengers paid {mean_fare} pounds.")

Which age group paid the most? 

In [None]:
for group in train["Age_group"].unique():
    group_mean_fare = np.ceil(train.query("Age_group == @group", engine="python")["Fare"].mean())
    print(f"{group} paid {group_mean_fare} pounds on average.")
    print(f"{'-'*40}")

**Age had an impact on the fare passengers paid, with older individuals typically paying more. However, young adults differ from this pattern, paying on average less, even less than children.**

4. **How name can be used to predict survivors.**

This is a bit challenging I'll have to admit. However, let's look at dataframe and see if there is something we can extract.

In [None]:
train.head(10)

Let's start by extracting the salutations present in each name.

In [None]:
def extract_salutation(data):
    other_names = data.split(",")[1] # Retrieves other names apart from surname. 
    salutation = other_names.split(".")[0] # All salutation seem to end in a fullstop.
    salutation = salutation.strip()
    return salutation

In [None]:
def extract_surname(data):
    surname = data.split(",")[0] 
    return surname

In [None]:
train["Salutation"] = train["Name"].apply(extract_salutation)
train["Surname"] = train["Name"].apply(extract_surname)

Is there any correlation between these salutations and survival?

In [None]:
salutation_survived = train.groupby("Salutation")["Survived"].sum()
salutation_survived 

What was the survival rates of these groups?

In [None]:
salutation_passenger_count = train.groupby("Salutation")["Survived"].count()
for salutation, survived, total_count in zip(salutation_survived.index, salutation_survived, salutation_passenger_count):
    survival_rate = np.round((survived/total_count)*100, 2)
    total_survival_rate = np.round((survived/total_passengers)*100, 2)
    print(f"{salutation} had a survival rate of {survival_rate}% in their group.")
    print(f"{salutation} had a survival rate of {total_survival_rate}% in on the titanic.")
    print(f"{'-'*50}")

5. **Similarily, how is ticket related to survivors.**

I'm just trying to see if any part of ticket correlates to survival.

In [None]:
def extract_first(data):
    return data[0]

In [None]:
train["Ticket_Length"] = train["Ticket"].apply(extract_first)

Let's see if length had a correlation to survival.

In [None]:
train[["Ticket_Length", "Survived"]].groupby("Ticket_Length").mean()

## Data Cleaning.

In [None]:
train["Relationships"] = train["SibSp"] + train["Parch"]

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
cat_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("cat_encoder", OneHotEncoder(sparse=False)),
    ])

In [None]:
from sklearn.compose import ColumnTransformer

num_attribs = ["Age", "Relationships", "Fare"]
cat_attribs = ["Pclass", "Sex", "Embarked", "Salutation"]

preprocess_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", cat_pipeline, cat_attribs),
    ])

In [None]:
X_train = preprocess_pipeline.fit_transform(
    train[num_attribs + cat_attribs])
X_train

In [None]:
y_train = train["Survived"]

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

svm_clf = SVC(gamma="auto")
svm_scores = cross_val_score(svm_clf, X_train, y_train, cv=10)
svm_scores.mean()