In [None]:
# Libraries
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

# Data Operations
from sklearn.model_selection import train_test_split
from category_encoders import OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer

# Classification Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
# Metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import warnings
warnings.filterwarnings("ignore")


### Importing libraries

### Import data

In [None]:
df = pd.read_csv("./data/train.csv")

### Data Understanding

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.sample(5)

### Data Wrangling

#### Missing Values

In [None]:
df.isnull().sum()

##### Understanding age to fill the most appropriate value

In [None]:
df[df["age"].isnull()]

In [None]:
df.groupby(["who"])["age"].describe()

In [None]:
df.age.median()

In [None]:
df["age"].fillna(df.age.median(), inplace=True)

#### Outlier Detection and Removal

##### Plots before removing outliers

In [None]:
fig, axes = plt.subplots(1,3, figsize=(18,5))

# Fare vs Age
axes[0].scatter(df["age"], df["fare"])
axes[0].set_title("Fare vs Age")
axes[0].set_xlabel("Age")
axes[0].set_ylabel("Fare")

# Fare Boxplot
axes[1].boxplot(df["fare"])
axes[1].set_title("Fare")

# Age Boxplot
axes[2].boxplot(df["age"])
axes[2].set_title("Age")

plt.show()

#### IQR Method (fare)

In [None]:
q1 = df['fare'].quantile(0.25)
q3 = df['fare'].quantile(0.75)
iqr = q3 - q1

low = q1 - 1.5 * iqr
high = q3 + 1.5 * iqr

df = df.loc[(df['fare'] > low) & (df['fare'] < high)].reset_index(drop=True)

#### IQR Method (age)

In [None]:
q1 = df['age'].quantile(0.25)
q3 = df['age'].quantile(0.75)
iqr = q3 - q1

low = q1 - 1.5 * iqr
high = q3 + 1.5 * iqr

df = df.loc[(df['age'] > low) & (df['age'] < high)].reset_index(drop=True)

##### Plots after removing outliers

In [None]:
fig, axes = plt.subplots(1,3, figsize=(18,5))

# Fare vs Age
axes[0].scatter(df["age"], df["fare"])
axes[0].set_title("Fare vs Age")
axes[0].set_xlabel("Age")
axes[0].set_ylabel("Fare")

# Fare Boxplot
axes[1].boxplot(df["fare"])
axes[1].set_title("Fare")

# Age Boxplot
axes[2].boxplot(df["age"])
axes[2].set_title("Age")

plt.show()


#### Column Equivalence

In [None]:
print(df["embarked"].unique(), df["embark_town"].unique())
df[["embarked", "embark_town"]].sample(5)

In [None]:
print(df["pclass"].unique(), df["class"].unique())
df[["pclass", "class"]].sample(5)

In [None]:
print(df["sibsp"].unique(), df["parch"].unique())
df[["sibsp", "parch"]].sample(5)

In [None]:
print(df["survived"].unique(), df["alive"].unique())
df[["survived", "alive"]].sample(5)

#### Drop Features


In [None]:
# Dropping alone and adult_male because we can get same information from parch and who respectively
# deck contains high amount of missing values,
# embarked == embark_town, pclass == class, alive == survived
df.drop(columns = ["alone","deck","embarked", "adult_male", "class", "alive"], inplace=True)

#### Converting data types

In [None]:
# category
df[["embark_town", "sex", "who"]] = df[["embark_town", "sex", "who"]].astype("category")

# uint8
df[["pclass", "sibsp", "age", "parch", "survived"]] = df[["pclass", "sibsp", "age", "parch", "survived"]].astype("uint8")

# unit16
df[["fare"]] = df[["fare"]].astype("uint16")

### Exploratory Data Analysis

#### Collinearity

In [None]:
corr = df.select_dtypes("number").drop(columns=["survived"]).corr()
sns.heatmap(corr, annot=True)

In [None]:
df.info()

#### Data visualization

##### Survivors by Passenger Type

In [None]:
survivors_count = df[df["survived"] == True].groupby("who")["survived"].count()
non_survivors_count = df[df["survived"] == False].groupby("who")["survived"].count()

combined_counts = pd.DataFrame({
    "Survivors": survivors_count,
    "Non-Survivors": non_survivors_count
})

combined_counts.plot(kind='bar')
plt.title("Survivors and Non-Survivors by Passenger Type")
plt.xlabel("Passenger Type")
plt.ylabel("Count")
plt.xticks(rotation=0)
plt.show()
print(combined_counts)

##### Survivors & Non-Survivor by Embark Town


In [None]:
total_count = df["embark_town"].value_counts()
survived_count = df[df["survived"] == 1]["embark_town"].value_counts()
non_survivors_count = df[df["survived"] == 0]["embark_town"].value_counts()

fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Plot total distribution
total_count.plot(kind="bar", ax=axes[0], color="skyblue", title="Embark Town Distribution")
axes[0].set_xlabel("Embark Town")
axes[0].set_ylabel("Count")
axes[0].set_xticks(range(len(total_count.index)))
axes[0].set_xticklabels(total_count.index, rotation=0)

# Plot survivors distribution
survived_count.plot(kind="bar", ax=axes[1], color="green", title="Survivors from Embark Town")
axes[1].set_xlabel("Embark Town")
axes[1].set_ylabel("Count")
axes[1].set_xticks(range(len(total_count.index)))
axes[1].set_xticklabels(total_count.index, rotation=0)

# Plot non-survivors distribution
non_survivors_count.plot(kind="bar", ax=axes[2], color="red", title="Non-Survivors from Embark Town")
axes[2].set_xlabel("Embark Town")
axes[2].set_ylabel("Count")
axes[2].set_xticks(range(len(total_count.index)))
axes[2].set_xticklabels(total_count.index, rotation=0)

plt.show()

### Feature Engineering

#### Family Size

In [None]:
df["family_size"] = df["sibsp"] + df["parch"] + 1
plt.hist(df["family_size"])
plt.xlabel("Family Size")
plt.ylim(0, 500)
plt.ylabel("Count")
plt.title("Family Size Distribution")
plt.show()

### Predictive Modeling

#### Features & Target Selection

In [None]:
features = df[["age", "pclass", "fare", "sex", "who", "embark_town", "family_size"]]

In [None]:
target = df["survived"]

#### Model Selection

In [None]:
models = {
    "Logistic Regression": make_pipeline(OneHotEncoder(), SimpleImputer(), LogisticRegression()),
    "Random Forest": make_pipeline(OneHotEncoder(), SimpleImputer(), RandomForestClassifier()),
    "Support Vector Machine": make_pipeline(OneHotEncoder(), SimpleImputer(), SVC(kernel="linear")),
    "Naive Bayes": make_pipeline(OneHotEncoder(), SimpleImputer(), GaussianNB()),
}

#### Train & Test data splitting

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.20, random_state=42)

#### Training/Testing and Result

In [None]:
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average="macro")
    recall = recall_score(y_test, y_pred, average="macro")
    f1 = f1_score(y_test, y_pred, average="macro")
    results[name] = {
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1": f1
    }

#### Model Metrics

In [None]:
for name, metrics in results.items():
    print(f"{name}:")
    print(f"  Accuracy = {metrics['Accuracy'] * 100:.2f}%")
    print(f"  Precision = {metrics['Precision'] * 100:.2f}%")
    print(f"  Recall = {metrics['Recall'] * 100:.2f}%")
    print(f"  F1 Score = {metrics['F1'] * 100:.2f}%")


#### Comparing model accuracy

In [None]:
model_names = list(results.keys())
accuracies = [metrics["Accuracy"] for metrics in results.values()]
precisions = [metrics["Precision"] for metrics in results.values()]
recalls = [metrics["Recall"] for metrics in results.values()]
f1_scores = [metrics["F1"] for metrics in results.values()]

fig, axes = plt.subplots(2, 2, figsize=(12, 8))

# Plot accuracy
axes[0, 0].bar(model_names, accuracies, width=0.5, align="center")
axes[0, 0].set_title("Accuracy Comparison")
axes[0, 0].set_ylabel("Accuracy")
axes[0, 0].set_ylim(0, 1)
axes[0, 0].tick_params(axis="x", labelrotation=45)

# Plot precision
axes[0, 1].bar(model_names, precisions, width=0.5, align="center")
axes[0, 1].set_title("Precision")
axes[0, 1].set_ylabel("Precision")
axes[0, 1].set_ylim(0, 1)
axes[0, 1].tick_params(axis="x", labelrotation=45)

# Plot recall
axes[1, 0].bar(model_names, recalls, width=0.5, align="center")
axes[1, 0].set_title("Recall")
axes[1, 0].set_ylabel("Recall")
axes[1, 0].set_ylim(0, 1)
axes[1, 0].tick_params(axis="x", labelrotation=45)

# Plot F1 score
axes[1, 1].bar(model_names, f1_scores, width=0.5, align="center")
axes[1, 1].set_title("F1 Score")
axes[1, 1].set_ylabel("F1 Score")
axes[1, 1].set_ylim(0, 1)
axes[1, 1].tick_params(axis="x", labelrotation=45)

plt.tight_layout()
plt.show()
