# Setup & Imports

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import polars as pl
from constants import PROJECT_ROOT

CHAPTER_ROOT = PROJECT_ROOT / "notebooks" / "chapter_3"

In [None]:
df_train = pl.read_csv(CHAPTER_ROOT / "data" / "titanic_data" / "train.csv")
df_test = pl.read_csv(CHAPTER_ROOT / "data" / "titanic_data" / "test.csv")

In [None]:
X = df_train.drop("Survived")
y = df_train.get_column("Survived")

In [None]:
df_train

# EDA (Exploratory Data Analysis)

#### Pclass

In [None]:
pclass_vs_survival_mean = (
    df_train.group_by(pl.col("Pclass"))
    .all()
    .with_columns(pl.col("Survived").list.mean().alias("survival_mean"))
    .select("Pclass", "survival_mean")
    .sort(by="Pclass", descending=False)
)

fig, ax = plt.subplots()

pclass = pclass_vs_survival_mean.get_column("Pclass").to_list()
survivability = pclass_vs_survival_mean.get_column("survival_mean").to_list()
bar_labels = [f"Class: {i} | Survivability: {survivability[i-1]:.2f}%" for i in pclass]
bar_colors = ["yellow", "orange", "red"]

ax.bar(pclass, survivability, label=bar_labels, color=bar_colors)

ax.set_ylabel("Survival Mean")
ax.set_xlabel("Ticket Class")
ax.legend(title="Survival Mean across different Ticket Classes")
ax.set_xticks(pclass)
ax.set_ylim(0, 1)

plt.show()

In [None]:
"""Map Pclass values: 3 -> 1, 2 -> 2, and 1 -> 3 so that they are consistent with the increase in survivability rates."""

#### Name

In [None]:
prefix_vs_survival_mean = df_train.with_columns(
    pl.col("Name").str.split(by=" ").list.get(1).str.strip_chars_end(".,").alias("prefix")
)
prefix_vs_survival_mean.get_column("prefix").value_counts()

#### Sex

In [None]:
sex_vs_survival_mean = (
    df_train.group_by(pl.col("Sex"))
    .all()
    .with_columns(pl.col("Survived").list.mean().alias("survival_mean"))
    .select("Sex", "survival_mean")
    .sort(by="survival_mean", descending=True)
)

fig, ax = plt.subplots()

genders = sex_vs_survival_mean.get_column("Sex").to_list()
survivability = sex_vs_survival_mean.get_column("survival_mean").to_list()
bar_labels = [
    f"{gender} | Survivability: {surv_rate:.2f}%"
    for gender, surv_rate in zip(genders, survivability)
]
bar_colors = ["yellow", "red"]

ax.bar(genders, survivability, label=bar_labels, color=bar_colors)

ax.set_ylabel("Survival Rate")
ax.set_xlabel("Gender")
ax.legend(title="Survival Rate across Genders")
ax.set_xticks(genders)

ax.set_ylim(0, 1)

plt.show()

In [None]:
"""Encode gender in a binary column where female = 1 and male = 0 to maintain the meaningful difference in survival rates."""

In [None]:
print(
    f"Women's survival rate: {df_train.filter(pl.col('Sex') == 'female').get_column('Survived').mean():.3f}"
)
print(
    f"Men's survival rate: {df_train.filter(pl.col('Sex') == 'male').get_column('Survived').mean():.3f}"
)

In [None]:
print(
    f"Survivor's mean fare: {df_train.filter(pl.col('Survived') == 1).get_column('Fare').mean():.3f}"
)
print(
    f"Non-Survivor's mean fare: {df_train.filter(pl.col('Survived') == 0).get_column('Fare').mean():.3f}"
)

In [None]:
x = df_train.to_pandas()

In [None]:
df_train.null_count()

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin


class AttributeTransformer(BaseEstimator, TransformerMixin):
    def __init__(
        self,
        drop_id: bool = True,
        transform_cabin: bool = True,
        transform_family: bool = True,
        extract_prefix: bool = True,
        extract_ticket_class: bool = True,
    ):
        self.drop_id = drop_id
        self.transform_cabin = transform_cabin
        self.transform_family = transform_family
        self.extract_prefix = extract_prefix
        self.extract_ticket_class = extract_ticket_class

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if self.transform_cabin:
            X["Cabin"] = X["Cabin"].apply(lambda x: x[0] if x else "0")
        if self.transform_family:
            X["FamilySize"] = X["SibSp"] + X["Parch"] + 1
            X.drop(["SibSp", "Parch"], axis=1, inplace=True)
        if self.extract_prefix:
            X["Prefix"] = X["Name"].apply(lambda x: x.split()[1].rstrip("."))
            X.drop("Name", axis=1, inplace=True)
        if self.extract_ticket_class:
            X["Ticket"] = X["Ticket"].apply(lambda x: x[0])
            X.drop("Ticket", axis=1, inplace=True)
        if self.drop_id:
            X.drop("PassengerId", axis=1, inplace=True)
        return np.c_[X]

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

cat_features = ["Pclass", "Sex", "SibSp", "Parch", "Cabin", "Name", "PassengerId", "Ticket"]

cat_pipeline = Pipeline(
    [
        ("attribs_transformer", AttributeTransformer()),
        ("one_hot_encoder", OneHotEncoder(sparse_output=False, handle_unknown="ignore")),
        ("imputer", SimpleImputer(strategy="most_frequent")),
    ]
)

In [None]:
from sklearn.preprocessing import StandardScaler

num_features = ["Age", "Fare"]

num_pipeline = Pipeline(
    [
        ("imputer", SimpleImputer(strategy="mean")),
        ("standard_scaler", StandardScaler()),
    ]
)

In [None]:
embarked_feature = ["Embarked"]

embark_pipeline = Pipeline(
    [
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("one_hot_encoder", OneHotEncoder(sparse_output=False, handle_unknown="ignore")),
    ]
)

In [None]:
from sklearn.compose import ColumnTransformer

full_pipeline = ColumnTransformer(
    [
        ("embarked", embark_pipeline, embarked_feature),
        ("num", num_pipeline, num_features),
        ("cat", cat_pipeline, cat_features),
    ],
    remainder="passthrough",
)

In [None]:
from sklearn.model_selection import train_test_split

random_state = 42

X_train, X_test, y_train, y_test = train_test_split(
    X.to_pandas(), y.to_pandas(), test_size=0.2, random_state=random_state
)

In [None]:
X_val, X_test, y_val, y_test = train_test_split(
    X_test, y_test, test_size=0.5, random_state=random_state
)

In [None]:
fitted_pipeline = full_pipeline.fit(X_train)

X_train_transformed = fitted_pipeline.transform(X_train)
X_val_transformed = fitted_pipeline.transform(X_val)
X_test_transformed = fitted_pipeline.transform(X_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

rf_param_grid = {
    "n_estimators": [10, 13, 15, 20],
    "criterion": ["gini", "entropy"],
    "max_depth": [10, 15, 20, 30, 50],
    "min_samples_split": [2, 3, 4, 5, 10],
    "min_samples_leaf": [1, 2, 3, 5],
    "max_features": ["sqrt", "log2"],
}

rf_grid_search = GridSearchCV(
    estimator=RandomForestClassifier(),
    param_grid=rf_param_grid,
    scoring="accuracy",
    n_jobs=-1,
    cv=20,
    verbose=1,
).fit(X_train_transformed, y_train)

In [None]:
print(rf_grid_search.best_score_)
print(rf_grid_search.best_params_)

In [None]:
from sklearn.model_selection import cross_val_score

rf_clf = rf_grid_search.best_estimator_

rf_scores = cross_val_score(rf_clf, X_val_transformed, y_val, cv=10, scoring="accuracy")

print(rf_scores.mean())

In [None]:
from sklearn.metrics import accuracy_score

rf_preds = rf_clf.predict(X_test_transformed)
rf_score_test = accuracy_score(rf_preds, y_test)
print(f"Random Forest score on test set: {rf_score_test:.3f}")

In [None]:
from sklearn.linear_model import LogisticRegression

lr_param_grid = {
    "penalty": ["l1", "l2", "elasticnet"],
    "tol": [0.001, 0.01, 0.1, 1],
    "C": [0.1, 1, 10, 20, 30, 40, 50, 100],
    "solver": ["liblinear", "sag", "saga"],
    "max_iter": [1000, 1500, 2000],
}

lr_grid_search = GridSearchCV(
    estimator=LogisticRegression(),
    param_grid=lr_param_grid,
    scoring="accuracy",
    n_jobs=-1,
    cv=20,
    verbose=1,
).fit(X_train_transformed, y_train)

In [None]:
print(lr_grid_search.best_score_)
print(lr_grid_search.best_params_)

In [None]:
lr_clf = lr_grid_search.best_estimator_

lr_scores = cross_val_score(lr_clf, X_val_transformed, y_val, cv=10, scoring="accuracy")

print(lr_scores.mean())

In [None]:
lr_preds = lr_clf.predict(X_test_transformed)
lr_score_test = accuracy_score(lr_preds, y_test)
print(f"Logistic Regression score on test set: {lr_score_test:.3f}")

In [None]:
from sklearn.svm import SVC

svc_param_grid = {
    "C": [10, 20, 30],
    "kernel": ["linear", "rbf"],
    "degree": [1, 2],
    "gamma": [0.001, 0.1, 1, 3, 5],
    "tol": [0.1, 1, 3, 5],
}

svc_grid_search = GridSearchCV(
    estimator=SVC(),
    param_grid=svc_param_grid,
    scoring="accuracy",
    n_jobs=-1,
    cv=20,
    verbose=1,
).fit(X_train_transformed, y_train)

In [None]:
print(svc_grid_search.best_score_)
print(svc_grid_search.best_params_)

In [None]:
svc_clf = svc_grid_search.best_estimator_

svc_scores = cross_val_score(svc_clf, X_val_transformed, y_val, cv=10, scoring="accuracy")

print(svc_scores.mean())

In [None]:
svc_preds = svc_clf.predict(X_test_transformed)
svc_score_test = accuracy_score(svc_preds, y_test)
print(f"SVC score on test set: {svc_score_test:.3f}")

In [None]:
from sklearn.ensemble import VotingClassifier

vote_clf = VotingClassifier(
    estimators=[
        ("random_forest", rf_clf),
        ("logistic_regression", lr_clf),
        ("svc", svc_clf),
    ],
    voting="hard",
).fit(X_train_transformed, y_train)

In [None]:
vote_scores = cross_val_score(vote_clf, X_val_transformed, y_val, cv=10, scoring="accuracy")

print(vote_scores.mean())

In [None]:
vote_preds = vote_clf.predict(X_test_transformed)
vote_score_test = accuracy_score(vote_preds, y_test)
print(f"SVC score on test set: {vote_score_test:.3f}")

In [None]:
X_submit = fitted_pipeline.transform(df_test.to_pandas())

In [None]:
import os

RESULTS_DIR = CHAPTER_ROOT / "results"

if not os.path.isdir(RESULTS_DIR):
    os.makedirs(RESULTS_DIR)

In [None]:
passenger_ids = pl.DataFrame({"PassengerId": df_test.get_column("PassengerId")})

In [None]:
rf_preds = [i.item() for i in rf_clf.predict(X_submit)]

rf_preds = pl.DataFrame({"Survived": rf_preds})

rf_preds_df = pl.concat(
    [passenger_ids, rf_preds],
    how="horizontal",
)

rf_preds_df.write_csv(RESULTS_DIR / "rf_preds.csv")

In [None]:
lr_preds = [i.item() for i in lr_clf.predict(X_submit)]

lr_preds = pl.DataFrame({"Survived": lr_preds})

lr_preds_df = pl.concat(
    [passenger_ids, lr_preds],
    how="horizontal",
)

lr_preds_df.write_csv(RESULTS_DIR / "lr_preds.csv")

In [None]:
svc_preds = [i.item() for i in svc_clf.predict(X_submit)]

svc_preds = pl.DataFrame({"Survived": svc_preds})

svc_preds_df = pl.concat(
    [passenger_ids, svc_preds],
    how="horizontal",
)

svc_preds_df.write_csv(RESULTS_DIR / "svc_preds.csv")

In [None]:
vote_preds = [i.item() for i in vote_clf.predict(X_submit)]

vote_preds = pl.DataFrame({"Survived": vote_preds})

vote_preds_df = pl.concat(
    [passenger_ids, vote_preds],
    how="horizontal",
)

vote_preds_df.write_csv(RESULTS_DIR / "vote_preds.csv")