# Setup & Imports

In [None]:
%load_ext autoreload
%autoreload 2

import matplotlib.colors as mcolors
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import polars as pl

from src.titanic.constants import PROJECT_ROOT

DATA_DIR = PROJECT_ROOT / "data"
RESULTS_DIR = PROJECT_ROOT / "results"

In [None]:
df_train = pl.read_csv(DATA_DIR / "train.csv")

In [None]:
df_train

# EDA (Exploratory Data Analysis)

In [None]:
c_map = "cividis"


def get_colormap(df: pl.DataFrame, column: str = "survival_mean", cmap=c_map) -> list:
    norm = mcolors.Normalize(vmin=df[column].min(), vmax=df[column].max())
    cmap = plt.get_cmap(c_map)
    return [cmap(norm(surv_rate)) for surv_rate in df[column].to_list()]


def plot_barplot(
    df: pl.DataFrame,
    x: list,
    y: list,
    labels: list,
    title: str,
    xlabel: str,
    different_labels: list | None = None,
    fontsize: str = "large",
    title_fontsize: str = "xx-large",
    ylabel: str = "Survival Mean",
    figsize: tuple[int, int] = (12, 6),
    set_ylim: bool = True,
    **kwargs,
):
    fig, ax = plt.subplots(figsize=figsize)

    bars = ax.bar(x, y, label=labels, color=get_colormap(df, **kwargs))

    if set_ylim:
        ax.set_ylim(0, 1)

    for i, bar in enumerate(bars):
        plt.text(
            bar.get_x() + bar.get_width() / 2,
            bar.get_height(),
            f"{different_labels[i]:.2f}" if different_labels else f"{y[i]:.2f}",
            ha="center",
            va="bottom",
        )

    ax.set_ylabel(ylabel)
    ax.set_xlabel(xlabel)
    ax.legend(title=title, title_fontsize=title_fontsize, fontsize=fontsize)

    return ax

#### Pclass

In [None]:
pclass_vs_survival_mean = (
    df_train.group_by(pl.col("Pclass"))
    .all()
    .with_columns(pl.col("Survived").list.mean().alias("survival_mean"))
    .select("Pclass", "survival_mean")
    .sort(by="Pclass", descending=False)
)

pclass = pclass_vs_survival_mean["Pclass"].to_list()
survivability = pclass_vs_survival_mean["survival_mean"].to_list()
pclass_bar_labels = pclass

ax = plot_barplot(
    df=pclass_vs_survival_mean,
    x=pclass,
    y=survivability,
    labels=pclass_bar_labels,
    title="Survival Rate across different Ticket Classes",
    xlabel="Ticket Class",
    fontsize="xx-large",
)

ax.set_xticks(pclass)

plt.show()

In [None]:
pclass_prevalence = df_train.get_column("Pclass").value_counts().sort(by="Pclass", descending=False)

colors = get_colormap(pclass_prevalence, column="count")

fig, ax = plt.subplots(figsize=(12, 6))

ax.bar(
    pclass_prevalence["Pclass"].cast(pl.String),
    pclass_prevalence["count"],
    label=pclass_bar_labels,
    color=colors,
)
ax.set_xlabel("Pclass")
ax.set_ylabel("Prevalence")
ax.legend(title="Pclass Distribution", title_fontsize="xx-large", fontsize="xx-large")

ax.set_ylim(0, df_train.height)

plt.show()

In [None]:
"""Map Pclass values: 3 -> 1, 2 -> 2, and 1 -> 3 so that they are consistent with the increase in survivability rates."""

#### Name

In [None]:
prefix_vs_survival_mean = (
    df_train.with_columns(
        pl.col("Name").str.split(by=".").list.get(0).str.split(by=" ").list.get(-1).alias("prefix")
    )
    .group_by(pl.col("prefix"))
    .all()
    .with_columns(
        pl.col("prefix"),
        pl.col("Survived").list.mean().alias("survival_mean"),
    )
    .select("prefix", "survival_mean")
    .sort(by="survival_mean", descending=False)
)

prefix_prevalence = (
    df_train.with_columns(
        pl.col("Name").str.split(by=".").list.get(0).str.split(by=" ").list.get(-1).alias("prefix")
    ).select(pl.col("prefix").value_counts())
).unnest("prefix")

prefixes = prefix_vs_survival_mean["prefix"].to_list()
survivability = prefix_vs_survival_mean["survival_mean"].to_list()
prevalence = [
    prefix_prevalence.filter(pl.col("prefix") == prefix).get_column("count").first()
    for prefix in prefixes
]
prefix_bar_labels = [prefix for prefix, surv_rate in zip(prefixes, survivability)]

ax = plot_barplot(
    df=prefix_vs_survival_mean,
    x=prefixes,
    y=prevalence,
    labels=prefix_bar_labels,
    different_labels=survivability,
    title="Survival Rate across different Prefixes",
    ylabel="Prevalence",
    xlabel="Prefix",
    title_fontsize="x-large",
    fontsize="medium",
    figsize=(15, 6),
    set_ylim=False,
)

plt.show()

In [None]:
"""Use one hot encoding to encode prefixes."""

#### Sex

In [None]:
sex_vs_survival_mean = (
    df_train.group_by(pl.col("Sex"))
    .all()
    .with_columns(pl.col("Survived").list.mean().alias("survival_mean"))
    .select("Sex", "survival_mean")
    .sort(by="survival_mean", descending=True)
)

genders = sex_vs_survival_mean["Sex"].to_list()
survivability = sex_vs_survival_mean["survival_mean"].to_list()
sex_bar_labels = genders

ax = plot_barplot(
    df=sex_vs_survival_mean,
    x=genders,
    y=survivability,
    labels=sex_bar_labels,
    title="Survival Rate across different Genders",
    xlabel="Gender",
    fontsize="xx-large",
)

plt.show()

In [None]:
sex_prevalence = df_train.get_column("Sex").value_counts().sort(by="Sex")

colors = get_colormap(sex_prevalence, column="count")

fig, ax = plt.subplots(figsize=(12, 6))

ax.bar(sex_prevalence["Sex"], sex_prevalence["count"], label=sex_bar_labels, color=colors)
ax.set_xlabel("Sex")
ax.set_ylabel("Prevalence")
ax.legend(title="Sex Distribution", title_fontsize="xx-large", fontsize="xx-large")

ax.set_ylim(0, df_train.height)

plt.show()

In [None]:
"""Encode gender in a binary column where female = 1 and male = 0 to maintain the meaningful difference in survival rates."""

#### Age

In [None]:
jump = 10
age_grouping = {f"{i*jump}-{i*jump+jump}": i for i in range(int(df_train["Age"].max() + 1) // jump)}


def map_age(age):
    for grp in age_grouping.keys():
        upper = float(grp.split("-")[-1])
        if float(age) <= upper:
            return age_grouping[grp]


age_group_vs_survival_mean = (
    df_train.with_columns(
        pl.col("Age").map_elements(map_age, return_dtype=pl.Int8).alias("age_group")
    )
    .group_by(pl.col("age_group"))
    .all()
    .with_columns(pl.col("Survived").list.mean().alias("survival_mean"))
    .select(pl.col("age_group"), pl.col("survival_mean"))
    .sort(by="age_group", descending=False)
)

age_groups = age_group_vs_survival_mean["age_group"].to_list()
age_groups[0] = -1
survivability = age_group_vs_survival_mean["survival_mean"].to_list()
age_bar_labels = ["Unknown"] + [grp for grp in age_grouping.keys()]

ax = plot_barplot(
    df=age_group_vs_survival_mean,
    x=age_groups,
    y=survivability,
    labels=age_bar_labels,
    title="Survival Rate across different Age Groups",
    xlabel="Age Group",
    fontsize="medium",
)

ax.set_xticks(age_groups)
ax.set_xticklabels(age_bar_labels)

plt.show()

In [None]:
age_group_prevalence = (
    df_train.with_columns(
        pl.col("Age").map_elements(map_age, return_dtype=pl.Int8).alias("age_group")
    )
    .get_column("age_group")
    .value_counts()
    .sort(by="age_group", descending=False)
)

colors = get_colormap(age_group_prevalence, column="count")

fig, ax = plt.subplots(figsize=(12, 6))

ax.bar(
    age_group_prevalence["age_group"],
    age_group_prevalence["count"],
    label=age_bar_labels,
    color=colors,
)
ax.set_xlabel("Age Group")
ax.set_ylabel("Prevalence")
ax.set_xticks([age_group + 1 for age_group in age_groups])
ax.set_xticklabels(age_bar_labels)
ax.legend(title="Age Distribution", title_fontsize="xx-large", fontsize="x-large")

ax.set_ylim(0, df_train.height)

plt.show()

In [None]:
"""Split into age categories (every 10 years), fill the nulls with median value for age."""

#### SibSp

In [None]:
sibsp_vs_survival_mean = (
    df_train.group_by(pl.col("SibSp"))
    .all()
    .with_columns(pl.col("Survived").list.mean().alias("survival_mean"))
    .select(pl.col("SibSp"), pl.col("survival_mean"))
    .sort(by="SibSp", descending=False)
)

sibsp_sizes = sibsp_vs_survival_mean["SibSp"].to_list()
survivability = sibsp_vs_survival_mean["survival_mean"].to_list()
sibsp_bar_labels = sibsp_sizes

ax = plot_barplot(
    df=sibsp_vs_survival_mean,
    x=sibsp_sizes,
    y=survivability,
    labels=sibsp_bar_labels,
    title="Survival Rate across different SibSp values",
    xlabel="SibSp",
    fontsize="x-large",
)

ax.set_xticks(sibsp_sizes)

plt.show()

In [None]:
sibsp_prevalence = df_train.get_column("SibSp").value_counts().sort(by="SibSp", descending=False)

colors = get_colormap(sibsp_prevalence, column="count")

fig, ax = plt.subplots(figsize=(12, 6))

ax.bar(
    sibsp_prevalence["SibSp"].cast(pl.String),
    sibsp_prevalence["count"],
    label=sibsp_bar_labels,
    color=colors,
)
ax.set_xlabel("SibSp")
ax.set_ylabel("Prevalence")
ax.legend(title="SibSp Distribution", title_fontsize="xx-large", fontsize="xx-large")

ax.set_ylim(0, df_train.height)

plt.show()

#### Parch

In [None]:
parch_vs_survival_mean = (
    df_train.group_by(pl.col("Parch"))
    .all()
    .with_columns(pl.col("Survived").list.mean().alias("survival_mean"))
    .select(pl.col("Parch"), pl.col("survival_mean"))
    .sort(by="Parch", descending=False)
)

parch_sizes = parch_vs_survival_mean["Parch"].to_list()
survivability = parch_vs_survival_mean["survival_mean"].to_list()
parch_bar_labels = parch_sizes

ax = plot_barplot(
    df=parch_vs_survival_mean,
    x=parch_sizes,
    y=survivability,
    labels=parch_bar_labels,
    title="Survival Rate across different Parch values",
    xlabel="Parch",
)

ax.set_xticks(parch_sizes)

plt.show()

In [None]:
parch_prevalence = df_train.get_column("Parch").value_counts().sort(by="Parch", descending=False)

colors = get_colormap(parch_prevalence, column="count")

fig, ax = plt.subplots(figsize=(12, 6))

ax.bar(
    parch_prevalence["Parch"].cast(pl.String),
    parch_prevalence["count"],
    label=parch_bar_labels,
    color=colors,
)
ax.set_xlabel("Parch")
ax.set_ylabel("Prevalence")
ax.legend(title="Parch Distribution", title_fontsize="xx-large", fontsize="xx-large")

ax.set_ylim(0, df_train.height)

plt.show()

#### Family Size

In [None]:
family_size_vs_survival_mean = (
    df_train.with_columns((pl.col("SibSp") + pl.col("Parch")).alias("family_size"))
    .group_by(pl.col("family_size"))
    .all()
    .with_columns(pl.col("Survived").list.mean().alias("survival_mean"))
    .select(pl.col("family_size"), pl.col("survival_mean"))
    .sort(by="family_size", descending=False)
)

family_sizes = family_size_vs_survival_mean["family_size"].to_list()
survivability = family_size_vs_survival_mean["survival_mean"].to_list()
family_bar_labels = family_sizes

ax = plot_barplot(
    df=family_size_vs_survival_mean,
    x=family_sizes,
    y=survivability,
    labels=family_bar_labels,
    title="Survival Rate across different Family Sizes",
    xlabel="Family Size",
)

ax.set_xticks(family_sizes)

plt.show()

In [None]:
family_size_prevalence = (
    df_train.with_columns((pl.col("SibSp") + pl.col("Parch")).alias("family_size"))
    .get_column("family_size")
    .value_counts()
    .sort(by="family_size", descending=False)
)

colors = get_colormap(family_size_prevalence, column="count")

fig, ax = plt.subplots(figsize=(12, 6))

ax.bar(
    family_size_prevalence["family_size"].cast(pl.String),
    family_size_prevalence["count"],
    label=family_bar_labels,
    color=colors,
)
ax.set_xlabel("Family Size")
ax.set_ylabel("Prevalence")
ax.legend(title="Family Size Distribution", title_fontsize="xx-large", fontsize="xx-large")

ax.set_ylim(0, df_train.height)

plt.show()

In [None]:
"""Combine SibSp and Parch to one family_size feature with 0 as no family on board, 1 for 1, 2, 3 family members, and 2 for > 3."""

#### Fare

In [None]:
fig, ax = plt.subplots(figsize=(12, 6))

fare_prevalence = df_train.get_column("Fare").value_counts().sort(by="Fare", descending=False)

colors = get_colormap(fare_prevalence, column="Fare")

ax.scatter(x=fare_prevalence["Fare"], y=fare_prevalence["count"], color=colors)

plt.show()

In [None]:
fare_jump = 30
fare_mapping = {
    f"{i*fare_jump}-{(i+1)*fare_jump}": i
    for i in range(0, int((df_train["Fare"].max() + (fare_jump - 1)) // fare_jump))
}


def group_fare(price):
    for bracket in fare_mapping.keys():
        if price <= float(bracket.split("-")[-1]):
            return fare_mapping[bracket]


fare_groups_df = df_train.with_columns(
    pl.col("Fare").map_elements(group_fare, return_dtype=pl.Int8).alias("fare_group")
)

fare_groups_vs_survival_mean = (
    fare_groups_df.group_by(pl.col("fare_group"))
    .all()
    .with_columns(
        pl.col("Survived").list.mean().alias("survival_mean"),
    )
    .select(pl.col("fare_group"), pl.col("survival_mean"))
    .sort(by="fare_group", descending=False)
)

fare_groups = fare_groups_vs_survival_mean["fare_group"].to_list()
survivability = fare_groups_vs_survival_mean["survival_mean"].to_list()
fare_groups_bar_labels = fare_groups

ax = plot_barplot(
    df=fare_groups_vs_survival_mean,
    x=fare_groups,
    y=survivability,
    labels=fare_groups_bar_labels,
    title="Survival Rate across different Fare Groups",
    xlabel="Fare Group",
    title_fontsize="x-large",
    fontsize="large",
)

ax.set_xticks(fare_groups_bar_labels)

plt.show()

In [None]:
fare_groups_prevalence = (
    fare_groups_df.get_column("fare_group").value_counts().sort(by="fare_group", descending=False)
)

colors = get_colormap(fare_groups_prevalence, column="count")

fig, ax = plt.subplots(figsize=(12, 6))

ax.bar(
    fare_groups_prevalence["fare_group"].cast(pl.String),
    fare_groups_prevalence["count"],
    label=fare_groups_bar_labels,
    color=colors,
)
ax.set_xlabel("Fare Group")
ax.set_ylabel("Prevalence")
ax.legend(title="Fare Group Distribution", title_fontsize="xx-large", fontsize="xx-large")

ax.set_ylim(0, df_train.height)

plt.show()

In [None]:
"""Transform into categorical variable with groups every 30 price units paid for fare. Then do one hot encoding."""

#### Cabin

In [None]:
cabin_binary = df_train.with_columns(
    pl.when(pl.col("Cabin").is_null()).then(0).otherwise(1).alias("has_cabin")
)

cabin_vs_survival_mean = (
    cabin_binary.group_by("has_cabin")
    .all()
    .with_columns(
        pl.col("Survived").list.mean().alias("survival_mean"),
    )
    .select(pl.col("has_cabin"), pl.col("survival_mean"))
    .sort(by="has_cabin", descending=False)
)

ax = plot_barplot(
    df=cabin_vs_survival_mean,
    x=cabin_vs_survival_mean["has_cabin"].to_list(),
    y=cabin_vs_survival_mean["survival_mean"].to_list(),
    labels=cabin_vs_survival_mean["has_cabin"].to_list(),
    title="Survival Rate for (not) having a Cabin",
    xlabel="Has Cabin",
    title_fontsize="xx-large",
    fontsize="xx-large",
)

ax.set_xticks(cabin_vs_survival_mean["has_cabin"])

plt.show()

In [None]:
cabin_prevalence = (
    cabin_binary.get_column("has_cabin").value_counts().sort(by="has_cabin", descending=False)
)

colors = get_colormap(cabin_prevalence, column="count")

fig, ax = plt.subplots(figsize=(12, 6))

ax.bar(
    cabin_prevalence["has_cabin"].cast(pl.String),
    cabin_prevalence["count"],
    label=cabin_prevalence["has_cabin"].to_list(),
    color=colors,
)
ax.set_xlabel("Has Cabin")
ax.set_ylabel("Prevalence")
ax.legend(title="Has Cabin Distribution", title_fontsize="xx-large", fontsize="xx-large")

ax.set_ylim(0, df_train.height)

plt.show()

In [None]:
"""Transform into binary feature 0 -> does not have cabin, 1 -> has cabin."""

#### Embarked

In [None]:
embarked_vs_survival_mean = (
    df_train.filter(~pl.col("Embarked").is_null())
    .group_by(pl.col("Embarked"))
    .all()
    .with_columns(
        pl.col("Survived").list.mean().alias("survival_mean"),
    )
    .select(pl.col("Embarked"), pl.col("survival_mean"))
    .sort(by="Embarked", descending=True)
)

ports = embarked_vs_survival_mean["Embarked"].to_list()
survivability = embarked_vs_survival_mean["survival_mean"].to_list()
port_bar_labels = ports

ax = plot_barplot(
    df=embarked_vs_survival_mean,
    x=ports,
    y=survivability,
    labels=port_bar_labels,
    title="Survival Rate across different Ports",
    xlabel="Embarked",
    title_fontsize="xx-large",
    fontsize="xx-large",
)

plt.show()

In [None]:
embarked_prevalence = (
    df_train.filter(~pl.col("Embarked").is_null())
    .get_column("Embarked")
    .value_counts()
    .sort(by="Embarked", descending=True)
)

colors = get_colormap(embarked_prevalence, column="count")

fig, ax = plt.subplots(figsize=(12, 6))

ax.bar(
    embarked_prevalence["Embarked"].cast(pl.String),
    embarked_prevalence["count"],
    label=port_bar_labels,
    color=colors,
)
ax.set_xlabel("Embarked")
ax.set_ylabel("Prevalence")
ax.legend(title="Embarked Distribution", title_fontsize="xx-large", fontsize="xx-large")

ax.set_ylim(0, df_train.height)

plt.show()

In [None]:
"""Leave categorical, do one hot encoding."""

#### Ticket

In [None]:
"""Drop the ticket feature, carries too much irrelevant data which implies the risk of confusing the models."""

In [None]:
df_train.null_count()

In [None]:
"""We will impute the 'Age' for the median of the sex group, the 'Cabin' will be transformed into binary cabin/no cabin and 'Embarked' will be imputed with the most frequent value."""

# Data Preparation

In [None]:
def pclass_mapping(pclass):
    mapping_dict = {3: 0, 2: 1, 1: 2}
    return mapping_dict[pclass]


def age_mapping(age, jump: int = 10):
    for i in range(int((df_train["Age"].max() + 1) // jump)):
        if age <= ((i + 1) * jump):
            return i


def fare_mapping(price, jump: int = 30):
    for i in range(int((df_train["Fare"].max() + (jump - 1)) // jump)):
        if price <= ((i + 1) * jump):
            return i


female_age_median = df_train.filter(pl.col("Sex") == "female").get_column("Age").mean()
male_age_median = df_train.filter(pl.col("Sex") == "male").get_column("Age").mean()

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin


class AttributeTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.with_columns(
            pl.when(pl.col("Age").is_null() & (pl.col("Sex") == "female"))
            .then(female_age_median)
            .when(pl.col("Age").is_null() & (pl.col("Sex") == "male"))
            .then(male_age_median)
            .otherwise(pl.col("Age"))
            .alias("Age")
        )

        columns_to_keep = ["Embarked"]
        columns_to_drop = [col for col in X.columns if col not in columns_to_keep]

        X = X.with_columns(
            p_class=pl.col("Pclass").map_elements(pclass_mapping, return_dtype=pl.Int8),
            prefix=pl.col("Name").str.split(by=".").list.get(0).str.split(by=" ").list.get(-1),
            gender=pl.when(pl.col("Sex") == "male").then(0).otherwise(1),
            age_group=pl.col("Age").map_elements(age_mapping, return_dtype=pl.Int8),
            family_size=(
                pl.when((pl.col("SibSp") + pl.col("Parch")) == 0)
                .then(0)
                .when((pl.col("SibSp") + pl.col("Parch")).is_in([1, 2, 3]))
                .then(1)
                .otherwise(2)
            ),
            fare_group=pl.col("Fare").map_elements(fare_mapping, return_dtype=pl.Int8),
            has_cabin=pl.when(pl.col("Cabin").is_null()).then(0).otherwise(1),
        ).drop(
            columns_to_drop,
        )
        return X

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

cat_features_to_ohe = ["prefix", "age_group", "Embarked", "family_size", "fare_group"]

cat_ohe_pipeline = Pipeline(
    [
        ("one_hot_encoder", OneHotEncoder(sparse_output=False, handle_unknown="ignore")),
        ("imputer", SimpleImputer(strategy="most_frequent")),
    ]
)

full_pipeline = Pipeline(
    steps=[
        ("attr_transformer", AttributeTransformer()),
        (
            "encoder",
            ColumnTransformer(
                transformers=[
                    ("encode_ohe", cat_ohe_pipeline, cat_features_to_ohe),
                ],
                remainder="passthrough",
            ),
        ),
    ]
)

In [None]:
from sklearn.model_selection import train_test_split

random_state = 42

X = df_train.drop("Survived")
y = df_train["Survived"]

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=random_state, stratify=y
)

In [None]:
fitted_pipeline = full_pipeline.fit(X_train)

X_train_transformed = fitted_pipeline.transform(X_train)
X_val_transformed = fitted_pipeline.transform(X_val)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

rf_param_grid = {
    "n_estimators": [10, 13, 15, 20],
    "criterion": ["gini", "entropy"],
    "max_depth": [10, 15, 20, 30, 50],
    "min_samples_split": [2, 3, 4, 5, 10],
    "min_samples_leaf": [1, 2, 3, 5],
    "max_features": ["sqrt", "log2"],
}

rf_grid_search = GridSearchCV(
    estimator=RandomForestClassifier(),
    param_grid=rf_param_grid,
    scoring="accuracy",
    n_jobs=-1,
    cv=20,
    verbose=1,
).fit(X_train_transformed, y_train)

In [None]:
print(rf_grid_search.best_score_)
print(rf_grid_search.best_params_)

In [None]:
from sklearn.model_selection import cross_val_score

rf_clf = rf_grid_search.best_estimator_

rf_scores = cross_val_score(rf_clf, X_val_transformed, y_val, cv=10, scoring="accuracy")

print(rf_scores.mean())

In [None]:
from sklearn.metrics import accuracy_score

rf_preds = rf_clf.predict(X_test_transformed)
rf_score_test = accuracy_score(rf_preds, y_test)
print(f"Random Forest score on test set: {rf_score_test:.3f}")

In [None]:
from sklearn.linear_model import LogisticRegression

lr_param_grid = {
    "penalty": ["l1", "l2", "elasticnet"],
    "tol": [0.001, 0.01, 0.1, 1],
    "C": [0.1, 1, 10, 20, 30, 40, 50, 100],
    "solver": ["liblinear", "sag", "saga"],
    "max_iter": [1000, 1500, 2000],
}

lr_grid_search = GridSearchCV(
    estimator=LogisticRegression(),
    param_grid=lr_param_grid,
    scoring="accuracy",
    n_jobs=-1,
    cv=20,
    verbose=1,
).fit(X_train_transformed, y_train)

In [None]:
print(lr_grid_search.best_score_)
print(lr_grid_search.best_params_)

In [None]:
lr_clf = lr_grid_search.best_estimator_

lr_scores = cross_val_score(lr_clf, X_val_transformed, y_val, cv=10, scoring="accuracy")

print(lr_scores.mean())

In [None]:
lr_preds = lr_clf.predict(X_test_transformed)
lr_score_test = accuracy_score(lr_preds, y_test)
print(f"Logistic Regression score on test set: {lr_score_test:.3f}")

In [None]:
from sklearn.svm import SVC

svc_param_grid = {
    "C": [10, 20, 30],
    "kernel": ["linear", "rbf"],
    "degree": [1, 2],
    "gamma": [0.001, 0.1, 1, 3, 5],
    "tol": [0.1, 1, 3, 5],
}

svc_grid_search = GridSearchCV(
    estimator=SVC(),
    param_grid=svc_param_grid,
    scoring="accuracy",
    n_jobs=-1,
    cv=20,
    verbose=1,
).fit(X_train_transformed, y_train)

In [None]:
print(svc_grid_search.best_score_)
print(svc_grid_search.best_params_)

In [None]:
svc_clf = svc_grid_search.best_estimator_

svc_scores = cross_val_score(svc_clf, X_val_transformed, y_val, cv=10, scoring="accuracy")

print(svc_scores.mean())

In [None]:
svc_preds = svc_clf.predict(X_test_transformed)
svc_score_test = accuracy_score(svc_preds, y_test)
print(f"SVC score on test set: {svc_score_test:.3f}")

In [None]:
from sklearn.ensemble import VotingClassifier

vote_clf = VotingClassifier(
    estimators=[
        ("random_forest", rf_clf),
        ("logistic_regression", lr_clf),
        ("svc", svc_clf),
    ],
    voting="hard",
).fit(X_train_transformed, y_train)

In [None]:
vote_scores = cross_val_score(vote_clf, X_val_transformed, y_val, cv=10, scoring="accuracy")

print(vote_scores.mean())

In [None]:
vote_preds = vote_clf.predict(X_test_transformed)
vote_score_test = accuracy_score(vote_preds, y_test)
print(f"SVC score on test set: {vote_score_test:.3f}")

In [None]:
X_submit = fitted_pipeline.transform(df_test.to_pandas())

In [None]:
import os

RESULTS_DIR = CHAPTER_ROOT / "results"

if not os.path.isdir(RESULTS_DIR):
    os.makedirs(RESULTS_DIR)

In [None]:
passenger_ids = pl.DataFrame({"PassengerId": df_test.get_column("PassengerId")})

In [None]:
rf_preds = [i.item() for i in rf_clf.predict(X_submit)]

rf_preds = pl.DataFrame({"Survived": rf_preds})

rf_preds_df = pl.concat(
    [passenger_ids, rf_preds],
    how="horizontal",
)

rf_preds_df.write_csv(RESULTS_DIR / "rf_preds.csv")

In [None]:
lr_preds = [i.item() for i in lr_clf.predict(X_submit)]

lr_preds = pl.DataFrame({"Survived": lr_preds})

lr_preds_df = pl.concat(
    [passenger_ids, lr_preds],
    how="horizontal",
)

lr_preds_df.write_csv(RESULTS_DIR / "lr_preds.csv")

In [None]:
svc_preds = [i.item() for i in svc_clf.predict(X_submit)]

svc_preds = pl.DataFrame({"Survived": svc_preds})

svc_preds_df = pl.concat(
    [passenger_ids, svc_preds],
    how="horizontal",
)

svc_preds_df.write_csv(RESULTS_DIR / "svc_preds.csv")

In [None]:
vote_preds = [i.item() for i in vote_clf.predict(X_submit)]

vote_preds = pl.DataFrame({"Survived": vote_preds})

vote_preds_df = pl.concat(
    [passenger_ids, vote_preds],
    how="horizontal",
)

vote_preds_df.write_csv(RESULTS_DIR / "vote_preds.csv")