In [41]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.utils.validation import check_is_fitted
from itertools import product
import warnings
warnings.filterwarnings("error", category=FutureWarning)

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, MultiLabelBinarizer

In [42]:
# Since MultiLabelBinarizer only take (y) during fit and fit_transform, we create this wrapper
# to align it with other transformers.
# It's the same idea as OneHotEncoder -> y is passed to fit() but not used
class MultiLabelBinarizerTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, **kwargs):
        self.classes_ = None
        self.mlb = MultiLabelBinarizer(**kwargs)

    def fit(self, X, y=None):  # accepts (X, y)
        self.mlb.fit(X)
        # This is important because scikit-learn verify estimator's fittedness based on existence of
        # attribute ending with "_" and not starting with "__"
        # Here classes_ is just overwritten from MultiLabelBinarizer
        self.classes_ = self.mlb.classes_
        return self

    def transform(self, X):
        return self.mlb.transform(X)

    def fit_transform(self, X, y=None):
        return self.mlb.fit_transform(X)

    def get_feature_names_out(self, input_features=None):
        return self.mlb.classes_

In [43]:
def engineer_features():
    # Engineer "Cabin" -> "HasCabin"
    has_cabin_transformer = FunctionTransformer(lambda x: x.notnull().astype(int), validate=False)
    decks = Pipeline(
        [
            ("cabin_to_deck", FunctionTransformer(lambda y: y.iloc[:, 0].apply(
                lambda c: ['U'] if pd.isna(c) else list({part[0] for part in c.split()})).tolist(), validate=False)),
            ("mlb", MultiLabelBinarizerTransformer())
        ]
    )
    cat_variables_transf = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    return ColumnTransformer(
        [
            ("has_cabin", has_cabin_transformer, ["Cabin"]),
            ("decks", decks, ["Cabin"]),
            ("dropper", "drop", ["Name", "Ticket", "Cabin"]),
            ("cat_variables", cat_variables_transf, ["Sex", "Embarked"])
        ]
    )


df = pd.read_csv("../data/train.csv", delimiter=",")

features = [x for x in df.columns if x != "Survived"]
x, y = df[features], df["Survived"].T
print(x.shape)
print(y.shape)

(891, 11)
(891,)


In [44]:
X_train, X_cv, y_train, y_cv = train_test_split(x, y, train_size=0.8, random_state=1)

In [45]:
print(f"X train: {X_train.shape}")
print(f"Y train: {y_train.shape}")
print(f"X cv: {X_cv.shape}")
print(f"y cv: {y_cv.shape}")


X train: (712, 11)
Y train: (712,)
X cv: (179, 11)
y cv: (179,)


In [46]:
max_depth_list = [2, 3, 4, 5, 6, 7, 8]
min_samples_split_list = [30, 50, 100, 150, 200, 250, 300]
n_estimators_list = [50, 100, 150, 200]

In [47]:
def eval_models(X_train, y_train, X_cv, y_cv, max_depth, min_samples_split, n_estimators, feature_transformer):
    model = Pipeline(steps=[
        ("preprocessor", feature_transformer),
        ("classifier", RandomForestClassifier(n_estimators=n_estimators,
                                              max_depth=max_depth,
                                              min_samples_split=min_samples_split,
                                              random_state=1))
    ])
    model.fit(X_train, y_train)
    print(f"The model is fitted: {check_is_fitted(model)}")
    print(f"The classifier is fitted: {check_is_fitted(model["classifier"])}")
    print(f"The transformer is fitted: {check_is_fitted(model["preprocessor"])}")
    y_cv_hat = model.predict(X_cv)
    cv_error = np.mean(y_cv != y_cv_hat)
    print(
        f"Max depth: {max_depth}, Min sample split: {min_samples_split}, N estimators: {n_estimators}, Error: {cv_error}")
    return model, max_depth, min_samples_split, n_estimators, cv_error


best_model_params = min(
    [eval_models(X_train, y_train, X_cv, y_cv, max_depth, min_samples_split, n_estimators, engineer_features()) for
     max_depth, min_samples_split, n_estimators in
     product(max_depth_list, min_samples_split_list, n_estimators_list)], key=lambda x: x[4])[:4]
print(best_model_params[-3:])

The model is fitted: None
The classifier is fitted: None
The transformer is fitted: None
Max depth: 2, Min sample split: 30, N estimators: 50, Error: 0.30726256983240224
The model is fitted: None
The classifier is fitted: None
The transformer is fitted: None
Max depth: 2, Min sample split: 30, N estimators: 100, Error: 0.21787709497206703
The model is fitted: None
The classifier is fitted: None
The transformer is fitted: None
Max depth: 2, Min sample split: 30, N estimators: 150, Error: 0.22346368715083798
The model is fitted: None
The classifier is fitted: None
The transformer is fitted: None
Max depth: 2, Min sample split: 30, N estimators: 200, Error: 0.22346368715083798
The model is fitted: None
The classifier is fitted: None
The transformer is fitted: None
Max depth: 2, Min sample split: 50, N estimators: 50, Error: 0.30726256983240224
The model is fitted: None
The classifier is fitted: None
The transformer is fitted: None
Max depth: 2, Min sample split: 50, N estimators: 100, Err

In [50]:
df_test = pd.read_csv("../data/test.csv", delimiter=",")

best_model = best_model_params[0]
y_test_hat = best_model.predict(df_test)
output = pd.DataFrame({
    "PassengerId": df_test["PassengerId"],
    "Survived": y_test_hat
})
output.to_csv("submission.csv", index=False)