In [1]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from itertools import product

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, MultiLabelBinarizer

In [36]:
def engineer_features():
    # Engineer "Cabin" -> "HasCabin"
    has_cabin_transformer = FunctionTransformer(lambda x: x.notnull().astype(int), validate=False)
    decks = Pipeline(
        [
            ("cabin_to_deck", FunctionTransformer(lambda y: y.iloc[:, 0].apply(lambda c: ['U'] if pd.isna(c) else list({part[0] for part in c.split()})).tolist(), validate=False)),
            ("mlb", MultiLabelBinarizer())
        ]
    )
    cat_variables_transf = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    return ColumnTransformer(
        [
            ("has_cabin", has_cabin_transformer, ["Cabin"]),
            ("decks", decks, ["Cabin"]),
            ("dropper", "drop", ["Name", "Ticket", "Cabin"]),
            ("cat_variables",  cat_variables_transf, ["Sex", "Embarked"])
        ]
    )



df = pd.read_csv("data/train.csv", delimiter=",")

features = [x for x in df.columns if x != "Survived"]
x, y = df[features], df["Survived"].T
print(x.shape)
print(y.shape)

(891, 11)
(891,)


In [3]:
X_train, X_cv, y_train, y_cv = train_test_split(x, y, train_size=0.8, random_state=1)

In [4]:
print(f"X train: {X_train.shape}")
print(f"Y train: {y_train.shape}")
print(f"X cv: {X_cv.shape}")
print(f"y cv: {y_cv.shape}")


X train: (712, 11)
Y train: (712,)
X cv: (179, 11)
y cv: (179,)


In [5]:
max_depth_list = [2, 3, 4, 5, 6, 7, 8]
min_samples_split_list = [30, 50, 100, 150, 200, 250, 300]
n_estimators_list = [50, 100, 150, 200]

In [37]:
def eval_models(X_train, y_train, X_cv, y_cv, max_depth, min_samples_split, n_estimators, feature_transformer):
    model = Pipeline(steps=[
        ("preprocessor", feature_transformer),
        ("classifier", RandomForestClassifier(n_estimators=n_estimators,
                                              max_depth=max_depth,
                                              min_samples_split=min_samples_split,
                                              random_state=1))
    ])
    model.fit(X_train, y_train)
    y_cv_hat = model.predict(X_cv)
    cv_error = np.mean(y_cv != y_cv_hat)
    print(
        f"Max depth: {max_depth}, Min sample split: {min_samples_split}, N estimators: {n_estimators}, Error: {cv_error}")
    return model,feature_transformer, max_depth, min_samples_split, n_estimators, cv_error


best_model_params = min(
    [eval_models(X_train, y_train, X_cv, y_cv, max_depth, min_samples_split, n_estimators, engineer_features()) for
     max_depth, min_samples_split, n_estimators in
     product(max_depth_list, min_samples_split_list, n_estimators_list)], key=lambda x: x[5])[:5]
print(best_model_params[-3:])

TypeError: MultiLabelBinarizer.fit_transform() takes 2 positional arguments but 3 were given

In [None]:
# df_test = pd.read_csv("data/test.csv", delimiter=",")
# df_test = engineer_features(df_test)
# predictions = best_model_params[0].predict(df_test)
# output = pd.DataFrame({'PassengerId': df_test.PassengerId, 'Survived': predictions})
# output.to_csv('submission.csv', index=False)