In [29]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from itertools import product

In [31]:
df = pd.read_csv("data/train.csv", delimiter=",").drop(columns=["Name", "Ticket"])
cat_variables = ["Pclass", "Sex", "SibSp", "Parch"]
le = OneHotEncoder(handle_unknown= "ignore", sparse_output=False)
df["Cabin"] = le.fit_transform(df["Cabin"])
df["Embarked"] = le.fit_transform(df["Embarked"])
df = pd.get_dummies(df, prefix=cat_variables, columns=cat_variables)
features = [x for x in df.columns if x != "Survived"]
x, y = df[features], df["Survived"].T
print(x.shape)
print(y.shape)

ValueError: Expected a 2-dimensional container but got <class 'pandas.core.series.Series'> instead. Pass a DataFrame containing a single row (i.e. single sample) or a single column (i.e. single feature) instead.

In [20]:
X_train, X_cv, y_train, y_cv = train_test_split(x, y, train_size=0.8, random_state=1)

In [21]:
print(f"X train: {X_train.shape}")
print(f"Y train: {y_train.shape}")
print(f"X cv: {X_cv.shape}")
print(f"y cv: {y_cv.shape}")


X train: (712, 24)
Y train: (712,)
X cv: (179, 24)
y cv: (179,)


In [22]:
max_depth_list = [2, 3, 4, 5, 6, 7, 8]
min_samples_split_list = [30, 50, 100, 150, 200, 250, 300]
n_estimators_list = [50, 100, 150, 200]

In [26]:
def eval_models(X_train, y_train, X_cv, y_cv, max_depth, min_samples_split, n_estimators):
    model = RandomForestClassifier(n_estimators=n_estimators,
                                   max_depth=max_depth,
                                   min_samples_split=min_samples_split,
                                   random_state=1)
    model.fit(X_train, y_train)
    y_cv_hat = model.predict(X_cv)
    cv_error = np.mean(y_cv != y_cv_hat)
    print(f"Max depth: {max_depth}, Min sample split: {min_samples_split}, N estimators: {n_estimators}, Error: {cv_error}")
    return model, max_depth, min_samples_split, n_estimators, cv_error


best_model_params = min([eval_models(X_train, y_train, X_cv, y_cv, max_depth, min_samples_split, n_estimators) for
                         max_depth, min_samples_split, n_estimators in
                         product(max_depth_list, min_samples_split_list, n_estimators_list)], key=lambda x: x[4])[:4]
print(best_model_params[-3:])

Max depth: 2, Min sample split: 30, N estimators: 50, Error: 0.2737430167597765
Max depth: 2, Min sample split: 30, N estimators: 100, Error: 0.25139664804469275
Max depth: 2, Min sample split: 30, N estimators: 150, Error: 0.26256983240223464
Max depth: 2, Min sample split: 30, N estimators: 200, Error: 0.2569832402234637
Max depth: 2, Min sample split: 50, N estimators: 50, Error: 0.2681564245810056
Max depth: 2, Min sample split: 50, N estimators: 100, Error: 0.25139664804469275
Max depth: 2, Min sample split: 50, N estimators: 150, Error: 0.26256983240223464
Max depth: 2, Min sample split: 50, N estimators: 200, Error: 0.26256983240223464
Max depth: 2, Min sample split: 100, N estimators: 50, Error: 0.2681564245810056
Max depth: 2, Min sample split: 100, N estimators: 100, Error: 0.25139664804469275
Max depth: 2, Min sample split: 100, N estimators: 150, Error: 0.26256983240223464
Max depth: 2, Min sample split: 100, N estimators: 200, Error: 0.26256983240223464
Max depth: 2, Min s

In [28]:
df_test = pd.read_csv("data/test.csv", delimiter=",").drop(columns=["Name", "Ticket"])
df_test["Cabin"] = le.transform(df_test["Cabin"])
df_test["Embarked"] = le.transform(df_test["Embarked"])
predictions = best_model_params[0].predict(df_test)
output = pd.DataFrame({'PassengerId': df_test.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)

ValueError: y contains previously unseen labels: 'B45'