In [1]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from itertools import product
from competition.titanic.FeaturesEngineering import *
import warnings
warnings.filterwarnings("error", category=FutureWarning)

from sklearn.pipeline import Pipeline

(891, 11)
(891,)


In [2]:
df = pd.read_csv("../data/train.csv", delimiter=",")

features = [x for x in df.columns if x != "Survived"]
x, y = df[features], df["Survived"].T
print(x.shape)
print(y.shape)

(891, 11)
(891,)


In [3]:
X_train, X_cv, y_train, y_cv = train_test_split(x, y, train_size=0.8, random_state=1)

In [4]:
print(f"X train: {X_train.shape}")
print(f"Y train: {y_train.shape}")
print(f"X cv: {X_cv.shape}")
print(f"y cv: {y_cv.shape}")


X train: (712, 11)
Y train: (712,)
X cv: (179, 11)
y cv: (179,)


In [5]:
max_depth_list = [2, 3, 4, 5, 6, 7, 8]
min_samples_split_list = [30, 50, 100, 150, 200, 250, 300]
n_estimators_list = [50, 100, 150, 200]

In [6]:
def eval_models(X_train, y_train, X_cv, y_cv, max_depth, min_samples_split, n_estimators, feature_transformer):
    model = Pipeline(steps=[
        ("preprocessor", feature_transformer),
        ("classifier", RandomForestClassifier(n_estimators=n_estimators,
                                              max_depth=max_depth,
                                              min_samples_split=min_samples_split,
                                              random_state=1))
    ])
    model.fit(X_train, y_train)
    y_cv_hat = model.predict(X_cv)
    cv_accuracy = np.mean(y_cv == y_cv_hat)
    print(
        f"Max depth: {max_depth}, Min sample split: {min_samples_split}, N estimators: {n_estimators}, Accuracy: {cv_accuracy}")
    return model, max_depth, min_samples_split, n_estimators, cv_accuracy


best_model_params = max(
    [eval_models(X_train, y_train, X_cv, y_cv, max_depth, min_samples_split, n_estimators, engineer_features()) for
     max_depth, min_samples_split, n_estimators in
     product(max_depth_list, min_samples_split_list, n_estimators_list)], key=lambda x: x[4])[:4]
print(best_model_params[-4:])

Max depth: 2, Min sample split: 30, N estimators: 50, Accuracy: 0.6927374301675978
Max depth: 2, Min sample split: 30, N estimators: 100, Accuracy: 0.7821229050279329
Max depth: 2, Min sample split: 30, N estimators: 150, Accuracy: 0.776536312849162
Max depth: 2, Min sample split: 30, N estimators: 200, Accuracy: 0.776536312849162
Max depth: 2, Min sample split: 50, N estimators: 50, Accuracy: 0.6927374301675978
Max depth: 2, Min sample split: 50, N estimators: 100, Accuracy: 0.7821229050279329
Max depth: 2, Min sample split: 50, N estimators: 150, Accuracy: 0.776536312849162
Max depth: 2, Min sample split: 50, N estimators: 200, Accuracy: 0.776536312849162
Max depth: 2, Min sample split: 100, N estimators: 50, Accuracy: 0.6927374301675978
Max depth: 2, Min sample split: 100, N estimators: 100, Accuracy: 0.7821229050279329
Max depth: 2, Min sample split: 100, N estimators: 150, Accuracy: 0.7821229050279329
Max depth: 2, Min sample split: 100, N estimators: 200, Accuracy: 0.776536312849

In [7]:
df_test = pd.read_csv("../data/test.csv", delimiter=",")

best_model = best_model_params[0]
y_test_hat = best_model.predict(df_test)
output = pd.DataFrame({
    "PassengerId": df_test["PassengerId"],
    "Survived": y_test_hat
})
output.to_csv("submission.csv", index=False)