In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, KFold

from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (
accuracy_score,
f1_score,
roc_auc_score
)

models_results = pd.DataFrame()

df = pd.read_csv("final_data/df.csv")
df = (df - df.min()) / (df.max() - df.min())
X_submission = pd.read_csv("final_data/X_submit.csv")
X_submission = (X_submission - X_submission.min()) / (X_submission.max() - X_submission.min())


X = df.drop("Transported", axis = 1)
y = df["Transported"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.2)

# Random Forest Classifier

## Validation

In [14]:
param_grid = {
    'n_estimators': [50, 100, 150, 200],       # Number of trees in the forest
    'max_depth': [None, 10, 20],       # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],      # Minimum number of samples required to split an internal node}
}
model = RandomForestClassifier()
kf = KFold(n_splits=4, shuffle=True, random_state=23)
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=kf)
grid_search.fit(X_val, y_val)

rf_best_params = grid_search.best_params_
rf_best_score = grid_search.best_score_
rf_best_estimator = grid_search.best_estimator_
print("Best Hyperparameters:")
print(rf_best_params)
print("Best score:")
print(rf_best_score)

Best Hyperparameters:
{'max_depth': 20, 'min_samples_split': 5, 'n_estimators': 150}
Best score:
0.8241064234966674


## Train-Test

In [15]:
model = rf_best_estimator
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred)

print(accuracy)
print(f1)
print(auc)

models_results["RFC"] = [accuracy, f1, auc]

0.8021472392638037
0.7981220657276996
0.8020551038843723


# MLP

## Validation

In [16]:
import warnings
from sklearn.exceptions import ConvergenceWarning

warnings.filterwarnings("ignore", category=ConvergenceWarning)

params = {
    'hidden_layer_sizes': [(20,), (35,), (50,), (20, 20)],   # Number of units in the hidden layers
    'activation': ['relu', 'tanh', 'logistic'],                    # Activation function for the hidden layers                              # L2 regularization parameter
    'learning_rate_init': [0.005, 0.01, 0.05],                      # Initial learning rate
    'max_iter': [500]
}
model = MLPClassifier()
kf = KFold(n_splits=4, shuffle=True, random_state=23)
grid_search = GridSearchCV(estimator=model, param_grid=params, cv=kf)
grid_search.fit(X_val, y_val)

mlp_best_params = grid_search.best_params_
mlp_best_score = grid_search.best_score_
mlp_best_estimator = grid_search.best_estimator_
print("Best Hyperparameters:")
print(mlp_best_params)
print("Best score:")
print(mlp_best_score)

Best Hyperparameters:
{'activation': 'logistic', 'hidden_layer_sizes': (20,), 'learning_rate_init': 0.005, 'max_iter': 500}
Best score:
0.814623159745111


## Train-Test

In [17]:
model = mlp_best_estimator
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred)

print(accuracy)
print(f1)
print(auc)

models_results["MLP"] = [accuracy, f1, auc]

0.7998466257668712
0.8039068369646881
0.8000037639265282
