In [5]:
import pandas as pd
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_validate
from sklearn.ensemble import RandomForestClassifier

from preprocessing import TrainingPreProcessor

RANDOM_STATE = 0

train_set = pd.read_csv("input/train.csv")

training_preprocessor = TrainingPreProcessor()
training_preprocessor.fit(train_set, ignore_columns=["CLIENTNUM"])

X, y = training_preprocessor.transform(train_set)
class_weight = training_preprocessor.class_weight

In [3]:
rf_model = RandomForestClassifier(random_state=RANDOM_STATE, class_weight=class_weight)

## Grid search para floresta aleatória

In [6]:
kfold = StratifiedKFold(5, shuffle=True, random_state=RANDOM_STATE)
parameters = {
    "n_estimators": [50, 100, 500],
    "criterion": ["gini", "entropy", "log_loss"],
    "max_depth": [3, 5, 7, None],
    "max_features": ["sqrt", "log2"]
}

In [5]:
grid_search = GridSearchCV(
    rf_model,
    parameters,
    scoring="balanced_accuracy",
    cv=kfold,
    verbose=3,
    n_jobs=-1,
)

In [6]:
grid_search.fit(X, y)
print(grid_search.best_params_)
print(grid_search.best_score_)
grid_search.best_estimator_

Fitting 5 folds for each of 72 candidates, totalling 360 fits
{'criterion': 'entropy', 'max_depth': 7, 'max_features': 'sqrt', 'n_estimators': 50}
0.9125327647025576


In [3]:
best_model = RandomForestClassifier(
    random_state=RANDOM_STATE,
    class_weight=class_weight,
    **{
        "criterion": "entropy",
        "max_depth": 7,
        "max_features": "sqrt",
        "n_estimators": 50,
    }
)

In [7]:
metrics = cross_validate(
    best_model,
    X,
    y,
    cv=kfold,
    scoring=['accuracy', 'balanced_accuracy', 'f1', 'roc_auc'],
)
print(f"accuracy: {metrics['test_accuracy'].mean():.4f} +- {metrics['test_accuracy'].std():.4f}")
print(f"balanced_accuracy: {metrics['test_balanced_accuracy'].mean():.4f} +- {metrics['test_balanced_accuracy'].std():.4f}")
print(f"f1: {metrics['test_f1'].mean():.4f} +- {metrics['test_f1'].std():.4f}")
print(f"roc_auc: {metrics['test_roc_auc'].mean():.4f} +- {metrics['test_roc_auc'].std():.4f}")

accuracy: 0.9315 +- 0.0048
balanced_accuracy: 0.9125 +- 0.0040
f1: 0.9584 +- 0.0031
roc_auc: 0.9726 +- 0.0042
