# Random Forest

In [11]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import json
import pickle
from pickle import dump
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.metrics import accuracy_score

## Cargar el dataset

In [12]:
train_data = pd.read_csv("../data/processed/clean_train_con_outliers.csv")
test_data = pd.read_csv("../data/processed/clean_test_con_outliers.csv")

X_train = train_data.drop(["Outcome"], axis = 1)
y_train = train_data["Outcome"]
X_test = test_data.drop(["Outcome"], axis = 1)
y_test = test_data["Outcome"]

## Predicción 

In [13]:
model = RandomForestClassifier(random_state = 42)
model.fit(X_train, y_train)

In [14]:
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

print(f"Train: {accuracy_score(y_train, y_pred_train)}")
print(f"Test: {accuracy_score(y_test, y_pred_test)}")

Train: 1.0
Test: 0.8701298701298701


Sigue teniendo overfitting, aún con este modelo, además la mejora es muy ligera ya que el accuracy score del decision tree optimazodo era de 0.8636363636363636

## Hiperparametrización

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [10, 20, 30, 40, 50, 100, 150],
    'max_depth': [4, 6, 8, None],
    'bootstrap':[True, False],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 3, 5],
    'criterion': ['gini', 'entropy']
}



grid = GridSearchCV(model, param_grid, scoring = "accuracy", cv = 5)
grid

In [18]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

grid.fit(X_train, y_train)

print(f"Mejores hiperparámetros: {grid.best_params_}")


Mejores hiperparámetros: {'bootstrap': False, 'criterion': 'entropy', 'max_depth': 8, 'min_samples_leaf': 5, 'min_samples_split': 2, 'n_estimators': 40}


In [20]:
final_model = RandomForestClassifier(
    bootstrap=False,
    criterion='entropy',
    max_depth=8,
    min_samples_leaf=5,
    min_samples_split=2,
    n_estimators=40,
    random_state=42
)

final_model.fit(X_train, y_train)

In [23]:
y_pred_train = final_model.predict(X_train)
y_pred_test = final_model.predict(X_test)

print(f"Train accuracy: {accuracy_score(y_train, y_pred_train)}")
print(f"Test accuracy: {accuracy_score(y_test, y_pred_test)}")

Train accuracy: 0.9674267100977199
Test accuracy: 0.8506493506493507


No mejora las predicciones del decision tree

## Guardado del modelo

In [22]:
dump(model, open("../models/random_forest_classifier_42.sav", "wb"))