In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from sklearn.tree import plot_tree

from collections import Counter  
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

from sklearn.model_selection import train_test_split

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

from sklearn.ensemble import RandomForestClassifier


In [2]:
datos_diabetes = pd.read_csv('/home/juan/machineLearning2025/datasets/diabetes_012_health_indicators_BRFSS2015.csv')
#Crear la columna diabetes_01 que unifique prediabetes con diabetes
datos_diabetes['diabetes_01'] = datos_diabetes['Diabetes_012']
datos_diabetes['diabetes_01'] = datos_diabetes['diabetes_01'].replace(2,1)

#Reparar nombres de columnas. Se usa el formato loweCamelCase para el nombre de las caracteristicas.

new_col_names = []

for name in datos_diabetes.columns:
    # Luego, pon todas las letras en minúsculas
    name_lowered_first_letter = name[0].lower() + name[1:]
    # Elimina los espacios al principio y al final
    name_stripped = name_lowered_first_letter.strip()
    # Por último, reemplaza los espacios entre palabras por guiones bajos
    name_no_spaces = name_stripped.replace(' ', '_')
    # Agrega el nuevo nombre a la lista de nuevos nombres de columna
    new_col_names.append(name_no_spaces)

datos_diabetes.columns = new_col_names

datos_diabetes = datos_diabetes.rename(columns={'bMI':'bmi'})

datos_diabetes.columns

datos_diabetes.drop(columns=['physHlth', 'mentHlth'], inplace=True)

In [3]:
##Balanceo con RAndOverSample
X = datos_diabetes.drop(["diabetes_01", "diabetes_012"], axis=1)
y = datos_diabetes["diabetes_01"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)

#Balanceamos con RandomOverSampler
os =  RandomOverSampler()
X_train_res, y_train_res = os.fit_resample(X_train, y_train)
 
print ("before resampling {}".format(Counter(y_train)))
print ("after resampling {}".format(Counter(y_train_res)))

before resampling Counter({0.0: 160225, 1.0: 30035})
after resampling Counter({0.0: 160225, 1.0: 160225})


In [4]:
#df_train, df_valid = train_test_split(datos_diabetes, test_size=0.25, random_state=54321)
#features_train = df_train.drop(['diabetes_01'], axis=1)
#target_train = df_train['diabetes_01'] 

#features_valid = df_valid.drop(['diabetes_01'], axis=1)
#target_valid = df_valid['diabetes_01'] 


In [6]:

# Random Forest con Grid Search
model = RandomForestClassifier(random_state=54321)

param_grid = {
    'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
    'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    #'min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9, 10]
}

scoring_metrics = ['average_precision', 'roc_auc', 'accuracy']
grid_search = GridSearchCV(model, param_grid, cv=5, scoring=scoring_metrics, refit='average_precision')
grid_search.fit(X_train_res, y_train_res)
print("Mejores parámetros encontrados por Grid Search:", grid_search.best_params_)
print("(AUC-PR) del mejor modelo en el conjunto de validación:", grid_search.best_score_)

KeyboardInterrupt: 

In [None]:
scoring_metrics = ['average_precision', 'roc_auc', 'accuracy']
grid_search = GridSearchCV(model, param_grid, cv=5, scoring=scoring_metrics, refit='average_precision')
grid_search.fit(X_train_res, y_train_res)
print("Mejores parámetros encontrados por Grid Search:", grid_search.best_params_)
print("(AUC-PR) del mejor modelo en el conjunto de validación:", grid_search.best_score_)

In [None]:
results_df = pd.DataFrame(grid_search.cv_results_)

# Mostrar columnas relevantes para cada métrica
metric_cols_to_display = []
for metric in scoring_metrics:
    metric_cols_to_display.append(f'mean_test_{metric}')
    metric_cols_to_display.append(f'std_test_{metric}')

param_cols_to_display = [f'param_{param_name}' for param_name in param_grid.keys()]

print(results_df[param_cols_to_display + metric_cols_to_display].sort_values(by=f'mean_test_{grid_search.refit_}', ascending=False))

# El mejor estimador se selecciona basado en la métrica 'refit'
best_model_refit = grid_search.best_estimator_
