In [1]:
import pandas as pd
import numpy as np
import os

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import seaborn as sns
from sklearn.metrics import recall_score, precision_score, f1_score 

In [2]:
datos_diabetes = pd.read_csv('/home/juan/machineLearning2025/datasets/diabetes_012_health_indicators_BRFSS2015.csv')
#Crear la columna diabetes_01 que unifique prediabetes con diabetes
datos_diabetes['diabetes_01'] = datos_diabetes['Diabetes_012']
datos_diabetes['diabetes_01'] = datos_diabetes['diabetes_01'].replace(2,1)

#Reparar nombres de columnas. Se usa el formato loweCamelCase para el nombre de las caracteristicas.

new_col_names = []

for name in datos_diabetes.columns:
    # Luego, pon todas las letras en minúsculas
    name_lowered_first_letter = name[0].lower() + name[1:]
    # Elimina los espacios al principio y al final
    name_stripped = name_lowered_first_letter.strip()
    # Por último, reemplaza los espacios entre palabras por guiones bajos
    name_no_spaces = name_stripped.replace(' ', '_')
    # Agrega el nuevo nombre a la lista de nuevos nombres de columna
    new_col_names.append(name_no_spaces)

datos_diabetes.columns = new_col_names

datos_diabetes = datos_diabetes.rename(columns={'bMI':'bmi'})

datos_diabetes.columns

datos_diabetes.drop(columns=['physHlth', 'mentHlth'], inplace=True)

datos_diabetes = datos_diabetes[datos_diabetes['bmi'] <= 50] #FILTRADO DE BMI

In [3]:
#Funciones run_model() y evaluate_model()


def run_model(X_train, X_test, Y_train, Y_test):
    clf_base = LogisticRegression(C=1.0,penalty='l2',random_state=1,solver="newton-cg")
    clf_base.fit(X_train, Y_train)
    return clf_base
def evaluate_model(model, train_features, train_target, test_features, test_target):
   
    eval_stats = {}
    
    fig, axs = plt.subplots(1, 3, figsize=(20, 6)) 
    
    for type, features, target in (('train', train_features, train_target), ('test', test_features, test_target)):
        
        eval_stats[type] = {}
    
        pred_target = model.predict(features)
        pred_proba = model.predict_proba(features)[:, 1]
        
        # F1
        f1_thresholds = np.arange(0, 1.01, 0.05)
        f1_scores = [metrics.f1_score(target, pred_proba>=threshold) for threshold in f1_thresholds]
        
        # ROC
        fpr, tpr, roc_thresholds = metrics.roc_curve(target, pred_proba)
        roc_auc = metrics.roc_auc_score(target, pred_proba)    
        eval_stats[type]['ROC AUC'] = roc_auc

        # PRC
        precision, recall, pr_thresholds = metrics.precision_recall_curve(target, pred_proba)
        aps = metrics.average_precision_score(target, pred_proba)
        eval_stats[type]['APS'] = aps
        
        if type == 'train':
            color = 'blue'
        else:
            color = 'green'

        # Valor F1
        ax = axs[0]
        max_f1_score_idx = np.argmax(f1_scores)
        ax.plot(f1_thresholds, f1_scores, color=color, label=f'{type}, max={f1_scores[max_f1_score_idx]:.2f} @ {f1_thresholds[max_f1_score_idx]:.2f}')
        # establecer cruces para algunos umbrales        
        for threshold in (0.2, 0.4, 0.5, 0.6, 0.8):
            closest_value_idx = np.argmin(np.abs(f1_thresholds-threshold))
            marker_color = 'orange' if threshold != 0.5 else 'red'
            ax.plot(f1_thresholds[closest_value_idx], f1_scores[closest_value_idx], color=marker_color, marker='X', markersize=7)
        ax.set_xlim([-0.02, 1.02])    
        ax.set_ylim([-0.02, 1.02])
        ax.set_xlabel('threshold')
        ax.set_ylabel('F1')
        ax.legend(loc='lower center')
        ax.set_title(f'Valor F1') 

        # ROC
        ax = axs[1]    
        ax.plot(fpr, tpr, color=color, label=f'{type}, ROC AUC={roc_auc:.2f}')
        # establecer cruces para algunos umbrales        
        for threshold in (0.2, 0.4, 0.5, 0.6, 0.8):
            closest_value_idx = np.argmin(np.abs(roc_thresholds-threshold))
            marker_color = 'orange' if threshold != 0.5 else 'red'            
            ax.plot(fpr[closest_value_idx], tpr[closest_value_idx], color=marker_color, marker='X', markersize=7)
        ax.plot([0, 1], [0, 1], color='grey', linestyle='--')
        ax.set_xlim([-0.02, 1.02])    
        ax.set_ylim([-0.02, 1.02])
        ax.set_xlabel('FPR')
        ax.set_ylabel('TPR')
        ax.legend(loc='lower center')        
        ax.set_title(f'Curva ROC')
        
        # PRC
        ax = axs[2]
        ax.plot(recall, precision, color=color, label=f'{type}, AP={aps:.2f}')
        # establecer cruces para algunos umbrales        
        for threshold in (0.2, 0.4, 0.5, 0.6, 0.8):
            closest_value_idx = np.argmin(np.abs(pr_thresholds-threshold))
            marker_color = 'orange' if threshold != 0.5 else 'red'
            ax.plot(recall[closest_value_idx], precision[closest_value_idx], color=marker_color, marker='X', markersize=7)
        ax.set_xlim([-0.02, 1.02])    
        ax.set_ylim([-0.02, 1.02])
        ax.set_xlabel('recall')
        ax.set_ylabel('precision')
        ax.legend(loc='lower center')
        ax.set_title(f'PRC')   
        
        eval_stats[type]['Exactitud'] = metrics.accuracy_score(target, pred_target)
        eval_stats[type]['F1'] = metrics.f1_score(target, pred_target)
    
    df_eval_stats = pd.DataFrame(eval_stats)
    df_eval_stats = df_eval_stats.round(2)
    df_eval_stats = df_eval_stats.reindex(index=('Exactitud', 'F1', 'APS', 'ROC AUC'))
    
    print(df_eval_stats)
    
    return eval_stats['train']['F1'], eval_stats['test']['F1']

In [4]:
from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from collections import Counter  
from imblearn.over_sampling import RandomOverSampler


X = datos_diabetes.drop(["diabetes_01", "diabetes_012"], axis=1)
y = datos_diabetes["diabetes_01"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)

#Balanceamos con RandomOverSampler
os =  RandomOverSampler()
X_train_res, y_train_res = os.fit_resample(X_train, y_train)
 
print ("before resampling {}".format(Counter(y_train)))
print ("after resampling {}".format(Counter(y_train_res)))


before resampling Counter({0.0: 159361, 1.0: 29267})
after resampling Counter({0.0: 159361, 1.0: 159361})


In [5]:
#gridsearch para extratreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import mean_squared_error

model_extratree= ExtraTreesClassifier(n_estimators=100, random_state=42)
param_grid = {
    'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110],
    'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    #'min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9, 10]
}



scoring_metrics = ['average_precision', 'roc_auc', 'accuracy']
grid_search_extratree = GridSearchCV(model_extratree, param_grid, cv=5, scoring=scoring_metrics, refit='average_precision')
grid_search_extratree.fit(X_train_res, y_train_res)
print("Mejores parámetros encontrados por Grid Search:", grid_search_extratree.best_params_)
print("(AUC-PR) del mejor modelo en el conjunto de validación:", grid_search_extratree.best_score_)

KeyboardInterrupt: 

In [5]:
#Veremos ahora las caracteristicas mas importantes de este dataset balanceado:

from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import mean_squared_error

# Entrenar el modelo ExtraTreesRegressor
model = ExtraTreesRegressor(n_estimators=100, random_state=42) ##ensamble de 100 arboles
model.fit(X_train_res, y_train_res)

In [6]:
# Obtener las importancias de las características

##Significado de importacia: reduccion de indice gini de cada caracteristica.
importances = model.feature_importances_

# Crear un DataFrame con las características y sus importancias
feature_importances = pd.DataFrame({"Feature": X.columns, "Importance": importances})

# Ordenar las características por importancia
feature_importances = feature_importances.sort_values(by="Importance", ascending=False)

# Mostrar las características más importantes
print(feature_importances)

                 Feature  Importance
3                    bmi    0.155545
0                 highBP    0.142158
16                   age    0.119236
18                income    0.105907
13               genHlth    0.089687
17             education    0.083617
4                 smoker    0.040570
8                 fruits    0.038987
7           physActivity    0.034106
9                veggies    0.032248
15                   sex    0.028682
14              diffWalk    0.026015
1               highChol    0.022785
12           noDocbcCost    0.019018
6   heartDiseaseorAttack    0.017283
5                 stroke    0.015275
11         anyHealthcare    0.011792
10     hvyAlcoholConsump    0.010673
2              cholCheck    0.006416


In [7]:
# Ahora evaluaremos le modelo de ensamble variando el numero de caracteristicas para evaluar su rendimiento.

results = []
for i in range(1, len(feature_importances) + 1):
    selected_features = feature_importances["Feature"][:i]
    X_train_selected = X_train_res[selected_features]
    X_test_selected = X_test[selected_features]

    model.fit(X_train_selected, y_train)
    y_pred = model.predict(X_test_selected)
    mse = mean_squared_error(y_test, y_pred)
    results.append((i, mse))

# Convertir los resultados a un DataFrame
results_df = pd.DataFrame(results, columns=["Number of Features", "MSE"])

# Encontrar el número óptimo de características
optimal_features = results_df.loc[results_df["MSE"].idxmin(), "Number of Features"]
print(f"Número óptimo de características: {optimal_features}")

# Graficar el ECM en función del número de características
plt.figure(figsize=(10, 6))
plt.plot(
    results_df["Number of Features"], results_df["MSE"], marker="o", linestyle="--"
)
plt.xlabel("Número de Características")
plt.ylabel("ECM")
plt.title("ECM en función del Número de Características")
plt.axvline(optimal_features, color="r", linestyle="--")
plt.show()

ValueError: Found input variables with inconsistent numbers of samples: [318722, 188628]

PROBAR REGRESION LOGISTICA Y ARBOL DE DESICION PARA LAS 4 MEJORES CARACTERISTICAS QUE SE EXTRAJERON DESPUES DE BALANCEAR CON RandomOversample.

In [8]:
X_train_selected = X_train_res[['bmi', 'highBP', 'age']]
X_test_selected = X_test[['bmi', 'highBP', 'age']]

#model_feature_selection = ExtraTreesRegressor(n_estimators=100, random_state=42) ##ensamble de 100 arboles
#model_feature_selection.fit(X_train_selected, y_train_res)
#train_f1, test_f1 = evaluate_model(model, X_train_selected, y_train_res, X_test_selected, y_test)

X_test_selected


Unnamed: 0,bmi,highBP,age
76193,38.0,0.0,9.0
108390,31.0,1.0,9.0
165658,25.0,1.0,13.0
81213,37.0,1.0,10.0
67061,28.0,0.0,10.0
...,...,...,...
20598,21.0,0.0,2.0
186955,25.0,0.0,5.0
137544,24.0,0.0,3.0
198352,30.0,0.0,4.0
