In [18]:
# to handle datasets
import pandas as pd
import numpy as np

# for plotting
import matplotlib.pyplot as plt

# to save the model
import joblib

# to build the model
from sklearn.linear_model import Lasso

# to evaluate the model
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.model_selection import train_test_split

# to visualise al the columns in the dataframe
pd.pandas.set_option('display.max_columns', None)

from pycaret.classification import *
import mlflow
import mlflow.sklearn

In [15]:
list(pd.read_csv(r'..\data\processed\selected_features.csv').iloc[:,0])

['Gender',
 'Polyuria',
 'Polydipsia',
 'sudden weight loss',
 'Genital thrush',
 'Itching',
 'Irritability',
 'delayed healing',
 'partial paresis',
 'Alopecia']

In [17]:

X = pd.concat([pd.read_csv('../data/processed/X_train.csv')[list(pd.read_csv(r'..\data\processed\selected_features.csv').iloc[:,0])]
               ,pd.read_csv('../data/processed/y_train.csv')],axis=1)
print(X.shape)
X.head()

(468, 11)


Unnamed: 0,Gender,Polyuria,Polydipsia,sudden weight loss,Genital thrush,Itching,Irritability,delayed healing,partial paresis,Alopecia,diabetes
0,0,1,1,1,0,0,0,0,1,0,1
1,0,0,1,0,0,1,1,1,1,0,1
2,1,0,0,0,0,0,0,0,0,0,0
3,1,0,1,0,1,0,0,1,0,0,1
4,0,1,1,1,0,0,0,1,1,0,1


In [19]:
X_train, X_test, y_train, y_test = train_test_split(
    X.drop('diabetes',axis=1), X['diabetes'], test_size=0.1, random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((421, 10), (47, 10), (421,), (47,))

In [21]:
df_train = pd.concat([X_train,y_train],axis=1)
print(df_train.shape)
df_train.head()

(421, 11)


Unnamed: 0,Gender,Polyuria,Polydipsia,sudden weight loss,Genital thrush,Itching,Irritability,delayed healing,partial paresis,Alopecia,diabetes
462,1,1,1,0,0,0,1,0,1,0,1
405,1,1,1,1,1,1,0,0,0,1,1
392,1,1,1,0,0,1,0,0,0,0,1
455,1,1,0,0,0,1,0,1,1,1,0
307,1,0,0,0,0,1,0,1,0,1,0


In [23]:

# Inicializar el setup de PyCaret
# Aquí "class" es la variable objetivo
exp_clf = setup(data=df_train, target='diabetes', session_id=42,  log_experiment=True, experiment_name='diabetes_model', log_plots=True)

# Comparar múltiples modelos y seleccionar el mejor
best_model = compare_models()

# Entrenar el mejor modelo
final_model = finalize_model(best_model)

# Registrar el modelo en MLflow
mlflow.sklearn.log_model(final_model, "best_diabetes_model")

# Mostrar el mejor modelo
print(best_model)

Unnamed: 0,Description,Value
0,Session id,42
1,Target,diabetes
2,Target type,Binary
3,Original data shape,"(421, 11)"
4,Transformed data shape,"(421, 11)"
5,Transformed train set shape,"(294, 11)"
6,Transformed test set shape,"(127, 11)"
7,Numeric features,10
8,Preprocess,True
9,Imputation type,simple


2024/09/16 21:12:33 INFO mlflow.tracking.fluent: Experiment with name 'diabetes_model' does not exist. Creating a new experiment.


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.9286,0.9819,0.9456,0.9434,0.9424,0.8482,0.8551,0.139
knn,K Neighbors Classifier,0.9285,0.975,0.9228,0.9621,0.9408,0.8503,0.8546,0.04
gbc,Gradient Boosting Classifier,0.9254,0.9797,0.9234,0.9578,0.938,0.8443,0.8517,0.076
lightgbm,Light Gradient Boosting Machine,0.9253,0.9786,0.9181,0.9638,0.9367,0.8456,0.8555,0.079
et,Extra Trees Classifier,0.9184,0.9686,0.9289,0.9431,0.9333,0.8277,0.8359,0.138
dt,Decision Tree Classifier,0.9115,0.9456,0.9123,0.9478,0.9269,0.8143,0.8224,0.022
nb,Naive Bayes,0.8877,0.9568,0.9117,0.9108,0.9093,0.762,0.7677,0.024
lr,Logistic Regression,0.8876,0.9623,0.895,0.9269,0.9077,0.7637,0.7734,1.45
ridge,Ridge Classifier,0.8844,0.9598,0.8512,0.9651,0.8998,0.7643,0.7829,0.027
lda,Linear Discriminant Analysis,0.8809,0.9589,0.8456,0.9651,0.8964,0.7578,0.7775,0.018




RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       monotonic_cst=None, n_estimators=100, n_jobs=-1,
                       oob_score=False, random_state=42, verbose=0,
                       warm_start=False)


In [24]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import numpy as np

# Inicializar el modelo
rf = RandomForestClassifier()

# Definir los hiperparámetros que quieres probar
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Configurar la búsqueda de hiperparámetros con GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid,
                           cv=5, n_jobs=-1, verbose=2)

# Ajustar el modelo con la búsqueda de hiperparámetros
grid_search.fit(X_train, y_train)

# Mejor combinación de hiperparámetros
print("Mejores hiperparámetros encontrados:\n", grid_search.best_params_)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits
Mejores hiperparámetros encontrados:
 {'bootstrap': True, 'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}


In [25]:
import joblib

# Después de realizar GridSearchCV o RandomizedSearchCV y ajustar el modelo
# Supongamos que usaste 'grid_search' o 'random_search'

# Obtener el mejor modelo encontrado
best_model = grid_search.best_estimator_  # O random_search.best_estimator_

# Exportar el mejor modelo a un archivo .joblib
joblib.dump(best_model, '../data/processed/best_random_forest_model.joblib')

print("Modelo exportado correctamente.")


Modelo exportado correctamente.
