# 3. Preparación de datos:

In [1]:
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import joblib
import warnings

warnings.filterwarnings('ignore')

In [2]:
# Cargar el dataset
file_path = "heart.csv"
df = pd.read_csv(file_path)

In [3]:
# Separar variables de entrada y salida
X = df.drop("output", axis=1)
y = df["output"]

In [4]:
# Definir las columnas para escalar y codificar
numeric_features = ['age', 'trtbps', 'chol', 'thalachh', 'oldpeak']
categorical_features = ['cp', 'thall', 'caa']

# Crear el preprocesador
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(drop='first'), categorical_features)
    ])

# Crear el pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('classifier', RandomForestClassifier())
])

In [5]:
# Dividir en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# 4. Modelación

## Modelacion sin PyCaret

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV

## Busqueda de hiperparametros

In [7]:
# Definir la búsqueda de hiperparámetros
param_grid = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5, 10],
}

# Realizar la búsqueda de hiperparámetros
grid_search = GridSearchCV(pipeline, param_grid, cv=5)
grid_search.fit(X_train, y_train)

In [8]:
# Mejor modelo
best_model = grid_search.best_estimator_
print("\nMejores hiperparámetros para Random Forest:")
print(grid_search.best_params_)



Mejores hiperparámetros para Random Forest:
{'classifier__max_depth': None, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 50}


In [9]:
# Evaluar el mejor modelo
y_pred_best = best_model.predict(X_test)
print("\nBest Model Accuracy: {:.2f}".format(accuracy_score(y_test, y_pred_best)))
print("Classification Report for Best Model:\n")
print(classification_report(y_test, y_pred_best))


Best Model Accuracy: 0.82
Classification Report for Best Model:

              precision    recall  f1-score   support

           0       0.76      0.90      0.83        29
           1       0.89      0.75      0.81        32

    accuracy                           0.82        61
   macro avg       0.83      0.82      0.82        61
weighted avg       0.83      0.82      0.82        61



In [10]:
# Modelos base
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(),
    "KNN": KNeighborsClassifier(),
}

# Entrenar y evaluar modelos
best_base_model = None
best_accuracy = 0
best_model_name = ""

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"\n{name} Accuracy: {accuracy:.2f}")
    print(f"Classification Report for {name}:\n")
    print(classification_report(y_test, y_pred))

    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_base_model = model
        best_model_name = name



Logistic Regression Accuracy: 0.89
Classification Report for Logistic Regression:

              precision    recall  f1-score   support

           0       0.89      0.86      0.88        29
           1       0.88      0.91      0.89        32

    accuracy                           0.89        61
   macro avg       0.89      0.88      0.88        61
weighted avg       0.89      0.89      0.89        61


Decision Tree Accuracy: 0.80
Classification Report for Decision Tree:

              precision    recall  f1-score   support

           0       0.74      0.90      0.81        29
           1       0.88      0.72      0.79        32

    accuracy                           0.80        61
   macro avg       0.81      0.81      0.80        61
weighted avg       0.82      0.80      0.80        61


Random Forest Accuracy: 0.84
Classification Report for Random Forest:

              precision    recall  f1-score   support

           0       0.83      0.83      0.83        29
         

In [11]:
print("\nBest Base Model Accuracy: {:.2f}".format(best_accuracy))
print("Classification Report for Best Base Model:\n")
y_pred_best_base = best_base_model.predict(X_test)
print(classification_report(y_test, y_pred_best_base))
print(f"\nBest Base Model Name: {best_model_name}")


Best Base Model Accuracy: 0.89
Classification Report for Best Base Model:

              precision    recall  f1-score   support

           0       0.89      0.86      0.88        29
           1       0.88      0.91      0.89        32

    accuracy                           0.89        61
   macro avg       0.89      0.88      0.88        61
weighted avg       0.89      0.89      0.89        61


Best Base Model Name: Logistic Regression


In [12]:
# Guardar el mejor modelo
joblib.dump(best_model, "mejor_modelo_pipeline.joblib")

['mejor_modelo_pipeline.joblib']

## Modelacion con PyCaret

In [13]:
#!pip install pycaret
from pycaret.classification import *

In [14]:
# Cargar el dataset
file_path = "heart.csv"
data = pd.read_csv(file_path)

In [15]:
# Configuración inicial
clf_setup = setup(data=data, target="output",
                  train_size=0.8,  # 80% para entrenamiento
                  normalize=True,  # Normalizar datos
                  session_id=42,  # Reproducibilidad
                  fold=5)  # Validación cruzada


Unnamed: 0,Description,Value
0,Session id,42
1,Target,output
2,Target type,Binary
3,Original data shape,"(303, 14)"
4,Transformed data shape,"(303, 14)"
5,Transformed train set shape,"(242, 14)"
6,Transformed test set shape,"(61, 14)"
7,Numeric features,13
8,Preprocess,True
9,Imputation type,simple


In [16]:
# Comparar y seleccionar los 5 mejores modelos
top_models = compare_models(n_select=5)

# Optimizar los 5 modelos seleccionados
tuned_models = [tune_model(model) for model in top_models]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.8435,0.9129,0.8712,0.8527,0.8593,0.6824,0.6885,0.036
ridge,Ridge Classifier,0.831,0.9057,0.9091,0.8094,0.8552,0.6537,0.6627,0.008
lda,Linear Discriminant Analysis,0.831,0.9061,0.9091,0.8094,0.8552,0.6537,0.6627,0.008
lr,Logistic Regression,0.8309,0.904,0.8937,0.8176,0.8526,0.6547,0.6616,0.296
rf,Random Forest Classifier,0.8309,0.9125,0.8786,0.8269,0.8504,0.6559,0.662,0.044
ada,Ada Boost Classifier,0.8062,0.8632,0.8484,0.8139,0.8271,0.606,0.6161,0.022
lightgbm,Light Gradient Boosting Machine,0.8061,0.8882,0.8336,0.8164,0.8235,0.6082,0.6116,0.028
nb,Naive Bayes,0.8058,0.8947,0.8322,0.8215,0.8222,0.6071,0.6157,0.22
qda,Quadratic Discriminant Analysis,0.8019,0.8782,0.8026,0.8344,0.8147,0.6012,0.6074,0.01
knn,K Neighbors Classifier,0.8018,0.8797,0.863,0.7991,0.8261,0.5957,0.6058,0.224


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7347,0.8064,0.7407,0.7692,0.7547,0.4661,0.4664
1,0.7959,0.8889,0.9259,0.7576,0.8333,0.5769,0.5964
2,0.8958,0.9143,0.9615,0.8621,0.9091,0.788,0.7944
3,0.9167,0.9545,0.9231,0.9231,0.9231,0.8322,0.8322
4,0.8542,0.951,0.8462,0.88,0.8627,0.7073,0.7079
Mean,0.8395,0.903,0.8795,0.8384,0.8566,0.6741,0.6795
Std,0.0667,0.0541,0.0789,0.0645,0.0602,0.1355,0.1339


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7755,0.8485,0.8519,0.7667,0.807,0.5405,0.5448
1,0.8571,0.8603,0.9259,0.8333,0.8772,0.7076,0.7132
2,0.8333,0.9266,0.8462,0.8462,0.8462,0.6643,0.6643
3,0.875,0.9458,0.8462,0.9167,0.88,0.75,0.7526
4,0.8542,0.9493,0.8846,0.8519,0.8679,0.7053,0.7059
Mean,0.839,0.9061,0.8709,0.8429,0.8557,0.6735,0.6762
Std,0.0344,0.0431,0.031,0.0478,0.0271,0.0718,0.0714


Fitting 5 folds for each of 10 candidates, totalling 50 fits


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7959,0.8384,0.8889,0.7742,0.8276,0.5805,0.5888
1,0.7959,0.867,0.9259,0.7576,0.8333,0.5769,0.5964
2,0.875,0.9108,0.9231,0.8571,0.8889,0.7465,0.7492
3,0.875,0.9353,0.8462,0.9167,0.88,0.75,0.7526
4,0.8542,0.958,0.9231,0.8276,0.8727,0.7032,0.7089
Mean,0.8392,0.9019,0.9014,0.8266,0.8605,0.6714,0.6792
Std,0.0361,0.0438,0.0308,0.0575,0.0251,0.0775,0.0724


Fitting 5 folds for each of 10 candidates, totalling 50 fits


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7755,0.8401,0.8148,0.7857,0.8,0.5444,0.5448
1,0.8163,0.862,0.8889,0.8,0.8421,0.624,0.629
2,0.8542,0.9248,0.8462,0.88,0.8627,0.7073,0.7079
3,0.8542,0.9406,0.8462,0.88,0.8627,0.7073,0.7079
4,0.875,0.9528,0.9231,0.8571,0.8889,0.7465,0.7492
Mean,0.835,0.904,0.8638,0.8406,0.8513,0.6659,0.6678
Std,0.0353,0.0447,0.0379,0.0401,0.0296,0.0727,0.0728


Fitting 5 folds for each of 10 candidates, totalling 50 fits


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8163,0.8266,0.8519,0.8214,0.8364,0.6272,0.6278
1,0.8367,0.9175,0.963,0.7879,0.8667,0.6615,0.6839
2,0.8542,0.9196,0.8846,0.8519,0.8679,0.7053,0.7059
3,0.875,0.9615,0.8462,0.9167,0.88,0.75,0.7526
4,0.875,0.9528,0.9231,0.8571,0.8889,0.7465,0.7492
Mean,0.8514,0.9156,0.8937,0.847,0.868,0.6981,0.7039
Std,0.0227,0.0478,0.0442,0.0427,0.0178,0.0479,0.0461


Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [17]:
# Evaluar cada modelo optimizado y seleccionar el mejor
best_model = None
best_combined_score = 0

for tuned_model in tuned_models:
    evaluate_model(tuned_model)
    metrics = pull().iloc[0]
    combined_score = (metrics['F1'] + metrics['Accuracy'] + metrics['Recall']) / 3

    if combined_score > best_combined_score:
        best_combined_score = combined_score
        best_model = tuned_model

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [18]:
best_model

In [19]:
best_combined_score

0.8348666666666666

In [20]:
# Entrenar el modelo final con todos los datos
final_model = finalize_model(best_model)  # Selecciona el mejor modelo optimizado

In [21]:
from pycaret.classification import save_model

# Guardar el modelo
save_model(final_model, "mejor_modelo_pipeline")


Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['age', 'sex', 'cp', 'trtbps',
                                              'chol', 'fbs', 'restecg',
                                              'thalachh', 'exng', 'oldpeak',
                                              'slp', 'caa', 'thall'],
                                     transformer=SimpleImputer(add_indicator=False,
                                                               copy=True,
                                                               fill_value=None,
                                                               keep_empty_features=False,
                                                               missing_values=nan,
                                                               strategy='mean'))),
                 ('categor...
                  ExtraTreesClassifier(bootstrap=Fa

## FAST API con PyCared

In [31]:
from fastapi import FastAPI
import pandas as pd
from pycaret.classification import load_model, predict_model

# Cargar el modelo guardado
model = load_model("mejor_modelo_pipeline")

# Inicializar la aplicación FastAPI
app = FastAPI()

@app.post("/predict/")
def predict(data: dict):
    """
    Endpoint para predecir la salida de un modelo entrenado.
    """
    # Convertir los datos de entrada en un DataFrame
    input_data = pd.DataFrame([data])

    # Realizar la predicción
    prediction = predict_model(model, data=input_data)

    # Retornar la predicción
    return {
        "prediction_label": prediction["Label"][0],
        "prediction_score": prediction["Score"][0],
    }


Transformation Pipeline and Model Successfully Loaded


## FastAPI con Joblib

In [None]:
from fastapi import FastAPI
import pandas as pd
import joblib

# Cargar el modelo guardado
model = joblib.load('mejor_modelo_pipeline.pkl')

# Inicializar la aplicación FastAPI
app = FastAPI()

@app.post("/predict/")
def predict(data: dict):
    """
    Endpoint para predecir la salida de un modelo entrenado.
    """
    # Convertir los datos de entrada en un DataFrame
    input_data = pd.DataFrame([data])

    # Realizar la predicción
    prediction = model.predict(input_data)
    prediction_proba = model.predict_proba(input_data)

    # Retornar la predicción
    return {
        "prediction_label": int(prediction[0]),
        "prediction_score": float(prediction_proba[0][1]),
    }

Vamos a crear una API REST con FastAPI que reciba datos de entrada en formato JSON y devuelva una predicción en formato JSON.

Aunque de los modelos con scikit-learn el mejor fue el modelo 'Logistic Regression', con PyCaret el mejor fue el ExtraTreesClassifier.