#**Carga de Datos**

In [1]:
# Montar Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Importar librerías necesarias
import pandas as pd

# Leer el dataset desde Google Drive
# Cambia la ruta del archivo según su ubicación en tu Google Drive
file_path = "/content/drive/My Drive/databankrupcy.csv"  # Cambia "data.csv" al nombre de tu archivo
df = pd.read_csv(file_path)

# Verificar los datos
print(df.head())
print(df.info())


Mounted at /content/drive
   Bankrupt?   ROA(C) before interest and depreciation before interest  \
0          1                                           0.370594          
1          1                                           0.464291          
2          1                                           0.426071          
3          1                                           0.399844          
4          1                                           0.465022          

    ROA(A) before interest and % after tax  \
0                                 0.424389   
1                                 0.538214   
2                                 0.499019   
3                                 0.451265   
4                                 0.538432   

    ROA(B) before interest and depreciation after tax  \
0                                           0.405750    
1                                           0.516730    
2                                           0.472295    
3                       

#**Preparacion de datos**

In [14]:
# Convertir columnas enteras a flotantes
X = X.astype({col: 'float64' for col in X.select_dtypes('int').columns})

# Dividir los datos en entrenamiento y prueba
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Tamaño del conjunto de entrenamiento:", X_train.shape)
print("Tamaño del conjunto de prueba:", X_test.shape)


Tamaño del conjunto de entrenamiento: (5455, 95)
Tamaño del conjunto de prueba: (1364, 95)


#**Dividir los Datos**

In [7]:
from sklearn.model_selection import train_test_split

# Separar características (X) y variable objetivo (y)
X = df.drop("Bankrupt?", axis=1)  # Eliminar la columna objetivo de las características
y = df["Bankrupt?"]  # Variable objetivo

# Dividir en datos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


#**Configurar MLFlow**

In [8]:
# Instalar MLFlow si no está ya instalado
!pip install mlflow

# Importar MLFlow
import mlflow
import mlflow.sklearn

# Configurar el experimento en MLFlow
mlflow.set_experiment("Bankruptcy Prediction")


Collecting mlflow
  Downloading mlflow-2.18.0-py3-none-any.whl.metadata (29 kB)
Collecting mlflow-skinny==2.18.0 (from mlflow)
  Downloading mlflow_skinny-2.18.0-py3-none-any.whl.metadata (30 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.14.0-py3-none-any.whl.metadata (7.4 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==2.18.0->mlflow)
  Downloading databricks_sdk-0.38.0-py3-none-any.whl.metadata (38 kB)
Collecting Mako (from alembic!=1.10.0,<2->mlflow)
  Downloading Mako-1.3.6-py3-none-any.whl.metadata (2.9 kB)
Collecting graphql-core<3.3,>=3.1 (from graphene<4->mlflow)
  Downloading graphql_core-3.2.5-py3-none-any.whl.metadata (10 kB)
Colle

2024/12/01 15:21:31 INFO mlflow.tracking.fluent: Experiment with name 'Bankruptcy Prediction' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///content/mlruns/104313688846476588', creation_time=1733066491939, experiment_id='104313688846476588', last_update_time=1733066491939, lifecycle_stage='active', name='Bankruptcy Prediction', tags={}>

#**Entrenar el Modelo y Registrar Métricas**

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score, f1_score

# Iniciar un experimento en MLFlow
with mlflow.start_run(run_name="Random Forest"):
    # Entrenar el modelo
    model = RandomForestClassifier(max_depth=20, min_samples_split=5, n_estimators=50, random_state=42)
    model.fit(X_train, y_train)

    # Realizar predicciones
    y_pred = model.predict(X_test)

    # Calcular métricas
    acc = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    # Registrar métricas en MLFlow
    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("recall", recall)
    mlflow.log_metric("f1_score", f1)

    # Guardar el modelo en MLFlow
    mlflow.sklearn.log_model(model, "random_forest_model")

# Mostrar los resultados en la consola
print(f"Accuracy: {acc:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")




Accuracy: 0.97
Recall: 0.20
F1 Score: 0.31


#**Optimización de Hiperparámetros con Optuna**

In [10]:
# Instalar Optuna si no está ya instalado
!pip install optuna

import optuna
from sklearn.model_selection import cross_val_score

# Definir la función objetivo para Optuna
def objective(trial):
    # Sugerir valores para los hiperparámetros
    max_depth = trial.suggest_int("max_depth", 10, 30)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 10)
    n_estimators = trial.suggest_int("n_estimators", 50, 200)

    # Modelo con los hiperparámetros sugeridos
    model = RandomForestClassifier(max_depth=max_depth, min_samples_split=min_samples_split, n_estimators=n_estimators, random_state=42)

    # Validación cruzada
    return cross_val_score(model, X_train, y_train, cv=3, scoring="accuracy").mean()

# Iniciar la optimización
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

# Imprimir los mejores hiperparámetros
print("Mejores parámetros:", study.best_params)


Collecting optuna
  Downloading optuna-4.1.0-py3-none-any.whl.metadata (16 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.1.0-py3-none-any.whl (364 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/364.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m364.4/364.4 kB[0m [31m20.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.9.0 optuna-4.1.0


[I 2024-12-01 15:22:32,665] A new study created in memory with name: no-name-5426aa7e-7897-4668-b244-ea21474d152b
[I 2024-12-01 15:22:51,960] Trial 0 finished with value: 0.9701192219276903 and parameters: {'max_depth': 25, 'min_samples_split': 5, 'n_estimators': 186}. Best is trial 0 with value: 0.9701192219276903.
[I 2024-12-01 15:23:00,323] Trial 1 finished with value: 0.9714022804149574 and parameters: {'max_depth': 27, 'min_samples_split': 9, 'n_estimators': 107}. Best is trial 1 with value: 0.9714022804149574.
[I 2024-12-01 15:23:14,443] Trial 2 finished with value: 0.9710357786740742 and parameters: {'max_depth': 21, 'min_samples_split': 2, 'n_estimators': 168}. Best is trial 1 with value: 0.9714022804149574.
[I 2024-12-01 15:23:20,638] Trial 3 finished with value: 0.9701190203315733 and parameters: {'max_depth': 10, 'min_samples_split': 9, 'n_estimators': 65}. Best is trial 1 with value: 0.9714022804149574.
[I 2024-12-01 15:23:30,657] Trial 4 finished with value: 0.970486026062

Mejores parámetros: {'max_depth': 27, 'min_samples_split': 9, 'n_estimators': 107}


#**Registro del Modelo Inicial**

In [15]:
from mlflow.models.signature import infer_signature

# Generar un ejemplo de entrada representativo
input_example = X_test.iloc[:1]  # Usamos el primer registro del conjunto de prueba
# Opcionalmente, puedes agregar valores faltantes al input_example si es representativo de datos reales.

# Inferir la firma del modelo
signature = infer_signature(X_test, model.predict(X_test))

# Registrar el modelo con input_example y signature
mlflow.sklearn.log_model(
    sk_model=model,
    artifact_path="random_forest_model",
    input_example=input_example,
    signature=signature
)


<mlflow.models.model.ModelInfo at 0x7fad041579a0>

#**Registro del Modelo Optimizado**

In [16]:
from mlflow.models.signature import infer_signature

# Generar un ejemplo de entrada representativo
input_example = X_test.iloc[:1]  # Usamos el primer registro del conjunto de prueba
# Opcionalmente, puedes agregar valores faltantes al input_example si es representativo de datos reales.

# Inferir la firma del modelo
signature = infer_signature(X_test, model.predict(X_test))

# Registrar el modelo con input_example y signature
mlflow.sklearn.log_model(
    sk_model=model,
    artifact_path="random_forest_model",
    input_example=input_example,
    signature=signature
)


<mlflow.models.model.ModelInfo at 0x7facfe9b38e0>

In [17]:
# Instalar ngrok si no está instalado
!pip install pyngrok

# Exponer el servidor de MLFlow
from pyngrok import ngrok
mlflow_ui = ngrok.connect(port="5000")
print(f"MLFlow UI available at: {mlflow_ui}")
!mlflow ui --port 5000


Collecting pyngrok
  Downloading pyngrok-7.2.1-py3-none-any.whl.metadata (8.3 kB)
Downloading pyngrok-7.2.1-py3-none-any.whl (22 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.2.1


ERROR:pyngrok.process.ngrok:t=2024-12-01T15:46:47+0000 lvl=eror msg="failed to reconnect session" obj=tunnels.session err="authentication failed: Usage of ngrok requires a verified account and authtoken.\n\nSign up for an account: https://dashboard.ngrok.com/signup\nInstall your authtoken: https://dashboard.ngrok.com/get-started/your-authtoken\r\n\r\nERR_NGROK_4018\r\n"
ERROR:pyngrok.process.ngrok:t=2024-12-01T15:46:47+0000 lvl=eror msg="session closing" obj=tunnels.session err="authentication failed: Usage of ngrok requires a verified account and authtoken.\n\nSign up for an account: https://dashboard.ngrok.com/signup\nInstall your authtoken: https://dashboard.ngrok.com/get-started/your-authtoken\r\n\r\nERR_NGROK_4018\r\n"
ERROR:pyngrok.process.ngrok:t=2024-12-01T15:46:47+0000 lvl=eror msg="terminating with error" obj=app err="authentication failed: Usage of ngrok requires a verified account and authtoken.\n\nSign up for an account: https://dashboard.ngrok.com/signup\nInstall your aut

PyngrokNgrokError: The ngrok process errored on start: authentication failed: Usage of ngrok requires a verified account and authtoken.\n\nSign up for an account: https://dashboard.ngrok.com/signup\nInstall your authtoken: https://dashboard.ngrok.com/get-started/your-authtoken\r\n\r\nERR_NGROK_4018\r\n.

In [19]:
!ngrok authtoken 2pcb0NRrykNCTdohIids2WMqu5L_5ML23hn9GHoN9xSYYEn8s


Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [None]:
from pyngrok import ngrok

# Exponer el puerto 5000 con un túnel HTTP
mlflow_ui = ngrok.connect(5000, "http")
print(f"MLFlow UI disponible en: {mlflow_ui}")
!mlflow ui --port 5000



MLFlow UI disponible en: NgrokTunnel: "https://0574-35-245-56-243.ngrok-free.app" -> "http://localhost:5000"
[2024-12-01 16:04:06 +0000] [12587] [INFO] Starting gunicorn 23.0.0
[2024-12-01 16:04:06 +0000] [12587] [INFO] Listening at: http://127.0.0.1:5000 (12587)
[2024-12-01 16:04:06 +0000] [12587] [INFO] Using worker: sync
[2024-12-01 16:04:06 +0000] [12588] [INFO] Booting worker with pid: 12588
[2024-12-01 16:04:06 +0000] [12589] [INFO] Booting worker with pid: 12589
[2024-12-01 16:04:06 +0000] [12594] [INFO] Booting worker with pid: 12594
[2024-12-01 16:04:06 +0000] [12595] [INFO] Booting worker with pid: 12595
