In [1]:
import os
import sys
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import (
    f1_score, precision_score, recall_score, classification_report,
    confusion_matrix, roc_auc_score
)
from sklearn.ensemble import RandomForestClassifier
import mlflow
import mlflow.sklearn
from mlflow.models.signature import infer_signature

src_path = os.path.abspath(os.path.join(os.getcwd(),  '..', 'src'))
if src_path not in sys.path:
    sys.path.append(src_path)

from data_preprocess import DataCleaner, DataPreprocessor

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (
    f1_score, precision_score, recall_score, roc_auc_score,
    confusion_matrix, classification_report
)
from mlflow.models.signature import infer_signature
import mlflow
from collections import Counter

In [2]:
# ================================
# 1. CONFIGURACIÓN
# ================================
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("alerta_certificado_invalido_v3")

2025/05/07 23:50:03 INFO mlflow.tracking.fluent: Experiment with name 'alerta_certificado_invalido_v3' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/924702083764126140', creation_time=1746679803865, experiment_id='924702083764126140', last_update_time=1746679803865, lifecycle_stage='active', name='alerta_certificado_invalido_v3', tags={}>

In [3]:
df_raw = pd.read_excel(r'../data/raw/Incapacidades_Empresa.xlsx')
features =  pd.read_csv(r'../data/processed/features.csv').iloc[:, 0].to_list()

# Limpieza
cleaner = DataCleaner(df_raw)
df_clean = cleaner.limpiar()

# Preprocesamiento
processor = DataPreprocessor(df_clean, target='alerta_certificado_invalido')
df_ready = processor.procesar()
df = df_ready.copy()

In [4]:
X = df.drop(columns=["alerta_certificado_invalido","c.c_colaborador"])
X = X[features]
y = df["alerta_certificado_invalido"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, random_state=42, stratify=y
)
y_test = y_test.astype(int)
print(f"✅ X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"✅ X_test: {X_test.shape}, y_test: {y_test.shape}")

✅ X_train: (5022, 373), y_train: (5022,)
✅ X_test: (558, 373), y_test: (558,)


In [5]:
# ================================
# 3. GRIDSEARCHCV + RANDOM FOREST
# ================================
param_grid = {
    'n_estimators': [200, 300],
    'max_depth': [ 10, 20, 30],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['log2']
}

grid_search = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    scoring='f1',
    cv=5,
    verbose=1,
    n_jobs=-1
)

# ================================
# 4. ENTRENAMIENTO + LOGGING RF
# ================================
with mlflow.start_run(run_name="RandomForest_HPO"):

    print("🔍 Buscando mejores hiperparámetros...")
    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    y_proba = best_model.predict_proba(X_test)[:, 1]
    y_pred = y_pred.astype(int)

    # Métricas
    f1 = f1_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_proba)

    mlflow.log_params(grid_search.best_params_)
    mlflow.log_metric("f1_score", f1)
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("recall", recall)
    mlflow.log_metric("roc_auc", auc)

    input_example = X_test.iloc[:1]
    signature = infer_signature(X_test, y_pred)

    mlflow.sklearn.log_model(best_model, "RandomForest_best", signature=signature, input_example=input_example)

    print("✅ RandomForest -> F1: {:.4f}, Precision: {:.4f}, Recall: {:.4f}, AUC: {:.4f}".format(f1, precision, recall, auc))
    print("📊 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("🧾 Classification Report:\n", classification_report(y_test, y_pred))


🔍 Buscando mejores hiperparámetros...
Fitting 5 folds for each of 24 candidates, totalling 120 fits


 nan nan nan nan nan nan]


✅ RandomForest -> F1: 0.3667, Precision: 0.5347, Recall: 0.2790, AUC: 0.5657
📊 Confusion Matrix:
 [[215  67]
 [199  77]]
🧾 Classification Report:
               precision    recall  f1-score   support

           0       0.52      0.76      0.62       282
           1       0.53      0.28      0.37       276

    accuracy                           0.52       558
   macro avg       0.53      0.52      0.49       558
weighted avg       0.53      0.52      0.49       558

🏃 View run RandomForest_HPO at: http://127.0.0.1:5000/#/experiments/924702083764126140/runs/ac440ff2243d468098635660e89c35e0
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/924702083764126140


In [6]:
# ================================
# 5. ENTRENAMIENTO XGBoost
# ================================
with mlflow.start_run(run_name="XGBoostClassifier"):

    y_train = y_train.astype(int)  # Asegura que sean enteros
    class_counts = Counter(y_train)

    neg = class_counts.get(0, 0)
    pos = class_counts.get(1, 1)  # evita división por cero

    scale_pos_weight = neg / pos

    xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss',
                              scale_pos_weight=scale_pos_weight, random_state=42)
    xgb_model.fit(X_train, y_train)

    y_pred = xgb_model.predict(X_test)
    y_proba = xgb_model.predict_proba(X_test)[:, 1]
    y_pred = y_pred.astype(int)

    f1 = f1_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_proba)

    mlflow.log_param("scale_pos_weight", scale_pos_weight)
    mlflow.log_metric("f1_score", f1)
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("recall", recall)
    mlflow.log_metric("roc_auc", auc)

    signature = infer_signature(X_test, y_pred)
    mlflow.sklearn.log_model(xgb_model, "XGBoost", signature=signature, input_example=X_test.iloc[:1])

    print("✅ XGBoost -> F1: {:.4f}, Precision: {:.4f}, Recall: {:.4f}, AUC: {:.4f}".format(f1, precision, recall, auc))


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


✅ XGBoost -> F1: 0.4851, Precision: 0.5000, Recall: 0.4710, AUC: 0.5231
🏃 View run XGBoostClassifier at: http://127.0.0.1:5000/#/experiments/924702083764126140/runs/ad08ba4c85c44aacb82b31c81b594500
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/924702083764126140


In [7]:
# ================================
# 6. ENTRENAMIENTO LightGBM
# ================================
with mlflow.start_run(run_name="LightGBMClassifier"):

    lgb_model = LGBMClassifier(class_weight='balanced', random_state=42)
    lgb_model.fit(X_train, y_train)

    y_pred = lgb_model.predict(X_test)
    y_proba = lgb_model.predict_proba(X_test)[:, 1]
    y_pred = y_pred.astype(int)

    f1 = f1_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_proba)

    mlflow.log_metric("f1_score", f1)
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("recall", recall)
    mlflow.log_metric("roc_auc", auc)

    signature = infer_signature(X_test, y_pred)
    mlflow.sklearn.log_model(lgb_model, "LightGBM", signature=signature, input_example=X_test.iloc[:1])

    print("✅ LightGBM -> F1: {:.4f}, Precision: {:.4f}, Recall: {:.4f}, AUC: {:.4f}".format(f1, precision, recall, auc))

[LightGBM] [Info] Number of positive: 2487, number of negative: 2535
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001036 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1021
[LightGBM] [Info] Number of data points in the train set: 5022, number of used features: 104
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
✅ LightGBM -> F1: 0.4901, Precision: 0.4875, Recall: 0.4928, AUC: 0.4945
🏃 View run LightGBMClassifier at: http://127.0.0.1:5000/#/experiments/924702083764126140/runs/9a769fd16c624f8fb9937f28b8553930
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/924702083764126140


In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (
    f1_score, precision_score, recall_score, roc_auc_score,
    confusion_matrix, classification_report
)
from mlflow.models.signature import infer_signature
import mlflow

# Asegura que y_train es numérico
y_train = y_train.astype(int)
y_test = y_test.astype(int)

# ================================
# GRIDSEARCHCV + LOGISTIC REGRESSION
# ================================
param_grid = {
    'C': [0.01, 0.1, 1.0, 10.0],
    'solver': ['liblinear', 'saga'],
    'l1_ratio': [0.0, 0.5, 1.0]  # solo para elasticnet
}

grid_search = GridSearchCV(
    LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42),
    param_grid=param_grid,
    scoring='f1',
    cv=5,
    verbose=1,
    n_jobs=-1
)

# ================================
# ENTRENAMIENTO + LOGGING
# ================================
with mlflow.start_run(run_name="LogisticRegression_HPO"):

    print("🔍 Buscando mejores hiperparámetros para regresión logística...")
    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    y_proba = best_model.predict_proba(X_test)[:, 1]

    # Métricas
    f1 = f1_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_proba)

    # Logging
    mlflow.log_params(grid_search.best_params_)
    mlflow.log_metric("f1_score", f1)
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("recall", recall)
    mlflow.log_metric("roc_auc", auc)

    signature = infer_signature(X_test, y_pred)
    mlflow.sklearn.log_model(
        best_model,
        "LogisticRegression_best",
        signature=signature,
        input_example=X_test.iloc[:1]
    )

    print("✅ Mejor configuración encontrada:", grid_search.best_params_)
    print(f"🎯 F1: {f1:.4f} | Precision: {precision:.4f} | Recall: {recall:.4f} | AUC: {auc:.4f}")
    print("📊 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("🧾 Classification Report:\n", classification_report(y_test, y_pred))


🔍 Buscando mejores hiperparámetros para regresión logística...
Fitting 5 folds for each of 24 candidates, totalling 120 fits




✅ Mejor configuración encontrada: {'C': 10.0, 'l1_ratio': 0.0, 'solver': 'saga'}
🎯 F1: 0.5620 | Precision: 0.5662 | Recall: 0.5580 | AUC: 0.6209
📊 Confusion Matrix:
 [[164 118]
 [122 154]]
🧾 Classification Report:
               precision    recall  f1-score   support

           0       0.57      0.58      0.58       282
           1       0.57      0.56      0.56       276

    accuracy                           0.57       558
   macro avg       0.57      0.57      0.57       558
weighted avg       0.57      0.57      0.57       558

🏃 View run LogisticRegression_HPO at: http://127.0.0.1:5000/#/experiments/924702083764126140/runs/bb9c6b87173c4c8a8187847edf2a1b32
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/924702083764126140
