In [10]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
import mlflow
import mlflow.sklearn
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import os
import sys

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix, classification_report
import mlflow
import mlflow.sklearn

In [3]:
# ========================
# 2. CONFIG MLflow
# ========================
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("modelo_clasificacion_var_rpta_alt_v2")

<Experiment: artifact_location='mlflow-artifacts:/546444954564698688', creation_time=1743978996493, experiment_id='546444954564698688', last_update_time=1743978996493, lifecycle_stage='active', name='modelo_clasificacion_var_rpta_alt_v2', tags={}>

In [3]:
import os
import sys

# Obtener la ruta del directorio base (nivel superior del proyecto)
BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), ".."))
SRC_PATH = os.path.join(BASE_DIR, "src")

# Agregar SRC_PATH a sys.path si no está presente
if SRC_PATH not in sys.path:
    sys.path.insert(0, SRC_PATH)

In [4]:
import sys
import os

# Agrega el path al directorio src
sys.path.append(os.path.abspath(os.path.join('..', 'src')))

from data_engineer import FeatureSelector

df = pd.read_csv(r"..\data\procesed\df_train.csv")
features = pd.read_csv(r'..\data\procesed\features.csv')

limpieza = FeatureSelector(df, features)
df = limpieza.fit_transform()

Limpiando variables numéricas...
Separando X e y...
Detectando tipos de variables...
Preprocesando variables...
Transformando variables...
Reconstruyendo nombres de columnas...
Seleccionando variables...
Ajustando columnas faltantes...
Ordenando columnas...
Finalizando DataFrame...


In [5]:
df.head()

Unnamed: 0,id,var_rpta_alt,subsegm_PREF CONCILIACION,producto_SOBREGIRO,producto_LIBRANZA,marca_pago_NO_PAGO,ctrl_terc_EXCLIENTE,producto_CREDIPAGO,producto_ROTATIVOS,producto_LEASING HABITACIONAL,...,tot_activos,pago_total,personas_dependientes,num_hijos,origen_fondos_OTROS,nit_enmascarado,canal_actualizacion_PIC,canal_actualizacion_SVP,aplicativo_V,egresos_mes
0,536377#353056.0#726815.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.254922,0.055124,-0.2834,-0.237131,1.0,1.224157,0.0,0.0,0.0,-0.240882
1,245279#974375.0#105967.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.179708,-0.321321,0.946537,-0.237131,1.0,-0.363812,0.0,1.0,0.0,-0.069276
2,389195#nan#nan,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.274307,-0.328143,-0.2834,-0.237131,0.0,0.421265,0.0,0.0,0.0,-0.163659
3,63486#nan#nan,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.080262,-0.328143,-0.2834,-0.237131,0.0,-1.355511,0.0,1.0,0.0,0.016527
4,306792#nan#nan,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.690084,-0.328143,-0.2834,-0.237131,0.0,-0.028252,0.0,0.0,0.0,2.599624


In [6]:
# Separar features y target
X = df.drop(columns=["var_rpta_alt", "id"])
y = df["var_rpta_alt"]

# Split 60% train / 40% test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.4, random_state=42
)

print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"X_test: {X_test.shape}, y_test: {y_test.shape}")

X_train: (43150, 113), y_train: (43150,)
X_test: (28767, 113), y_test: (28767,)


In [7]:
# ========================
# 4. MODELOS A ENTRENAR
# ========================
modelos = {
    'LogisticRegression': LogisticRegression(max_iter=1000),
    'KNeighborsClassifier': KNeighborsClassifier(),
    'DecisionTreeClassifier': DecisionTreeClassifier(),
    'RandomForestClassifier': RandomForestClassifier()
}

In [8]:
# ========================
# 5. ENTRENAMIENTO + MLFLOW
# ========================
for model_name, model in modelos.items():
    with mlflow.start_run(run_name=model_name):
        print(f"\nEntrenando {model_name}...")

        # Entrenamiento
        model.fit(X_train, y_train)

        # Predicción y métrica
        y_pred = model.predict(X_test)
        f1 = f1_score(y_test, y_pred)
        # Dentro del loop
        scores = cross_val_score(model, X_train, y_train, cv=5, scoring='f1')
        print(f"Cross-validated F1: {np.mean(scores):.4f} ± {np.std(scores):.4f}")

        # Confusion matrix
        print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
        print("Classification Report:\n", classification_report(y_test, y_pred))

        # Log en MLflow
        mlflow.log_param("model_type", model_name)
        mlflow.log_metric("f1_score", f1)
        #muestrame la presicion y recall
        precision = f1_score(y_test, y_pred, average='weighted')
        recall = f1_score(y_test, y_pred, average='weighted')
        mlflow.sklearn.log_model(model, model_name)

        print(f"✅ {model_name} -> F1 score: {f1:.4f}")
        print(f"✅ {model_name} -> Precision: {precision:.4f}")
        print(f"✅ {model_name} -> Recall: {recall:.4f}")


Entrenando LogisticRegression...
Cross-validated F1: 0.5799 ± 0.0062
Confusion Matrix:
 [[15900  2983]
 [ 4608  5276]]
Classification Report:
               precision    recall  f1-score   support

         0.0       0.78      0.84      0.81     18883
         1.0       0.64      0.53      0.58      9884

    accuracy                           0.74     28767
   macro avg       0.71      0.69      0.69     28767
weighted avg       0.73      0.74      0.73     28767





✅ LogisticRegression -> F1 score: 0.5816
✅ LogisticRegression -> Precision: 0.7297
✅ LogisticRegression -> Recall: 0.7297
🏃 View run LogisticRegression at: http://127.0.0.1:5000/#/experiments/546444954564698688/runs/19efd6e9adc745a1b387aa5d543d2cd9
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/546444954564698688

Entrenando KNeighborsClassifier...
Cross-validated F1: 0.5228 ± 0.0054
Confusion Matrix:
 [[15490  3393]
 [ 5090  4794]]
Classification Report:
               precision    recall  f1-score   support

         0.0       0.75      0.82      0.79     18883
         1.0       0.59      0.49      0.53      9884

    accuracy                           0.71     28767
   macro avg       0.67      0.65      0.66     28767
weighted avg       0.70      0.71      0.70     28767





✅ KNeighborsClassifier -> F1 score: 0.5306
✅ KNeighborsClassifier -> Precision: 0.6976
✅ KNeighborsClassifier -> Recall: 0.6976
🏃 View run KNeighborsClassifier at: http://127.0.0.1:5000/#/experiments/546444954564698688/runs/993ed5109dd947cf9b51b2121007a527
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/546444954564698688

Entrenando DecisionTreeClassifier...
Cross-validated F1: 0.5488 ± 0.0058
Confusion Matrix:
 [[14291  4592]
 [ 4418  5466]]
Classification Report:
               precision    recall  f1-score   support

         0.0       0.76      0.76      0.76     18883
         1.0       0.54      0.55      0.55      9884

    accuracy                           0.69     28767
   macro avg       0.65      0.65      0.65     28767
weighted avg       0.69      0.69      0.69     28767





✅ DecisionTreeClassifier -> F1 score: 0.5482
✅ DecisionTreeClassifier -> Precision: 0.6874
✅ DecisionTreeClassifier -> Recall: 0.6874
🏃 View run DecisionTreeClassifier at: http://127.0.0.1:5000/#/experiments/546444954564698688/runs/91e379658490429595d90389d4dc8986
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/546444954564698688

Entrenando RandomForestClassifier...
Cross-validated F1: 0.6116 ± 0.0101
Confusion Matrix:
 [[16762  2121]
 [ 4499  5385]]
Classification Report:
               precision    recall  f1-score   support

         0.0       0.79      0.89      0.84     18883
         1.0       0.72      0.54      0.62      9884

    accuracy                           0.77     28767
   macro avg       0.75      0.72      0.73     28767
weighted avg       0.76      0.77      0.76     28767





✅ RandomForestClassifier -> F1 score: 0.6193
✅ RandomForestClassifier -> Precision: 0.7610
✅ RandomForestClassifier -> Recall: 0.7610
🏃 View run RandomForestClassifier at: http://127.0.0.1:5000/#/experiments/546444954564698688/runs/2f887a5ba94e4896929445fee1ce0ff8
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/546444954564698688


In [13]:

# ========================
# Hiperparámetros a buscar
# ========================
param_grid = {
    'n_estimators': [100,200],
    'max_depth': [5, 10],
    'min_samples_split': [1,2,5],
    'min_samples_leaf': [1, 2 , 5],
    'max_features': ['sqrt', 'log2']
}

# ========================
# GridSearchCV con CV interno
# ========================
rf = RandomForestClassifier(random_state=42)

grid_search = GridSearchCV(
    rf,
    param_grid=param_grid,
    scoring='f1',
    cv=5,
    n_jobs=-1,
    verbose=2
)

# ========================
# MLflow Run
# ========================
with mlflow.start_run(run_name="RandomForest_HPO"):
    print("🔍 Buscando mejores hiperparámetros...")

    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)

    # Métricas
    f1 = f1_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)

    # Logging
    mlflow.log_params(grid_search.best_params_)
    mlflow.log_metric("f1_score", f1)
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("recall", recall)
    mlflow.sklearn.log_model(best_model, "RandomForest_best")

    print("✅ Mejor configuración encontrada:", grid_search.best_params_)
    print(f"🎯 F1: {f1:.4f} | Precision: {precision:.4f} | Recall: {recall:.4f}")
    print("📊 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("🧾 Classification Report:\n", classification_report(y_test, y_pred))


🔍 Buscando mejores hiperparámetros...
Fitting 5 folds for each of 72 candidates, totalling 360 fits


120 fits failed out of a total of 360.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
120 fits failed with the following error:
Traceback (most recent call last):
  File "d:\Desktop\juan_guzman_prueba\env\lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\Desktop\juan_guzman_prueba\env\lib\site-packages\sklearn\base.py", line 1382, in wrapper
    estimator._validate_params()
  File "d:\Desktop\juan_guzman_prueba\env\lib\site-packages\sklearn\base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "d:\Desktop\juan_guzman_prueba\env\lib\site-packages\sklearn\utils\_param_validation.py", line 98, in validate_parameter

✅ Mejor configuración encontrada: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200}
🎯 F1: 0.5835 | Precision: 0.7116 | Recall: 0.4944
📊 Confusion Matrix:
 [[16902  1981]
 [ 4997  4887]]
🧾 Classification Report:
               precision    recall  f1-score   support

         0.0       0.77      0.90      0.83     18883
         1.0       0.71      0.49      0.58      9884

    accuracy                           0.76     28767
   macro avg       0.74      0.69      0.71     28767
weighted avg       0.75      0.76      0.74     28767

🏃 View run RandomForest_HPO at: http://127.0.0.1:5000/#/experiments/546444954564698688/runs/9d04117e510942ae8c70b8ca7d575205
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/546444954564698688
