In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
import mlflow
import mlflow.sklearn
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import os
import sys

In [2]:
# ========================
# 2. CONFIG MLflow
# ========================
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("modelo_clasificacion_var_rpta_alt")

<Experiment: artifact_location='mlflow-artifacts:/418343190953855177', creation_time=1743867162970, experiment_id='418343190953855177', last_update_time=1743867162970, lifecycle_stage='active', name='modelo_clasificacion_var_rpta_alt', tags={}>

In [3]:
import os
import sys

# Obtener la ruta del directorio base (nivel superior del proyecto)
BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), ".."))
SRC_PATH = os.path.join(BASE_DIR, "src")

# Agregar SRC_PATH a sys.path si no está presente
if SRC_PATH not in sys.path:
    sys.path.insert(0, SRC_PATH)

In [4]:
from data_engineer import FeatureSelector

df = pd.read_csv(r"..\data\raw\prueba_op_base_pivot_var_rpta_alt_enmascarado_trtest.csv")
features = pd.read_csv(r'..\data\procesed\features.csv')

limpieza = FeatureSelector(df, features)
df = limpieza.fit_transform()

Limpiando variables numéricas...
Separando X e y...
Detectando tipos de variables...
Preprocesando variables...
Transformando variables...
Reconstruyendo nombres de columnas...
Seleccionando variables...
Ajustando columnas faltantes...
Ordenando columnas...
Finalizando DataFrame...


In [5]:
df.head()

Unnamed: 0,id,var_rpta_alt,marca_alternativa_No Acepta Alternativa,marca_alternativa_SIN_INFO,descripcion_ranking_mejor_ult_INTERESADO ALTERNATIVA,marca_alt_apli_SI,pagos_tanque_Sin pago,marca_pago_Sin pago,marca_agrupada_rgo_MANTENIMIENTO,promesas_cumplidas,...,valor_cuota_mes,pago_mes,cant_gestiones_binario,descripcion_ranking_post_ult_NO ACEPTA ALTERNATIVA,rango_mora_b.31-90,aplicativo_V,cant_gestiones,num_oblig_orig_enmascarado,nit_enmascarado,min_mora
0,630611#219718#863073,1,0.0,0,0.0,1.0,1.0,1.0,0.0,-0.306304,...,0.085543,-0.126673,0.183695,0.0,1.0,1.0,0.516621,-1.015716,1.77099,0.742248
1,59412#789567#290775,1,0.0,0,0.0,1.0,1.0,1.0,1.0,-0.306304,...,-0.345952,-0.126673,0.183695,0.0,0.0,0.0,-0.918933,0.814605,-1.359317,-0.419785
2,277595#1045909#34433,1,0.0,0,0.0,0.0,1.0,1.0,0.0,-0.306304,...,2.996446,-0.126673,0.183695,0.0,1.0,0.0,0.676127,1.63796,-0.163622,0.354904
3,26897#585786#494556,1,0.0,0,0.0,1.0,0.0,0.0,1.0,-0.306304,...,-0.472363,-0.124356,0.183695,0.0,0.0,0.0,-0.83918,0.160073,-1.537507,-0.342316
4,24588#1061389#18953,1,0.0,0,0.0,0.0,1.0,1.0,0.0,-0.306304,...,-0.631972,-0.126673,0.183695,0.0,0.0,0.0,-0.918933,1.687681,-1.550161,-0.342316


In [6]:
# Separar features y target
X = df.drop(columns=["var_rpta_alt", "id"])
y = df["var_rpta_alt"]

# Split 60% train / 40% test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.4, random_state=42
)

print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"X_test: {X_test.shape}, y_test: {y_test.shape}")

X_train: (340950, 44), y_train: (340950,)
X_test: (227301, 44), y_test: (227301,)


In [7]:
# ========================
# 4. MODELOS A ENTRENAR
# ========================
modelos = {
    'LogisticRegression': LogisticRegression(max_iter=1000),
    'KNeighborsClassifier': KNeighborsClassifier(),
    'DecisionTreeClassifier': DecisionTreeClassifier(),
    'RandomForestClassifier': RandomForestClassifier(),
    'GradientBoostingClassifier': GradientBoostingClassifier(),
    'AdaBoostClassifier': AdaBoostClassifier()
}

In [8]:
print(y.value_counts(normalize=True))  # ¿Qué proporción hay de cada clase?


var_rpta_alt
0    0.519987
1    0.480013
Name: proportion, dtype: float64


In [10]:
X.columns

Index(['marca_alternativa_No Acepta Alternativa', 'marca_alternativa_SIN_INFO',
       'descripcion_ranking_mejor_ult_INTERESADO ALTERNATIVA',
       'marca_alt_apli_SI', 'pagos_tanque_Sin pago', 'marca_pago_Sin pago',
       'marca_agrupada_rgo_MANTENIMIENTO', 'promesas_cumplidas',
       'descripcion_ranking_mejor_ult_NO ACEPTA ACUERDO',
       'desc_alternativa2_Sin alivio', 'cant_alter_posibles',
       'descripcion_ranking_post_ult_PLAN DE COMPROMISO CUMPLIDO',
       'desc_alternativa3_Sin alivio',
       'descripcion_ranking_post_ult_ACEPTA ALTERNATIVA',
       'descripcion_ranking_mejor_ult_ACEPTA ALTERNATIVA',
       'alternativa_aplicada_agr_CONSOLIDACION',
       'desc_alternativa2_Prorroga  ', 'marca_agrupada_rgo_MODIFICACIÓN',
       'segmento_Personal', 'segmento_Personal plus', 'banca_Independientes',
       'cant_acuerdo_binario', 'desc_alternativa3_Consolidación de pasivos',
       'desc_alternativa1_Reestructuración novacion', 'porc_pago_mes',
       'marca_pago_Pago 

In [11]:
a = ['marca_alternativa_No Acepta Alternativa', 'marca_alternativa_SIN_INFO',
       'descripcion_ranking_mejor_ult_INTERESADO ALTERNATIVA',
       'marca_alt_apli_SI', 'pagos_tanque_Sin pago', 'marca_pago_Sin pago',
       'marca_agrupada_rgo_MANTENIMIENTO', 'promesas_cumplidas',
       'descripcion_ranking_mejor_ult_NO ACEPTA ACUERDO',
       'desc_alternativa2_Sin alivio', 'cant_alter_posibles',
       'descripcion_ranking_post_ult_PLAN DE COMPROMISO CUMPLIDO',
       'desc_alternativa3_Sin alivio',
       'descripcion_ranking_post_ult_ACEPTA ALTERNATIVA',
       'descripcion_ranking_mejor_ult_ACEPTA ALTERNATIVA',
       'alternativa_aplicada_agr_CONSOLIDACION',
       'desc_alternativa2_Prorroga  ', 'marca_agrupada_rgo_MODIFICACIÓN',
       'segmento_Personal', 'segmento_Personal plus', 'banca_Independientes',
       'cant_acuerdo_binario', 'desc_alternativa3_Consolidación de pasivos',
       'desc_alternativa1_Reestructuración novacion', 'porc_pago_mes',
       'marca_pago_Pago parcial', 'rpc',
       'desc_alternativa1_Prorroga más Ampliación de plazo',
       'descripcion_ranking_post_ult_PLAN DE PAGO', 'dias_mora_fin',
       'pago_cuota', 'alternativa_aplicada_agr_SIN_INFO', 'producto_ROTATIVOS',
       'endeudamiento', 'valor_cuota_mes', 'pago_mes',
       'cant_gestiones_binario',
       'descripcion_ranking_post_ult_NO ACEPTA ALTERNATIVA',
       'rango_mora_b.31-90', 'aplicativo_V', 'cant_gestiones',
       'num_oblig_orig_enmascarado', 'nit_enmascarado', 'min_mora']

'id' in a

False

In [13]:
'id' in X_train.columns

False

In [9]:
# ========================
# 5. ENTRENAMIENTO + MLFLOW
# ========================
for model_name, model in modelos.items():
    with mlflow.start_run(run_name=model_name):
        print(f"\nEntrenando {model_name}...")

        # Entrenamiento
        model.fit(X_train, y_train)

        # Predicción y métrica
        y_pred = model.predict(X_test)
        f1 = f1_score(y_test, y_pred)
        # Dentro del loop
        scores = cross_val_score(model, X_train, y_train, cv=5, scoring='f1')
        print(f"Cross-validated F1: {np.mean(scores):.4f} ± {np.std(scores):.4f}")

        # Confusion matrix
        print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
        print("Classification Report:\n", classification_report(y_test, y_pred))

        # Log en MLflow
        mlflow.log_param("model_type", model_name)
        mlflow.log_metric("f1_score", f1)
        #muestrame la presicion y recall
        precision = f1_score(y_test, y_pred, average='weighted')
        recall = f1_score(y_test, y_pred, average='weighted')
        mlflow.sklearn.log_model(model, model_name)

        print(f"✅ {model_name} -> F1 score: {f1:.4f}")
        print(f"✅ {model_name} -> Precision: {precision:.4f}")
        print(f"✅ {model_name} -> Recall: {recall:.4f}")


Entrenando LogisticRegression...
Cross-validated F1: 0.9867 ± 0.0005
Confusion Matrix:
 [[117476   1017]
 [  1934 106874]]
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.99      0.99    118493
           1       0.99      0.98      0.99    108808

    accuracy                           0.99    227301
   macro avg       0.99      0.99      0.99    227301
weighted avg       0.99      0.99      0.99    227301





✅ LogisticRegression -> F1 score: 0.9864
✅ LogisticRegression -> Precision: 0.9870
✅ LogisticRegression -> Recall: 0.9870
🏃 View run LogisticRegression at: http://127.0.0.1:5000/#/experiments/418343190953855177/runs/95e2a20448a5483197d39bb9d3653832
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/418343190953855177

Entrenando KNeighborsClassifier...
Cross-validated F1: 0.9818 ± 0.0004
Confusion Matrix:
 [[117660    833]
 [  2882 105926]]
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.99      0.98    118493
           1       0.99      0.97      0.98    108808

    accuracy                           0.98    227301
   macro avg       0.98      0.98      0.98    227301
weighted avg       0.98      0.98      0.98    227301





✅ KNeighborsClassifier -> F1 score: 0.9828
✅ KNeighborsClassifier -> Precision: 0.9836
✅ KNeighborsClassifier -> Recall: 0.9836
🏃 View run KNeighborsClassifier at: http://127.0.0.1:5000/#/experiments/418343190953855177/runs/a0ae7a428844429c828ba0d9d7e9832f
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/418343190953855177

Entrenando DecisionTreeClassifier...
Cross-validated F1: 0.9901 ± 0.0003
Confusion Matrix:
 [[117421   1072]
 [  1010 107798]]
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99    118493
           1       0.99      0.99      0.99    108808

    accuracy                           0.99    227301
   macro avg       0.99      0.99      0.99    227301
weighted avg       0.99      0.99      0.99    227301





✅ DecisionTreeClassifier -> F1 score: 0.9904
✅ DecisionTreeClassifier -> Precision: 0.9908
✅ DecisionTreeClassifier -> Recall: 0.9908
🏃 View run DecisionTreeClassifier at: http://127.0.0.1:5000/#/experiments/418343190953855177/runs/7096c0b396494b328e8dc4267cea7b48
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/418343190953855177

Entrenando RandomForestClassifier...
Cross-validated F1: 0.9947 ± 0.0004
Confusion Matrix:
 [[118252    241]
 [   923 107885]]
Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      1.00    118493
           1       1.00      0.99      0.99    108808

    accuracy                           0.99    227301
   macro avg       1.00      0.99      0.99    227301
weighted avg       0.99      0.99      0.99    227301





✅ RandomForestClassifier -> F1 score: 0.9946
✅ RandomForestClassifier -> Precision: 0.9949
✅ RandomForestClassifier -> Recall: 0.9949
🏃 View run RandomForestClassifier at: http://127.0.0.1:5000/#/experiments/418343190953855177/runs/175e04d0c5764dacabd1cfd8b99ce597
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/418343190953855177

Entrenando GradientBoostingClassifier...
Cross-validated F1: 0.9927 ± 0.0003
Confusion Matrix:
 [[118039    454]
 [  1143 107665]]
Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      0.99    118493
           1       1.00      0.99      0.99    108808

    accuracy                           0.99    227301
   macro avg       0.99      0.99      0.99    227301
weighted avg       0.99      0.99      0.99    227301





✅ GradientBoostingClassifier -> F1 score: 0.9926
✅ GradientBoostingClassifier -> Precision: 0.9930
✅ GradientBoostingClassifier -> Recall: 0.9930
🏃 View run GradientBoostingClassifier at: http://127.0.0.1:5000/#/experiments/418343190953855177/runs/8f0aca13f6dc4304ae5126828f6279c9
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/418343190953855177

Entrenando AdaBoostClassifier...
Cross-validated F1: 0.9731 ± 0.0002
Confusion Matrix:
 [[117542    951]
 [  4800 104008]]
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.99      0.98    118493
           1       0.99      0.96      0.97    108808

    accuracy                           0.97    227301
   macro avg       0.98      0.97      0.97    227301
weighted avg       0.98      0.97      0.97    227301





✅ AdaBoostClassifier -> F1 score: 0.9731
✅ AdaBoostClassifier -> Precision: 0.9747
✅ AdaBoostClassifier -> Recall: 0.9747
🏃 View run AdaBoostClassifier at: http://127.0.0.1:5000/#/experiments/418343190953855177/runs/3e2645c71f5543f5b3fb1a9dfc0ff850
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/418343190953855177
