In [None]:
!pip install xgboost lightgbm

In [None]:
# 01: Importar librerías
import pandas as pd
import numpy as np
import json
import joblib
import os

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, f1_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# 02: Cargar y preparar datos
billing = pd.read_csv('C:/Users/Rodrigo/Actividad_Feedback/data/billing.csv', sep=';')
clients = pd.read_parquet('C:/Users/Rodrigo/Actividad_Feedback/data/clients.parquet')
with open('C:/Users/Rodrigo/Actividad_Feedback/data/tenure.json') as f:
    tenure = pd.DataFrame(json.load(f))
data_churn = pd.read_parquet('C:/Users/Rodrigo/Actividad_Feedback/data_churn.parquet')

df = clients.merge(billing, on='customerid', how='left')\
            .merge(tenure, on='customerid', how='left')\
            .merge(data_churn[['customerid', 'target']].rename(columns={'target': 'churn'}), on='customerid', how='left')

terminaciones = ['7', '1', '6', '8']
df = df[df['customerid'].astype(str).str[-1].isin(terminaciones)]
df = df[df['churn'].notna()]

X = df.drop(columns=['customerid', 'churn'])
y = df['churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

numericas = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
categoricas = X_train.select_dtypes(include=['object', 'category']).columns.tolist()

preprocesador = ColumnTransformer(transformers=[
    ('num', Pipeline([('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())]), numericas),
    ('cat', Pipeline([('imputer', SimpleImputer(strategy='most_frequent')), ('onehot', OneHotEncoder(handle_unknown='ignore'))]), categoricas)
])

# 03: Probar múltiples clasificadores
modelos = {
    'Logistic Regression': LogisticRegression(max_iter=500),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'SVM': SVC(probability=True),
    'Decision Tree': DecisionTreeClassifier(),
    'KNN': KNeighborsClassifier(),
    'Gaussian NB': GaussianNB(),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    'LightGBM': LGBMClassifier()
}

resultados = {}

for nombre, modelo in modelos.items():
    pipeline = Pipeline([
        ('preprocesador', preprocesador),
        ('clasificador', modelo)
    ])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    f1 = f1_score(y_test, y_pred, average='macro')
    resultados[nombre] = f1
    print(f"{nombre}: F1 Macro = {f1:.4f}")

# 04: Seleccionar Top 3
top3 = sorted(resultados.items(), key=lambda x: x[1], reverse=True)[:3]
print("\nTop 3 Modelos:")
for modelo, score in top3:
    print(f"{modelo}: {score:.4f}")

# 05: Ajuste de Hiperparámetros
mejores_modelos = {
    'Random Forest': RandomForestClassifier(),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    'LightGBM': LGBMClassifier()
}

parametros = {
    'Random Forest': {
        'clasificador__n_estimators': [100, 200],
        'clasificador__max_depth': [None, 10, 20]
    },
    'XGBoost': {
        'clasificador__n_estimators': [100, 200],
        'clasificador__learning_rate': [0.05, 0.1]
    },
    'LightGBM': {
        'clasificador__n_estimators': [100, 200],
        'clasificador__learning_rate': [0.05, 0.1]
    }
}

mejores_estimadores = {}

for nombre, modelo in mejores_modelos.items():
    pipeline = Pipeline([
        ('preprocesador', preprocesador),
        ('clasificador', modelo)
    ])
    grid = GridSearchCV(pipeline, parametros[nombre], cv=3, scoring='f1_macro', n_jobs=-1)
    grid.fit(X_train, y_train)
    mejores_estimadores[nombre] = grid.best_estimator_
    print(f"\nMejor configuración para {nombre}: {grid.best_params_}")

# 06: Stacking Ensemble
estimators = [(name, clf) for name, clf in mejores_estimadores.items()]

stacking = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(),
    n_jobs=-1
)

stacking.fit(X_train, y_train)
y_pred_stack = stacking.predict(X_test)
f1_stack = f1_score(y_test, y_pred_stack, average='macro')
print(f"\nStacking F1 Macro Score: {f1_stack:.4f}")

# 07: Serializar modelo final
joblib.dump(stacking, 'modelo_final_stack.pkl')

# 08: Guardar CSV de inferencia batch
X_test_inf = X_test.copy()
X_test_inf['customerid'] = df.loc[X_test.index, 'customerid'].values
X_test_inf.to_csv('data/nuevos_clientes.csv', index=False)