# Telecom X — Parte 2: Predicción de Cancelación (Churn)

**Fecha:** 2025-08-10

Este notebook está pensado para ejecutarse en **Google Colab**. Contiene el pipeline completo para:

- Preparación y preprocesamiento de datos para modelado.
- Entrenamiento y evaluación de varios modelos de clasificación (Regresión Logística, KNN, RandomForest, SVM).
- Interpretación de resultados (coeficientes, importancia de variables) y recomendaciones estratégicas.

Dataset (raw): https://raw.githubusercontent.com/ingridcristh/challenge2-data-science-LATAM/main/TelecomX_Data.json

---


In [None]:
# Ejecutar en Google Colab (descomenta si necesitas instalar paquetes)
# !pip install --upgrade scikit-learn matplotlib pandas requests

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
import requests, json
plt.rcParams['figure.figsize'] = (8,5)
plt.rcParams['axes.grid'] = True
print('Librerías cargadas')

In [None]:
# Cargar datos
url = "https://raw.githubusercontent.com/ingridcristh/challenge2-data-science-LATAM/main/TelecomX_Data.json"
resp = requests.get(url)
resp.raise_for_status()
data = resp.json()
df = pd.json_normalize(data)
df.columns = [c.strip().lower().replace(' ', '_') for c in df.columns]
print('Registros:', len(df))
df.head()

In [None]:
# Identificar columna churn
possible_churn = [c for c in df.columns if 'churn' in c or 'cancel' in c or 'exit' in c or 'churned' in c]
print('Columnas candidatas churn:', possible_churn)
churn_col = possible_churn[0] if possible_churn else None
print('Usando churn_col =', churn_col)

In [None]:
# Preparación básica
df_model = df.copy()

# Limpiar strings
for c in df_model.select_dtypes(include='object').columns:
    df_model[c] = df_model[c].str.strip()

# Convertir a num donde tenga sentido
for c in df_model.columns:
    if df_model[c].dtype == object:
        try:
            df_model[c] = df_model[c].str.replace(',','').astype(float)
        except Exception:
            pass

# Mapear churn a 0/1 si existe
if churn_col:
    df_model[churn_col] = df_model[churn_col].astype(str).str.lower().map({'yes':1,'no':0,'y':1,'n':0,'true':1,'false':0}).astype(float)
    df_model = df_model[df_model[churn_col].notna()]
    df_model[churn_col] = df_model[churn_col].astype(int)

print('Registros para modelado:', len(df_model))

In [None]:
# Selección de features (excluir identificadores)
exclude = [churn_col, 'id', 'customerid', 'customer_id']
features = [c for c in df_model.columns if c not in exclude]
# eliminar columnas con único valor
features = [c for c in features if df_model[c].nunique()>1]
print('Features seleccionadas:', len(features))

In [None]:
# Tipos de features
num_cols = df_model[features].select_dtypes(include=[np.number]).columns.tolist()
cat_cols = [c for c in features if c not in num_cols]
print('Numéricas:', num_cols[:10])
print('Categóricas:', cat_cols[:10])

In [None]:
# División train/test
X = df_model[features]
y = df_model[churn_col]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)
print('Train:', X_train.shape, 'Test:', X_test.shape)

In [None]:
# Preprocesamiento
numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),
                                      ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                                          ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))])

preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, num_cols),
                                               ('cat', categorical_transformer, cat_cols)])

In [None]:
# Modelos
models = {
    'LogisticRegression': Pipeline([('pre', preprocessor), ('clf', LogisticRegression(max_iter=1000, solver='liblinear'))]),
    'KNN': Pipeline([('pre', preprocessor), ('clf', KNeighborsClassifier())]),
    'RandomForest': Pipeline([('pre', preprocessor), ('clf', RandomForestClassifier(n_estimators=200, random_state=42))]),
    'SVM': Pipeline([('pre', preprocessor), ('clf', SVC(probability=True, kernel='rbf'))])
}

results = {}
for name, pipe in models.items():
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    y_proba = pipe.predict_proba(X_test)[:,1] if hasattr(pipe.named_steps['clf'], 'predict_proba') else None
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, zero_division=0)
    rec = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    auc = roc_auc_score(y_test, y_proba) if y_proba is not None else None
    results[name] = {'pipe': pipe, 'accuracy': acc, 'precision': prec, 'recall': rec, 'f1': f1, 'auc': auc}
    print('==', name, '==')
    print('Accuracy:', acc, 'Precision:', prec, 'Recall:', rec, 'F1:', f1, 'AUC:', auc)
    print(classification_report(y_test, y_pred, zero_division=0))
    print('\n')

In [None]:
# Resumen
res_df = pd.DataFrame([{**{'model':k}, **{m: results[k][m] for m in ['accuracy','precision','recall','f1','auc']}} for k in results]).set_index('model')
res_df

In [None]:
# Interpretabilidad: obtener nombres de features transformadas
pre = preprocessor.fit(X_train)
X_train_t = pre.transform(X_train)
num_features = num_cols
cat_features = list(pre.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(cat_cols)) if len(cat_cols)>0 else []
feature_names = num_features + cat_features
print('Features transformadas:', len(feature_names))

In [None]:
# Logistic Regression: coeficientes
if 'LogisticRegression' in results:
    lr = results['LogisticRegression']['pipe'].named_steps['clf']
    coefs = lr.coef_[0]
    coef_df = pd.DataFrame({'feature': feature_names, 'coef': coefs})
    coef_df['abs'] = coef_df['coef'].abs()
    coef_df = coef_df.sort_values('abs', ascending=False).head(20)
    display(coef_df)

In [None]:
# Random Forest: importancias
if 'RandomForest' in results:
    rf = results['RandomForest']['pipe'].named_steps['clf']
    importances = rf.feature_importances_
    imp_df = pd.DataFrame({'feature': feature_names, 'importance': importances}).sort_values('importance', ascending=False).head(20)
    display(imp_df)

In [None]:
# ROC Curves
plt.figure(figsize=(8,6))
for name, info in results.items():
    pipe = info['pipe']
    if hasattr(pipe.named_steps['clf'], 'predict_proba'):
        y_proba = pipe.predict_proba(X_test)[:,1]
        fpr, tpr, _ = roc_curve(y_test, y_proba)
        auc = roc_auc_score(y_test, y_proba)
        plt.plot(fpr, tpr, label=f'{name} (AUC={auc:.3f})')
plt.plot([0,1],[0,1],'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves')
plt.legend()
plt.show()

## Conclusiones y recomendaciones

- Resume los modelos comparados y su rendimiento (F1 y AUC).
- Menciona las variables con mayor impacto según regresión logística y Random Forest.
- Recomendaciones tácticas para retención basadas en las variables importantes.

---

**Siguientes pasos sugeridos:** grid-search de hiperparámetros, validación cruzada, probar XGBoost/LightGBM y despliegue del scoring en producción.