#Limpieza y el An√°lisis Exploratorio

In [1]:
# ----- IMPORTANDO LIBRERIAS ----

import pandas as pd
import numpy as np
import requests
from sklearn.preprocessing import MinMaxScaler

In [5]:
# --- 1. RECUPERACI√ìN DE DATOS  ---
print("üì° Descargando datos originales")
url_api = "https://raw.githubusercontent.com/ingridcristh/challenge2-data-science-LATAM/main/TelecomX_Data.json"
response = requests.get(url_api)
df_datos = pd.DataFrame(response.json())

# Aplanar columnas anidadas
for col in ['customer', 'phone', 'internet', 'account']:
    df_datos = pd.concat([df_datos, pd.json_normalize(df_datos[col].tolist())], axis=1)
    df_datos.drop(columns=[col], inplace=True)

# Limpieza b√°sica
     # Eliminar vac√≠os
df = df_datos[df_datos['Churn'] != ''].copy()

df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})
df['Charges.Total'] = pd.to_numeric(df['Charges.Total'], errors='coerce')

      # Eliminar nulos num√©ricos
df = df.dropna(subset=['Charges.Total'])

print(f"\n‚úÖ Datos base cargados: {df.shape}")

# --- 2. CODIFICACI√ìN (Encoding) ---
# Separamos ID (no sirve para predecir) y Target
customer_id = df['customerID']
target = df['Churn']
df_features = df.drop(columns=['customerID', 'Churn'])

# Identificar columnas
cat_cols = df_features.select_dtypes(include=['object']).columns
num_cols = df_features.select_dtypes(include=['float64', 'int64']).columns

print(f"\nüîÑ Codificando {len(cat_cols)} variables categ√≥ricas...")

# One-Hot Encoding (Convierte 'InternetService' en 'InternetService_Fiber', 'InternetService_DSL', etc.)
df_encoded = pd.get_dummies(df_features, columns=cat_cols, drop_first=True)

# --- 3. NORMALIZACI√ìN (Scaling) ---
# Los modelos funcionan mejor si 'tenure' (0-72) y 'MonthlyCharges' (20-100) est√°n en el mismo rango (0-1)
print("\n‚öñÔ∏è Escalando variables num√©ricas al rango [0,1]...")

scaler = MinMaxScaler()
df_encoded[num_cols] = scaler.fit_transform(df_encoded[num_cols])

# Reconstruir dataframe final
df_ml = pd.concat([customer_id, df_encoded, target], axis=1)

print(f"\n‚úÖ DATASET LISTO PARA ML: {df_ml.shape}")
print(df_ml.head(5))

# --- 4. GUARDAR ---
# Guardamos en la carpeta data que creamos
df_ml.to_csv('telecom_churn_ml_preparado.csv', index=False)
print("\nüíæ Archivo 'telecom_churn_ml_preparado.csv' guardado.")

üì° Descargando datos originales

‚úÖ Datos base cargados: (7032, 21)

üîÑ Codificando 15 variables categ√≥ricas...

‚öñÔ∏è Escalando variables num√©ricas al rango [0,1]...

‚úÖ DATASET LISTO PARA ML: (7032, 32)
   customerID  SeniorCitizen    tenure  Charges.Monthly  Charges.Total  \
0  0002-ORFBO            0.0  0.112676         0.471144       0.066294   
1  0003-MKNFE            0.0  0.112676         0.414428       0.060420   
2  0004-TLHLJ            0.0  0.042254         0.553731       0.030239   
3  0011-IGKFF            1.0  0.169014         0.793532       0.140670   
4  0013-EXCHZ            1.0  0.028169         0.653234       0.028687   

   gender_Male  Partner_Yes  Dependents_Yes  PhoneService_Yes  \
0        False         True            True              True   
1         True        False           False              True   
2         True        False           False              True   
3         True         True           False              True   
4        False  