#📌 Extracción

In [10]:
import pandas as pd
import json

with open('/content/TelecomX_Data.json') as f:
    raw_data = json.load(f)

df_raw = pd.json_normalize(raw_data)




Index(['customerID', 'Churn', 'customer.gender', 'customer.SeniorCitizen',
       'customer.Partner', 'customer.Dependents', 'customer.tenure',
       'phone.PhoneService', 'phone.MultipleLines', 'internet.InternetService',
       'internet.OnlineSecurity', 'internet.OnlineBackup',
       'internet.DeviceProtection', 'internet.TechSupport',
       'internet.StreamingTV', 'internet.StreamingMovies', 'account.Contract',
       'account.PaperlessBilling', 'account.PaymentMethod',
       'account.Charges.Monthly', 'account.Charges.Total'],
      dtype='object')
   customerID Churn customer.gender  customer.SeniorCitizen customer.Partner  \
0  0002-ORFBO    No          Female                       0              Yes   
1  0003-MKNFE    No            Male                       0               No   
2  0004-TLHLJ   Yes            Male                       0               No   

  customer.Dependents  customer.tenure phone.PhoneService phone.MultipleLines  \
0                 Yes             

#🔧 Transformación

In [16]:
import logging

logging.basicConfig(filename='telecomx_etl.log', level=logging.INFO)

# Diagnóstico
mem_mb = df.memory_usage(deep=True).sum() / (1024**2)
print(f"Memoria total: {mem_mb:.2f} MB")
logging.info(f'Memoria total: {mem_mb:.2f} MB')

# Limpieza
low_variance = [col for col in df.columns if df[col].nunique() <= 1]
high_cardinality = [col for col in df.columns if df[col].nunique() > 1000]
nested_cols = [col for col in df.columns if df[col].apply(lambda x: isinstance(x, (list, dict))).any()]
cols_to_drop = list(set(low_variance + high_cardinality + nested_cols))

df_clean = df.drop(columns=cols_to_drop)
logging.info(f'Columnas eliminadas: {len(cols_to_drop)}')


Memoria total: 95.97 MB


#📊 Carga y análisis

In [17]:
df_clean.to_csv('telecomx_reducido.csv', index=False)
logging.info('Archivo telecomx_reducido.csv guardado exitosamente')



In [19]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

X = df_clean.drop('Churn_Yes', axis=1)
y = df_clean['Churn_Yes']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       False       0.96      0.97      0.97      1082
        True       0.92      0.88      0.90       372

    accuracy                           0.95      1454
   macro avg       0.94      0.93      0.93      1454
weighted avg       0.95      0.95      0.95      1454



#📄Informe final

In [20]:
importances = pd.Series(model.feature_importances_, index=X.columns)
top_features = importances.sort_values(ascending=False).head(10)
print("Variables más influyentes en el churn:")
print(top_features)


Variables más influyentes en el churn:
Churn_No                          0.331478
tenure                            0.058509
InternetService_Fiber optic       0.018920
PaymentMethod_Electronic check    0.017039
Contract_Two year                 0.014868
TechSupport_Yes                   0.013447
OnlineSecurity_Yes                0.012570
Contract_One year                 0.011576
PaperlessBilling_Yes              0.008558
OnlineBackup_Yes                  0.008473
dtype: float64


#Informe final
##teniendo en cuenta las variables: tipo de contrato, metodo de pago, servicios contratados, tenure y Churn_no las remendaciones son:                                                     -incentiivar contratos largos
##-mejorar servicios de soporte y seguridad
## -fidelizacion de clientes nuevos
## -monitorear clientes con pago electronico