In [47]:
import pandas as pd
import os
import numpy as np
from IPython.display import display

In [48]:
data=pd.read_csv(os.path.join('..','DataFrames','Churn.csv'))

In [51]:
display(data.head())

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2.0,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1.0,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8.0,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1.0,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2.0,125510.82,1,1,1,79084.1,0


In [52]:
# Eliminar columnas irrelevantes
data.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1, inplace=True)

# Convertir variables categóricas a variables dummy
data = pd.get_dummies(data, columns=['Geography', 'Gender'], drop_first=True)

# Verificar valores nulos
print(data.isnull().sum())

CreditScore            0
Age                    0
Tenure               909
Balance                0
NumOfProducts          0
HasCrCard              0
IsActiveMember         0
EstimatedSalary        0
Exited                 0
Geography_Germany      0
Geography_Spain        0
Gender_Male            0
dtype: int64


In [53]:
# Eliminar filas con valores nulos
data.dropna(subset=['Tenure'], inplace=True)


In [54]:
# Verificar valores nulos
print(data.isnull().sum())

CreditScore          0
Age                  0
Tenure               0
Balance              0
NumOfProducts        0
HasCrCard            0
IsActiveMember       0
EstimatedSalary      0
Exited               0
Geography_Germany    0
Geography_Spain      0
Gender_Male          0
dtype: int64


In [55]:
# Ver el balance de clases en la variable objetivo 'Exited'
print(data['Exited'].value_counts())

Exited
0    7237
1    1854
Name: count, dtype: int64


In [61]:

# Entrenar el modelo con los datos equilibrados y escalados
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from imblearn.under_sampling import RandomUnderSampler

# Dividir los datos en características (X) y objetivo (y)
X = data.drop('Exited', axis=1)
y = data['Exited']

# Dividir en conjunto de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Submuestreo de la clase mayoritaria
rus = RandomUnderSampler(random_state=42)
X_res, y_res = rus.fit_resample(X_train, y_train)

# Escalar las características (después de realizar el submuestreo)
scaler = StandardScaler()
X_res_scaled = scaler.fit_transform(X_res)
X_test_scaled = scaler.transform(X_test)  # Escalar también el conjunto de prueba

# Configurar y entrenar el modelo con los datos equilibrados y escalados
model = LogisticRegression(class_weight='balanced', max_iter=20000, random_state=42, solver='saga',C=1)

# Entrenar el modelo
model.fit(X_res_scaled, y_res)

# Hacer predicciones
y_pred = model.predict(X_test_scaled)

# Evaluar el modelo
print(classification_report(y_test, y_pred))




              precision    recall  f1-score   support

           0       0.90      0.69      0.78      2173
           1       0.37      0.70      0.48       555

    accuracy                           0.70      2728
   macro avg       0.63      0.70      0.63      2728
weighted avg       0.79      0.70      0.72      2728



In [64]:
from imblearn.over_sampling import SMOTE

# Sobremuestreo de la clase minoritaria
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X_train, y_train)

# Entrenar nuevamente el modelo con los datos equilibrados
model.fit(X_res, y_res)


In [65]:
# Evaluar el modelo en el conjunto de prueba
y_pred = model.predict(X_test)

# Reporte final
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.85      0.51      0.63      2173
           1       0.25      0.65      0.36       555

    accuracy                           0.54      2728
   macro avg       0.55      0.58      0.50      2728
weighted avg       0.73      0.54      0.58      2728

