In [10]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from ydata_profiling import ProfileReport

import tensorflow as tf
import keras_tuner as kt
from tensorflow import keras
from tensorflow.keras.optimizers import Adam
from keras.models import Sequential
from keras.layers import Dense, Input
from keras.utils import plot_model
from keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from scikeras.wrappers import KerasClassifier

In [11]:
data = pd.read_csv('churn_data.csv')
data.head()

Unnamed: 0,Age,Gender,Tenure,Usage Frequency,Support Calls,Payment Delay,Subscription Type,Contract Length,Total Spend,Last Interaction,Churn
0,32.0,Female,50.0,16.0,1.0,9.0,Basic,Quarterly,774.06,2.0,0.0
1,27.0,Female,5.0,1.0,10.0,29.0,Premium,Annual,295.0,25.0,1.0
2,42.0,Male,3.0,16.0,2.0,11.0,Premium,Annual,623.61,25.0,0.0
3,27.0,Female,35.0,29.0,1.0,2.0,Basic,Quarterly,567.96,10.0,0.0
4,32.0,Male,51.0,18.0,0.0,5.0,Premium,Annual,831.21,11.0,0.0


In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 165000 entries, 0 to 164999
Data columns (total 11 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Age                165000 non-null  float64
 1   Gender             165000 non-null  object 
 2   Tenure             165000 non-null  float64
 3   Usage Frequency    165000 non-null  float64
 4   Support Calls      165000 non-null  float64
 5   Payment Delay      165000 non-null  float64
 6   Subscription Type  165000 non-null  object 
 7   Contract Length    165000 non-null  object 
 8   Total Spend        165000 non-null  float64
 9   Last Interaction   165000 non-null  float64
 10  Churn              165000 non-null  float64
dtypes: float64(8), object(3)
memory usage: 13.8+ MB


In [13]:
ProfileReport(data)

100%|██████████| 11/11 [00:00<00:00, 23.92it/s]<00:00,  9.47it/s, Describe variable: Churn]          
Summarize dataset: 100%|██████████| 69/69 [00:05<00:00, 13.65it/s, Completed]                                 
Generate report structure: 100%|██████████| 1/1 [00:00<00:00,  1.03it/s]
Render HTML: 100%|██████████| 1/1 [00:00<00:00,  1.71it/s]




In [14]:
# Separate train, test
train, test = train_test_split(data, test_size=0.2, random_state=42)

# Delete rows where 'Churn' is NaN
train = train.dropna(subset=['Churn'])

# Separar variables explicativas y variable objetivo
X_train = train.drop('Churn', axis=1)
y_train = train['Churn']

display(X_train.head())
display(y_train.head())

Unnamed: 0,Age,Gender,Tenure,Usage Frequency,Support Calls,Payment Delay,Subscription Type,Contract Length,Total Spend,Last Interaction
82562,46.0,Male,8.0,28.0,4.0,10.0,Basic,Monthly,238.0,12.0
13148,42.0,Female,45.0,30.0,1.0,20.0,Premium,Quarterly,659.3,2.0
10615,21.0,Male,57.0,15.0,1.0,20.0,Standard,Annual,511.55,21.0
38965,21.0,Male,19.0,12.0,1.0,3.0,Premium,Quarterly,890.55,14.0
24572,30.0,Male,57.0,23.0,1.0,14.0,Basic,Annual,553.09,18.0


82562    1.0
13148    0.0
10615    0.0
38965    0.0
24572    0.0
Name: Churn, dtype: float64

In [16]:
# Aplicar OneHotEncoder a la variable a predecir
encoder_y = OneHotEncoder(sparse_output=False)

y_train_encoded = encoder_y.fit_transform(y_train.values.reshape(-1, 1))

# Cuarto Paso: Standard Scaler para variables numericas, One hot Encoder para variables categoricas
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X_train.select_dtypes(include=['object']).columns
print(f'Numerical {numeric_features}')
print(f'Categorical {categorical_features}')

Numerical Index(['Age', 'Tenure', 'Usage Frequency', 'Support Calls', 'Payment Delay',
       'Total Spend', 'Last Interaction'],
      dtype='object')
Categorical Index(['Gender', 'Subscription Type', 'Contract Length'], dtype='object')


In [None]:
# Escalo las variables numericas y aplico OneHotEncoder a las categoricas
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),   
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Creo el pipeline de transformacion
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='passthrough'
)

full_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)
])

X_train_processed = full_pipeline.fit_transform(X_train)