In [1]:
import pandas as pd
import numpy as np

from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

import matplotlib.pyplot as plt

from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, Binarizer

from sklearn.pipeline import Pipeline

from feature_engine.imputation import(
    AddMissingIndicator,
    MeanMedianImputer,
    CategoricalImputer
)

from feature_engine.encoding import (
    RareLabelEncoder,
    OrdinalEncoder,
    OneHotEncoder
)

from feature_engine.transformation import YeoJohnsonTransformer

from feature_engine.wrappers import SklearnTransformerWrapper

import joblib

In [2]:
data = pd.read_csv('customer_data_edited.csv')
data.head()

Unnamed: 0,recordID,state,account_length,area_code,international_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,...,total_eve_charge,total_night_minutes,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,number_customer_service_calls,churn,customer_id
0,1,HI,101,510,no,no,0,70.9,123,12.05,...,18.01,236.0,73,10.62,10.6,3,2.86,3,no,23383607.0
1,2,MT,137,510,no,no,0,223.6,86,38.01,...,20.81,94.2,81,4.24,9.5,7,2.57,0,no,22550362.0
2,3,OH,103,408,no,yes,29,294.7,95,50.1,...,20.17,300.3,127,13.51,13.7,6,3.7,1,no,59063354.0
3,4,NM,99,415,no,no,0,216.8,123,36.86,...,10.74,220.6,82,9.93,15.7,2,4.24,1,no,25464504.0
4,5,SC,108,415,no,no,0,197.4,78,33.56,...,10.54,204.5,107,9.2,7.7,4,2.08,2,no,691824.0


In [3]:
data = pd.get_dummies(data, columns=['churn'], drop_first=True)
data.rename(columns = {'churn_yes':'churn'}, inplace = True)
data

Unnamed: 0,recordID,state,account_length,area_code,international_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,...,total_eve_charge,total_night_minutes,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,number_customer_service_calls,customer_id,churn
0,1,HI,101,510,no,no,0,70.9,123,12.05,...,18.01,236.0,73,10.62,10.6,3,2.86,3,23383607.0,0
1,2,MT,137,510,no,no,0,223.6,86,38.01,...,20.81,94.2,81,4.24,9.5,7,2.57,0,22550362.0,0
2,3,OH,103,408,no,yes,29,294.7,95,50.10,...,20.17,300.3,127,13.51,13.7,6,3.70,1,59063354.0,0
3,4,NM,99,415,no,no,0,216.8,123,36.86,...,10.74,220.6,82,9.93,15.7,2,4.24,1,25464504.0,0
4,5,SC,108,415,no,no,0,197.4,78,33.56,...,10.54,204.5,107,9.20,7.7,4,2.08,2,691824.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12887,12888,MT,25,415,no,no,0,134.3,98,22.83,...,17.20,195.9,100,8.82,12.6,5,3.40,2,3785730.0,0
12888,12889,MT,113,415,no,no,0,215.9,93,36.70,...,20.41,156.7,123,7.05,4.9,5,1.32,3,25171109.0,0
12889,12890,ID,88,415,no,yes,31,181.6,91,30.87,...,18.12,207.8,104,9.35,11.4,4,3.08,1,12126991.0,0
12890,12891,AK,120,415,no,no,0,178.4,97,30.33,...,14.31,120.5,93,5.42,9.3,9,2.51,1,33084674.0,0


In [4]:
#Cast de variables categóricas
data['area_code'] = data['area_code'].astype('O')

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
        data.drop(['recordID', 'customer_id'], axis=1),
        data['churn'],
        test_size=0.3,
        random_state=2022)

X_train.shape, X_test.shape

((9024, 20), (3868, 20))

In [6]:
#Balanceo de Datos

dataNegativa = X_train[X_train['churn']==0]
dataPositiva = X_train[X_train['churn']==1]

cantidadDataNegativa = 2*dataPositiva.shape[0]

dataNegativa = dataNegativa.sample(n=cantidadDataNegativa, random_state=2021)

X_train=pd.concat([dataPositiva,dataNegativa])

X_train = X_train.drop(['churn'], axis=1)

In [7]:
X_train.shape

(3801, 19)

In [8]:
y_train = y_train.sample(n=3801, axis=0)

In [9]:
X_train.head()

Unnamed: 0,state,account_length,area_code,international_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,total_eve_minutes,total_eve_calls,total_eve_charge,total_night_minutes,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,number_customer_service_calls
7479,ID,116,510,yes,yes,29,162.3,91,27.59,279.3,79,23.74,192.7,131,8.67,11.7,2,3.16,3
1465,TN,222,415,no,no,0,256.6,119,43.62,230.8,98,19.62,236.0,97,10.62,12.2,5,3.29,1
207,ME,79,510,yes,no,0,300.4,113,51.07,210.8,93,17.92,173.4,121,7.8,11.9,3,3.21,2
3421,NC,54,415,yes,no,0,190.5,108,32.39,259.7,108,22.07,141.5,111,6.37,9.7,2,2.62,2
222,SD,108,510,yes,no,0,275.9,84,46.9,203.0,91,17.26,211.4,108,9.51,6.7,4,1.81,2


In [10]:
y_train

11803    0
7278     0
10385    0
395      0
2897     0
        ..
866      0
7014     0
1997     0
3279     0
10632    0
Name: churn, Length: 3801, dtype: uint8

In [11]:
y_train.shape

(3801,)

## Configuración del Machine Learning Pipeline

In [12]:
#Varibles para transformación YEO JOHNSON
NUMERICALS_YEO_JOHNSON_VARS = ['total_intl_calls']

#Variables para binarización por sesgo fuerte
BINARIZE_VARS = ['number_vmail_messages']

#Variables categoricas a codificar sin ordinalidad
CATEGORICAL_VARS = ['state', 'area_code']

#Variables para OHE
OHE_VARS = ['international_plan', 'voice_mail_plan']

#PCA
pca = PCA(n_components=1)

#Variables seleccionadas para uso
FEATURES = ['state', 
            'account_length', 
            'area_code', 
            'international_plan', 
            'voice_mail_plan', 
            'number_vmail_messages', 
            'total_day_minutes',
            'total_day_calls', 
            'total_day_charge', 
            'total_eve_minutes',
            'total_eve_calls',
            'total_eve_charge',
            'total_night_minutes',
            'total_night_calls',
            'total_night_charge',
            'total_intl_minutes',
            'total_intl_calls',
            'total_intl_charge',
            'number_customer_service_calls']

In [13]:
#Selección de variables para entrenamiento
X_train = X_train[FEATURES]
X_train.shape

(3801, 19)

## Configuración del Machine Learning Pipeline

In [14]:
churn_pipeline = Pipeline([

    #============= TRANSFORMACIÓN DE VARIABLES NUMÉRICAS =============
    
    # Transformación Yeo Johnson
    ('yeo_johnson', YeoJohnsonTransformer(variables=NUMERICALS_YEO_JOHNSON_VARS)
    ),
    
    # Binarización de Variables con Sesgo Fuerte
    ('binarizer', SklearnTransformerWrapper(
        transformer=Binarizer(threshold=0), variables=BINARIZE_VARS)
    ),
    
    
    #============ CODIFICACION DE VARIABLES CATEGORICAS NOMINALES ============
    
    ('rare_label_encoder', RareLabelEncoder(
        tol=0.01, n_categories=1, variables=CATEGORICAL_VARS)),
    
    ('categorical_encoder', OrdinalEncoder(
        encoding_method='ordered', variables=CATEGORICAL_VARS)),
    
    
    #============ ONE HOT ENCODING ============
    
    #6. Codificación por frecuencia de la variable
    ('ohe_encoder', OneHotEncoder(
        variables=OHE_VARS, drop_last=True)
    ),
    
    #=========== SCALER ==============
    ('scaler', MinMaxScaler()),
    
    
    #=========== ENTRENAMIENTO DEL MODELO ============
    ('pca', pca),
    ('Log_reg', LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial')),
])

In [15]:
churn_pipeline.fit(X_train, y_train)

Pipeline(steps=[('yeo_johnson',
                 YeoJohnsonTransformer(variables=['total_intl_calls'])),
                ('binarizer',
                 SklearnTransformerWrapper(transformer=Binarizer(threshold=0),
                                           variables=['number_vmail_messages'])),
                ('rare_label_encoder',
                 RareLabelEncoder(n_categories=1, tol=0.01,
                                  variables=['state', 'area_code'])),
                ('categorical_encoder',
                 OrdinalEncoder(variables=['state', 'area_code'])),
                ('ohe_encoder',
                 OneHotEncoder(drop_last=True,
                               variables=['international_plan',
                                          'voice_mail_plan'])),
                ('scaler', MinMaxScaler()), ('pca', PCA(n_components=1)),
                ('Log_reg',
                 LogisticRegression(multi_class='multinomial',
                                    random_state=0))])

In [16]:
X_test = X_test[FEATURES]

In [17]:
test_predictions = churn_pipeline.predict(X_test)
test_predictions

array([0, 0, 0, ..., 0, 0, 0], dtype=uint8)

In [18]:
precision = accuracy_score(test_predictions, y_test) * 100
print("Accuracy con Logistic Regression: {0:.6f}%".format(precision))

Accuracy con Logistic Regression: 85.625646%


In [19]:
import joblib

In [20]:
#Guardamos pipeline
joblib.dump(churn_pipeline, 'churn_pipeline.pkl')

['churn_pipeline.pkl']