In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import scipy.stats as stats

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

import joblib

In [2]:
pd.pandas.set_option('display.max_columns', None) #habilitamos despliegue maximo de columnas

In [3]:
data = pd.read_csv('customer_data_edited.csv')
print(data.shape)

(12892, 22)


In [4]:
data.head()

Unnamed: 0,recordID,state,account_length,area_code,international_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,total_eve_minutes,total_eve_calls,total_eve_charge,total_night_minutes,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,number_customer_service_calls,churn,customer_id
0,1,HI,101,510,no,no,0,70.9,123,12.05,211.9,73,18.01,236.0,73,10.62,10.6,3,2.86,3,no,23383607.0
1,2,MT,137,510,no,no,0,223.6,86,38.01,244.8,139,20.81,94.2,81,4.24,9.5,7,2.57,0,no,22550362.0
2,3,OH,103,408,no,yes,29,294.7,95,50.1,237.3,105,20.17,300.3,127,13.51,13.7,6,3.7,1,no,59063354.0
3,4,NM,99,415,no,no,0,216.8,123,36.86,126.4,88,10.74,220.6,82,9.93,15.7,2,4.24,1,no,25464504.0
4,5,SC,108,415,no,no,0,197.4,78,33.56,124.0,101,10.54,204.5,107,9.2,7.7,4,2.08,2,no,691824.0


In [5]:
#Separamos data para entrenamiento y prueba,
X_train, X_test, y_train, y_test = train_test_split(
                        data.drop(['recordID', 'customer_id'], axis=1),
                        data['churn'],
                        test_size=0.15,
                        random_state=2021)

In [6]:
X_train.shape, X_test.shape

((10958, 20), (1934, 20))

## 2. Balanceo de Datos

#### 2.1 X_train

In [7]:
X_train['churn'].value_counts()

no     9417
yes    1541
Name: churn, dtype: int64

In [8]:
dataNegativa=X_train[X_train['churn']=='no']
dataNegativa.shape

(9417, 20)

In [9]:
dataPositiva=X_train[X_train['churn']=='yes']
dataPositiva.shape

(1541, 20)

In [10]:
cantidadDataNegativa=2*dataPositiva.shape[0]

In [11]:
dataNegativa = dataNegativa.sample(n=cantidadDataNegativa,random_state=2021)
dataNegativa.shape

(3082, 20)

In [12]:
X_train=pd.concat([dataPositiva,dataNegativa])
X_train.shape

(4623, 20)

#### 2.2 X_test

In [13]:
X_test['churn'].value_counts()

no     1652
yes     282
Name: churn, dtype: int64

In [14]:
dataNegativa=X_test[X_test['churn']=='no']
dataNegativa.shape

(1652, 20)

In [15]:
dataPositiva=X_test[X_test['churn']=='yes']
dataPositiva.shape

(282, 20)

In [16]:
cantidadDataNegativa=2*dataPositiva.shape[0]

In [17]:
dataNegativa = dataNegativa.sample(n=cantidadDataNegativa,random_state=2021)
dataNegativa.shape

(564, 20)

In [18]:
X_test=pd.concat([dataPositiva,dataNegativa])
X_test.shape

(846, 20)

## 3. Missing Values
* La data no cuenta con valores faltantes.

## 4. Transformación de Variables Numéricas

In [19]:
#Aplicamos transoformación de Yeo-Jonhson
X_train['total_intl_calls'], param = stats.yeojohnson(X_train['total_intl_calls'])

In [20]:
X_test['total_intl_calls'] = stats.yeojohnson(X_test['total_intl_calls'], lmbda=param)

#### Binarización de Variables con Sesgo fuerte

In [21]:
sesgadas = ['number_vmail_messages']

In [22]:
for var in sesgadas:
    X_train[var] = np.where(X_train[var] == 0, 0, 1)
    X_test[var] = np.where(X_test[var] == 0, 0, 1)

## 5. Codificación de Variables Categóricas

In [23]:
cat_vars = [var for var in data.columns if data[var].dtype == 'O']

#Agregamos el caso especial de MSSubClass ya que por definición es categórica
cat_vars = cat_vars + ['area_code']
cat_vars

['state', 'international_plan', 'voice_mail_plan', 'churn', 'area_code']

### 5.1 One Hot Encoding

In [24]:
binary_vars = ['churn', 'international_plan', 'voice_mail_plan']

 ### 5.1.1. X_train

In [25]:
new_Xtrain = pd.get_dummies(X_train[['churn', 'international_plan', 'voice_mail_plan']], drop_first=True)
new_Xtrain

Unnamed: 0,churn_yes,international_plan_yes,voice_mail_plan_yes
1523,1,0,0
694,1,0,0
3766,1,0,0
4818,1,0,0
5975,1,0,0
...,...,...,...
4446,0,0,0
7709,0,0,1
4812,0,0,0
7558,0,0,0


In [26]:
#Se unen ambos dataframes
X_train = pd.concat([X_train, new_Xtrain], axis = 1)

#se eliminan las columnas duplicadas
X_train.drop(['international_plan', 'voice_mail_plan', 'churn' ], axis=1, inplace=True)

### 5.1.2 X_test

In [27]:
new_Xtest = pd.get_dummies(X_test[['churn', 'international_plan', 'voice_mail_plan']], drop_first=True)
new_Xtest

Unnamed: 0,churn_yes,international_plan_yes,voice_mail_plan_yes
8280,1,0,0
8477,1,1,0
4568,1,0,1
1986,1,1,1
5181,1,0,0
...,...,...,...
8909,0,0,0
10704,0,0,0
1991,0,0,0
7671,0,0,0


In [28]:
#Se unen ambos dataframes
X_test = pd.concat([X_test, new_Xtest], axis = 1)

#se eliminan las columnas duplicadas
X_test.drop(['international_plan', 'voice_mail_plan', 'churn' ], axis=1, inplace=True)

### 5.2 Freq Encoding

In [29]:
def freq_map(train, test, var):
    
    train_freq_map = (train[var].value_counts().sort_values(ascending = False)).to_dict()
    test_freq_map = (test[var].value_counts().sort_values(ascending = False)).to_dict()
    
    train[var] = data[var].map(train_freq_map)
    test[var] = data[var].map(test_freq_map)

In [30]:
big_cat_vars = [var for var in cat_vars if(var not in binary_vars)]
big_cat_vars

['state', 'area_code']

In [31]:
for var in big_cat_vars:
    freq_map(X_train, X_test, var)

In [32]:
X_train

Unnamed: 0,state,account_length,area_code,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,total_eve_minutes,total_eve_calls,total_eve_charge,total_night_minutes,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,number_customer_service_calls,churn_yes,international_plan_yes,voice_mail_plan_yes
1523,110,14,2290,0,271.9,104,46.22,171.4,104,14.57,239.2,111,10.76,13.8,1.662758,3.73,2,1,0,0
694,92,67,1186,0,125.0,96,21.25,294.5,114,25.03,205.7,75,9.26,10.2,1.662758,2.75,1,1,0,0
3766,88,71,1186,0,290.4,108,49.37,253.9,92,21.58,263.3,126,11.85,10.1,1.858008,2.73,3,1,0,0
4818,90,56,1147,0,221.9,112,37.72,278.2,122,23.65,288.1,85,12.96,7.1,1.858008,1.92,0,1,0,0
5975,80,129,1186,0,334.3,118,56.83,192.1,104,16.33,191.0,83,8.59,10.4,2.024212,2.81,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4446,61,112,2290,0,208.7,150,35.48,212.8,104,18.09,178.1,98,8.01,8.5,1.662758,2.30,0,0,0,0
7709,90,193,2290,1,71.2,58,12.10,124.7,105,10.60,155.5,108,7.00,11.7,1.425734,3.16,0,0,0,1
4812,71,62,2290,0,186.8,94,31.76,207.6,92,17.65,195.0,98,8.78,8.8,1.662758,2.38,3,0,0,0
7558,81,94,1147,0,136.2,114,23.15,165.1,118,14.03,137.9,71,6.21,9.6,1.858008,2.59,0,0,0,0


In [33]:
y_train

8047      no
7171      no
9422      no
6361      no
10114     no
        ... 
2669      no
1152      no
6201      no
11605    yes
9332      no
Name: churn, Length: 10958, dtype: object

## 6. Scaling

In [34]:
scaler = MinMaxScaler()

scaler.fit(X_train)

X_train = pd.DataFrame(
    scaler.transform(X_train),
    columns=X_train.columns
)


X_test = pd.DataFrame(
    scaler.transform(X_test),
    columns=X_test.columns
)

In [35]:
#Guardamos dataset con data preparada para entrenamiento.

X_train.to_csv('preprocess_data/prep_Xtrain.csv', index=False)
X_test.to_csv('preprocess_data/prep_Xtest.csv', index=False)

y_train.to_csv('preprocess_data/prep_ytrain.csv', index=False)
y_test.to_csv('preprocess_data/prep_ytest.csv', index=False)

In [36]:
joblib.dump(scaler, 'preprocess_data/minmax_scaler.joblib')

['preprocess_data/minmax_scaler.joblib']

In [37]:
np.sum(X_train[X_train == 'Unf'].sum(axis=0))

0.0