In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import scipy.stats as stats

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

import joblib

In [2]:
pd.pandas.set_option('display.max_columns', None) #habilitamos despliegue maximo de columnas

In [3]:
data = pd.read_csv('customer_data_edited.csv')
print(data.shape)

(12892, 22)


In [4]:
data.head()

Unnamed: 0,recordID,state,account_length,area_code,international_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,total_eve_minutes,total_eve_calls,total_eve_charge,total_night_minutes,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,number_customer_service_calls,churn,customer_id
0,1,HI,101,510,no,no,0,70.9,123,12.05,211.9,73,18.01,236.0,73,10.62,10.6,3,2.86,3,no,23383607.0
1,2,MT,137,510,no,no,0,223.6,86,38.01,244.8,139,20.81,94.2,81,4.24,9.5,7,2.57,0,no,22550362.0
2,3,OH,103,408,no,yes,29,294.7,95,50.1,237.3,105,20.17,300.3,127,13.51,13.7,6,3.7,1,no,59063354.0
3,4,NM,99,415,no,no,0,216.8,123,36.86,126.4,88,10.74,220.6,82,9.93,15.7,2,4.24,1,no,25464504.0
4,5,SC,108,415,no,no,0,197.4,78,33.56,124.0,101,10.54,204.5,107,9.2,7.7,4,2.08,2,no,691824.0


In [5]:
#Separamos data para entrenamiento y prueba,
X_train, X_test, y_train, y_test = train_test_split(
                        data.drop(['recordID', 'customer_id'], axis=1),
                        data['churn'],
                        test_size=0.15,
                        random_state=2021)

In [6]:
X_train.shape, X_test.shape

((10958, 20), (1934, 20))

## 2. Missing Values
* La data no cuenta con valores faltantes.

## 4. Transformación de Variables Numéricas

In [7]:
#Aplicamos transoformación de Yeo-Jonhson
X_train['total_intl_calls'], param = stats.yeojohnson(X_train['total_intl_calls'])

In [8]:
X_test['total_intl_calls'] = stats.yeojohnson(X_test['total_intl_calls'], lmbda=param)

#### Binarización de Variables con Sesgo fuerte

In [9]:
sesgadas = ['number_vmail_messages']

In [10]:
for var in sesgadas:
    X_train[var] = np.where(X_train[var] == 0, 0, 1)
    X_test[var] = np.where(X_test[var] == 0, 0, 1)

## 5. Codificación de Variables Categóricas

In [11]:
cat_vars = [var for var in data.columns if data[var].dtype == 'O']

#Agregamos el caso especial de MSSubClass ya que por definición es categórica
cat_vars = cat_vars + ['area_code']
cat_vars

['state', 'international_plan', 'voice_mail_plan', 'churn', 'area_code']

### 5.1 One Hot Encoding

In [12]:
binary_vars = ['churn', 'international_plan', 'voice_mail_plan']

 ### 5.1.1. X_train

In [13]:
new_Xtrain = pd.get_dummies(X_train[['churn', 'international_plan', 'voice_mail_plan']], drop_first=True)
new_Xtrain

Unnamed: 0,churn_yes,international_plan_yes,voice_mail_plan_yes
8047,0,0,0
7171,0,0,0
9422,0,0,0
6361,0,0,0
10114,0,0,1
...,...,...,...
2669,0,0,1
1152,0,0,0
6201,0,0,1
11605,1,0,0


In [14]:
#Se unen ambos dataframes
X_train = pd.concat([X_train, new_Xtrain], axis = 1)

#se eliminan las columnas duplicadas
X_train.drop(['international_plan', 'voice_mail_plan', 'churn' ], axis=1, inplace=True)

### 5.1.2 X_test

In [15]:
new_Xtest = pd.get_dummies(X_test[['churn', 'international_plan', 'voice_mail_plan']], drop_first=True)
new_Xtest

Unnamed: 0,churn_yes,international_plan_yes,voice_mail_plan_yes
5276,0,1,0
2952,0,0,1
8280,1,0,0
7532,0,0,0
9965,0,0,0
...,...,...,...
166,0,0,0
6454,1,0,1
9756,0,1,1
3862,0,0,1


In [16]:
#Se unen ambos dataframes
X_test = pd.concat([X_test, new_Xtest], axis = 1)

#se eliminan las columnas duplicadas
X_test.drop(['international_plan', 'voice_mail_plan', 'churn' ], axis=1, inplace=True)

### 5.2 Freq Encoding

In [17]:
def freq_map(train, test, var):
    
    train_freq_map = (train[var].value_counts().sort_values(ascending = False)).to_dict()
    test_freq_map = (test[var].value_counts().sort_values(ascending = False)).to_dict()
    
    train[var] = data[var].map(train_freq_map)
    test[var] = data[var].map(test_freq_map)

In [18]:
big_cat_vars = [var for var in cat_vars if(var not in binary_vars)]
big_cat_vars

['state', 'area_code']

In [19]:
for var in big_cat_vars:
    freq_map(X_train, X_test, var)

In [20]:
X_train

Unnamed: 0,state,account_length,area_code,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,total_eve_minutes,total_eve_calls,total_eve_charge,total_night_minutes,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,number_customer_service_calls,churn_yes,international_plan_yes,voice_mail_plan_yes
8047,225,88,5446,0,172.8,81,29.38,193.4,90,16.44,89.6,107,4.03,12.8,1.983028,3.46,2,0,0,0
7171,219,41,2774,0,182.1,89,30.96,211.5,104,17.98,207.4,124,9.33,6.8,0.720602,1.84,1,0,0,0
9422,197,85,2774,0,211.5,100,35.96,184.6,88,15.69,164.3,131,7.39,13.3,1.762687,3.59,2,0,0,0
6361,252,64,5446,0,206.2,76,35.05,232.4,76,19.75,251.6,96,11.32,13.6,1.168644,3.67,1,0,0,0
10114,252,201,5446,1,192.0,97,32.64,239.1,81,20.32,116.1,125,5.22,15.1,1.499031,4.08,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2669,238,93,2738,1,138.1,91,23.48,167.3,72,14.22,238.9,115,10.75,6.8,1.499031,1.84,2,0,0,1
1152,185,57,5446,0,214.0,100,36.38,209.9,80,17.84,105.1,112,4.73,11.3,0.720602,3.05,0,0,0,0
6201,258,119,5446,1,217.1,92,36.91,220.8,134,18.77,249.5,93,11.23,8.0,1.983028,2.16,2,0,0,1
11605,237,133,2774,0,295.0,141,50.15,223.6,101,19.01,229.4,109,10.32,12.9,1.762687,3.48,2,1,0,0


In [21]:
y_train

8047      no
7171      no
9422      no
6361      no
10114     no
        ... 
2669      no
1152      no
6201      no
11605    yes
9332      no
Name: churn, Length: 10958, dtype: object

## 6. Scaling

In [22]:
scaler = MinMaxScaler()

scaler.fit(X_train)

X_train = pd.DataFrame(
    scaler.transform(X_train),
    columns=X_train.columns
)


X_test = pd.DataFrame(
    scaler.transform(X_test),
    columns=X_test.columns
)

In [23]:
#Guardamos dataset con data preparada para entrenamiento.

X_train.to_csv('preprocess_data/prep_Xtrain.csv', index=False)
X_test.to_csv('preprocess_data/prep_Xtest.csv', index=False)

y_train.to_csv('preprocess_data/prep_ytrain.csv', index=False)
y_test.to_csv('preprocess_data/prep_ytest.csv', index=False)

In [24]:
joblib.dump(scaler, 'preprocess_data/minmax_scaler.joblib')

['preprocess_data/minmax_scaler.joblib']

In [25]:
np.sum(X_train[X_train == 'Unf'].sum(axis=0))

0.0