In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

print(sklearn.__version__)

1.5.1


In [2]:
df_treino = pd.read_csv('./data/train.csv')
df_teste = pd.read_csv('./data/test.csv')

In [3]:
df_treino.dtypes

id                    int64
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure              float64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                 int64
dtype: object

In [3]:
df_treino.head(10)

Unnamed: 0,id,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,4030,Female,0,No,No,56.0,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,Two year,No,,45.05,2560.1,0
1,6731,Male,0,Yes,Yes,,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,One year,No,Mailed check,19.65,332.65,0
2,6479,Female,0,Yes,No,60.0,Yes,Yes,Fiber optic,No,...,Yes,Yes,Yes,Yes,Two year,Yes,Credit card (automatic),110.8,6640.7,0
3,6861,Female,0,No,No,37.0,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,Month-to-month,Yes,Bank transfer (automatic),101.9,3545.35,1
4,3266,Male,0,Yes,Yes,29.0,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.75,1974.8,1
5,6368,Male,0,No,No,70.0,No,No phone service,DSL,Yes,...,Yes,Yes,No,Yes,Two year,No,Bank transfer (automatic),48.4,3442.8,0
6,5830,Male,0,Yes,Yes,31.0,No,No phone service,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,No,Electronic check,59.95,1848.8,0
7,4476,Female,0,No,No,3.0,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Month-to-month,No,Electronic check,19.55,61.05,0
8,1508,Male,0,No,No,54.0,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Credit card (automatic),19.65,1008.7,0
9,2921,Female,0,Yes,No,72.0,Yes,No,DSL,Yes,...,Yes,Yes,Yes,Yes,Two year,No,Mailed check,85.1,6155.4,0


In [4]:
df_treino.columns

Index(['id', 'gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
       'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [5]:
#Checando balanceamento da target (binário)
df_treino['Churn'].value_counts()

Churn
0    4139
1    1495
Name: count, dtype: int64

In [6]:
df_treino['Churn'].value_counts()[0] / len(df_treino['Churn'])

0.7346467873624423

# Underbalance com todas as colunas

In [7]:
df_slice_0 = df_treino[df_treino.Churn == 0].sample(1500)
df_slice_1 = df_treino[df_treino.Churn == 1].sample(1495)

df_treino = pd.concat([df_slice_0, df_slice_1], ignore_index=True)

In [8]:
df_treino['Churn'].value_counts()

Churn
0    1500
1    1495
Name: count, dtype: int64

In [9]:
df_treino.head()

Unnamed: 0,id,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,5526,Female,0,Yes,Yes,54.0,Yes,Yes,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,One year,Yes,Bank transfer (automatic),24.75,1342.15,0
1,6058,Male,0,No,No,19.0,Yes,No,DSL,No,...,No,Yes,Yes,No,Month-to-month,Yes,Credit card (automatic),61.55,1093.2,0
2,6032,Female,1,Yes,Yes,12.0,No,No phone service,DSL,No,...,No,Yes,No,No,Month-to-month,Yes,Mailed check,29.3,355.9,0
3,4806,Male,0,Yes,Yes,52.0,Yes,Yes,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,24.55,1331.05,0
4,3608,Female,0,No,No,13.0,Yes,No,DSL,Yes,...,No,Yes,No,No,One year,No,Mailed check,55.15,742.9,0


In [10]:
df_treino['TotalCharges'].dtype

dtype('O')

In [11]:
df_treino['TotalCharges'] = pd.to_numeric(df_treino['TotalCharges'], errors='coerce')
df_teste['TotalCharges'] = pd.to_numeric(df_teste['TotalCharges'], errors='coerce')


In [12]:
df_treino.dtypes

id                    int64
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure              float64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges        float64
Churn                 int64
dtype: object

In [13]:
df_treino.isna().sum()

id                    0
gender                0
SeniorCitizen         0
Partner               0
Dependents          138
tenure              260
PhoneService          0
MultipleLines         0
InternetService       0
OnlineSecurity        0
OnlineBackup          0
DeviceProtection      0
TechSupport           0
StreamingTV           0
StreamingMovies       0
Contract              0
PaperlessBilling      0
PaymentMethod        56
MonthlyCharges        0
TotalCharges          4
Churn                 0
dtype: int64

In [14]:
df_treino = df_treino.dropna()
df_treino.isna().sum()

id                  0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [15]:
X = df_treino.drop(['id', 'Churn'], axis=1) #features
y = df_treino[['Churn']] #target

In [16]:
colunas_categoricas = X.select_dtypes(include="object").columns
colunas_numericas = X.select_dtypes(exclude="object").columns

In [17]:
colunas_categoricas

Index(['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
       'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
       'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
       'PaperlessBilling', 'PaymentMethod'],
      dtype='object')

In [18]:
colunas_numericas

Index(['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges'], dtype='object')

In [19]:
X_treino, X_valid, y_treino, y_valid = train_test_split(X, y)

In [20]:
OHE = OneHotEncoder()
scaler = StandardScaler()
model_rfc = RandomForestClassifier()

transformer = ColumnTransformer([('cat_cols', OHE, colunas_categoricas),
                                ('num_cols', scaler, colunas_numericas)])

pipe = Pipeline([("preprocessing", transformer),
                ("classifier", model_rfc)])

pipe.fit(X_treino, y_treino)

  return fit_method(estimator, *args, **kwargs)


In [21]:
X_valid # tal como virá em produção, sem transformações. Aí passa pelo pipeline

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
2773,Female,0,No,No,7.0,No,No phone service,DSL,No,Yes,No,No,No,Yes,Month-to-month,Yes,Credit card (automatic),40.10,293.30
1874,Female,0,No,No,10.0,Yes,Yes,Fiber optic,No,No,No,No,No,Yes,Month-to-month,No,Electronic check,86.45,830.85
2883,Female,1,Yes,No,68.0,Yes,Yes,Fiber optic,No,Yes,No,No,No,Yes,Month-to-month,Yes,Bank transfer (automatic),89.60,6127.60
289,Female,0,Yes,Yes,63.0,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Credit card (automatic),20.00,1209.25
166,Female,0,Yes,Yes,67.0,Yes,Yes,Fiber optic,Yes,No,No,Yes,Yes,Yes,Two year,No,Credit card (automatic),106.70,7009.50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2431,Male,0,No,No,3.0,Yes,Yes,Fiber optic,No,No,No,No,No,Yes,Month-to-month,Yes,Electronic check,85.30,264.80
1116,Female,0,Yes,No,72.0,Yes,Yes,Fiber optic,Yes,Yes,Yes,No,Yes,Yes,Two year,Yes,Electronic check,109.55,8165.10
1671,Male,1,No,No,3.0,No,No phone service,DSL,No,No,Yes,No,No,Yes,Month-to-month,Yes,Electronic check,41.15,132.20
2400,Male,0,No,No,1.0,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,No,Mailed check,70.60,70.60


In [22]:
predicoes = pipe.predict(X_valid)
predicoes[:10]

array([1, 1, 0, 0, 0, 0, 1, 1, 1, 1], dtype=int64)

In [20]:
y_valid.values[:10].flatten()

array([1, 1, 0, 1, 0, 0, 1, 1, 0, 0], dtype=int64)

In [23]:
accuracy_score(y_valid, predicoes)

0.7480438184663537

In [None]:
# Exportando o modelo final para implantação
import pickle

pickle.dump(pipe, open('./models/pipe.pkl', 'wb'))
