# 🛠️ Preparação dos Dados

## Extração do Arquivo

In [4]:
import pandas as pd

In [5]:
url = 'https://raw.githubusercontent.com/isenf/telecomx2/refs/heads/main/dados.csv'
df = pd.read_csv(url)

In [6]:
df.head()

Unnamed: 0,customerID,Churn,customer.gender,customer.SeniorCitizen,customer.Partner,customer.Dependents,customer.tenure,phone.PhoneService,phone.MultipleLines,internet.InternetService,...,internet.DeviceProtection,internet.TechSupport,internet.StreamingTV,internet.StreamingMovies,account.Contract,account.PaperlessBilling,account.PaymentMethod,Total.Day,account.Charges.Monthly,account.Charges.Total
0,0002-ORFBO,No,Female,0,Yes,Yes,9,Yes,No,DSL,...,No,Yes,Yes,No,One year,Yes,Mailed check,2.2,65.6,593.3
1,0003-MKNFE,No,Male,0,No,No,9,Yes,Yes,DSL,...,No,No,No,Yes,Month-to-month,No,Mailed check,2.01,59.9,542.4
2,0004-TLHLJ,Yes,Male,0,No,No,4,Yes,No,Fiber optic,...,Yes,No,No,No,Month-to-month,Yes,Electronic check,2.34,73.9,280.85
3,0011-IGKFF,Yes,Male,1,Yes,No,13,Yes,No,Fiber optic,...,Yes,No,Yes,Yes,Month-to-month,Yes,Electronic check,3.17,98.0,1237.85
4,0013-EXCHZ,Yes,Female,1,Yes,No,3,Yes,No,Fiber optic,...,No,Yes,Yes,No,Month-to-month,Yes,Mailed check,2.97,83.9,267.4


## Remoção de Colunas Irrelevantes

In [7]:
df.columns

Index(['customerID', 'Churn', 'customer.gender', 'customer.SeniorCitizen',
       'customer.Partner', 'customer.Dependents', 'customer.tenure',
       'phone.PhoneService', 'phone.MultipleLines', 'internet.InternetService',
       'internet.OnlineSecurity', 'internet.OnlineBackup',
       'internet.DeviceProtection', 'internet.TechSupport',
       'internet.StreamingTV', 'internet.StreamingMovies', 'account.Contract',
       'account.PaperlessBilling', 'account.PaymentMethod', 'Total.Day',
       'account.Charges.Monthly', 'account.Charges.Total'],
      dtype='object')

In [8]:
df = df.drop(['customerID', "Total.Day"], axis = 1)

## Encoding

In [9]:
for col in df.columns:
  print(f"{col}  ->  {df[col].unique()}")

Churn  ->  ['No' 'Yes']
customer.gender  ->  ['Female' 'Male']
customer.SeniorCitizen  ->  [0 1]
customer.Partner  ->  ['Yes' 'No']
customer.Dependents  ->  ['Yes' 'No']
customer.tenure  ->  [ 9  4 13  3 71 63  7 65 54 72  5 56 34  1 45 50 23 55 26 69 37 49 66 67
 20 43 59 12 27  2 25 29 14 35 64 39 40 11  6 30 70 57 58 16 32 33 10 21
 61 15 44 22 24 19 47 62 46 52  8 60 48 28 41 53 68 31 36 17 18 51 38 42
  0]
phone.PhoneService  ->  ['Yes' 'No']
phone.MultipleLines  ->  ['No' 'Yes' 'No phone service']
internet.InternetService  ->  ['DSL' 'Fiber optic' 'No']
internet.OnlineSecurity  ->  ['No' 'Yes' 'No internet service']
internet.OnlineBackup  ->  ['Yes' 'No' 'No internet service']
internet.DeviceProtection  ->  ['No' 'Yes' 'No internet service']
internet.TechSupport  ->  ['Yes' 'No' 'No internet service']
internet.StreamingTV  ->  ['Yes' 'No' 'No internet service']
internet.StreamingMovies  ->  ['No' 'Yes' 'No internet service']
account.Contract  ->  ['One year' 'Month-to-month' 'Two

In [10]:
df['customer.gender'] = df['customer.gender'].map({'Female': 0, 'Male': 1})

In [11]:
col_binarias = ['Churn', 'customer.Partner', 'customer.Dependents', 'phone.PhoneService','account.PaperlessBilling']

for col in col_binarias:
  df[col] = df[col].map({'Yes': 1, 'No': 0})

In [12]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [13]:
col_categoricas = ['phone.MultipleLines', 'internet.InternetService', 'internet.OnlineSecurity', 'internet.OnlineBackup', 'internet.DeviceProtection',
                   'internet.TechSupport', 'internet.StreamingTV', 'internet.StreamingMovies', 'account.Contract', 'account.PaymentMethod']

In [14]:
pre_processador = ColumnTransformer(
    [('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False), col_categoricas)],
    remainder='passthrough'
)

In [15]:
dados_transformados = pre_processador.fit_transform(df)

In [16]:
col_novas = pre_processador.get_feature_names_out()
col_novas

array(['onehot__phone.MultipleLines_No',
       'onehot__phone.MultipleLines_No phone service',
       'onehot__phone.MultipleLines_Yes',
       'onehot__internet.InternetService_DSL',
       'onehot__internet.InternetService_Fiber optic',
       'onehot__internet.InternetService_No',
       'onehot__internet.OnlineSecurity_No',
       'onehot__internet.OnlineSecurity_No internet service',
       'onehot__internet.OnlineSecurity_Yes',
       'onehot__internet.OnlineBackup_No',
       'onehot__internet.OnlineBackup_No internet service',
       'onehot__internet.OnlineBackup_Yes',
       'onehot__internet.DeviceProtection_No',
       'onehot__internet.DeviceProtection_No internet service',
       'onehot__internet.DeviceProtection_Yes',
       'onehot__internet.TechSupport_No',
       'onehot__internet.TechSupport_No internet service',
       'onehot__internet.TechSupport_Yes',
       'onehot__internet.StreamingTV_No',
       'onehot__internet.StreamingTV_No internet service',
       'on

In [17]:
df_final = pd.DataFrame(dados_transformados, columns=col_novas)

In [18]:
df_final.shape

(7043, 41)

In [19]:
for col in df_final.columns:
  print(f"{col}  ->  {df_final[col].unique()}")

onehot__phone.MultipleLines_No  ->  [1. 0.]
onehot__phone.MultipleLines_No phone service  ->  [0. 1.]
onehot__phone.MultipleLines_Yes  ->  [0. 1.]
onehot__internet.InternetService_DSL  ->  [1. 0.]
onehot__internet.InternetService_Fiber optic  ->  [0. 1.]
onehot__internet.InternetService_No  ->  [0. 1.]
onehot__internet.OnlineSecurity_No  ->  [1. 0.]
onehot__internet.OnlineSecurity_No internet service  ->  [0. 1.]
onehot__internet.OnlineSecurity_Yes  ->  [0. 1.]
onehot__internet.OnlineBackup_No  ->  [0. 1.]
onehot__internet.OnlineBackup_No internet service  ->  [0. 1.]
onehot__internet.OnlineBackup_Yes  ->  [1. 0.]
onehot__internet.DeviceProtection_No  ->  [1. 0.]
onehot__internet.DeviceProtection_No internet service  ->  [0. 1.]
onehot__internet.DeviceProtection_Yes  ->  [0. 1.]
onehot__internet.TechSupport_No  ->  [0. 1.]
onehot__internet.TechSupport_No internet service  ->  [0. 1.]
onehot__internet.TechSupport_Yes  ->  [1. 0.]
onehot__internet.StreamingTV_No  ->  [0. 1.]
onehot__inte

In [30]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7032 entries, 0 to 7042
Data columns (total 41 columns):
 #   Column                                                   Non-Null Count  Dtype  
---  ------                                                   --------------  -----  
 0   onehot__phone.MultipleLines_No                           7032 non-null   int64  
 1   onehot__phone.MultipleLines_No phone service             7032 non-null   int64  
 2   onehot__phone.MultipleLines_Yes                          7032 non-null   int64  
 3   onehot__internet.InternetService_DSL                     7032 non-null   int64  
 4   onehot__internet.InternetService_Fiber optic             7032 non-null   int64  
 5   onehot__internet.InternetService_No                      7032 non-null   int64  
 6   onehot__internet.OnlineSecurity_No                       7032 non-null   int64  
 7   onehot__internet.OnlineSecurity_No internet service      7032 non-null   int64  
 8   onehot__internet.OnlineSecurity_Y

In [21]:
col_float_bin = [
  col for col in df_final.columns
  if all(val in {0.0, 1.0} for val in df_final[col].unique())
]

In [22]:
excessao = ['remainder__account.Charges.Monthly', 'remainder__account.Charges.Total']

In [23]:
df_final = df_final.drop(columns=excessao).astype(int).join(df_final[excessao])

## Verificação da Proporção de Evasão

In [24]:
print(df.groupby('Churn').size() / df.shape[0])

Churn
0    0.73463
1    0.26537
dtype: float64


## Balanceamento de Classes

In [32]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from collections import Counter

df_final = df_final.dropna()

X = df_final.drop('remainder__Churn', axis=1)
y = df_final['remainder__Churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

smote = SMOTE(random_state=42)

X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

df_train = pd.DataFrame(X_train_res, columns=X.columns)
df_train['remainder__Churn'] = y_train_res

df_test = pd.DataFrame(X_test, columns=X.columns)
df_test['remainder__Churn'] = y_test

In [35]:
print(Counter(y_train_res), Counter(y_test))

Counter({0: 3609, 1: 3609}) Counter({0: 1554, 1: 556})


## Normalização

In [38]:
from sklearn.preprocessing import StandardScaler
import pandas as pd

numerical_cols = ['remainder__account.Charges.Monthly', 'remainder__account.Charges.Total']

df_train_scaled = df_train.copy()
df_test_scaled = df_test.copy()

scaler = StandardScaler()

scaler.fit(df_train_scaled[numerical_cols])

df_train_scaled[numerical_cols] = scaler.transform(df_train_scaled[numerical_cols])
df_test_scaled[numerical_cols] = scaler.transform(df_test_scaled[numerical_cols])