In [40]:
import pandas as pd
import numpy as np
import seaborn as sns
import statsmodels.api as sm
import pickle
from sklearn.metrics import precision_score
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier
from statsmodels.formula.api import ols
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from prettytable import PrettyTable

In [41]:

df = pd.read_csv('/content/dados_tratados.csv')
df.head()

Unnamed: 0,customer_id,churn,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,internet_service,...,device_protection,tech_support,streaming_tv,streaming_movies,contract,paper_less_billing,payment_method,charges_monthly,charges_total,daily_accounts
0,0002-ORFBO,No,Female,0,Yes,Yes,9,Yes,No,DSL,...,No,Yes,Yes,No,One year,Yes,Mailed check,65.6,593.3,2.16
1,0003-MKNFE,No,Male,0,No,No,9,Yes,Yes,DSL,...,No,No,No,Yes,Month-to-month,No,Mailed check,59.9,542.4,1.97
2,0004-TLHLJ,Yes,Male,0,No,No,4,Yes,No,Fiber optic,...,Yes,No,No,No,Month-to-month,Yes,Electronic check,73.9,280.85,2.43
3,0011-IGKFF,Yes,Male,1,Yes,No,13,Yes,No,Fiber optic,...,Yes,No,Yes,Yes,Month-to-month,Yes,Electronic check,98.0,1237.85,3.22
4,0013-EXCHZ,Yes,Female,1,Yes,No,3,Yes,No,Fiber optic,...,No,Yes,Yes,No,Month-to-month,Yes,Mailed check,83.9,267.4,2.76


In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7267 entries, 0 to 7266
Data columns (total 22 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   customer_id         7267 non-null   object 
 1   churn               7043 non-null   object 
 2   gender              7267 non-null   object 
 3   senior_citizen      7267 non-null   int64  
 4   partner             7267 non-null   object 
 5   dependents          7267 non-null   object 
 6   tenure              7267 non-null   int64  
 7   phone_service       7267 non-null   object 
 8   multiple_lines      7267 non-null   object 
 9   internet_service    7267 non-null   object 
 10  online_security     7267 non-null   object 
 11  online_backup       7267 non-null   object 
 12  device_protection   7267 non-null   object 
 13  tech_support        7267 non-null   object 
 14  streaming_tv        7267 non-null   object 
 15  streaming_movies    7267 non-null   object 
 16  contra

In [43]:

df.isna().sum()

Unnamed: 0,0
customer_id,0
churn,224
gender,0
senior_citizen,0
partner,0
dependents,0
tenure,0
phone_service,0
multiple_lines,0
internet_service,0


In [44]:
df = df.dropna(subset=['churn'])


df.isna().sum()

Unnamed: 0,0
customer_id,0
churn,0
gender,0
senior_citizen,0
partner,0
dependents,0
tenure,0
phone_service,0
multiple_lines,0
internet_service,0


In [45]:
df.churn.unique()

data = df.copy()
data.drop(columns='customer_id', axis=1, inplace = True)


y = data['churn']
x = data.drop(columns='churn', axis=1)

#Tratamento de dados

In [46]:

categorics_columns = x.select_dtypes(include='object')
categorics_columns.head()

Unnamed: 0,gender,partner,dependents,phone_service,multiple_lines,internet_service,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,contract,paper_less_billing,payment_method
0,Female,Yes,Yes,Yes,No,DSL,No,Yes,No,Yes,Yes,No,One year,Yes,Mailed check
1,Male,No,No,Yes,Yes,DSL,No,No,No,No,No,Yes,Month-to-month,No,Mailed check
2,Male,No,No,Yes,No,Fiber optic,No,No,Yes,No,No,No,Month-to-month,Yes,Electronic check
3,Male,Yes,No,Yes,No,Fiber optic,No,Yes,Yes,No,Yes,Yes,Month-to-month,Yes,Electronic check
4,Female,Yes,No,Yes,No,Fiber optic,No,No,No,Yes,Yes,No,Month-to-month,Yes,Mailed check


In [47]:
cols = x.columns

one_hot = make_column_transformer((
    OneHotEncoder(drop='if_binary'),
    ['gender',	'partner',	'dependents',	'phone_service',	'multiple_lines',	'internet_service',
     'online_security',	'online_backup',	'device_protection',	'tech_support',	'streaming_tv',
     'streaming_movies',	'contract',	'paper_less_billing',	'payment_method']
),
    remainder='passthrough', sparse_threshold=0)

x = one_hot.fit_transform(x)



feature_names_out  = one_hot.get_feature_names_out(cols)
feature_names_out

array(['onehotencoder__gender_Male', 'onehotencoder__partner_Yes',
       'onehotencoder__dependents_Yes',
       'onehotencoder__phone_service_Yes',
       'onehotencoder__multiple_lines_No',
       'onehotencoder__multiple_lines_No phone service',
       'onehotencoder__multiple_lines_Yes',
       'onehotencoder__internet_service_DSL',
       'onehotencoder__internet_service_Fiber optic',
       'onehotencoder__internet_service_No',
       'onehotencoder__online_security_No',
       'onehotencoder__online_security_No internet service',
       'onehotencoder__online_security_Yes',
       'onehotencoder__online_backup_No',
       'onehotencoder__online_backup_No internet service',
       'onehotencoder__online_backup_Yes',
       'onehotencoder__device_protection_No',
       'onehotencoder__device_protection_No internet service',
       'onehotencoder__device_protection_Yes',
       'onehotencoder__tech_support_No',
       'onehotencoder__tech_support_No internet service',
       'oneh

In [48]:
label_enconder = LabelEncoder()
y = label_enconder.fit_transform(y)
y


array([0, 0, 1, ..., 0, 0, 0])

#Criando Dataframe

In [49]:
reaname_cols = [col.replace('onehotencoder__', '').replace('remainder__', '') for col in feature_names_out]
print(reaname_cols)

['gender_Male', 'partner_Yes', 'dependents_Yes', 'phone_service_Yes', 'multiple_lines_No', 'multiple_lines_No phone service', 'multiple_lines_Yes', 'internet_service_DSL', 'internet_service_Fiber optic', 'internet_service_No', 'online_security_No', 'online_security_No internet service', 'online_security_Yes', 'online_backup_No', 'online_backup_No internet service', 'online_backup_Yes', 'device_protection_No', 'device_protection_No internet service', 'device_protection_Yes', 'tech_support_No', 'tech_support_No internet service', 'tech_support_Yes', 'streaming_tv_No', 'streaming_tv_No internet service', 'streaming_tv_Yes', 'streaming_movies_No', 'streaming_movies_No internet service', 'streaming_movies_Yes', 'contract_Month-to-month', 'contract_One year', 'contract_Two year', 'paper_less_billing_Yes', 'payment_method_Bank transfer (automatic)', 'payment_method_Credit card (automatic)', 'payment_method_Electronic check', 'payment_method_Mailed check', 'senior_citizen', 'tenure', 'charges_

In [50]:
new_data = pd.DataFrame(x, columns=reaname_cols)
new_data['churn'] = y
new_data.head()

Unnamed: 0,gender_Male,partner_Yes,dependents_Yes,phone_service_Yes,multiple_lines_No,multiple_lines_No phone service,multiple_lines_Yes,internet_service_DSL,internet_service_Fiber optic,internet_service_No,...,payment_method_Bank transfer (automatic),payment_method_Credit card (automatic),payment_method_Electronic check,payment_method_Mailed check,senior_citizen,tenure,charges_monthly,charges_total,daily_accounts,churn
0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,9.0,65.6,593.3,2.16,0
1,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,9.0,59.9,542.4,1.97,0
2,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,4.0,73.9,280.85,2.43,1
3,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,13.0,98.0,1237.85,3.22,1
4,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,1.0,3.0,83.9,267.4,2.76,1


In [51]:

x = new_data.drop(columns='churn', axis=1)
y = new_data['churn']

In [52]:

x, x_test, y, y_test = train_test_split(x,y, random_state=101, stratify=y, test_size=0.20)
x_train, x_val, y_train, y_val = train_test_split(x,y, random_state=101, stratify=y)

print(f"Dimensões de x_train: {x_train.shape}")
print(f"Dimensões de x_test: {x_test.shape}")

print(f'{df.churn.value_counts(normalize=True)*100}%')

Dimensões de x_train: (4225, 41)
Dimensões de x_test: (1409, 41)
churn
No     73.463013
Yes    26.536987
Name: proportion, dtype: float64%


In [53]:

smote = SMOTE(random_state=101)
x_train_balanced, y_train_balanced = smote.fit_resample(x_train, y_train)
print(pd.Series(y_train_balanced).value_counts(normalize=True)*100)
print("\n")


churn
0    50.0
1    50.0
Name: proportion, dtype: float64




In [54]:

x_train = x_train_balanced
y_train = y_train_balanced

In [55]:
def dummy_model_1():
    dummy = DummyClassifier()
    dummy.fit(x_train, y_train)

    print(f'Dummy 1 Train: {dummy.score(x_train, y_train)}')
    print(f'Dummy  1 Validation: {dummy.score(x_val, y_val)}')
    print(f'Dummy 1 Acuracy: {dummy.score(x_val, y_val)}')

dummy_model_1()

Dummy 1 Train: 0.5
Dummy  1 Validation: 0.7345635202271115
Dummy 1 Acuracy: 0.7345635202271115


In [56]:
def decision_tree_cassifier_1():
  tree_class = DecisionTreeClassifier(max_depth=3, random_state=101)
  tree_class.fit(x_train, y_train)

  print(f'Tree Classifier 1 Train: {tree_class.score(x_train, y_train)}')
  print(f'Tree Classifier 1 Validation: {tree_class.score(x_val, y_val)}')
  print(f'Tree Classifier 1 Acuracy: {tree_class.score(x_val, y_val)}')

decision_tree_cassifier_1()

Tree Classifier 1 Train: 0.7996134020618557
Tree Classifier 1 Validation: 0.723207948899929
Tree Classifier 1 Acuracy: 0.723207948899929


In [57]:
def logist_regression_1():
  scaler = StandardScaler()
  X_train_scaled = scaler.fit_transform(x_train)

  log_reg = LogisticRegression(random_state=101, max_iter=1000)

  log_reg.fit(X_train_scaled, y_train)

  y_pred_log_reg = log_reg.predict(x_test)

  print("\nClassification Report - Logistic Regression model 1:")
  print(classification_report(y_test, y_pred_log_reg, target_names=['Churn: No', 'Churn: Yes']))

logist_regression_1()


Classification Report - Logistic Regression model 1:
              precision    recall  f1-score   support

   Churn: No       0.37      0.05      0.08      1035
  Churn: Yes       0.23      0.78      0.35       374

    accuracy                           0.24      1409
   macro avg       0.30      0.41      0.22      1409
weighted avg       0.33      0.24      0.15      1409





In [58]:
train_data_ols = pd.concat([x_train, y_train], axis=1)
formula_data = 'churn ~' + ' + '.join([f'Q("{col}")' for col in x_train.columns])
model_ols_1 = ols(formula_data, data = train_data_ols).fit()


print('Remumo modelo:  modelo_ols_1\n')
print(model_ols_1.summary())

Remumo modelo:  modelo_ols_1

                            OLS Regression Results                            
Dep. Variable:                  churn   R-squared:                       0.380
Model:                            OLS   Adj. R-squared:                  0.377
Method:                 Least Squares   F-statistic:                     157.7
Date:                Wed, 20 Aug 2025   Prob (F-statistic):               0.00
Time:                        01:18:08   Log-Likelihood:                -3023.7
No. Observations:                6208   AIC:                             6097.
Df Residuals:                    6183   BIC:                             6266.
Df Model:                          24                                         
Covariance Type:            nonrobust                                         
                                                    coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------

In [59]:
def ols_model_1():
  print(f'OLS R-squared Train: {model_ols_1.rsquared}')

  y_train_pred_ols = model_ols_1.predict(x_train)
  y_val_pred_ols = model_ols_1.predict(x_val)

  y_train_pred_binary = (y_train_pred_ols > 0.5).astype(int)
  y_val_pred_binary = (y_val_pred_ols > 0.5).astype(int)

  precision_train = precision_score(y_train, y_train_pred_binary)
  precision_val = precision_score(y_val, y_val_pred_binary)

  print(f'OLS Precision Train: {precision_train}')
  print(f'OLS Precision Validation: {precision_val}')

ols_model_1()

OLS R-squared Train: 0.37963926012934557
OLS Precision Train: 0.7531927531927531
OLS Precision Validation: 0.4965635738831615


In [60]:
remove_columns = ['gender_Male','charges_monthly','daily_accounts','multiple_lines_No',
                  'contract_One year','partner_Yes',"payment_method_Mailed check"]

# Creating new DataFrame for Model 2 features
x_2 = pd.DataFrame(x, columns=reaname_cols).drop(columns=remove_columns, axis=1)
y_2 = y

In [61]:
x_2_train, x_2_test, y_2_train, y_2_test = train_test_split(x_2, y_2, random_state=101, stratify=y_2, test_size=0.20)
x_2_train, x_2_val, y_2_train, y_2_val = train_test_split(x_2_train, y_2_train, random_state=101, stratify=y_2_train)

print(f"Dimensões de x_2_train: {x_2_train.shape}")
print(f"Dimensões de x_2_test: {x_2_test.shape}")

Dimensões de x_2_train: (3380, 34)
Dimensões de x_2_test: (1127, 34)


In [62]:

def dummy_model_2():
  dummy_2 = DummyClassifier()
  dummy_2.fit(x_2_train, y_2_train)

  print(f'Dummy_2 Train: {dummy_2.score(x_2_train, y_2_train)}')
  print(f'Dummy_2 Validation: {dummy_2.score(x_2_val, y_2_val)}')
  print(f'Dummy_2 Accuracy: {dummy_2.score(x_2_val, y_2_val)}')

dummy_model_2()


Dummy_2 Train: 0.7346153846153847
Dummy_2 Validation: 0.7346938775510204
Dummy_2 Accuracy: 0.7346938775510204


In [63]:
def decision_tree_cassifier_2():
  tree_class_2 = DecisionTreeClassifier(max_depth=3, random_state=101)
  tree_class_2.fit(x_2_train, y_2_train)

  print(f'Tree Classifier 2 Train: {tree_class_2.score(x_2_train, y_2_train)}')
  print(f'Tree Classifier 2 Validation: {tree_class_2.score(x_2_val, y_2_val)}')
  print(f'Tree Classifier 2 Accuracy: {tree_class_2.score(x_2_val, y_2_val)}')

decision_tree_cassifier_2()

Tree Classifier 2 Train: 0.7878698224852071
Tree Classifier 2 Validation: 0.7985803016858918
Tree Classifier 2 Accuracy: 0.7985803016858918


In [64]:
def logist_regression_2():
  scaler = StandardScaler()
  X_train_scaled_2 = scaler.fit_transform(x_2_train)

  log_reg = LogisticRegression(random_state=101, max_iter=1000)

  log_reg.fit(X_train_scaled_2, y_2_train)

  y_pred_log_reg = log_reg.predict(x_2_test)

  print("\nClassification Report - Logistic Regression model 2:")
  print(classification_report(y_2_test, y_pred_log_reg, target_names=['Churn: No', 'Churn: Yes']))

logist_regression_2()


Classification Report - Logistic Regression model 2:
              precision    recall  f1-score   support

   Churn: No       1.00      0.00      0.00       828
  Churn: Yes       0.27      1.00      0.42       299

    accuracy                           0.27      1127
   macro avg       0.63      0.50      0.21      1127
weighted avg       0.81      0.27      0.11      1127





#MODELO OLS

In [65]:

train_data_ols_2 = pd.concat([x_2_train, y_2_train], axis=1)
formula_data_2 = 'churn ~' + ' + '.join([f'Q("{col}")' for col in x_2_train.columns])
model_ols_2 = ols(formula_data_2, data = train_data_ols_2).fit()


print('Remumo modelo:  modelo_ols_2\n')
print(model_ols_2.summary())

Remumo modelo:  modelo_ols_2

                            OLS Regression Results                            
Dep. Variable:                  churn   R-squared:                       0.286
Model:                            OLS   Adj. R-squared:                  0.282
Method:                 Least Squares   F-statistic:                     67.30
Date:                Wed, 20 Aug 2025   Prob (F-statistic):          6.46e-228
Time:                        01:18:08   Log-Likelihood:                -1463.4
No. Observations:                3380   AIC:                             2969.
Df Residuals:                    3359   BIC:                             3097.
Df Model:                          20                                         
Covariance Type:            nonrobust                                         
                                                    coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------

In [66]:
def ols_model_2():
  print(f'OLS R-squared Train 2: {model_ols_2.rsquared}')

  y_2_train_pred_ols = model_ols_2.predict(x_2_train)
  y_2_val_pred_ols = model_ols_2.predict(x_2_val)

  y_2_train_pred_binary = (y_2_train_pred_ols > 0.5).astype(int)
  y_2_val_pred_binary = (y_2_val_pred_ols > 0.5).astype(int)

  precision_train_2 = precision_score(y_2_train, y_2_train_pred_binary)
  precision_val_2 = precision_score(y_2_val, y_2_val_pred_binary)

  print(f'OLS Precision Train 2: {precision_train_2}')
  print(f'OLS Precision Validation 2: {precision_val_2}')

ols_model_2()

OLS R-squared Train 2: 0.2860668013320309
OLS Precision Train 2: 0.6715116279069767
OLS Precision Validation 2: 0.6651982378854625


In [67]:
remove_columns_3 = ["phone_service_Yes","internet_service_No","online_security_No internet service","online_backup_No internet service",
                  "device_protection_No internet service","tech_support_No internet service","streaming_tv_No internet service",
                  "tech_support_No internet service","streaming_tv_No internet service","streaming_movies_No internet service",
                  "online_security_Yes","payment_method_Credit card (automatic)","streaming_tv_No","dependents_Yes","tech_support_Yes",
                  "streaming_movies_No","online_backup_Yes","contract_Two year","device_protection_Yes","charges_total"]

# Creating new DataFrame for Model 3 features
x_3 = pd.DataFrame(x, columns=reaname_cols).drop(columns=remove_columns_3, axis=1)
y_3 = y


# Splitting data for Model 3
x_3_train, x_3_test, y_3_train, y_3_test = train_test_split(x_3, y_3, random_state=101, stratify=y_3, test_size=0.20)
x_3_train, x_3_val, y_3_train, y_3_val = train_test_split(x_3_train, y_3_train, random_state=101, stratify=y_3_train)

print(f"Dimensões de x_3_train: {x_3_train.shape}")
print(f"Dimensões de x_3_test: {x_3_test.shape}")

Dimensões de x_3_train: (3380, 23)
Dimensões de x_3_test: (1127, 23)


In [68]:
def dummy_model_3():
  dummy_3 = DummyClassifier()
  dummy_3.fit(x_3_train, y_3_train)

  print(f'Dummy_3 Train: {dummy_3.score(x_3_train, y_3_train)}')
  print(f'Dummy_3 Validation: {dummy_3.score(x_3_val, y_3_val)}')
  print(f'Dummy_3 Accuracy: {dummy_3.score(x_3_val, y_3_val)}')

dummy_model_3()

Dummy_3 Train: 0.7346153846153847
Dummy_3 Validation: 0.7346938775510204
Dummy_3 Accuracy: 0.7346938775510204


In [69]:
def dummy_model_3():
  dummy_3 = DummyClassifier()
  dummy_3.fit(x_3_train, y_3_train)

  print(f'Dummy_3 Train: {dummy_3.score(x_3_train, y_3_train)}')
  print(f'Dummy_3 Validation: {dummy_3.score(x_3_val, y_3_val)}')
  print(f'Dummy_3 Accuracy: {dummy_3.score(x_3_val, y_3_val)}')

dummy_model_3()

Dummy_3 Train: 0.7346153846153847
Dummy_3 Validation: 0.7346938775510204
Dummy_3 Accuracy: 0.7346938775510204


In [70]:
def decision_tree_cassifier_3():
  tree_class_3 = DecisionTreeClassifier(max_depth=3, random_state=101)
  tree_class_3.fit(x_3_train, y_3_train)

  print(f'Tree Classifier 3 Train: {tree_class_3.score(x_3_train, y_3_train)}')
  print(f'Tree Classifier 3 Validation: {tree_class_3.score(x_3_val, y_3_val)}')
  print(f'Tree Classifier 3 Accuracy: {tree_class_3.score(x_3_val, y_3_val)}')

decision_tree_cassifier_3()

Tree Classifier 3 Train: 0.7860946745562131
Tree Classifier 3 Validation: 0.8039041703637977
Tree Classifier 3 Accuracy: 0.8039041703637977


#REGRESSÃO LOGÍSTICA

In [71]:
def logist_regression_3():
  scaler = StandardScaler()
  X_train_scaled_3 = scaler.fit_transform(x_3_train)

  log_reg = LogisticRegression(random_state=101, max_iter=1000)

  log_reg.fit(X_train_scaled_3, y_3_train)

  y_pred_log_reg = log_reg.predict(scaler.transform(x_3_test))

  print("\nClassification Report - Logistic Regression model 3:")
  print(classification_report(y_3_test, y_pred_log_reg, target_names=['Churn: No', 'Churn: Yes']))

  return log_reg, scaler

# Call the function and store the returned model and scaler
modelo_final_3, scaler_final_3 = logist_regression_3()


Classification Report - Logistic Regression model 3:
              precision    recall  f1-score   support

   Churn: No       0.83      0.89      0.86       828
  Churn: Yes       0.63      0.51      0.56       299

    accuracy                           0.79      1127
   macro avg       0.73      0.70      0.71      1127
weighted avg       0.78      0.79      0.78      1127



In [72]:
train_data_ols_3 = pd.concat([x_3_train, y_3_train], axis=1)
formula_data_3 = 'churn ~' + ' + '.join([f'Q("{col}")' for col in x_3_train.columns])
model_ols_3 = ols(formula_data_3, data = train_data_ols_3).fit()


print('Remumo modelo:  modelo_ols_3\n')
print(model_ols_3.summary())


Remumo modelo:  modelo_ols_3

                            OLS Regression Results                            
Dep. Variable:                  churn   R-squared:                       0.282
Model:                            OLS   Adj. R-squared:                  0.277
Method:                 Least Squares   F-statistic:                     59.84
Date:                Wed, 20 Aug 2025   Prob (F-statistic):          1.08e-221
Time:                        01:18:08   Log-Likelihood:                -1473.8
No. Observations:                3380   AIC:                             2994.
Df Residuals:                    3357   BIC:                             3134.
Df Model:                          22                                         
Covariance Type:            nonrobust                                         
                                                    coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------

In [73]:
def ols_model_3():
  print(f'OLS R-squared Train 3: {model_ols_3.rsquared}')

  y_3_train_pred_ols = model_ols_3.predict(x_3_train)
  y_3_val_pred_ols = model_ols_3.predict(x_3_val)

  y_3_train_pred_binary = (y_3_train_pred_ols > 0.5).astype(int)
  y_3_val_pred_binary = (y_3_val_pred_ols > 0.5).astype(int)

  precision_train_3 = precision_score(y_3_train, y_3_train_pred_binary)
  precision_val_3 = precision_score(y_3_val, y_3_val_pred_binary)

  print(f'OLS Precision Train 3: {precision_train_3}')
  print(f'OLS Precision Validation 3: {precision_val_3}')

ols_model_3()

OLS R-squared Train 3: 0.2816854463755821
OLS Precision Train 3: 0.6774668630338734
OLS Precision Validation 3: 0.6578947368421053


In [74]:
remove_columns_4 = ["payment_method_Bank transfer (automatic)","device_protection_No","multiple_lines_No phone service"]

# Creating new DataFrame for Model 4 features
x_4 = pd.DataFrame(x, columns=reaname_cols).drop(columns=remove_columns_4, axis=1)
y_4 = y


# Splitting data for Model 4
x_4_train, x_4_test, y_4_train, y_4_test = train_test_split(x_4, y_4, random_state=101, stratify=y_4, test_size=0.20)
x_4_train, x_4_val, y_4_train, y_4_val = train_test_split(x_4_train, y_4_train, random_state=101, stratify=y_4_train)

print(f"Dimensões de x_4_train: {x_4_train.shape}")
print(f"Dimensões de x_4_test: {x_4_test.shape}")

Dimensões de x_4_train: (3380, 38)
Dimensões de x_4_test: (1127, 38)


In [75]:
def dummy_model_4():
  dummy_4 = DummyClassifier()
  dummy_4.fit(x_4_train, y_4_train)

  print(f'Dummy_4 Train: {dummy_4.score(x_4_train, y_4_train)}')
  print(f'Dummy_4 Validation: {dummy_4.score(x_4_val, y_4_val)}')
  print(f'Dummy_4 Accuracy: {dummy_4.score(x_4_val, y_4_val)}')

dummy_model_4()

Dummy_4 Train: 0.7346153846153847
Dummy_4 Validation: 0.7346938775510204
Dummy_4 Accuracy: 0.7346938775510204


In [76]:
def decision_tree_cassifier_4():
  tree_class_4 = DecisionTreeClassifier(max_depth=3, random_state=101)
  tree_class_4.fit(x_4_train, y_4_train)

  print(f'Tree Classifier 4 Train: {tree_class_4.score(x_4_train, y_4_train)}')
  print(f'Tree Classifier 4 Validation: {tree_class_4.score(x_4_val, y_4_val)}')
  print(f'Tree Classifier 4 Accuracy: {tree_class_4.score(x_4_val, y_4_val)}')

decision_tree_cassifier_4()


Tree Classifier 4 Train: 0.7878698224852071
Tree Classifier 4 Validation: 0.7985803016858918
Tree Classifier 4 Accuracy: 0.7985803016858918


In [77]:
def logist_regression_4():
  scaler = StandardScaler()
  X_train_scaled_4 = scaler.fit_transform(x_4_train)

  log_reg = LogisticRegression(random_state=101, max_iter=1000)

  log_reg.fit(X_train_scaled_4, y_4_train)

  y_pred_log_reg = log_reg.predict(x_4_test)

  print("\nClassification Report - Logistic Regression model 4:")
  print(classification_report(y_4_test, y_pred_log_reg, target_names=['Churn: No', 'Churn: Yes']))

logist_regression_4()


Classification Report - Logistic Regression model 4:
              precision    recall  f1-score   support

   Churn: No       1.00      0.00      0.01       828
  Churn: Yes       0.27      1.00      0.42       299

    accuracy                           0.27      1127
   macro avg       0.63      0.50      0.22      1127
weighted avg       0.81      0.27      0.12      1127





In [78]:
train_data_ols_4 = pd.concat([x_4_train, y_4_train], axis=1)
formula_data_4 = 'churn ~' + ' + '.join([f'Q("{col}")' for col in x_4_train.columns])
model_ols_4 = ols(formula_data_4, data = train_data_ols_4).fit()


print('Remumo modelo:  modelo_ols_4\n')
print(model_ols_4.summary())


Remumo modelo:  modelo_ols_4

                            OLS Regression Results                            
Dep. Variable:                  churn   R-squared:                       0.286
Model:                            OLS   Adj. R-squared:                  0.281
Method:                 Least Squares   F-statistic:                     56.10
Date:                Wed, 20 Aug 2025   Prob (F-statistic):          1.32e-224
Time:                        01:18:09   Log-Likelihood:                -1462.7
No. Observations:                3380   AIC:                             2975.
Df Residuals:                    3355   BIC:                             3129.
Df Model:                          24                                         
Covariance Type:            nonrobust                                         
                                                  coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------

In [79]:

def ols_model_4():
  print(f'OLS R-squared Train 4: {model_ols_4.rsquared}')

  y_4_train_pred_ols = model_ols_4.predict(x_4_train)
  y_4_val_pred_ols = model_ols_4.predict(x_4_val)

  y_4_train_pred_binary = (y_4_train_pred_ols > 0.5).astype(int)
  y_4_val_pred_binary = (y_4_val_pred_ols > 0.5).astype(int)

  precision_train_4 = precision_score(y_4_train, y_4_train_pred_binary)
  precision_val_4 = precision_score(y_4_val, y_4_val_pred_binary)

  print(f'OLS Precision Train 4: {precision_train_4}')
  print(f'OLS Precision Validation 4: {precision_val_4}')

ols_model_4()

OLS R-squared Train 4: 0.2863715598886316
OLS Precision Train 4: 0.6729377713458755
OLS Precision Validation 4: 0.6493506493506493


In [80]:
remove_columns_5 = ["internet_service_DSL","streaming_tv_Yes"]

# Creating new DataFrame for Model 5 features
x_5 = pd.DataFrame(x, columns=reaname_cols).drop(columns=remove_columns_5, axis=1)
y_5 = y


# Splitting data for Model 5
x_5_train, x_5_test, y_5_train, y_5_test = train_test_split(x_5, y_5, random_state=101, stratify=y_5, test_size=0.20)
x_5_train, x_5_val, y_5_train, y_5_val = train_test_split(x_5_train, y_5_train, random_state=101, stratify=y_5_train)

print(f"Dimensões de x_5_train: {x_5_train.shape}")
print(f"Dimensões de x_5_test: {x_5_test.shape}")

Dimensões de x_5_train: (3380, 39)
Dimensões de x_5_test: (1127, 39)


In [81]:
def dummy_model_5():
  dummy_5 = DummyClassifier()
  dummy_5.fit(x_5_train, y_5_train)

  print(f'Dummy_5 Train: {dummy_5.score(x_5_train, y_5_train)}')
  print(f'Dummy_5 Validation: {dummy_5.score(x_5_val, y_5_val)}')
  print(f'Dummy_5 Accuracy: {dummy_5.score(x_5_val, y_5_val)}')

dummy_model_5()


Dummy_5 Train: 0.7346153846153847
Dummy_5 Validation: 0.7346938775510204
Dummy_5 Accuracy: 0.7346938775510204


In [82]:
def decision_tree_cassifier_5():
  tree_class_5 = DecisionTreeClassifier(max_depth=3, random_state=101)
  tree_class_5.fit(x_5_train, y_5_train)

  print(f'Tree Classifier 5 Train: {tree_class_5.score(x_5_train, y_5_train)}')
  print(f'Tree Classifier 5 Validation: {tree_class_5.score(x_5_val, y_5_val)}')
  print(f'Tree Classifier 5 Accuracy: {tree_class_5.score(x_5_val, y_5_val)}')

decision_tree_cassifier_5()

Tree Classifier 5 Train: 0.7878698224852071
Tree Classifier 5 Validation: 0.7985803016858918
Tree Classifier 5 Accuracy: 0.7985803016858918


In [83]:
def logist_regression_5():
  scaler = StandardScaler()
  X_train_scaled_5 = scaler.fit_transform(x_5_train)

  log_reg = LogisticRegression(random_state=101, max_iter=1000)

  log_reg.fit(X_train_scaled_5, y_5_train)

  y_pred_log_reg = log_reg.predict(x_5_test)

  print("\nClassification Report - Logistic Regression model 5:")
  print(classification_report(y_5_test, y_pred_log_reg, target_names=['Churn: No', 'Churn: Yes']))

logist_regression_5()


Classification Report - Logistic Regression model 5:
              precision    recall  f1-score   support

   Churn: No       1.00      0.00      0.00       828
  Churn: Yes       0.27      1.00      0.42       299

    accuracy                           0.27      1127
   macro avg       0.63      0.50      0.21      1127
weighted avg       0.81      0.27      0.11      1127





In [84]:
train_data_ols_5 = pd.concat([x_5_train, y_5_train], axis=1)
formula_data_5 = 'churn ~' + ' + '.join([f'Q("{col}")' for col in x_5_train.columns])
model_ols_5 = ols(formula_data_5, data = train_data_ols_5).fit()


print('Remumo modelo:  modelo_ols_5\n')
print(model_ols_5.summary())


Remumo modelo:  modelo_ols_5

                            OLS Regression Results                            
Dep. Variable:                  churn   R-squared:                       0.286
Model:                            OLS   Adj. R-squared:                  0.281
Method:                 Least Squares   F-statistic:                     56.10
Date:                Wed, 20 Aug 2025   Prob (F-statistic):          1.32e-224
Time:                        01:18:09   Log-Likelihood:                -1462.7
No. Observations:                3380   AIC:                             2975.
Df Residuals:                    3355   BIC:                             3129.
Df Model:                          24                                         
Covariance Type:            nonrobust                                         
                                                    coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------

In [85]:
def ols_model_5():
  print(f'OLS R-squared Train 5: {model_ols_5.rsquared}')

  y_5_train_pred_ols = model_ols_5.predict(x_5_train)
  y_5_val_pred_ols = model_ols_5.predict(x_5_val)

  y_5_train_pred_binary = (y_5_train_pred_ols > 0.5).astype(int)
  y_5_val_pred_binary = (y_5_val_pred_ols > 0.5).astype(int)

  precision_train_5 = precision_score(y_5_train, y_5_train_pred_binary)
  precision_val_5 = precision_score(y_5_val, y_5_val_pred_binary)

  print(f'OLS Precision Train 5: {precision_train_5}')
  print(f'OLS Precision Validation 5: {precision_val_5}')

ols_model_5()

OLS R-squared Train 5: 0.2863715598886317
OLS Precision Train 5: 0.6729377713458755
OLS Precision Validation 5: 0.6493506493506493


In [86]:

dummy_model_1()
print(50*'-')

dummy_model_2()
print(50*'-')

dummy_model_3()
print(50*'-')

dummy_model_4()
print(50*'-')

dummy_model_5()
print(50*'-')

Dummy 1 Train: 0.5
Dummy  1 Validation: 0.7345635202271115
Dummy 1 Acuracy: 0.7345635202271115
--------------------------------------------------
Dummy_2 Train: 0.7346153846153847
Dummy_2 Validation: 0.7346938775510204
Dummy_2 Accuracy: 0.7346938775510204
--------------------------------------------------
Dummy_3 Train: 0.7346153846153847
Dummy_3 Validation: 0.7346938775510204
Dummy_3 Accuracy: 0.7346938775510204
--------------------------------------------------
Dummy_4 Train: 0.7346153846153847
Dummy_4 Validation: 0.7346938775510204
Dummy_4 Accuracy: 0.7346938775510204
--------------------------------------------------
Dummy_5 Train: 0.7346153846153847
Dummy_5 Validation: 0.7346938775510204
Dummy_5 Accuracy: 0.7346938775510204
--------------------------------------------------


In [87]:
decision_tree_cassifier_1()
print(50*'-')

decision_tree_cassifier_2()
print(50*'-')

decision_tree_cassifier_3()
print(50*'-')

decision_tree_cassifier_4()
print(50*'-')

decision_tree_cassifier_5()
print(50*'-')

Tree Classifier 1 Train: 0.7996134020618557
Tree Classifier 1 Validation: 0.723207948899929
Tree Classifier 1 Acuracy: 0.723207948899929
--------------------------------------------------
Tree Classifier 2 Train: 0.7878698224852071
Tree Classifier 2 Validation: 0.7985803016858918
Tree Classifier 2 Accuracy: 0.7985803016858918
--------------------------------------------------
Tree Classifier 3 Train: 0.7860946745562131
Tree Classifier 3 Validation: 0.8039041703637977
Tree Classifier 3 Accuracy: 0.8039041703637977
--------------------------------------------------
Tree Classifier 4 Train: 0.7878698224852071
Tree Classifier 4 Validation: 0.7985803016858918
Tree Classifier 4 Accuracy: 0.7985803016858918
--------------------------------------------------
Tree Classifier 5 Train: 0.7878698224852071
Tree Classifier 5 Validation: 0.7985803016858918
Tree Classifier 5 Accuracy: 0.7985803016858918
--------------------------------------------------


In [88]:
logist_regression_1()
print(50*'-')

logist_regression_2()
print(50*'-')

logist_regression_3()
print(50*'-')

logist_regression_4()
print(50*'-')

logist_regression_5()
print(50*'-')


Classification Report - Logistic Regression model 1:
              precision    recall  f1-score   support

   Churn: No       0.37      0.05      0.08      1035
  Churn: Yes       0.23      0.78      0.35       374

    accuracy                           0.24      1409
   macro avg       0.30      0.41      0.22      1409
weighted avg       0.33      0.24      0.15      1409

--------------------------------------------------

Classification Report - Logistic Regression model 2:
              precision    recall  f1-score   support

   Churn: No       1.00      0.00      0.00       828
  Churn: Yes       0.27      1.00      0.42       299

    accuracy                           0.27      1127
   macro avg       0.63      0.50      0.21      1127
weighted avg       0.81      0.27      0.11      1127

--------------------------------------------------





Classification Report - Logistic Regression model 3:
              precision    recall  f1-score   support

   Churn: No       0.83      0.89      0.86       828
  Churn: Yes       0.63      0.51      0.56       299

    accuracy                           0.79      1127
   macro avg       0.73      0.70      0.71      1127
weighted avg       0.78      0.79      0.78      1127

--------------------------------------------------

Classification Report - Logistic Regression model 4:
              precision    recall  f1-score   support

   Churn: No       1.00      0.00      0.01       828
  Churn: Yes       0.27      1.00      0.42       299

    accuracy                           0.27      1127
   macro avg       0.63      0.50      0.22      1127
weighted avg       0.81      0.27      0.12      1127

--------------------------------------------------

Classification Report - Logistic Regression model 5:
              precision    recall  f1-score   support

   Churn: No       1.00    



In [89]:
ols_model_1()
print(50*'-')

ols_model_2()
print(50*'-')

ols_model_3()
print(50*'-')

ols_model_4()
print(50*'-')

ols_model_5()
print(50*'-')

OLS R-squared Train: 0.37963926012934557
OLS Precision Train: 0.7531927531927531
OLS Precision Validation: 0.4965635738831615
--------------------------------------------------
OLS R-squared Train 2: 0.2860668013320309
OLS Precision Train 2: 0.6715116279069767
OLS Precision Validation 2: 0.6651982378854625
--------------------------------------------------
OLS R-squared Train 3: 0.2816854463755821
OLS Precision Train 3: 0.6774668630338734
OLS Precision Validation 3: 0.6578947368421053
--------------------------------------------------
OLS R-squared Train 4: 0.2863715598886316
OLS Precision Train 4: 0.6729377713458755
OLS Precision Validation 4: 0.6493506493506493
--------------------------------------------------
OLS R-squared Train 5: 0.2863715598886317
OLS Precision Train 5: 0.6729377713458755
OLS Precision Validation 5: 0.6493506493506493
--------------------------------------------------


In [90]:

print('Decision Tree Classifier model\n')
decision_tree_cassifier_3()
print(50*'-')

print('\nLogistic Regression model\n')
logist_regression_3()
print(50*'-')

print('\nOLS model\n')
ols_model_2()
print(50*'-')

Decision Tree Classifier model

Tree Classifier 3 Train: 0.7860946745562131
Tree Classifier 3 Validation: 0.8039041703637977
Tree Classifier 3 Accuracy: 0.8039041703637977
--------------------------------------------------

Logistic Regression model


Classification Report - Logistic Regression model 3:
              precision    recall  f1-score   support

   Churn: No       0.83      0.89      0.86       828
  Churn: Yes       0.63      0.51      0.56       299

    accuracy                           0.79      1127
   macro avg       0.73      0.70      0.71      1127
weighted avg       0.78      0.79      0.78      1127

--------------------------------------------------

OLS model

OLS R-squared Train 2: 0.2860668013320309
OLS Precision Train 2: 0.6715116279069767
OLS Precision Validation 2: 0.6651982378854625
--------------------------------------------------


#CONCLUSÃO

In [91]:
myTable = PrettyTable(['Model', 'Main Metric', 'Strength ✅', 'Weakness ❌', 'Suitability'])


myTable.add_row(['Decision Tree 3', 'Accuracy: 80.4%', 'Highest overall accuracy among the finalists.',
                 'Blind metrics. We do not know its Precision/Recall. The high accuracy could be hiding poor performance in predicting "Churn".',
                 'High'])

myTable.add_row(['Logistic Regression 3', 'Complete Report',
                 'High Precision (70%). Its churn predictions are reliable and actionable, preventing wasted resources.',
                 'Low Recall (30%). It fails to identify 70% of the customers who will actually churn.', 'High'])

myTable.add_row(['OLS 2', 'Precision: 66.5%', 'Reasonable performance.',
                 'Wrong tool. It is a regression model for a classification task. Inferior to Logistic Regression.', 'Low'])

print(myTable)


+-----------------------+------------------+-------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------+-------------+
|         Model         |   Main Metric    |                                              Strength ✅                                              |                                                          Weakness ❌                                                          | Suitability |
+-----------------------+------------------+-------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------+-------------+
|    Decision Tree 3    | Accuracy: 80.4%  |                             Highest overall accuracy among the finalists.           

In [92]:
new_clients = {
    'gender': ['Female', 'Male', 'Female', 'Male'],
    'senior_citizen': [0, 0, 0, 1],
    'partner': ['No', 'Yes', 'Yes', 'No'],
    'dependents': ['No', 'Yes', 'Yes', 'No'],
    'tenure': [2, 68, 24, 5],
    'phone_service': ['Yes', 'Yes', 'Yes', 'Yes'],
    'multiple_lines': ['No', 'Yes', 'No', 'Yes'],
    'internet_service': ['Fiber optic', 'DSL', 'DSL', 'Fiber optic'],
    'online_security': ['No', 'Yes', 'No', 'No'],
    'online_backup': ['No', 'Yes', 'Yes', 'No'],
    'device_protection': ['No', 'Yes', 'No', 'No'],
    'tech_support': ['No', 'Yes', 'No', 'No'],
    'streaming_tv': ['No', 'Yes', 'Yes', 'Yes'],
    'streaming_movies': ['Yes', 'Yes', 'Yes', 'Yes'],
    'contract': ['Month-to-month', 'Two year', 'One year', 'Month-to-month'],
    'paper_less_billing': ['Yes', 'Yes', 'No', 'Yes'],
    'payment_method': ['Electronic check', 'Bank transfer (automatic)', 'Mailed check', 'Electronic check'],
    'charges_monthly': [75.0, 85.0, 65.0, 95.5],
    'charges_total': [150.0, 5780.0, 1560.0, 477.5],
    'daily_accounts': [2.5, 2.8, 2.1, 3.1]
}

new_clients_df = pd.DataFrame(new_clients)

new_clients_df.to_csv('new_clients.csv', index=False)

print("File 'new_clients.csv' saved!")
display(new_clients_df)

File 'new_clients.csv' saved!


Unnamed: 0,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,internet_service,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,contract,paper_less_billing,payment_method,charges_monthly,charges_total,daily_accounts
0,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,Yes,Month-to-month,Yes,Electronic check,75.0,150.0,2.5
1,Male,0,Yes,Yes,68,Yes,Yes,DSL,Yes,Yes,Yes,Yes,Yes,Yes,Two year,Yes,Bank transfer (automatic),85.0,5780.0,2.8
2,Female,0,Yes,Yes,24,Yes,No,DSL,No,Yes,No,No,Yes,Yes,One year,No,Mailed check,65.0,1560.0,2.1
3,Male,1,No,No,5,Yes,Yes,Fiber optic,No,No,No,No,Yes,Yes,Month-to-month,Yes,Electronic check,95.5,477.5,3.1


In [99]:

with open('one_hot_encoder.pkl', 'wb') as file:
    pickle.dump(one_hot, file)

cols = x_3_train.columns.tolist()
with open('colunas_finais.pkl', 'wb') as file:
    pickle.dump(cols, file)

with open('scaler_churn.pkl', 'wb') as file:
    pickle.dump(scaler_final_3, file)


with open('modelo_churn.pkl', 'wb') as file:
    pickle.dump(modelo_final_3, file)

with open('one_hot_encoder.pkl', 'rb') as file:
    one_hot_loaded = pickle.load(file)

with open('colunas_finais.pkl', 'rb') as file:
    columns_loaded = pickle.load(file)

with open('scaler_churn.pkl', 'rb') as file:
    scaler_loaded = pickle.load(file)

with open('modelo_churn.pkl', 'rb') as file:
    model_loaded = pickle.load(file)
print("Artefatos carregados com sucesso!\n")



dados_encoded_array = one_hot_loaded.transform(new_clients_df)
encoded_cols = [col.replace('onehotencoder__', '').replace('remainder__', '') for col in one_hot_loaded.get_feature_names_out()]
df_encoded = pd.DataFrame(dados_encoded_array, columns=encoded_cols)


df_features = df_encoded[columns_loaded]
data_scaled = scaler_loaded.transform(df_features)
churn_prediction = model_loaded.predict(data_scaled)



new_clients_df['Churn_Prediction'] = churn_prediction
new_clients_df['Churn_Prediction'] =new_clients_df['Churn_Prediction'].map({0: 'Sem possibilidade Churn ', 1: 'Possível CHURN!'})

#Show a few coluns
print("--- Churn ---")
display(new_clients_df[['gender', 'partner', 'tenure', 'contract', 'charges_total',	'daily_accounts', 'Churn_Prediction']])


Artefatos carregados com sucesso!

--- Churn ---


Unnamed: 0,gender,partner,tenure,contract,charges_total,daily_accounts,Churn_Prediction
0,Female,No,2,Month-to-month,150.0,2.5,Possível CHURN!
1,Male,Yes,68,Two year,5780.0,2.8,Sem possibilidade Churn
2,Female,Yes,24,One year,1560.0,2.1,Sem possibilidade Churn
3,Male,No,5,Month-to-month,477.5,3.1,Possível CHURN!
