In [41]:
import pandas as pd

import imblearn
import sqlalchemy
import sdv
from sqlalchemy import create_engine
from sdv.single_table import CTGANSynthesizer
from sdv.metadata import SingleTableMetadata
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

In [42]:
print(f" pandas     {pd.__version__}")
print(f" imblearn   {imblearn.__version__}")
print(f" sqlalchemy {sqlalchemy.__version__}")
print(f" sdv        {sdv.__version__}")

 pandas     2.2.3
 imblearn   0.14.0
 sqlalchemy 2.0.38
 sdv        1.9.0


In [43]:
df_churn = pd.read_csv("../src/dataset_churn.csv")

In [44]:
df_churn.shape

(9455, 31)

In [45]:
df_churn['Churn'].value_counts(normalize=True)

Churn
0    0.812692
1    0.187308
Name: proportion, dtype: float64

In [46]:
features = ['CreditScore', 'Age', 'Tenure',
       'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember',
       'EstimatedSalary', 'days_since_last_tx', 'txs_avg_amount', 'amount_std',
       'avg_cashout_amount', 'ratio_recent_vs_past_txs',
       'ratio_recent_vs_past_amount', 'ratio_cashouts', 'ratio_transfers',
       'inflation_pressure', 'days_since_last_ss', 'total_ss_past30d',
       'total_ss_past90d', 'avg_ss_per_wk', 'avg_ss_duration_min',
       'std_ss_duration_min', 'ratio_ss_time_recent_vs_past',
       'ratio_events_sessios', 'ratio_failed_ss', 'total_opened_push']

In [47]:
X = df_churn[features]

In [48]:
y = df_churn[['Churn']]

In [49]:
sm = SMOTE(random_state=42)

In [50]:
data_x, data_y = sm.fit_resample(X, y)

In [52]:
data_x.shape

(15368, 27)

In [53]:
data_y.shape

(15368, 1)

In [51]:
data_y['Churn'].value_counts(normalize=True)

Churn
1    0.5
0    0.5
Name: proportion, dtype: float64

In [54]:
data_y.head()

Unnamed: 0,Churn
0,1
1,0
2,1
3,0
4,0


In [56]:
X_balanceado, X_database, y_balanceado, y_database = train_test_split(
    data_x, data_y,
    test_size=3000,        # exatamente 3000 linhas
    stratify=data_y,            # mantém 50% churn 0 e 50% churn 1
    random_state=42,       # garante reprodutibilidade
    shuffle=True           # mistura as linhas
)

In [57]:
dataset_balanceado = pd.concat([X_balanceado,y_balanceado],ignore_index=False, axis=1)

In [58]:
dataset_balanceado.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,days_since_last_tx,txs_avg_amount,...,total_ss_past30d,total_ss_past90d,avg_ss_per_wk,avg_ss_duration_min,std_ss_duration_min,ratio_ss_time_recent_vs_past,ratio_events_sessios,ratio_failed_ss,total_opened_push,Churn
4891,591,27,5,107812.67,1,0,1,162501.83,1,111524.559643,...,2,8,1.238095,12.453194,8.350533,0.117002,-2.197225,-3.295837,3,1
9010,603,38,8,59360.77,1,1,1,191457.06,1,133911.997589,...,5,12,0.96,20.413926,16.404494,-0.48913,-1.139434,-2.525729,4,0
8658,778,24,4,0.0,2,1,1,162809.2,2,131753.792842,...,1,9,1.105263,11.637624,7.867073,-0.845461,-2.397895,-3.091042,5,0
9548,511,53,1,75535.30918,1,0,0,143880.531485,18,101093.825387,...,2,12,0.963621,20.816539,16.271408,-0.825373,-1.0518,-2.620171,2,1
3731,728,45,3,108924.33,2,1,0,84300.4,25,166440.485556,...,1,11,0.923077,14.32301,12.189964,-0.819413,-1.139434,-3.218876,4,1


In [59]:
dataset_balanceado.shape

(12368, 28)

In [60]:
dataset_database = pd.concat([X_database,y_database],ignore_index=False, axis=1)

In [61]:
dataset_database.shape

(3000, 28)

In [None]:
dataset_database.head()

In [62]:
dataset_balanceado.to_csv("../src/dataset_churn_balanceado.csv")
dataset_database.to_csv("../src/dataset_churn_database.csv")