# CTGAN
***

In [4]:
import pandas as pd
import numpy as np

from ctgan.synthesizer import CTGANSynthesizer

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier

In [192]:
data = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')

In [193]:
data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


## 전처리
- 간단한 전처리만 진행

In [194]:
data['Churn'] = [1 if c == 'Yes' else 0 for c in data['Churn']]

In [195]:
data['gender'] = [1 if g == 'Male' else 0 for g in data['gender']]

In [196]:
data['Partner'] = [1 if p == 'Yes' else 0 for p in data['Partner']]

In [197]:
data['Dependents'] = [1 if d == 'Yes' else 0 for d in data['Dependents']]

In [198]:
data['PhoneService'] = [1 if ps == 'Yes' else 0 for ps in data['PhoneService']]

In [199]:
data['PaperlessBilling'] = [1 if pb == 'Yes' else 0 for pb in data['PaperlessBilling']]

In [200]:
data['MonthlyCharges'] = data['MonthlyCharges'].astype(float)

In [201]:
data['TotalCharges'] = data['TotalCharges'].str.replace(' ', '0').astype(float)

In [209]:
X = data.drop(['customerID', 'Churn'], axis = 1)
y = data.Churn

## Train & Test Split

실제 ML 문제와 유사한 환경 구성을 위해 아래와 같이 데이터를 구성함
- train : 학습 및 검증에만 사용 6063개
- test : 예측시에만 사용(submission에 해당하며 1000개)

In [210]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 1000, random_state = 42, stratify = y)

In [119]:
gen_target = X_train.copy()

In [120]:
gen_target['Churn'] = y_train

In [211]:
encoding_cols = ['MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
                 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaymentMethod']

In [212]:
le = LabelEncoder()

In [213]:
for col in encoding_cols :
    X_train[col] = le.fit_transform(X_train[col])
    X_test[col] = le.transform(X_test[col])

In [94]:
gen_samples = 5000

In [121]:
d_cols = gen_target.columns.tolist()

In [109]:
%%time
ctgan = CTGANSynthesizer()

CPU times: user 30 µs, sys: 0 ns, total: 30 µs
Wall time: 33.9 µs


In [122]:
%%time
ctgan.fit(gen_target, d_cols, epochs = 50)

Fitting CTGAN transformers for each column: 100%|██████████| 20/20 [00:00<00:00, 1364.58it/s]


CPU times: user 3h 12min 49s, sys: 38min 27s, total: 3h 51min 17s
Wall time: 17min


In [123]:
train_generated = ctgan.sample(gen_samples)

In [241]:
new_X_train = pd.concat([X_train, train_generated.iloc[:, :-1]], axis = 0, ignore_index = True)

In [245]:
new_y_train = pd.Series(y_train.tolist() + train_generated.Churn.tolist())

***
## CatBoost Fit & Predict Ensemble

In [144]:
skf = StratifiedKFold(n_splits = 10, random_state = 42, shuffle = True)

In [247]:
def ensemble_result(X, y, test) :
    
    cb_pred = np.zeros((test.shape[0], ))
    cb_score = []
    
    for i, (tr_idx, val_idx) in enumerate(skf.split(X, y)) :
        
        tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
        val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]
        
        cb = CatBoostClassifier(random_state = 42, max_depth = 5, learning_rate = 0.03, iterations = 2000, eval_metric = 'AUC')
        cb.fit(tr_x, tr_y, eval_set = [(tr_x, tr_y), (val_x, val_y)], early_stopping_rounds = 200, verbose = 0)
        
        val_pred = cb.predict_proba(val_x)[:, 1]
        fold_roc_auc = roc_auc_score(val_y, val_pred)
        cb_score.append(fold_roc_auc)
        print(f"{i + 1} Fold ROC_AUC = {round(fold_roc_auc, 4)}")
        
        fold_pred = cb.predict_proba(test)[:, 1] / skf.n_splits
        cb_pred += fold_pred
        
    print(f"\n {cb.__class__.__name__} AVG score = {np.mean(cb_score)}")
    
    return cb_pred

In [248]:
base_line = ensemble_result(X_train, y_train, X_test)

1 Fold ROC_AUC = 0.8672
2 Fold ROC_AUC = 0.8504
3 Fold ROC_AUC = 0.8506
4 Fold ROC_AUC = 0.8299
5 Fold ROC_AUC = 0.8155
6 Fold ROC_AUC = 0.8463
7 Fold ROC_AUC = 0.8625
8 Fold ROC_AUC = 0.8717
9 Fold ROC_AUC = 0.861
10 Fold ROC_AUC = 0.8356

 CatBoostClassifier AVG score = 0.8490749330001297


In [249]:
ctgan_pred = ensemble_result(new_X_train, new_y_train, X_test)

1 Fold ROC_AUC = 0.7814
2 Fold ROC_AUC = 0.7922
3 Fold ROC_AUC = 0.7668
4 Fold ROC_AUC = 0.765
5 Fold ROC_AUC = 0.8024
6 Fold ROC_AUC = 0.7811
7 Fold ROC_AUC = 0.7858
8 Fold ROC_AUC = 0.8029
9 Fold ROC_AUC = 0.7871
10 Fold ROC_AUC = 0.7927

 CatBoostClassifier AVG score = 0.7857296918937233


각 데이터를 학습한 결과 학습 데이터에서는 데이터를 보강하지 않았을 때 더 좋은 성능을 보였다.
***
## 성능 비교

In [250]:
roc_auc_score(y_test, base_line)

0.8504505198305736

In [251]:
roc_auc_score(y_test, ctgan_pred)

0.8510358105506353

CTGAN으로 학습데이터를 보강한 결과가 상대적으로 좋은 성능을 보이는 것으로 나타났다.