## Full DPWGAN rewrite

In [1]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from models import dp_wgan, pate_gan, ron_gauss
import numpy as np
import pandas as pd
import collections, time

In [2]:
# HYPERPARAMETERS
MODEL_NAME = 'DPWGAN' # Don't change this
DATASET_NAME = 'churn' # Choose either 'churn' or 'marketing' exactly!
TARGET_VARIABLE = 'Exited' # either 'Exited' or 'Response'
TRAIN_TEST_RATIO = 0.25
LEAKY = True # Put false for normal relu.

# These seem to be good/important to tune from what I can tell from the github.
TARGET_EPSILON = 10
TARGET_DELTA = 1e-4
SIGMA = 0.8
NUM_EPOCHS = 500
LEARNING_RATE = 5e-5

# Following defaults in the toolbox. Might not be crucial to tune these
MICRO_BATCH_SIZE = 8
BATCH_SIZE = 64
ENABLE_PRIVACY = True
CLIP_COEFF = 0.1
CLAMP_LOWER = -0.01
CLAMP_UPPER = 0.01

In [3]:
# Read in data and do train test split
df = pd.read_csv(f'{DATASET_NAME}_processed.csv')
df_train, df_test = train_test_split(df, test_size=TRAIN_TEST_RATIO, random_state=42, stratify = df[TARGET_VARIABLE])

# Initialise logfile path
timestamp = int(time.time())
logfile = f'log_{DATASET_NAME}_{MODEL_NAME}_{timestamp}.csv'

In [4]:
# Grab x and y from the respective dataframes and convert to numpy arrays.
train_x = df_train.drop(columns=TARGET_VARIABLE).values
train_y = df_train[TARGET_VARIABLE].values
test_x = df_test.drop(columns=TARGET_VARIABLE).values
test_y = df_test[TARGET_VARIABLE].values

# Initialise scaler and use this to normalize the inputs.
scaler = StandardScaler()
train_x = scaler.fit_transform(train_x)
test_x = scaler.transform(test_x)

# Some misc variables for pategan 
data_columns = [col for col in df_train.columns if col != TARGET_VARIABLE]
class_ratios = df_train[TARGET_VARIABLE].sort_values().groupby(df_train[TARGET_VARIABLE]).size().values/train_x.shape[0]
input_dim = train_x.shape[1]
z_dim = int(input_dim / 4 + 1) if input_dim % 4 == 0 else int(input_dim / 4)
conditional = True

In [5]:
# Initialise hyperparams and do TRAINING
Hyperparams = collections.namedtuple(
        'Hyperarams',
        'batch_size micro_batch_size clamp_lower clamp_upper clip_coeff sigma class_ratios lr num_epochs')
Hyperparams.__new__.__defaults__ = (None, None, None, None, None, None, None, None, None)

model = dp_wgan.DP_WGAN(LEAKY, logfile, input_dim, z_dim, TARGET_EPSILON, TARGET_DELTA, conditional)
model.train(train_x, train_y, Hyperparams(batch_size=BATCH_SIZE, micro_batch_size=MICRO_BATCH_SIZE,
                                              clamp_lower=CLAMP_LOWER, clamp_upper=CLAMP_UPPER,
                                              clip_coeff=CLIP_COEFF, sigma=SIGMA, class_ratios=class_ratios, lr=LEARNING_RATE, 
                                              num_epochs=NUM_EPOCHS), private=ENABLE_PRIVACY)


Epoch : 1 Loss D real :  0.010714411980568423 Loss D fake :  0.010565648695409038 Loss G :  0.010633213267995894 Epsilon spent :  2.623153807427913
Epoch : 2 Loss D real :  0.010862018328220055 Loss D fake :  0.010656586445916558 Loss G :  0.010703517456732443 Epsilon spent :  2.9437225218617797
Epoch : 3 Loss D real :  0.010819494412316784 Loss D fake :  0.010821979859933582 Loss G :  0.010839053784986325 Epsilon spent :  3.224422132116374
Epoch : 4 Loss D real :  0.011106140410721886 Loss D fake :  0.010958945494663145 Loss G :  0.01094224207200507 Epsilon spent :  3.339819054074313
Epoch : 5 Loss D real :  0.011274419941166205 Loss D fake :  0.011028902119101746 Loss G :  0.011236928058126532 Epsilon spent :  3.455215976032252
Epoch : 6 Loss D real :  0.010773818496483938 Loss D fake :  0.01116729468878628 Loss G :  0.011298002508992348 Epsilon spent :  3.5706128979901908
Epoch : 7 Loss D real :  0.011116367533215982 Loss D fake :  0.01120728272566263 Loss G :  0.011383343818824759 

Epoch : 57 Loss D real :  0.01072805911413499 Loss D fake :  0.011086269023043313 Loss G :  0.011157023167082508 Epsilon spent :  8.198761846989168
Epoch : 58 Loss D real :  0.010861468766195313 Loss D fake :  0.010934986540215224 Loss G :  0.010910634755485114 Epsilon spent :  8.275272724626578
Epoch : 59 Loss D real :  0.011016693464063046 Loss D fake :  0.011084017991827346 Loss G :  0.011005661293023687 Epsilon spent :  8.351783602263989
Epoch : 60 Loss D real :  0.011113357908168359 Loss D fake :  0.010942385972109175 Loss G :  0.011067815049257104 Epsilon spent :  8.428294479901401
Epoch : 61 Loss D real :  0.01124603302902224 Loss D fake :  0.010866413861424225 Loss G :  0.010969408958357688 Epsilon spent :  8.504805357538812
Epoch : 62 Loss D real :  0.011186023165965913 Loss D fake :  0.011065262554048653 Loss G :  0.011053722421445693 Epsilon spent :  8.581316235176223
Epoch : 63 Loss D real :  0.010766347148978391 Loss D fake :  0.011084791617975596 Loss G :  0.0111982878822

## Generate synthetic data using trained model, then save in CSV

In [6]:
# Helper functions for saving the synthetic data...
def update_array(indexes):
    b = np.zeros((indexes.size, indexes.max()+1))
    b[np.arange(indexes.size), indexes] = 1
    return b

def save_marketing():
    # Some fancy indexing to get the actual synthetic data..
    accepted = np.argmax(syn_save[:,16:21], axis=1)
    education = np.argmax(syn_save[:, 22:27], axis=1)
    marital = np.argmax(syn_save[:, 27:34], axis=1)
    country = np.argmax(syn_save[:, 34:], axis=1)

    syn_save[:,16:21] = update_array(accepted)
    syn_save[:, 22:27] = update_array(education)
    syn_save[:, 27:34] = update_array(marital)
    syn_save[:, 34:] = update_array(country)

    df1 = pd.DataFrame(syn_save, columns = df.columns.drop(TARGET_VARIABLE))
    df2 = pd.DataFrame(syn_y, columns = [TARGET_VARIABLE])
    df_save = pd.concat([df1,df2], axis =1)
    df_save.to_csv(f'synthetic_{MODEL_NAME}_{DATASET_NAME}.csv')

def save_churn():
    geography = np.argmax(syn_save[:,8:11], axis=1)
    gender = np.argmax(syn_save[:,11:], axis=1)
    
    syn_save[:,8:11] = update_array(geography)
    syn_save[:, 11:] = update_array(gender)
    
    df1 = pd.DataFrame(syn_save, columns = df.columns.drop(TARGET_VARIABLE))
    df2 = pd.DataFrame(syn_y, columns = [TARGET_VARIABLE])
    df_save = pd.concat([df1,df2], axis =1)
    df_save.to_csv(f'synthetic_{MODEL_NAME}_{DATASET_NAME}.csv')


In [13]:
# Run this cell to generate data and save them.
syn_data = model.generate(train_x.shape[0], class_ratios)
syn_x, syn_y = syn_data[:, :-1], syn_data[:, -1]

# Make a copy for saving
syn_save = scaler.inverse_transform(syn_x)

# Save data to csv using functions
if DATASET_NAME == 'churn': save_churn()
else: save_marketing()


## Do classification using Neural Networks and look at ROC Score

In [19]:
# Train using Synthetic data, using simple neural network.
mlp = MLPClassifier((32,8), max_iter=1000, random_state=42)
mlp.fit(syn_x, syn_y)
pred_y = mlp.predict(test_x)

print('ROC Score', roc_auc_score(test_y, pred_y))
print('Accuracy', mlp.score(test_x,test_y))

ROC Score 0.5491247943841591
Accuracy 0.4612


In [18]:
# Train using REAL data, using simple neural network.
mlp = MLPClassifier((32,8), max_iter=1000, random_state=42)
mlp.fit(train_x, train_y)
pred_y = mlp.predict(test_x)

print('ROC Score', roc_auc_score(test_y, pred_y))
print('Accuracy', mlp.score(test_x,test_y))

ROC Score 0.7328967583990432
Accuracy 0.8564
