## Full DPWGAN rewrite

In [1]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from models import dp_wgan, pate_gan, ron_gauss
import numpy as np
import pandas as pd
import collections, time

In [2]:
# HYPERPARAMETERS
MODEL_NAME = 'DPWGAN' # Don't change this
DATASET_NAME = 'churn' # Choose either 'churn' or 'marketing' exactly!
TARGET_VARIABLE = 'Exited' # either 'Exited' or 'Response'
TRAIN_TEST_RATIO = 0.25
LEAKY = True # Put false for normal relu.

# These seem to be good/important to tune from what I can tell from the github.
TARGET_EPSILON = 10
TARGET_DELTA = 1e-4
SIGMA = 0.8
NUM_EPOCHS = 500
LEARNING_RATE = 5e-5

# Following defaults in the toolbox. Might not be crucial to tune these
MICRO_BATCH_SIZE = 8
BATCH_SIZE = 64
ENABLE_PRIVACY = True
CLIP_COEFF = 0.1
CLAMP_LOWER = -0.01
CLAMP_UPPER = 0.01

In [3]:
# Read in data and do train test split
df = pd.read_csv(f'{DATASET_NAME}_processed.csv')
df_train, df_test = train_test_split(df, test_size=TRAIN_TEST_RATIO, random_state=42, stratify = df[TARGET_VARIABLE])

# Initialise logfile path
timestamp = int(time.time())
logfile = f'log_{DATASET_NAME}_{MODEL_NAME}_{timestamp}.csv'

In [4]:
# Grab x and y from the respective dataframes and convert to numpy arrays.
train_x = df_train.drop(columns=TARGET_VARIABLE).values
train_y = df_train[TARGET_VARIABLE].values
test_x = df_test.drop(columns=TARGET_VARIABLE).values
test_y = df_test[TARGET_VARIABLE].values

# Initialise scaler and use this to normalize the inputs.
scaler = StandardScaler()
train_x = scaler.fit_transform(train_x)
test_x = scaler.transform(test_x)

# Some misc variables for pategan 
data_columns = [col for col in df_train.columns if col != TARGET_VARIABLE]
class_ratios = df_train[TARGET_VARIABLE].sort_values().groupby(df_train[TARGET_VARIABLE]).size().values/train_x.shape[0]
input_dim = train_x.shape[1]
z_dim = int(input_dim / 4 + 1) if input_dim % 4 == 0 else int(input_dim / 4)
conditional = True

In [None]:
# Initialise hyperparams and do TRAINING
Hyperparams = collections.namedtuple(
        'Hyperarams',
        'batch_size micro_batch_size clamp_lower clamp_upper clip_coeff sigma class_ratios lr num_epochs')
Hyperparams.__new__.__defaults__ = (None, None, None, None, None, None, None, None, None)

model = dp_wgan.DP_WGAN(LEAKY, logfile, input_dim, z_dim, TARGET_EPSILON, TARGET_DELTA, conditional)
model.train(train_x, train_y, Hyperparams(batch_size=BATCH_SIZE, micro_batch_size=MICRO_BATCH_SIZE,
                                              clamp_lower=CLAMP_LOWER, clamp_upper=CLAMP_UPPER,
                                              clip_coeff=CLIP_COEFF, sigma=SIGMA, class_ratios=class_ratios, lr=LEARNING_RATE, 
                                              num_epochs=NUM_EPOCHS), private=ENABLE_PRIVACY)


Epoch : 1 Loss D real :  0.009998809366402354 Loss D fake :  0.010185346211993226 Loss G :  0.010272271634336137 Epsilon spent :  2.623153807427913
Epoch : 2 Loss D real :  0.009980876158773903 Loss D fake :  0.010269886215483387 Loss G :  0.010298191653309585 Epsilon spent :  2.9437225218617797
Epoch : 3 Loss D real :  0.009979570551506892 Loss D fake :  0.010317872401817045 Loss G :  0.010317506082561073 Epsilon spent :  3.224422132116374
Epoch : 4 Loss D real :  0.009999396572235376 Loss D fake :  0.010336159857109239 Loss G :  0.010385298509679692 Epsilon spent :  3.339819054074313
Epoch : 5 Loss D real :  0.009959406347372277 Loss D fake :  0.010422250987970857 Loss G :  0.010432118975295718 Epsilon spent :  3.455215976032252
Epoch : 6 Loss D real :  0.010124009351318521 Loss D fake :  0.010453518492777609 Loss G :  0.010461647396901662 Epsilon spent :  3.5706128979901908
Epoch : 7 Loss D real :  0.010083417815733528 Loss D fake :  0.010402372263996425 Loss G :  0.0104674642047140

## Generate synthetic data using trained model, then save in CSV

In [129]:
# Helper functions for saving the synthetic data...
def update_array(indexes):
    b = np.zeros((indexes.size, indexes.max()+1))
    b[np.arange(indexes.size), indexes] = 1
    return b

def save_marketing():
    # Some fancy indexing to get the actual synthetic data..
    accepted = np.argmax(syn_save[:,16:21], axis=1)
    education = np.argmax(syn_save[:, 22:27], axis=1)
    marital = np.argmax(syn_save[:, 27:34], axis=1)
    country = np.argmax(syn_save[:, 34:], axis=1)

    syn_save[:,16:21] = update_array(accepted)
    syn_save[:, 22:27] = update_array(education)
    syn_save[:, 27:34] = update_array(marital)
    syn_save[:, 34:] = update_array(country)

    df1 = pd.DataFrame(syn_save, columns = df.columns.drop(TARGET_VARIABLE))
    df2 = pd.DataFrame(syn_y, columns = [TARGET_VARIABLE])
    df_save = pd.concat([df1,df2], axis =1)
    df_save.to_csv(f'synthetic_{MODEL_NAME}_{DATASET_NAME}.csv')

def save_churn():
    geography = np.argmax(syn_save[:,8:11], axis=1)
    gender = np.argmax(syn_save[:,11:], axis=1)
    
    syn_save[:,8:11] = update_array(geography)
    syn_save[:, 11:] = update_array(gender)
    
    df1 = pd.DataFrame(syn_save, columns = df.columns.drop(TARGET_VARIABLE))
    df2 = pd.DataFrame(syn_y, columns = [TARGET_VARIABLE])
    df_save = pd.concat([df1,df2], axis =1)
    df_save.to_csv(f'synthetic_{MODEL_NAME}_{DATASET_NAME}.csv')


In [130]:
# Run this cell to generate data and save them.
syn_data = model.generate(train_x.shape[0], class_ratios)
syn_x, syn_y = syn_data[:, :-1], syn_data[:, -1]

# Make a copy for saving
syn_save = scaler.inverse_transform(syn_x)

# Save data to csv using functions
if TARGET_VARIABLE == 'churn': save_churn()
else: save_marketing()


## Do classification using Neural Networks and look at ROC Score

In [110]:
# Train using Synthetic data, using simple neural network.
mlp = MLPClassifier((32,8), max_iter=1000, random_state=42)
mlp.fit(syn_x, syn_y)
pred_y = mlp.predict(test_x_syn)

print(roc_auc_score(test_y, pred_y))
print(mlp.score(test_x_syn,test_y))

0.38112961399739087
0.6227436823104693


In [11]:
# Train using REAL data, using simple neural network.
mlp = MLPClassifier((32,8), max_iter=1000, random_state=42)
mlp.fit(train_x, train_y)
print(mlp.score(test_x, test_y))

pred_y = mlp.predict(test_x)
print(roc_auc_score(test_y,pred_y))

0.8664259927797834
0.7328677768398435
