## Full PATEGAN rewrite

In [2]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from models import dp_wgan, pate_gan, ron_gauss
import numpy as np
import pandas as pd
import collections, time

In [3]:
# HYPERPARAMETERS
MODEL_NAME = 'PATEGAN' # Don't change this
DATASET_NAME = 'churn' # Choose either 'churn' or 'marketing' exactly!
TARGET_VARIABLE = 'Exited' # either 'Exited' or 'Response'
TRAIN_TEST_RATIO = 0.25

# These seem to be good to tune from what I can tell from the github.
NUM_TEACHERS = 20
TARGET_EPSILON = 1
TARGET_DELTA = 1e-4
LEARNING_RATE = 1e-4

# Following defaults in the toolbox. Might not be crucial to tune these
BATCH_SIZE = 64
TEACHER_ITER = 5
STUDENT_ITER = 5
NUM_MOMENTS= 100
LAP_SCALE = 1e-4

In [4]:
# Read in data and do train test split
df = pd.read_csv(f'{DATASET_NAME}_processed.csv')
df_train, df_test = train_test_split(df, test_size=TRAIN_TEST_RATIO, random_state=42, stratify = df[TARGET_VARIABLE])

# Initialise logfile path
timestamp = int(time.time())
logfile = f'log_{DATASET_NAME}_{MODEL_NAME}_{timestamp}.csv'

In [5]:
# Grab x and y from the respective dataframes and convert to numpy arrays.
train_x = df_train.drop(columns=TARGET_VARIABLE).values
train_y = df_train[TARGET_VARIABLE].values
test_x = df_test.drop(columns=TARGET_VARIABLE).values
test_y = df_test[TARGET_VARIABLE].values

# Initialise scaler and use this to normalize the inputs.
scaler = StandardScaler()
train_x = scaler.fit_transform(train_x)
test_x = scaler.transform(test_x)

# Some misc variables for pategan 
data_columns = [col for col in df_train.columns if col != TARGET_VARIABLE]
class_ratios = df_train[TARGET_VARIABLE].sort_values().groupby(df_train[TARGET_VARIABLE]).size().values/train_x.shape[0]
input_dim = train_x.shape[1]
z_dim = int(input_dim / 4 + 1) if input_dim % 4 == 0 else int(input_dim / 4)
conditional = True

In [None]:
# Initialise hyperparams and do TRAINING
Hyperparams = collections.namedtuple(
        'Hyperarams',
        'batch_size num_teacher_iters num_student_iters num_moments lap_scale class_ratios lr')
Hyperparams.__new__.__defaults__ = (None, None, None, None, None, None, None)

model = pate_gan.PATE_GAN(logfile, input_dim, z_dim, NUM_TEACHERS, TARGET_EPSILON, TARGET_DELTA, conditional)
model.train(train_x, train_y, Hyperparams(batch_size=BATCH_SIZE, num_teacher_iters=TEACHER_ITER,
                                              num_student_iters=STUDENT_ITER, num_moments=NUM_MOMENTS,
                                              lap_scale=LAP_SCALE, class_ratios=class_ratios, lr=LEARNING_RATE))


## Generate synthetic data using trained model, then save in CSV

In [None]:
# Helper functions for saving the synthetic data...
def update_array(indexes):
    b = np.zeros((indexes.size, indexes.max()+1))
    b[np.arange(indexes.size), indexes] = 1
    return b

def save_marketing():
    # Some fancy indexing to get the actual synthetic data..
    accepted = np.argmax(syn_save[:,16:21], axis=1)
    education = np.argmax(syn_save[:, 22:27], axis=1)
    marital = np.argmax(syn_save[:, 27:34], axis=1)
    country = np.argmax(syn_save[:, 34:], axis=1)

    syn_save[:,16:21] = update_array(accepted)
    syn_save[:, 22:27] = update_array(education)
    syn_save[:, 27:34] = update_array(marital)
    syn_save[:, 34:] = update_array(country)

    df1 = pd.DataFrame(syn_save, columns = df.columns.drop(TARGET_VARIABLE))
    df2 = pd.DataFrame(syn_y, columns = [TARGET_VARIABLE])
    df_save = pd.concat([df1,df2], axis =1)
    df_save.to_csv(f'synthetic_{MODEL_NAME}_{DATASET_NAME}.csv')

def save_churn():
    geography = np.argmax(syn_save[:,8:11], axis=1)
    gender = np.argmax(syn_save[:,11:], axis=1)
    
    syn_save[:,8:11] = update_array(geography)
    syn_save[:, 11:] = update_array(gender)
    
    df1 = pd.DataFrame(syn_save, columns = df.columns.drop(TARGET_VARIABLE))
    df2 = pd.DataFrame(syn_y, columns = [TARGET_VARIABLE])
    df_save = pd.concat([df1,df2], axis =1)
    df_save.to_csv(f'synthetic_{MODEL_NAME}_{DATASET_NAME}.csv')


In [None]:
# Run this cell to generate data and save them.
syn_data = model.generate(train_x.shape[0], class_ratios)
syn_x, syn_y = syn_data[:, :-1], syn_data[:, -1]

# Make a copy for saving
syn_save = scaler.inverse_transform(syn_x)

# Save data to csv using functions
if TARGET_VARIABLE == 'churn': save_churn()
else: save_marketing()


## Do classification using Neural Networks and look at ROC Score

In [30]:
# Train using Synthetic data, using simple neural network.
mlp = MLPClassifier((32,8), max_iter=1000, random_state=42)
mlp.fit(syn_x, syn_y)
pred_y = mlp.predict(test_x)

print('ROC Score', roc_auc_score(test_y, pred_y))
print('Accuracy', mlp.score(test_x,test_y))

ROC Score 0.5844908295602793
Accuracy 0.723826714801444


In [31]:
# Train using REAL data, using simple neural network.
mlp = MLPClassifier((32,8), max_iter=1000, random_state=42)
mlp.fit(train_x, train_y)
pred_y = mlp.predict(test_x)

print('ROC Score', roc_auc_score(test_y, pred_y))
print('Accuracy', mlp.score(test_x,test_y))

ROC Score 0.7328677768398435
Accuracy 0.8664259927797834
