In [None]:
!git clone https://github.com/yeeyangtee/private-data-generation.git

Cloning into 'private-data-generation'...
remote: Enumerating objects: 103, done.[K
remote: Counting objects: 100% (103/103), done.[K
remote: Compressing objects: 100% (80/80), done.[K
remote: Total 103 (delta 35), reused 84 (delta 20), pack-reused 0[K
Receiving objects: 100% (103/103), 1.21 MiB | 9.04 MiB/s, done.
Resolving deltas: 100% (35/35), done.


In [None]:
cd /content/private-data-generation

/content/private-data-generation


In [None]:
# Download datasets
!gdown --id 1PaXMlTVHoB-vv-CaY1SgZDhf3evZjWf9 # churn processed
!gdown --id 1sJRwaeYcNaX_hqRJdhqAtMWnX7h-XKJE # marketing processed

## Full DPWGAN rewrite

In [1]:
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from models import dp_wgan, pate_gan
import numpy as np
import pandas as pd
import collections, time

In [2]:
# HYPERPARAMETERS
MODEL_NAME = 'DPWGAN' # Don't change this
DATASET_NAME = 'marketing' # Choose either 'churn' or 'marketing' exactly!
TARGET_VARIABLE = 'Response' # either 'Exited' or 'Response'
TRAIN_TEST_RATIO = 0.25
LEAKY = False # Put False for normal relu. The number indicates the amount of negative slope. Default is 0.01

# These seem to be good/important to tune from what I can tell from the github.
TARGET_EPSILON = 10
TARGET_DELTA = 1e-4
SIGMA = 1.2
NUM_EPOCHS = 100
LEARNING_RATE = 5e-5

# Following defaults in the toolbox. Might not be crucial to tune these
MICRO_BATCH_SIZE = 8
BATCH_SIZE = 64
ENABLE_PRIVACY = True
CLIP_COEFF = 0.1
CLAMP_LOWER = -0.01
CLAMP_UPPER = 0.01

In [3]:
# Read in data and do train test split
df = pd.read_csv(f'{DATASET_NAME}_processed.csv')
df_train, df_test = train_test_split(df, test_size=TRAIN_TEST_RATIO, random_state=42, stratify = df[TARGET_VARIABLE])

# Initialise logfile path
timestamp = int(time.time())
logfile = f'logs/log_{DATASET_NAME}_{MODEL_NAME}_{timestamp}_{TARGET_EPSILON}.csv'

In [4]:
# Grab x and y from the respective dataframes and convert to numpy arrays.
train_x = df_train.drop(columns=TARGET_VARIABLE).values
train_y = df_train[TARGET_VARIABLE].values
test_x = df_test.drop(columns=TARGET_VARIABLE).values
test_y = df_test[TARGET_VARIABLE].values

# Initialise scaler and use this to normalize the inputs.
scaler = StandardScaler()
train_x = scaler.fit_transform(train_x)
test_x = scaler.transform(test_x)

# Some misc variables for pategan 
data_columns = [col for col in df_train.columns if col != TARGET_VARIABLE]
class_ratios = df_train[TARGET_VARIABLE].sort_values().groupby(df_train[TARGET_VARIABLE]).size().values/train_x.shape[0]
input_dim = train_x.shape[1]
z_dim = int(input_dim / 4 + 1) if input_dim % 4 == 0 else int(input_dim / 4)
conditional = True

In [None]:
# Initialise hyperparams and do TRAINING
Hyperparams = collections.namedtuple(
        'Hyperarams',
        'batch_size micro_batch_size clamp_lower clamp_upper clip_coeff sigma class_ratios lr num_epochs')
Hyperparams.__new__.__defaults__ = (None, None, None, None, None, None, None, None, None)

model = dp_wgan.DP_WGAN(LEAKY, logfile, input_dim, z_dim, TARGET_EPSILON, TARGET_DELTA, conditional)
model.train(train_x, train_y, test_x, test_y, data_columns, scaler, DATASET_NAME, Hyperparams(batch_size=BATCH_SIZE, micro_batch_size=MICRO_BATCH_SIZE,
                                              clamp_lower=CLAMP_LOWER, clamp_upper=CLAMP_UPPER,
                                              clip_coeff=CLIP_COEFF, sigma=SIGMA, class_ratios=class_ratios, lr=LEARNING_RATE, 
                                              num_epochs=NUM_EPOCHS), private=ENABLE_PRIVACY)


Best Roc of 0.5153480162688973 found, saving....
saving nice!
Epoch : 1 Loss D real :  0.011415510552723997 Loss D fake :  0.012078700414173518 Loss G :  0.012225620332291652 Epsilon spent :  1.8460928363141051 ROC attained:  0.5153480162688973
Epoch : 2 Loss D real :  0.011933347067052595 Loss D fake :  0.012503993027578722 Loss G :  0.012535728964354249 Epsilon spent :  2.1496228187495907 ROC attained:  0.4781930268846085




Epoch : 3 Loss D real :  0.012136609245305163 Loss D fake :  0.012807176579899276 Loss G :  0.012921056324201477 Epsilon spent :  2.4020824091979414 ROC attained:  0.4186427237612872


In [None]:
# Optional: Look at model architectures
print(model.discriminator)
print(model.generator)

Discriminator(
  (main): Sequential(
    (0): Linear(in_features=43, out_features=21, bias=True)
    (1): ReLU()
    (2): Linear(in_features=21, out_features=1, bias=True)
  )
)
Generator(
  (main): Sequential(
    (0): Linear(in_features=11, out_features=20, bias=True)
    (1): ReLU()
    (2): Linear(in_features=20, out_features=42, bias=True)
  )
)


## Generate synthetic data using trained model, then save in CSV

In [None]:
# Helper functions for saving the synthetic data...
def update_array(indexes, cols = None):
    if cols: colsize = cols
    else: colsize = indexes.max() +1
    b = np.zeros((indexes.size, colsize))
    b[np.arange(indexes.size), indexes] = 1
    return b

def save_marketing():
    # Some fancy indexing to get the actual synthetic data..
    accepted = np.argmax(syn_save[:,16:21], axis=1)
    education = np.argmax(syn_save[:, 22:27], axis=1)
    marital = np.argmax(syn_save[:, 27:34], axis=1)
    country = np.argmax(syn_save[:, 34:], axis=1)

    syn_save[:,16:21] = update_array(accepted, cols=5)
    syn_save[:, 22:27] = update_array(education, cols=5)
    syn_save[:, 27:34] = update_array(marital, cols=7)
    syn_save[:, 34:] = update_array(country, cols=8)

    df1 = pd.DataFrame(syn_save, columns = df.columns.drop(TARGET_VARIABLE))
    df2 = pd.DataFrame(syn_y, columns = [TARGET_VARIABLE])
    df_save = pd.concat([df1,df2], axis =1)
    df_save.to_csv(f'synthetic_{MODEL_NAME}_{DATASET_NAME}_{TARGET_EPSILON}.csv')

def save_churn():
    geography = np.argmax(syn_save[:,8:11], axis=1)
    gender = np.argmax(syn_save[:,11:], axis=1)
    
    syn_save[:,8:11] = update_array(geography, cols=3)
    syn_save[:, 11:] = update_array(gender, cols=2)
    syn_save[:,4] = np.round(syn_save[:,4]) # num products
    syn_save[:,5] = np.round(np.clip(syn_save[:,5],0,1)) # Has card
    syn_save[:,6] = np.round(np.clip(syn_save[:,6],0,1)) # Is active
    
    df1 = pd.DataFrame(syn_save, columns = df.columns.drop(TARGET_VARIABLE))
    df2 = pd.DataFrame(syn_y, columns = [TARGET_VARIABLE])
    df_save = pd.concat([df1,df2], axis =1)
    df_save.to_csv(f'synthetic_{MODEL_NAME}_{DATASET_NAME}_{TARGET_EPSILON}.csv')


In [None]:
# Run this cell to generate data and save them.
syn_data = model.generate(train_x.shape[0], class_ratios)
syn_x, syn_y = syn_data[:, :-1], syn_data[:, -1]

# Make a copy for saving
syn_save = scaler.inverse_transform(syn_x)

# Save data to csv using functions
if DATASET_NAME == 'churn': save_churn()
else: save_marketing()


## Do classification with various classifiers.
- MLP
- RandomForest
- AdaBoost
- Decision Trees
- Gaussian Naive Bayes
- Bernouilli Naive Bayes
- SVM
- Logistic Regression

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB,BernoulliNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

In [None]:
# Initialise all models
classifier_names = ['MLP', 'RF', 'ADABOOST', 'DT', 'GauNB', 'BernNB', 'SVM', 'LogReg']
classifiers = [MLPClassifier((32,8), max_iter=1000, random_state=42), #MLP
               RandomForestClassifier(max_depth=2, random_state=42), #RF
               AdaBoostClassifier(n_estimators=50, random_state=42),
               DecisionTreeClassifier(random_state=42),
               GaussianNB(),
               BernoulliNB(),
               SVC(random_state=42),
               LogisticRegression(random_state=42)
]


In [None]:
syn_clf_accuracy = []
syn_clf_roc_score = []
real_clf_accuracy = []
real_clf_roc_score = []

# Loop thru all classifiers.
for clf in classifiers:

    # Do synthetic
    clf.fit(syn_x, syn_y)
    pred_y = clf.predict(test_x)

    # compute score and append
    syn_clf_roc_score.append(roc_auc_score(test_y, pred_y))
    syn_clf_accuracy.append(clf.score(test_x,test_y))

    # Do real training data
    clf.fit(train_x, train_y)
    pred_y = clf.predict(test_x)

    # compute score and append
    real_clf_roc_score.append(roc_auc_score(test_y, pred_y))
    real_clf_accuracy.append(clf.score(test_x,test_y))

In [None]:
score_df = pd.DataFrame({'Classifier Name':classifier_names,
                         'Accuracy on Synthetic':syn_clf_accuracy,
                         'ROC on Synthetic': syn_clf_roc_score,
                         'Accuracy on Real': real_clf_accuracy,
                         'ROC on Real': real_clf_roc_score,
                         })
score_df.to_csv( 'scores.csv') # Make into a csv that you can download and do computations!

In [None]:
score_df

Unnamed: 0,Classifier Name,Accuracy on Synthetic,ROC on Synthetic,Accuracy on Real,ROC on Real
0,MLP,0.4648,0.466567,0.8564,0.732897
1,RF,0.6808,0.532715,0.808,0.529218
2,ADABOOST,0.2712,0.481752,0.8604,0.72444
3,DT,0.4464,0.443316,0.7948,0.68618
4,GauNB,0.5304,0.570635,0.81,0.654776
5,BernNB,0.7824,0.502909,0.8028,0.622471
6,SVM,0.7792,0.493589,0.8612,0.685458
7,LogReg,0.5488,0.508337,0.8092,0.578961
