# Artificial and synthetic datasets

In this notebook, instead of using and synthesizing *real* datasets, we first create *artifical* datasets and then analyze the outputs. This way, we can engineer some required features in the artifical dataset.

In [None]:
# solve issue with autocomplete
%config Completer.use_jedi = False

%load_ext autoreload
%autoreload 2
%matplotlib inline

from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

In [None]:
import copy
from joblib import load as jload
import matplotlib.pyplot as plt
import matplotlib.pylab as pl
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

from privgem import tabular_ppgm
from privgem import tabular_patectgan
from privgem import tabular_metrics
from privgem import tabular_utils
from privgem import tabular_artificial
from privgem import rbo_metric

from utils import train_save_pate_models

# For reproducibility
np.random.seed(1364)

## Create an artificial data

In [None]:
n_samples = 10000
# Classes
n_classes = 2
class_weights = [0.5, 0.5]
n_clusters_per_class = 1
# Features
n_features=5
n_informative=5
n_redundant=0
n_repeated=0
# Control "noise"
flip_y=0.1
class_sep=1.0

# number of categorical columns and their bins
n_categorical=5
n_categorical_bins=[5, 5, 5, 5, 5]

In [None]:
X, y, categories = \
    tabular_artificial.make_table(n_samples=n_samples,
                                  n_classes=n_classes,
                                  class_weights=class_weights,
                                  n_clusters_per_class=n_clusters_per_class,
                                  n_features=n_features, 
                                  n_informative=n_informative, 
                                  n_redundant=n_redundant, 
                                  n_repeated=n_repeated,
                                  n_categorical=n_categorical,
                                  n_categorical_bins=n_categorical_bins,
                                  flip_y=flip_y, 
                                  class_sep=class_sep)

## Utility of original/artifical dataset

In [None]:
# extract numerical and categorical columns
num_columns, cat_columns = tabular_utils.extract_col_names_by_type(X)

In [None]:
orig_corr_matrix = tabular_metrics.compute_associations(X, cat_columns)

In [None]:
# create a pipeline
custom_pipe = tabular_metrics.create_pipeline(num_columns, cat_columns, 
                                              categories=categories,
                                              inp_classifer=RandomForestClassifier())
                                              #inp_classifer=GradientBoostingClassifier())

In [None]:
# split the data
test_size=0.3

X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=test_size, shuffle=True)

In [None]:
X_train.head()

In [None]:
f1_orig, auc_orig, roc_auc_orig, f_orig_built, f_orig_perm, f_orig_shap = \
    tabular_metrics.performance_classification(X_train, y_train, 
                                               X_test, y_test, 
                                               model_imp=custom_pipe,
                                               pipe_classifier_name="classifier")

print(f"F1:       {f1_orig:.3f}\n"\
      f"AUC:      {auc_orig:.3f}\n"\
      f"ROC-AUC:  {roc_auc_orig:.3f}\n"\
      f"Features (built): {f_orig_built}\n"\
      f"Features (perm) : {f_orig_perm}\n"\
      f"Features (shap) : {f_orig_shap}")

In [None]:
# Sort the features
sorted_f_orig_built, _ = \
    tabular_utils.sort_feature_vector(f_orig_built, X_train.columns.to_list())
sorted_f_orig_perm, _  = \
    tabular_utils.sort_feature_vector(f_orig_perm,  X_train.columns.to_list())
sorted_f_orig_shap, sorted_f_orig_shap_val  = \
    tabular_utils.sort_feature_vector(f_orig_shap,  X_train.columns.to_list())

## Shuffle columns, independently

In [None]:
def shuffle(df):     
    df_shuffled = copy.deepcopy(df)

    for indx in range(df_shuffled.shape[1]):
        df_shuffled.iloc[:, indx] = np.random.permutation(df_shuffled.iloc[:, indx])
    return df_shuffled

In [None]:
X_train_shuffled = shuffle(X_train)
X_test_shuffled = shuffle(X_test)

In [None]:
f1_shuffled, auc_shuffled, roc_auc_shuffled, f_shuffled_built, f_shuffled_perm, f_shuffled_shap = \
    tabular_metrics.performance_classification(X_train_shuffled, y_train, 
                                               X_test_shuffled, y_test, 
                                               model_imp=custom_pipe,
                                               pipe_classifier_name="classifier")

print(f"F1:       {f1_shuffled:.3f}\n"\
      f"AUC:      {auc_shuffled:.3f}\n"\
      f"ROC-AUC:  {roc_auc_shuffled:.3f}\n"\
      f"Features (built): {f_shuffled_built}\n"\
      f"Features (perm) : {f_shuffled_perm}\n"\
      f"Features (shap) : {f_shuffled_shap}")

In [None]:
# Sort the features
sorted_f_shuffled_built, _ = \
    tabular_utils.sort_feature_vector(f_shuffled_built, X_train.columns.to_list())
sorted_f_shuffled_perm, _  = \
    tabular_utils.sort_feature_vector(f_shuffled_perm,  X_train.columns.to_list())
sorted_f_shuffled_shap, sorted_f_shuffled_shap_val  = \
    tabular_utils.sort_feature_vector(f_shuffled_shap,  X_train.columns.to_list())

In [None]:
sorted_f_orig_shap

In [None]:
sorted_f_shuffled_shap

In [None]:
# cosine similarity between original and shuffled dataset
cosine_sim_measure_shuffled = \
    tabular_metrics.cosine_sim(f_shuffled_shap, f_orig_shap)

kl_div_measure_shuffled = tabular_metrics.kl_div(f_shuffled_shap, f_orig_shap)

# RBO
rbo_shuffled = rbo_metric(sorted_f_orig_shap, sorted_f_shuffled_shap)

## Choose a synthesizer

In [None]:
synthesizer_method = "pgm"

## Synthesize using PATE-CTGAN

In [None]:
if synthesizer_method == "pate-ctgan":
    list_eps = [0.4, 1, 10]
    #list_nm = [4.2e-5, 1.05e-4, 9e-4]
    list_nm = [4.2e-4, 1.05e-3, 9e-3]
    list_mo = [1000, 100, 100]

    list_save_log = [
        "./pate_00_40/patectgan_training.csv",
        "./pate_01_00/patectgan_training.csv",
        "./pate_10_00/patectgan_training.csv",
                     ]

    list_save_model = [
        "./pate_00_40/model.pkl",
        "./pate_01_00/model.pkl",
        "./pate_10_00/model.pkl",
                     ]

    batch_size = 64
    device = "default" # or "default" or "cpu" or "cuda:1"
    
    discrete_columns = cat_columns + ["label"]

In [None]:
if synthesizer_method == "pate-ctgan":
    # prepare data
    Xy = X_train.copy()
    Xy["label"] = y_train
    Xy[cat_columns] = Xy[cat_columns].astype("int")

In [None]:
if synthesizer_method == "pate-ctgan":
    from parhugin import multiFunc
    myproc = multiFunc(num_req_p=4)

    for i in range(len(list_eps)):    
        myproc.add_job(target_func=train_save_pate_models, 
                       target_args=(Xy, 
                                    discrete_columns,
                                    list_eps[i], 
                                    batch_size,
                                    list_nm[i], 
                                    list_mo[i],
                                    list_save_log[i], 
                                    device, 
                                    list_save_model[i])
                      )

In [None]:
if synthesizer_method == "pate-ctgan":
    myproc.run_jobs()

In [None]:
if synthesizer_method == "pate-ctgan":
    list_models = []
    for i in range(len(list_save_model)):
        list_models.append(jload(list_save_model[i]))

    # plot the results
    tabular_utils.plot_log_patectgan(filename=list_save_log[0])

## Synthesize using PGM

In [None]:
if synthesizer_method == "pgm":
    num_iters = 5000
    list_eps = [0.005, 0.01, 0.1, 0.4, 1, 4.0, 10]
    #list_eps = [0.005, 1, 10]
    delta = 1e-5

## Synthesize the artificial data

In [None]:
target_var = "label"
rbo_p = 0.6

# prepare data
Xy = X_train.copy()
Xy["label"] = y_train
Xy[cat_columns] = Xy[cat_columns].astype("int")

# --- loop over epsilons
list_roc_auc = []
list_f1 = []
list_cosine = []
list_rbo = []
list_rbo_corr = []
list_kl = []
list_syn_features = []
for eps in list_eps:
    print(f"--- EPS: {eps}")
    
    if synthesizer_method == "pgm":
        # train a PGM model
        pgm = tabular_ppgm(target_variable=target_var, 
                           target_epsilon=eps, 
                           target_delta=delta)
        pgm.train(Xy, iters=num_iters)
        # generate synthetic output
        synth_pd = pgm.generate(num_rows=len(Xy))
    elif synthesizer_method == "pate-ctgan":
        synth_pd = list_models[i].sample(len(Xy))
    
    # utility of synthetic data
    Xsyn_train = synth_pd.drop(columns=[target_var]).astype("str")
    ysyn_train = synth_pd[target_var].to_list()
    
    f1_tmp, auc_tmp, roc_auc_tmp, f_syn_built, f_syn_perm, f_syn_shap = \
        tabular_metrics.performance_classification(Xsyn_train, ysyn_train, 
                                                   X_test, y_test, 
                                                   model_imp=custom_pipe,
                                                   pipe_classifier_name="classifier")
    
    # Sort the features
    sorted_f_syn_built, _ = \
        tabular_utils.sort_feature_vector(f_syn_built, X_train.columns.to_list())
    sorted_f_syn_perm, _  = \
        tabular_utils.sort_feature_vector(f_syn_perm,  X_train.columns.to_list())
    sorted_f_syn_shap, sorted_f_syn_shap_val  = \
        tabular_utils.sort_feature_vector(f_syn_shap,  X_train.columns.to_list())
    
    
    # cosine similarity between original and synthetic dataset
    cosine_sim_measure = \
        tabular_metrics.cosine_sim(f_syn_shap, f_orig_shap)
    
    kl_div_measure = tabular_metrics.kl_div(f_syn_shap, f_orig_shap)
    
    # RBO
    rbo = rbo_metric(sorted_f_orig_shap, sorted_f_syn_shap)
    
    # collect results
    list_roc_auc.append(roc_auc_tmp)
    list_f1.append(f1_tmp)
    list_cosine.append(cosine_sim_measure)
    list_rbo.append(rbo.rbo(p=rbo_p))
    list_rbo_corr.append(
        rbo.correlated_rank_similarity(p=rbo_p, 
                                       correlation_matrix=orig_corr_matrix))
    list_kl.append(kl_div_measure)
    list_syn_features.append(f_syn_shap)

## Plot the results

In [None]:
%matplotlib inline

var2plot = list_roc_auc
orig_var = roc_auc_orig
ylabel = "ROC-AUC"

plt.figure(figsize=(7, 5))

plt.plot(list_eps, var2plot, 
         lw=3, marker="o", c="k")
plt.axhline(orig_var, 
            ls="--", c="r",
            label="original")
plt.axhline(roc_auc_shuffled, 
            ls="--", c="blue",
            label="shuffled")

plt.xlabel("$\epsilon$", size=20)
plt.ylabel(ylabel, size=20)
plt.xscale("log")
plt.xticks(size=16)
plt.yticks(size=16)
plt.legend(loc='center left', 
           bbox_to_anchor=(1, 0.5), 
           fontsize=16)
plt.grid()
plt.show()

In [None]:
%matplotlib inline

var2plot = list_f1
orig_var = f1_orig
ylabel = "F1"

plt.figure(figsize=(7, 5))

plt.plot(list_eps, var2plot, 
         lw=3, marker="o", c="k")
plt.axhline(orig_var, 
            ls="--", c="r",
            label="original")
plt.axhline(f1_shuffled, 
            ls="--", c="blue",
            label="shuffled")

plt.xlabel("$\epsilon$", size=20)
plt.ylabel(ylabel, size=20)
plt.xscale("log")
plt.xticks(size=16)
plt.yticks(size=16)
plt.legend(loc='center left', 
           bbox_to_anchor=(1, 0.5), 
           fontsize=16)
plt.grid()
plt.show()

In [None]:
%matplotlib inline

ylabel = "RBO score"

plt.figure(figsize=(7, 5))

plt.plot(list_eps, list_rbo, 
         lw=3, marker="o", c="k",
         label="RBO")
plt.plot(list_eps, list_rbo_corr, 
         lw=2, marker="o", c="k", ls="--",
         label="RBOcorr")

plt.axhline(1, 
            ls="--", c="g",
            label="skyline")
plt.axhline(rbo_shuffled.rbo(p=0.6), 
            ls="--", c="blue",
            label="RBO, shuffled")
plt.axhline(rbo_shuffled.correlated_rank_similarity(
                p=0.6, correlation_matrix=orig_corr_matrix), 
            ls=":", c="blue",
            label="RBOcorr, shuffled")

plt.xlabel("$\epsilon$", size=20)
plt.ylabel(ylabel, size=20)
plt.xscale("log")
plt.xticks(size=16)
plt.yticks(size=16)
plt.legend(loc='center left', 
           bbox_to_anchor=(1, 0.5), 
           fontsize=16)
plt.grid()
plt.show()

In [None]:
print(sorted_f_syn_shap)
print(sorted_f_orig_shap)

In [None]:
f_orig_shap, f_shuffled_shap

In [None]:
from scipy import spatial

1. - spatial.distance.cosine(f_orig_shap, f_shuffled_shap)

In [None]:
sorted_f_orig_shap_val

In [None]:
plt.figure(figsize=(10, 5))

# original features
plt.plot(f_orig_shap, c="r", 
         lw=2, ls="--", marker="o",
         label="original")

# original features
plt.plot(f_shuffled_shap, c="b", 
         lw=2, ls="--", marker="o",
         label="shuffled")

# synthetic features
colors = pl.cm.viridis_r(np.linspace(0.0,1,len(list_syn_features)))

for i in range(len(list_syn_features)):
    plt.plot(list_syn_features[i], c=colors[i], 
             lw=1., marker="o", label=f"$\epsilon$:{list_eps[i]}")

plt.xlabel("Features", size=20)
plt.ylabel("Score", size=20)

list_features = X_train.columns.to_list()
plt.xticks(range(len(list_features)), list_features, 
           size=16)
plt.yticks(size=16)
plt.legend(loc='center left', 
           bbox_to_anchor=(1, 0.5), 
           fontsize=16)
plt.grid()
plt.show()

In [None]:
%matplotlib inline

var2plot = list_cosine
orig_var = 1
ylabel = "Cosine sim"

plt.figure(figsize=(7, 5))

plt.plot(list_eps, var2plot, 
         lw=3, marker="o", c="k")
plt.axhline(orig_var, 
            ls="--", c="r",
            label="original")
plt.axhline(cosine_sim_measure_shuffled, 
            ls="--", c="b",
            label="shuffled")

plt.xlabel("$\epsilon$", size=20)
plt.ylabel(ylabel, size=20)
plt.xscale("log")
plt.xticks(size=16)
plt.yticks(size=16)
plt.legend(loc='center left', 
           bbox_to_anchor=(1, 0.5), 
           fontsize=16)
plt.grid()
plt.show()

In [None]:
%matplotlib inline

var2plot = list_kl
orig_var = 0
ylabel = "KL-divergence"

plt.figure(figsize=(7, 5))

plt.plot(list_eps, var2plot, 
         lw=3, marker="o", c="k")
plt.axhline(orig_var, 
            ls="--", c="r",
            label="original")
plt.axhline(kl_div_measure_shuffled, 
            ls="--", c="b",
            label="shuffled")

plt.xlabel("$\epsilon$", size=20)
plt.ylabel(ylabel, size=20)
plt.xscale("log")
plt.xticks(size=16)
plt.yticks(size=16)
plt.legend(loc='center left', 
           bbox_to_anchor=(1, 0.5), 
           fontsize=16)
plt.grid()
plt.show()

## Why cosine sim. between `f_shuffled_shap` and `f_orig_shap` is so high?

In [None]:
# Range of values for elements in random vectors
min_val = 0
max_val = 1000
# Repetition
num_iter = 1000000

# list_dims = [  2,   3,   4,   5,   6,   7,   8,   9,   10, 
#               20,  30,  40,  50,  60,  70,  80,  90,  100, 
#              200, 300, 400, 500, 600, 700, 800, 900, 1000]

list_dims = [2] + list(range(5, 105, 5))

# --- list to collect results
dims = []
# cosine 
cs_means = []
cs_stds = []
# L2
l2_means = []
l2_stds = []


# j specifies the dimension of random vectors
for j in list_dims:
    print(j, end=" ")
    
    repetitions = range(num_iter)
    tmp_cs_dists = []
    tmp_l2_dists = []
    x_used = []
    
    for i in repetitions:
        
        v1 = np.random.uniform(min_val, max_val, j)
        # Weight some components?
        # v1[:int(len(v1)/2)] *= 10
        
        # v2 is a permutation of v1
        v2 = np.random.permutation(v1)
        # v2 = np.random.uniform(min_val, max_val, j)

        cs_sim  = tabular_metrics.cosine_sim(v1, v2)
        l2_dist = tabular_metrics.L2_norm_dist(v1, v2)
        
        if isinstance(cs_sim, float) and isinstance(l2_dist, float):
            tmp_cs_dists.append(cs_sim)
            tmp_l2_dists.append(l2_dist)
            x_used.append(i)
            
    tmp_cs_dists = np.array(tmp_cs_dists)
    tmp_l2_dists = np.array(tmp_l2_dists)

    tmp_noninf_cs_dists = tmp_cs_dists[tmp_cs_dists != np.inf]
    curr_cs_mean = np.mean(tmp_noninf_cs_dists)
    curr_cs_std  =  np.std(tmp_noninf_cs_dists)
    
    tmp_noninf_l2_dists = tmp_l2_dists[tmp_l2_dists != np.inf]
    curr_l2_mean = np.mean(tmp_noninf_l2_dists)
    curr_l2_std  =  np.std(tmp_noninf_l2_dists)
    
    dims.append(j)
    
    cs_means.append(curr_cs_mean)
    cs_stds.append(curr_cs_std)
    
    l2_means.append(curr_l2_mean)
    l2_stds.append(curr_l2_std)
    
#plt.scatter(x_act, dists, alpha=0.01, s=10)

In [None]:
%matplotlib inline

plt.figure(figsize=(7, 5))
plt.errorbar(dims, cs_means, cs_stds, 
             c="k", linestyle='None', marker='o')

plt.xlabel("Dimension", size=20)
plt.ylabel("Cosine sim", size=20)

plt.xticks(size=16)
plt.yticks(size=16)

plt.ylim(0, 1)
plt.xlim(0, 100)
# plt.xscale("log")
plt.grid()
plt.show()

In [None]:
plt.scatter(x_used, tmp_cs_dists, alpha=0.01, c='k')
plt.ylim(0, 1)