# Artificial and synthetic datasets

In this notebook, instead of using and synthesizing *real* datasets, we first create *artifical* datasets and then analyze the outputs. This way, we can engineer some required features in the artifical dataset.

In [None]:
# solve issue with autocomplete
%config Completer.use_jedi = False

%load_ext autoreload
%autoreload 2
%matplotlib inline

from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

In [None]:
import copy
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

from privgem import tabular_ppgm
from privgem import tabular_metrics
from privgem import tabular_utils
from privgem import tabular_artificial

# For reproducibility
np.random.seed(1364)

## Create an artificial data

In [None]:
n_samples = 5000
# Classes
n_classes = 2
class_weights = [0.5, 0.5]
n_clusters_per_class = 1
# Features
n_features=5
n_informative=5
n_redundant=0
n_repeated=0
# Control "noise"
flip_y=0.2
class_sep=1.0

# number of categorical columns and their bins
n_categorical=5
n_categorical_bins=[10, 5, 5, 5, 5]

In [None]:
X, y, categories = \
    tabular_artificial.make_table(n_samples=n_samples,
                                  n_classes=n_classes,
                                  class_weights=class_weights,
                                  n_clusters_per_class=n_clusters_per_class,
                                  n_features=n_features, 
                                  n_informative=n_informative, 
                                  n_redundant=n_redundant, 
                                  n_repeated=n_repeated,
                                  n_categorical=n_categorical,
                                  n_categorical_bins=n_categorical_bins,
                                  flip_y=flip_y, 
                                  class_sep=class_sep)

## Utility of original/artifical dataset

In [None]:
# extract numerical and categorical columns
num_columns, cat_columns = tabular_utils.extract_col_names_by_type(X)

In [None]:
# create a pipeline
custom_pipe = tabular_metrics.create_pipeline(num_columns, cat_columns, 
                                              categories=categories,
                                              inp_classifer=RandomForestClassifier())

In [None]:
# split the data
test_size=0.3

X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=test_size, shuffle=True)


In [None]:
f1_orig, auc_orig, roc_auc_orig, features_orig = \
    tabular_metrics.performance_classification(X_train, y_train, 
                                               X_test, y_test, 
                                               model_imp=custom_pipe,
                                               pipe_classifier_name="classifier")

print(f"F1:       {f1_orig:.3f}\n"\
      f"AUC:      {auc_orig:.3f}\n"\
      f"ROC-AUC:  {roc_auc_orig:.3f}\n"\
      f"Features: {features_orig}")

## Synthesize the artificial data

In [None]:
num_iters = 1000
list_eps = [0.005, 0.01, 0.1, 1, 5]
delta = 1e-5
target_var = "label"

# prepare data
Xy = X.copy()
Xy["label"] = y
Xy = Xy.astype("int")

# --- loop over epsilons
list_roc_auc = []
list_f1 = []
list_cosine = []
for eps in list_eps:
    print(f"--- EPS: {eps}")
    # train a PGM model
    pgm = tabular_ppgm(target_variable=target_var, 
                       target_epsilon=eps, 
                       target_delta=delta)
    pgm.train(Xy, iters=num_iters)
    # generate synthetic output
    synth_pd = pgm.generate(num_rows=len(Xy))
    
    # utility of synthetic data
    Xpgm_train = synth_pd.drop(columns=[target_var]).astype("str")
    ypgm_train = synth_pd[target_var].to_list()
    
    f1_tmp, auc_tmp, roc_auc_tmp, features_tmp = \
        tabular_metrics.performance_classification(Xpgm_train, ypgm_train, 
                                                   X_test, y_test, 
                                                   model_imp=custom_pipe,
                                                   pipe_classifier_name="classifier")
    
    # cosine similarity between original and synthetic dataset
    cosine_sim_measure = \
        tabular_metrics.cosine_sim(features_orig, features_tmp)
    
    # collect results
    list_roc_auc.append(roc_auc_tmp)
    list_f1.append(f1_tmp)
    list_cosine.append(cosine_sim_measure)
    

In [None]:
%matplotlib inline

var2plot = list_roc_auc
orig_var = roc_auc_orig
ylabel = "ROC-AUC"

plt.figure(figsize=(7, 5))

plt.plot(list_eps, var2plot, 
         lw=3, marker="o", c="k")
plt.axhline(orig_var, ls="--", c="r")

plt.xlabel("Epsilon", size=20)
plt.ylabel(ylabel, size=20)
plt.xticks(size=16)
plt.yticks(size=16)
plt.grid()
plt.show()

In [None]:
%matplotlib inline

var2plot = list_f1
orig_var = f1_orig
ylabel = "F1"

plt.figure(figsize=(7, 5))

plt.plot(list_eps, var2plot, 
         lw=3, marker="o", c="k")
plt.axhline(orig_var, ls="--", c="r")

plt.xlabel("Epsilon", size=20)
plt.ylabel(ylabel, size=20)
plt.xticks(size=16)
plt.yticks(size=16)
plt.grid()
plt.show()

In [None]:
%matplotlib inline

var2plot = list_cosine
orig_var = 1
ylabel = "Cosine sim"

plt.figure(figsize=(7, 5))

plt.plot(list_eps, var2plot, 
         lw=3, marker="o", c="k")
plt.axhline(orig_var, ls="--", c="r")

plt.xlabel("Epsilon", size=20)
plt.ylabel(ylabel, size=20)
plt.xticks(size=16)
plt.yticks(size=16)
plt.grid()
plt.show()