In [None]:
!pip install "matplotlib>=3.2,<=3.7.3"
!pip install "ydata-profiling>4.4,<4.5"
!pip install tensorflow sdv

In [None]:
import pandas as pd
import numpy as np
from tensorflow import keras

## Ideas

If adversary does not work, predict -> predict_proba to get more information

In [2]:
from sdv.single_table import CTGANSynthesizer
from sdv.metadata import SingleTableMetadata
from pia_functions import data_train_test, get_distributed_adult_sets, generate_shadow_model_outputs

### Generate training data for shadow models

In [None]:
n_shadow_models=200
distributions=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

In [5]:
X_train, X_test, y_train, y_test, sensitive, sensitive_t = data_train_test()

In [None]:
distributed_datasets = get_distributed_adult_sets(distributions=distributions)

## Generate synthetic data for model output

In [6]:
output_size = 10000
metadata = SingleTableMetadata()
to_fit = pd.DataFrame(np.concatenate((X_train, X_test)), columns=[str(i) for i in range(79)])
#to_fit = to_fit.astype({i: 'int' for i in range(4,79)})
#to_fit["index"] = to_fit.index
#to_fit = to_fit.astype({"index": 'string'})
metadata.detect_from_dataframe(data=to_fit)

In [7]:
metadata.to_dict()

{'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1',
 'columns': {'0': {'sdtype': 'numerical'},
  '1': {'sdtype': 'numerical'},
  '2': {'sdtype': 'numerical'},
  '3': {'sdtype': 'numerical'},
  '4': {'sdtype': 'categorical'},
  '5': {'sdtype': 'categorical'},
  '6': {'sdtype': 'categorical'},
  '7': {'sdtype': 'categorical'},
  '8': {'sdtype': 'categorical'},
  '9': {'sdtype': 'categorical'},
  '10': {'sdtype': 'categorical'},
  '11': {'sdtype': 'categorical'},
  '12': {'sdtype': 'categorical'},
  '13': {'sdtype': 'categorical'},
  '14': {'sdtype': 'categorical'},
  '15': {'sdtype': 'categorical'},
  '16': {'sdtype': 'categorical'},
  '17': {'sdtype': 'categorical'},
  '18': {'sdtype': 'categorical'},
  '19': {'sdtype': 'categorical'},
  '20': {'sdtype': 'categorical'},
  '21': {'sdtype': 'categorical'},
  '22': {'sdtype': 'categorical'},
  '23': {'sdtype': 'categorical'},
  '24': {'sdtype': 'categorical'},
  '25': {'sdtype': 'categorical'},
  '26': {'sdtype': 'categorical'},
  '27': {'sdtype'

In [2]:
syn_model = CTGANSynthesizer(metadata)
syn_model.fit(to_fit)

NameError: name 'CTGANSynthesizer' is not defined

In [1]:
sampled = syn_model.sample(num_rows=output_size)

NameError: name 'syn_model' is not defined

### Train shadow models and generate output

In [None]:
all_shadow_outputs = []
for ds in distributed_datasets:
    outputs = generate_shadow_model_outputs(ds, X_test, n_shadow_models=n_shadow_models)
    all_shadow_outputs.append(outputs)

### Save shadow model outputs to file

In [None]:
adv_df = pd.DataFrame(np.array(np.concatenate((all_shadow_outputs))))
adv_df["y"] = np.concatenate(([np.repeat(d, n_shadow_models) for d in distributions]))
adv_df.to_csv("data/shadow_model_outputs.csv")

### Shadow model outputs as training data for adversary

In [None]:
adv_df = pd.read_csv("data/shadow_model_outputs.csv")
adv_ddf_shuffled = adv_df.sample(frac=1, random_state=1).reset_index(drop=True)
adv_y = adv_ddf_shuffled["y"]
adv_X = adv_ddf_shuffled.drop(columns=["y"])

In [75]:
adv_X.shape

(1800, 9770)

### Create adversary

In [73]:
adversary = keras.Sequential()
adversary.add(keras.Input(shape=(adv_X.shape[1],)))
adversary.add(keras.layers.Dense(8, activation='relu'))
adversary.add(keras.layers.Dense(4, activation='relu'))
adversary.add(keras.layers.Dense(1))

adversary.compile(optimizer='adam', loss=keras.losses.MeanSquaredError())

### Train adversary

In [74]:
adversary.fit(adv_X, adv_y, validation_split=0.2, batch_size=32, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7f294a273df0>

# Test adversary

In [76]:
adv_df_test = pd.read_csv("data/shadow_model_outputs_test_set.csv")
adv_ddf_shuffled_test = adv_df_test.sample(frac=1, random_state=1).reset_index(drop=True)
adv_y_test = adv_ddf_shuffled_test["y"]
adv_X_test = adv_ddf_shuffled_test.drop(columns=["y"])

In [80]:
adversary.evaluate(adv_X_test, adv_y_test)#, adv_y_test)



0.11049496382474899

In [79]:
adv_y_test

0      0.9
1      0.8
2      0.4
3      0.2
4      0.1
      ... 
445    0.6
446    0.2
447    0.8
448    0.5
449    0.1
Name: y, Length: 450, dtype: float64

### Next steps
* Use adversary during training

### (Data set analysis)

In [None]:
import ydata_profiling

ydata_profiling.ProfileReport(X_test)

In [None]:
# RANDOM DS
random_dict = {}
for attr in ds.X_train:
    print(f'{round(X_train_pr[attr].min(),2)} {round(X_train_pr[attr].max(),2)}')
    random_attr = np.random.rand(random_length)*(X_train_pr[attr].max()+abs(X_train_pr[attr].min()))-abs(X_train_pr[attr].min())
    random_dict[attr] = random_attr

In [None]:
random_ds = pd.DataFrame(random_dict)