# PePR: Direct Generalized Membership Inference Attack
Attack a single target model with a singe attack configuration.

## Prep Google Colab Enviornment
**Important: Restart the Runtime after this Cell!**
The restart is needed because of `pip install -e`.

In [1]:
!git clone https://github.com/hallojs/ml-pepr.git
%pip install -e ml-pepr
%pip install pylatex

Cloning into 'ml-pepr'...
remote: Enumerating objects: 206, done.[K
remote: Counting objects: 100% (206/206), done.[K
remote: Compressing objects: 100% (136/136), done.[K
remote: Total 206 (delta 102), reused 171 (delta 68), pack-reused 0[K
Receiving objects: 100% (206/206), 268.27 KiB | 14.12 MiB/s, done.
Resolving deltas: 100% (102/102), done.
Obtaining file:///content/ml-pepr
Installing collected packages: pepr
  Running setup.py develop for pepr
Successfully installed pepr
Collecting pylatex
[?25l  Downloading https://files.pythonhosted.org/packages/8a/76/015a1d785221d9b0d2ad80759d892a6d9d0a8a05daffc52202311ea3d652/PyLaTeX-1.4.1.tar.gz (84kB)
[K     |████████████████████████████████| 92kB 6.0MB/s 
[?25hCollecting ordered-set
  Downloading https://files.pythonhosted.org/packages/f5/ab/8252360bfe965bba31ec05112b3067bd129ce4800d89e0b85613bc6044f6/ordered-set-4.0.2.tar.gz
Building wheels for collected packages: pylatex, ordered-set
  Building wheel for pylatex (setup.py) ... [?

## Imports

In [1]:
from pepr.privacy import gmia
from pepr.utilities import assign_record_ids_to_target_models

import tensorflow as tf
from tensorflow.keras import models
from tensorflow.keras import optimizers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import MaxPooling2D
from tensorflow.keras.layers import Conv2D
from tensorflow.keras.layers import Activation
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Dense

import numpy as np
import logging

  import pandas.util.testing as tm


## Setup Logging

In [2]:
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s', '%Y-%m-%d %H:%M:%S')

# TensorFlow Logger
file_handler_tf = logging.FileHandler('tf.log')
file_handler_tf.setLevel(logging.ERROR)
file_handler_tf.setFormatter(formatter)

tf.get_logger().setLevel(logging.ERROR)
logger_tf = tf.get_logger()
logger_tf.addHandler(file_handler_tf)

# PePR Logger
file_handler_pr = logging.FileHandler('pepr.privacy.gmia.log')
file_handler_pr.setLevel(logging.DEBUG)
file_handler_pr.setFormatter(formatter)

stream_handler_pr = logging.StreamHandler()
stream_handler_pr.setLevel(logging.DEBUG)
stream_handler_pr.setFormatter(formatter)

logger_pr = logging.getLogger('pepr.privacy.gmia')
logger_pr.addHandler(file_handler_pr)
logger_pr.addHandler(stream_handler_pr)

## A Few Function Definitions

In [3]:
def create_model(input_shape, n_categories):
  """Architecture of the target and reference models.

  Parameters
  ----------
  input_shape : tuple
      Dimensions of the input for the target/training
  n_categories : int
      number of categories for the prediction
  models.

  Returns
  -------
  tensorflow.python.keras.engine.sequential.Sequential
      A convolutional neuronal network model.
  """
  model = Sequential()

  # first convolution layer
  model.add(Conv2D(filters=32, kernel_size=(5, 5), strides=(
      1, 1), padding='same', input_shape=input_shape))
  model.add(Activation('relu'))

  # max pooling layer
  model.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='same'))

  # second convolution layer
  model.add(Conv2D(filters=64, kernel_size=(
      5, 5), strides=(1, 1), padding='same'))
  model.add(Activation('relu'))

  # max pooling layer
  model.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='same'))

  # fully connected layer
  model.add(Flatten())
  model.add(Dense(1024))
  model.add(Activation('relu'))

  # drop out
  model.add(Dropout(rate=0.5))

  # fully connected layer
  model.add(Dense(n_categories))
  model.add(Activation('softmax'))

  return model

def create_compile_model():
  input_shape = (28, 28, 1)
  number_classes = 10

  model = create_model(input_shape, number_classes)

  optimizer = optimizers.Adam(lr=0.0001)
  loss = 'categorical_crossentropy'
  metrics = ["accuracy"]
  model.compile(optimizer, loss=loss, metrics=metrics)

  return model

def load_fashion_mnist():
    """Loads and preprocesses the fashion mnist dataset.

    Returns
    -------
    tuple
        (training data, training labels, test data, test labels)
    """
    train, test = tf.keras.datasets.fashion_mnist.load_data()
    train_data, train_labels = train
    test_data, test_labels = test

    # Normalize the data to a range between 0 and 1
    train_data = np.array(train_data, dtype=np.float32) / 255
    test_data = np.array(test_data, dtype=np.float32) / 255

    # Reshape the images to (28, 28, 1)
    train_data = train_data.reshape(train_data.shape[0], 28, 28, 1)
    test_data = test_data.reshape(test_data.shape[0], 28, 28, 1)

    train_labels = np.array(train_labels, dtype=np.int32)
    test_labels = np.array(test_labels, dtype=np.int32)

    return np.vstack((train_data, test_data)), np.hstack((train_labels, test_labels))

## Data Setup

In [4]:
data, labels = load_fashion_mnist()
records_per_target_model = assign_record_ids_to_target_models(20000, 100, 10000)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-images-idx3-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-images-idx3-ubyte.gz


## Train a Target Model

In [5]:
target_model = create_compile_model()
target_model.fit(data[40000:50000],
                 tf.keras.utils.to_categorical(labels[40000:50000], num_classes=10),
                 epochs=50,
                 batch_size=50,
                 verbose=0)
target_model.save('data/target_model')

## Run the Attack

In [6]:
attack_pars = {
    'number_classes': 10,
    'number_reference_models': 100,
    'reference_training_set_size': 10000,
    'create_compile_model': create_compile_model,
    'reference_epochs': 50,
    'reference_batch_size': 50,
    'hlf_metric': 'cosine',
    'hlf_layer_number': 10,
    'neighbor_threshold': 0.125,
    'probability_threshold': 0.1
}

# single target
data_conf = {
    'reference_indices': list(range(40000)),
    'target_indices': list(range(40000, 50000)),
    'evaluation_indices': list(range(40000, 60000)),
    'record_indices_per_target': np.array([np.arange(10000)])
}

gmia_attack = gmia.DirectGmia('Playground GMIA', attack_pars, data, labels, data_conf, [target_model])

gmia_attack.run(save_path='data')

gmia_attack.create_attack_report()

2020-12-14 09:56:58 - pepr.privacy.gmia - INFO - Create mapping of records to reference models.
2020-12-14 09:56:58 - pepr.privacy.gmia - INFO - Save mapping of records to reference models: data/records_per_reference_model.npy.
2020-12-14 09:56:58 - pepr.privacy.gmia - DEBUG - records_per_reference_model shape: (100, 10000)
2020-12-14 09:56:58 - pepr.privacy.gmia - INFO - Progress: Train reference model 1/100.
2020-12-14 09:57:25 - pepr.privacy.gmia - INFO - Progress: Train reference model 2/100.
2020-12-14 09:57:53 - pepr.privacy.gmia - INFO - Progress: Train reference model 3/100.
2020-12-14 09:58:21 - pepr.privacy.gmia - INFO - Progress: Train reference model 4/100.
2020-12-14 09:58:49 - pepr.privacy.gmia - INFO - Progress: Train reference model 5/100.
2020-12-14 09:59:17 - pepr.privacy.gmia - INFO - Progress: Train reference model 6/100.
2020-12-14 09:59:45 - pepr.privacy.gmia - INFO - Progress: Train reference model 7/100.
2020-12-14 10:00:14 - pepr.privacy.gmia - INFO - Progress:

Attack accuracy per target model:  [[1.0], [0.75], [0.7777777777777778], [0.8333333333333334], [0.8333333333333334], [0.8333333333333334], [0.8461538461538461], [0.8461538461538461], [0.8461538461538461], [0.8461538461538461], [0.8], [0.75], [0.75], [0.75], [0.75], [0.75], [0.75], [0.75], [0.7058823529411765], [0.7058823529411765], [0.7058823529411765], [0.7058823529411765], [0.7058823529411765], [0.7058823529411765], [0.7058823529411765], [0.6666666666666666], [0.6666666666666666], [0.6666666666666666], [0.6666666666666666], [0.6666666666666666], [0.6666666666666666], [0.6666666666666666], [0.6666666666666666], [0.631578947368421], [0.631578947368421], [0.631578947368421], [0.631578947368421], [0.6], [0.6], [0.6], [0.6], [0.6], [0.6], [0.6], [0.6], [0.6], [0.6], [0.6], [0.6], [0.5714285714285714], [0.5714285714285714], [0.5714285714285714], [0.5714285714285714], [0.5454545454545454], [0.5454545454545454], [0.5454545454545454], [0.5454545454545454], [0.5454545454545454], [0.52173913043

In [None]:
# Zip report directory if you want to download it from google colab
!zip -r gmia_report.zip gmia_report