In [1]:
# imports and code needed for solutions
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import torch
import torch.nn.functional as F

from layers import MultiCategoryGumbelSoftmax
from private_db import query_aggregate, query_restricted
from sklearn.preprocessing import OneHotEncoder

np.random.seed(1)

def show_heatmap(df):
    crosstab = pd.crosstab(df.weather, df.status)
    releveled = crosstab.loc[['sunny', 'cloudy', 'rainy'], ['on time', 'delayed', 'canceled']]
    sns.heatmap(releveled, cmap="YlGnBu")
    plt.show()

df = pd.DataFrame(
    {'weather': ['sunny']*10000+['cloudy']*10000+['rainy']*10000,
     'status': ['on time']*8000+['delayed']*2000
     + ['on time']*3000+['delayed']*5000+['canceled']*2000
     + ['on time']*1000+['delayed']*3000+['canceled']*6000}
)

enc = OneHotEncoder()
enc.fit(df)
data = torch.tensor(enc.transform(df).toarray()).float()

# the multi category Gumbel softmax needs to know the dimensions of each output variable
output_dims = [len(cat) for cat in enc.categories_]

noise_dim = 8 # number of dimensions for noise input to generator
data_dim = data.shape[1] # number of dimensions of the data inputs
hidden_dim = 16 # number of dimensions for the hidden layers

In [6]:

def show_heatmap(df):
    crosstab = pd.crosstab(df.weather, df.status)
    releveled = crosstab.loc[['sunny', 'cloudy', 'rainy'], ['on time', 'delayed', 'canceled']]
    sns.heatmap(releveled, cmap="YlGnBu")
    plt.savefig('map.png')
    
show_heatmap(df)


In [5]:
df.sample(10).reset_index(drop=True)

Unnamed: 0,weather,status
0,sunny,delayed
1,cloudy,delayed
2,rainy,on time
3,sunny,on time
4,cloudy,delayed
5,sunny,on time
6,rainy,on time
7,rainy,canceled
8,cloudy,canceled
9,rainy,delayed


# Attack aggregation (query_aggregate)

We can find an individual's income by limiting our average to an average over a single person:

In [None]:
query_aggregate('SELECT AVG(income) FROM people WHERE age=99 and zip=60637')

# Attack aggregation (query_restricted)

We can use `SUM` or `AVG` to learn an individual's income. `SUM` is easier, but it is helpful to see how `AVG` works since frequently only averages are available. Note that it doesn't really matter exactly what restriction we use after `OR`, as long as it includes at least 50 people.

In [None]:
# using SUM
sum_large = query_restricted('SELECT SUM(income) FROM people WHERE (age=98 AND zip=60616) OR zip=60609')
sum_small = query_restricted('SELECT SUM(income) FROM people WHERE zip=60609')
sum_large - sum_small

In [None]:
# using AVG
count = query_restricted('SELECT COUNT(income) FROM people WHERE zip=60609') 
avg_large = query_restricted('SELECT AVG(income) FROM people WHERE (age=98 AND zip=60616) OR zip=60609')
avg_small = query_restricted('SELECT AVG(income) FROM people WHERE zip=60609')
(count+1)*avg_large - count*avg_small

In [None]:
# confirming the successful attack
query_aggregate('SELECT AVG(income) FROM people WHERE age=98 AND zip=60616')

# Gaussian mechanism

Note that in order to stop the `SUM` attack we need to add far more noise than to stop the `AVG` attack. It may make more sense to simply prohibit `SUM` queries.

In [None]:
def query_private(query_str):
    result = query_aggregate(query_str)
    noise = np.random.normal(0, 1000)
    return result + noise

First let's see that `AVG` queries are still relatively accurate:

In [None]:
query_private('SELECT AVG(income) FROM people WHERE age=30')

In [None]:
query_restricted('SELECT AVG(income) FROM people WHERE age=30')

In [None]:
query_private('SELECT AVG(income) FROM people WHERE age=50')

In [None]:
query_restricted('SELECT AVG(income) FROM people WHERE age=50')

Now let's try the `AVG` attack (assuming zip code counts are known exactly):

In [None]:
count = query_aggregate('SELECT COUNT(*) FROM people WHERE zip=60609') 
avg_large = query_private('SELECT AVG(income) FROM people WHERE (age=98 AND zip=60616) OR zip=60609')
avg_small = query_private('SELECT AVG(income) FROM people WHERE zip=60609')
(count+1)*avg_large - (count)*avg_small

Again note that the `SUM` attack is still largely successful:

In [None]:
# using SUM
sum_large = query_private('SELECT SUM(income) FROM people WHERE (age=98 AND zip=60616) OR zip=60609')
sum_small = query_private('SELECT SUM(income) FROM people WHERE zip=60609')
sum_large - sum_small

A more sophisticated approach would tune the noise up or down based on how many entries the `AVG` was operating over:

In [None]:
from private_db import query

def query_private_adaptive(query_str):
    result, count = query(query_str)
    noise = np.random.normal(0, 1e6/count)
    return result + noise

# Wasserstein GAN

We remove the `Sigmoid` layer from the discriminator, and update the training losses

In [None]:
generator = torch.nn.Sequential(
    torch.nn.Linear(noise_dim, hidden_dim),
    torch.nn.ReLU(),
    MultiCategoryGumbelSoftmax(hidden_dim, output_dims)
)

discriminator = torch.nn.Sequential(
    torch.nn.Linear(data_dim, hidden_dim),
    torch.nn.LeakyReLU(),
    torch.nn.Linear(hidden_dim, 1)
    ### CHANGED: removed Sigmoid ###
)

def train(data, generator, discriminator,
          epochs=20, n_discriminator=5, batch_size=128,
          learning_rate=1e-3):
    """Train the GAN
    
    Parameters
    ----------
    data : torch.Tensor
        Data for training
    generator : torch.nn.Sequential
        Generator network
    discriminator : torch.nn.Sequential
        Discriminator network
    epochs : int
        Number of iterations over the full data set for training
    n_discriminator : int
        Number of discriminator training iterations
    batch_size : int
        Number of training examples per inner iteration
    learning_rate : float
        Learning rate for training
    """
    # these solvers are optimizers for learning the parameters for our networks
    # we have one for each network (the generator and the discriminator)
    # RMSprop is one of many optimizers, and the choice is not terribly important here
    # The learning rate influences how large of a step the optimizer takes when it updates the parameters
    generator_solver = torch.optim.RMSprop(
        generator.parameters(), lr=learning_rate
    )
    
    discriminator_solver = torch.optim.RMSprop(
        discriminator.parameters(), lr=learning_rate
    )

    # There is a batch for each discriminator training iteration,
    # so each epoch is epoch_length iterations, and the total number of
    # iterations is the number of epochs times the length of each epoch.
    epoch_length = len(data) / (n_discriminator * batch_size)
    n_iters = int(epochs * epoch_length)
    
    # training loop
    for iteration in range(n_iters):
        for _ in range(n_discriminator):
            # Sample real data
            rand_perm = torch.randperm(data.size(0))
            real_sample = data[rand_perm[:batch_size]]

            # Sample fake data
            noise = torch.randn(batch_size, noise_dim)
            fake_sample = generator(noise)

            # Have the discriminator score the data
            discriminator_real = discriminator(real_sample)
            discriminator_fake = discriminator(fake_sample)

            # Calculate discriminator loss
            # Discriminator wants to assign a high score to real data
            # and a low score to fake data
            ### CHANGED: new loss function ###
            discriminator_loss = -(
                torch.mean(discriminator_real) -
                torch.mean(discriminator_fake)
            )

            discriminator_loss.backward() # backpropagate the loss through the discriminator network
            discriminator_solver.step() # update the discriminator network parameters using the optimizer

            # Reset the gradients
            generator.zero_grad()
            discriminator.zero_grad()

        # Sample and score fake data
        noise = torch.randn(batch_size, noise_dim)
        fake_sample = generator(noise)
        discriminator_fake = discriminator(fake_sample)

        # Calculate generator loss
        # Generator wants discriminator to assign a high score to fake data
        ### CHANGED: new loss function ###
        generator_loss = -torch.mean(discriminator_fake)

        generator_loss.backward() # backpropagate the loss through the generator network
        generator_solver.step() # update the generator network parameters using the optimizer

        # Reset the gradients
        generator.zero_grad()
        discriminator.zero_grad()

        # Show training losses and sample crosstabs after each epoch
        if int(iteration % epoch_length) == 0:
            epoch = int(iteration / epoch_length)
            print('Epoch {}\n'
                  'Discriminator loss: {}; '
                  'Generator loss: {}'.format(epoch,
                                              discriminator_loss.data.numpy(),
                                              generator_loss.data.numpy()))
            noise = torch.randn(len(data), noise_dim) # noise for fake sample
            fake_sample = generator(noise)
            # convert back from one-hot encoding
            fake_df = pd.DataFrame(enc.inverse_transform(fake_sample.detach()))
            fake_df.columns = df.columns
            show_heatmap(fake_df)

In [None]:
train(data, generator, discriminator)

# Differentially-private WGAN

In [None]:
generator = torch.nn.Sequential(
    torch.nn.Linear(noise_dim, hidden_dim),
    torch.nn.ReLU(),
    MultiCategoryGumbelSoftmax(hidden_dim, output_dims)
)


discriminator = torch.nn.Sequential(
    torch.nn.Linear(data_dim, hidden_dim),
    torch.nn.LeakyReLU(),
    torch.nn.Linear(hidden_dim, 1)
)

def train(data, generator, discriminator,
          epochs=20, n_discriminator=5, batch_size=128,
          learning_rate=1e-3, sigma=0.01, weight_clip=0.1): ### CHANGED: add sigma and weight_clip parameters ###
    """Train the model
    Parameters
    ----------
    data : torch.Tensor
        Data for training
    epochs : int
        Number of iterations over the full data set for training
    n_discriminator : int
        Number of discriminator training iterations
    batch_size : int
        Number of training examples per inner iteration
    learning_rate : float
        Learning rate for training
    sigma : float
        Amount of noise to add (for differential privacy)
    weight_clip : float
        Maximum range of weights (for differential privacy)
    """
    # these solvers are optimizers for learning the parameters for our networks
    # we have one for each network (the generator and the discriminator)
    # RMSprop is one of many optimizers, and the choice is not terribly important here
    # The learning rate influences how large of a step the optimizer takes when it updates the parameters
    generator_solver = torch.optim.RMSprop(
        generator.parameters(), lr=learning_rate
    )
    discriminator_solver = torch.optim.RMSprop(
        discriminator.parameters(), lr=learning_rate
    )

    ### CHANGED: add hooks to introduce noise to gradient for differential privacy ###
    for parameter in discriminator.parameters():
        parameter.register_hook(
            lambda grad: grad + sigma * torch.randn(parameter.shape)
        )

    # There is a batch for each discriminator training iteration,
    # so each epoch is epoch_length iterations, and the total number of
    # iterations is the number of epochs times the length of each epoch.
    epoch_length = len(data) / (n_discriminator * batch_size)
    n_iters = int(epochs * epoch_length)
    for iteration in range(n_iters):
        for _ in range(n_discriminator):
            # Sample real data
            rand_perm = torch.randperm(data.size(0))
            real_sample = data[rand_perm[:batch_size]]

            # Sample fake data
            noise = torch.randn(batch_size, noise_dim)
            fake_sample = generator(noise)

            # Have the discriminator score the data
            discriminator_real = discriminator(real_sample)
            discriminator_fake = discriminator(fake_sample)

            # Calculate discriminator loss
            # Discriminator wants to assign a high score to real data
            # and a low score to fake data          
            discriminator_loss = -(
               torch.mean(discriminator_real) -
               torch.mean(discriminator_fake)
            )

            discriminator_loss.backward()
            discriminator_solver.step()

            ### CHANGED: add weight clipping for privacy guarantee ###
            for param in discriminator.parameters():
               param.data.clamp_(-weight_clip, weight_clip)

            # Reset gradient
            generator.zero_grad()
            discriminator.zero_grad()

        # Sample and score fake data
        noise = torch.randn(batch_size, noise_dim)
        fake_sample = generator(noise)
        discriminator_fake = discriminator(fake_sample)

        # Calculate generator loss
        # Generator wants discriminator to assign a high score to fake data
        generator_loss = -torch.mean(discriminator_fake)

        generator_loss.backward()
        generator_solver.step()

        # Reset gradient
        generator.zero_grad()
        discriminator.zero_grad()

        # Show training losses and sample crosstabs after each epoch
        if int(iteration % epoch_length) == 0:
            epoch = int(iteration / epoch_length)
            print('Epoch {}\n'
                  'Discriminator loss: {}; '
                  'Generator loss: {}'.format(epoch,
                                              discriminator_loss.data.numpy(),
                                              generator_loss.data.numpy()))
            noise = torch.randn(len(data), noise_dim) # noise for fake sample
            fake_sample = generator(noise)
            # convert back from one-hot encoding
            fake_df = pd.DataFrame(enc.inverse_transform(fake_sample.detach()))
            fake_df.columns = df.columns
            show_heatmap(fake_df)

In [None]:
train(data, generator, discriminator)