<a href="https://colab.research.google.com/github/ivyclare/PrivateAI/blob/master/MNIST_PATE_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### PATE Analysis on MNIST

http://www.cleverhans.io/privacy/2018/04/29/privacy-and-machine-learning.html
Our PATE approach at providing differential privacy to machine learning is based on a simple intuition: if two different classifiers, trained on two different datasets with no training examples in common, agree on how to classify a new input example, then that decision does not reveal information about any single training example. The decision could have been made with or without any single training example, because both the model trained with that example and the model trained without that example reached the same conclusion.

====================

In order to train MNIST in a differentially private manner, we need 2 main components; private datasets (teachers) and public unlabelled dataset (student). MNIST is divided into train and test data. Hence, we'll have to create the teacher and student datasets ourselves. 

We will follow the steps below, to create a privacy preserving MNIST deep learning model:

- Create the teacher and student datasets
    - The training data is divided into non-overlapping subsets
- 

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
# import our libraries
import numpy as np
import pandas as pd
import torch
from torchvision import datasets, transforms
from torch.utils.data import Subset, DataLoader
from torch import nn, optim
import torch.nn.functional as F
import time, os
import math

### Step 1: Create Teacher and Student Datasets

In [3]:
# Load MNIST dataset

data_transforms = transforms.Compose([transforms.ToTensor(),
                                      transforms.Normalize((0.5,),(0.5,))
                                     ])
# train_data = datasets.MNIST(root=’data’, train=True, download=True, transform=transform)

trainset = datasets.MNIST(root='data', train=True, transform=data_transforms, download=True)

testset = datasets.MNIST(root='data', train=False, transform=data_transforms, download=True)



Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to data/MNIST/raw/train-images-idx3-ubyte.gz


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Extracting data/MNIST/raw/train-images-idx3-ubyte.gz to data/MNIST/raw
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to data/MNIST/raw/train-labels-idx1-ubyte.gz


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Extracting data/MNIST/raw/train-labels-idx1-ubyte.gz to data/MNIST/raw
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to data/MNIST/raw/t10k-images-idx3-ubyte.gz


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Extracting data/MNIST/raw/t10k-images-idx3-ubyte.gz to data/MNIST/raw
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to data/MNIST/raw/t10k-labels-idx1-ubyte.gz


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Extracting data/MNIST/raw/t10k-labels-idx1-ubyte.gz to data/MNIST/raw
Processing...
Done!





In [0]:
len(trainset), len(testset)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [102]:
# TEACHERS
#divide train set between teachers and create dataloaders for valid and trainsets
num_teachers = 10
valid_per = 0.2 #20% for validation
batch_size = 32


def teacher_dataloaders(transet=trainset, num_teachers=num_teachers, batch_size=batch_size, valid_per = 0.3):
  trainloaders = []
  validloaders = []
  teacher_data_len = len(trainset) // num_teachers

  for i in range(num_teachers):
    # get particular subset of data
    indice = list(range(i*teacher_data_len, (i+1)*teacher_data_len))
    data_subset = Subset(trainset, indice)
    # split into train and validation set
    valid_size = int(len(data_subset) * valid_per)
    train_size = len(data_subset) - valid_size
    train_subset, valid_subset = torch.utils.data.random_split(data_subset, [train_size,valid_size])
    # print(len(train_subset))
    # print(len(valid_subset))

    #create data loaders
    trainloader = DataLoader(train_subset, batch_size=batch_size, shuffle=True, num_workers=1)
    validloader = DataLoader(valid_subset, batch_size=batch_size, shuffle=False, num_workers=1)

    #add dataloaders to list
    trainloaders.append(trainloader)
    validloaders.append(validloader)
  
  return trainloaders, validloaders

trainloaders, validloaders = teacher_dataloaders()
len(trainloaders), len(validloaders)

(10, 10)

In [0]:
#  # STUDENT
#  testset
 
#  # split into training and validation
#  student_train

## Step 2: Train Teachers

In [0]:
# define model
class Net(nn.Module):
  def __init__(self):
    super().__init__()

    self.fc1 = nn.Linear(784, 256)
    self.fc2 = nn.Linear(256, 128)
    self.fc3 = nn.Linear(128, 64)
    self.fc4 = nn.Linear(64, 10)
    self.dropout = nn.Dropout(p=0.4)

  def forward(self, x):
    x = x.view(x.shape[0], -1)
    x = self.fc1(x)
    x = self.dropout(F.relu(self.fc2(x)))
    x = self.dropout(F.relu(self.fc3(x)))
    x = F.log_softmax(self.fc4(x), dim=1)

    return x

In [0]:
# training loop
def train(trainloader, validloader, model, optimizer, criterion, epochs, device):
  start = time.time()
  # trainloader = trainloaders[0]
  # validloader = validloaders[0]
  best_loss = math.inf
  train_results = []
  valid_results = []

  for epoch in range(epochs):   
    model.train()
    running_loss = 0.0
    running_corrects = 0
    valid_corrects = 0
    valid_loss = 0
    
    for images, labels in trainloader:
      images = images.to(device)
      labels = labels.to(device)
      optimizer.zero_grad()

      outputs = model(images)
      _, preds = torch.max(outputs, 1)
      loss = criterion(outputs, labels)
      loss.backward()
      optimizer.step()

      running_loss += loss.item()
      running_corrects += torch.sum(preds == labels.data)

      with torch.no_grad():
        model.eval()
        for images, labels in validloader:
          images = images.to(device)
          labels = labels.to(device)

          outputs = model(images)
          valid_loss += criterion(outputs, labels)

          ps = torch.exp(outputs)
          top_p, top_class = ps.topk(1, dim=1)
          equals = top_class == labels.view(*top_class.shape)
          valid_corrects += torch.mean(equals.type(torch.FloatTensor))

      #   # if(valid_loss < best_loss):
      #   #   best_loss = valid_loss
      
        train_loss = running_loss / len(trainloader)
        train_acc = running_corrects.double() / len(trainloader)
        train_results.append([train_loss,train_acc])

        valid_losss = valid_loss / len(validloader)
        valid_acc = valid_corrects / len(validloader)
        valid_results.append([valid_losss,valid_acc])

    print("Epoch: {}/{}".format(epoch, epochs))
    print('\tTrain Loss: {:.4f} Train Acc: {:.4f}'.format(train_loss, train_acc))
    print('\tValid Loss: {:.4f} Valid Acc: {:.4f}'.format(valid_losss, valid_acc))
  return model
  # return model, train_results, valid_results

In [43]:
model = Net()
model.to(device)
criterion = nn.NLLLoss()
optimizer = optim.Adam(model.parameters() , lr=0.001)
epochs = 10

cuda


In [98]:
teacher_models = []
for trainloader, validloader in zip(trainloaders, validloaders):
  teacher_model = train(trainloaders, validloaders, model, optimizer, criterion, epochs, device)
  teacher_models.append(teacher_model)

Epoch: 0/10
	Train Loss: 0.0688 Train Acc: 62.4667
	Valid Loss: 22.7243 Valid Acc: 69.5367


KeyboardInterrupt: ignored

## Step 3: Get Private Labels 

## Step 4: Add Laplacian Noise

## Step 5 Peform PATE Analysis

## Step 6: Train **Student**

# TRAIN MNIST NORMALLY