In [50]:
epochs = 5

In [51]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Example - Simple Vertically Partitioned Split Neural Network

- <b>Alice</b>
    - Has model Segment 1
    - Has the handwritten Images
- <b>Bob</b>
    - Has model Segment 2
    - Has the image Labels
    
Based on [SplitNN - Tutorial 3](https://github.com/OpenMined/PySyft/blob/master/examples/tutorials/advanced/split_neural_network/Tutorial%203%20-%20Folded%20Split%20Neural%20Network.ipynb) from Adam J Hall - Twitter: [@AJH4LL](https://twitter.com/AJH4LL) · GitHub:  [@H4LL](https://github.com/H4LL)

Authors:
- Pavlos Papadopoulos · GitHub:  [@pavlos-p](https://github.com/pavlos-p)
- Tom Titcombe · GitHub:  [@TTitcombe](https://github.com/TTitcombe)
- Robert Sandmann · GitHub: [@rsandmann](https://github.com/rsandmann)


In [52]:
class SplitNN:
    def __init__(self, models, optimizers):
        self.models = models
        self.optimizers = optimizers

        self.data = []
        self.remote_tensors = []

    def forward(self, x):
        data = []
        remote_tensors = []

        data.append(self.models[0](x))

        if data[-1].location == self.models[1].location:
            remote_tensors.append(data[-1].detach().requires_grad_())
        else:
            remote_tensors.append(
                data[-1].detach().move(self.models[1].location).requires_grad_()
            )

        i = 1
        while i < (len(models) - 1):
            data.append(self.models[i](remote_tensors[-1]))

            if data[-1].location == self.models[i + 1].location:
                remote_tensors.append(data[-1].detach().requires_grad_())
            else:
                remote_tensors.append(
                    data[-1].detach().move(self.models[i + 1].location).requires_grad_()
                )

            i += 1

        data.append(self.models[i](remote_tensors[-1]))

        self.data = data
        self.remote_tensors = remote_tensors

        return data[-1]

    def backward(self):
        for i in range(len(models) - 2, -1, -1):
            if self.remote_tensors[i].location == self.data[i].location:
                grads = self.remote_tensors[i].grad.copy()
            else:
                grads = self.remote_tensors[i].grad.copy().move(self.data[i].location)
    
            self.data[i].backward(grads)

    def zero_grads(self):
        for opt in self.optimizers:
            opt.zero_grad()

    def step(self):
        for opt in self.optimizers:
            opt.step()

In [None]:
!pip install openmined.psi
!pip install syft==0.2.9

In [54]:
import os
# change to path to PyVertical in drive:
os.chdir('/content/drive/MyDrive/LEO/UFRJ/IC/vertical/PyVertical')
import sys
sys.path.append('../')

import torch
from torchvision import datasets, transforms
from torch import nn, optim
from torchvision.datasets import MNIST
from torchvision.transforms import ToTensor

import syft as sy

import pandas as pd
import numpy as np

from torch.utils.data import DataLoader

from src.dataloader import VerticalDataLoader
from src.psi.util import Client, Server
from src.utils import add_ids

hook = sy.TorchHook(torch)



In [55]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
from uuid import uuid4
from typing import List

class FraudDataset(Dataset):
  def __init__(self, file_out, sc):
    # x = file_out.iloc[1:284807, 1:30].values
    # y = file_out.iloc[1:284807, 30].values
    x = file_out.iloc[:, 1:30].values
    y = file_out.iloc[:, 30].values

    x_transform = sc.transform(x)

    self.data = torch.tensor(x_transform, dtype=torch.float32)
    self.targets = torch.tensor(y, dtype=torch.float32)

    self.ids = np.array([uuid4() for _ in range(len(file_out))])

  def __len__(self):
    if self.data is not None:
      return self.data.size(0)
    else:
      return len(self.targets)

  def __getitem__(self, idx):
    return self.data[idx], self.targets[idx]

  def get_ids(self) -> List[str]:
    return [str(id_) for id_ in self.ids]

  def sort_by_ids(self):
        ids = self.get_ids()
        sorted_idxs = np.argsort(ids)

        if self.data is not None:
            self.data = self.data[sorted_idxs]

        if self.targets is not None:
            self.targets = self.targets[sorted_idxs]

        self.ids = self.ids[sorted_idxs]

In [56]:
os.chdir('/content/drive/MyDrive/Colab Notebooks/data')
file_out = pd.read_csv("creditcard.csv")
x_train = file_out.iloc[1:10000, :]
x_sc = x_train.iloc[:,1:30].values
sc = StandardScaler()
sc.fit(x_sc)
x_test = file_out.iloc[10000:15000, :]
dataset = FraudDataset(x_train, sc)
testset = FraudDataset(x_test, sc)

# Batch data
dataloader = VerticalDataLoader(dataset, batch_size=128) # partition_dataset uses by default "remove_data=True, keep_order=False"


## Implement PSI and order the datasets accordingly

In [57]:
# Compute private set intersection
client_items = dataloader.dataloader1.dataset.get_ids()
server_items = dataloader.dataloader2.dataset.get_ids()

client = Client(client_items)
server = Server(server_items)

setup, response = server.process_request(client.request, len(client_items))
intersection = client.compute_intersection(setup, response)

# Order data
dataloader.drop_non_intersecting(intersection)
dataloader.sort_by_ids()

In [58]:
torch.manual_seed(0)

# Define our model segments

input_size = 29
hidden_sizes = [128, 128]
output_size = 1

models = [
    nn.Sequential(
        nn.Linear(input_size, hidden_sizes[0]),
        nn.ReLU(),
        nn.Linear(hidden_sizes[0], hidden_sizes[1]),
        nn.ReLU(),
    ),
    nn.Sequential(nn.Linear(hidden_sizes[1], output_size), nn.Sigmoid()),
]

# Create optimisers for each segment and link to them
optimizers = [
    optim.SGD(model.parameters(), lr=0.03,)
    for model in models
]

# create some workers
alice = sy.VirtualWorker(hook, id="alice")
bob = sy.VirtualWorker(hook, id="bob")

# Send Model Segments to model locations
model_locations = [alice, bob]
for model, location in zip(models, model_locations):
    model.send(location)

#Instantiate a SpliNN class with our distributed segments and their respective optimizers
splitNN = SplitNN(models, optimizers)

In [59]:
def train(x, target, splitNN):
    
    #1) Zero our grads
    splitNN.zero_grads()
    
    #2) Make a prediction
    pred = splitNN.forward(x)

    #3) Figure out how much we missed by
    criterion = nn.BCELoss()
    loss = criterion(pred, target)
    
    #4) Backprop the loss on the end layer
    loss.backward()
    
    #5) Feed Gradients backward through the nework
    splitNN.backward()
    
    #6) Change the weights
    splitNN.step()
    
    return loss, pred

In [60]:
for i in range(epochs):
    running_loss = 0
    correct_preds = 0
    total_preds = 0
    tp, fp, tn, fn = 0, 0, 0, 0
    j = 0
    for (data, label) in zip(dataloader.dataloader1.dataset.data, dataloader.dataloader2.dataset.targets):
        # Train a model
        data = data.send(models[0].location)
        # data = data.view(data.shape[0], -1)
        label = label.send(models[-1].location)
        label = label.view(1)

        # Call model
        loss, preds = train(data, label, splitNN)
        pred = round(float(preds.get()[0]))
        lab = float(label.get()[0])
        # Collect statistics
        tp += (pred and lab)
        fp += (pred and not lab)
        tn += (not pred and not lab)
        fn += (not pred and lab)
        percent = j / dataloader.dataloader2.dataset.targets.shape[0] * 100
        j += 1
        print(f'\r epoch {i}: {percent:.1f}% | tp: {tp}, tn: {tn}, fp: {fp}, fn: {fn}', end='')
        running_loss += loss.get()
    f1_score = tp / (tp + (fp + fn)/2)
    print(f" | training loss: {running_loss/len(dataloader):.3f} | f1 score: {f1_score:.3f}")

 epoch 0: 100.0% | tp: 31.0, tn: 9956, fp: 5, fn: 7.0
 epoch 0: training loss: 1.191 - f1 score: 0.838
 epoch 1: 100.0% | tp: 36.0, tn: 9959, fp: 2, fn: 2.0
 epoch 1: training loss: 0.204 - f1 score: 0.947
 epoch 2: 100.0% | tp: 36.0, tn: 9960, fp: 1, fn: 2.0
 epoch 2: training loss: 0.198 - f1 score: 0.960
 epoch 3: 100.0% | tp: 36.0, tn: 9960, fp: 1, fn: 2.0
 epoch 3: training loss: 0.146 - f1 score: 0.960
 epoch 4: 100.0% | tp: 36.0, tn: 9960, fp: 1, fn: 2.0
 epoch 4: training loss: 0.177 - f1 score: 0.960


In [65]:
from torch.utils.data import DataLoader

testloaderC = DataLoader(testset, batch_size=64, shuffle=True)

testloader = VerticalDataLoader(testset, batch_size=128)
client_items = testloader.dataloader1.dataset.get_ids()
server_items = testloader.dataloader2.dataset.get_ids()

client = Client(client_items)
server = Server(server_items)

setup, response = server.process_request(client.request, len(client_items))
intersection = client.compute_intersection(setup, response)

# Order data
testloader.drop_non_intersecting(intersection)
testloader.sort_by_ids()

In [66]:
tp, fp, tn, fn = 0, 0, 0, 0
for sample, label in zip(testloader.dataloader1.dataset.data, testloader.dataloader2.dataset.targets):
    sample = sample.send(models[0].location)
    # data = data.view(data.shape[0], -1)
    label = label.send(models[-1].location)
    label = label.view(1)
    # Call model
    splitNN.zero_grads()
    pred = splitNN.forward(sample)
    pred = round(float(pred.get()[0]))
    lab = float(label.get()[0])
    # Collect statistics
    tp += (pred and lab)
    fp += (pred and not lab)
    tn += (not pred and not lab)
    fn += (not pred and lab)
    print(f'\r tp: {tp}, fp: {fp}, tn: {tn}, fn: {fn}', end='')
f1_score = tp / (tp + (fp + fn)/2)
print(f'\n test f1 score: {f1_score:.3f}')

 tp: 18.0, fp: 22, tn: 4955, fn: 5.0
 test f1 score: 0.571
