In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
import sys

import pandas as pd
import numpy as np

import time
import copy

import io

import torch

from tqdm.auto import tqdm

In [2]:
from datasets import load_original_dataset, load_deleted_dataset
from models import CNN

In [3]:
DATA_DIR = 'Datasets/Features/'
EPOCHS = 5
PERCENTAGES = [1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 99]

In [4]:
sys.path.append(os.path.abspath('./libraries/SISA/'))

In [5]:
# https://github.com/cleverhans-lab/machine-unlearning/blob/master/distribution.py#L7
# https://github.com/cleverhans-lab/machine-unlearning/blob/master/sisa.py#L14

class args:
    distribution = "exponential"
    shards = 1
    batch_size = 16
    learning_rate = 0.001

In [6]:
# https://github.com/cleverhans-lab/machine-unlearning/blob/master/distribution.py#L55

def mass(index):
    if args.distribution.split(":")[0] == "exponential":
        lbd = (
            float(args.distribution.split(":")[1])
            if len(args.distribution.split(":")) > 1
            else -np.log(0.05) / index.shape[0]
        )
        return np.exp(-lbd * index) - np.exp(-lbd * (index + 1))
    if args.distribution.split(":")[0] == "pareto":
        a = (
            float(args.distribution.split(":")[1])
            if len(args.distribution.split(":")) > 1
            else 1.16
        )
        return a / ((index + 1) ** (a + 1))

# https://github.com/cleverhans-lab/machine-unlearning/blob/master/distribution.py#L74

def get_partition(nb_train):

    # Initialize queue and partition.
    weights = mass(np.arange(0, nb_train))
    indices = np.argsort(weights)
    queue = np.array([weights[indices], np.ones(weights.shape)]).transpose()
    partition = [np.array([index]) for index in indices]
    
    # Put all points in the top queue.
    bottom_queue = queue.shape[0]  # pylint: disable=unsubscriptable-object
    lim = (
        int(float(args.distribution.split(":")[1]) * nb_train)
        if len(args.distribution.split(":")) > 1
        else int(0.01 * nb_train)
    )
    
    for _ in range(nb_train - args.shards):
        # Fetch top 2 clusters and merge them.
        w1 = queue[0]
        w2 = queue[1]
    
        l1 = partition[0]
        l2 = partition[1]
    
        partition = partition[2:]
        queue = queue[2:]
        bottom_queue -= 2
    
        merged_weight = w1 + w2
    
        # If merged cluster is smaller in number of points than the limit, insert it in top queue.
        if merged_weight[1] < lim:
            # Top queue is ordered first by number of points (weight[1]) and second by cost (weight[0]).
            offset_array = np.where(queue[:bottom_queue, 1] >= merged_weight[1])
            limit_array = np.where(queue[:bottom_queue, 1] > merged_weight[1])
            offset = (
                offset_array[0][0]
                if offset_array[0].shape[0] > 0
                else bottom_queue
            )
            limit = (
                limit_array[0][0]
                if limit_array[0].shape[0] > 0
                else bottom_queue
            )
            position_array = np.where(
                queue[offset:limit][:, 0] >= merged_weight[0]
            )
            position = (
                position_array[0][0]
                if position_array[0].shape[0] > 0
                else bottom_queue
            )
            bottom_queue += 1
    
        # Otherwise insert it in the bottom queue.
        else:
            # Bottom queue is ordered by cost only.
            position_array = np.where(
                queue[bottom_queue:][:, 0] >= merged_weight[0]
            )
            position = (
                position_array[0][0]
                if position_array[0].shape[0] > 0
                else queue.shape[0]
            )
    
        # Actual insertion.
        queue = np.insert(queue, position, merged_weight, axis=0)
        partition = (
            partition[:position]
            + [np.concatenate((l1, l2))]
            + partition[position:]
        )

    return partition

In [7]:
def fit(model, save_dir, train_set, test_set, forget_set):
    
    os.makedirs(save_dir, exist_ok=True)

    # 1- Create a container with a specified number of shards:
    # https://github.com/cleverhans-lab/machine-unlearning/blob/master/example-scripts/purchase-sharding/README.txt#L5
    # https://github.com/cleverhans-lab/machine-unlearning/blob/master/example-scripts/purchase-sharding/init.sh#L16

    # run PLS-GAP algorithm to find a low cost split.
    nb_train = len(train_set)
    partition = get_partition(nb_train)
    
    # https://github.com/cleverhans-lab/machine-unlearning/blob/master/sharded.py#L37
    train_loader = torch.utils.data.DataLoader(np.array(partition[0]), batch_size = args.batch_size, shuffle = False, drop_last=False)

    # https://github.com/cleverhans-lab/machine-unlearning/blob/master/sisa.py#L98
    criterion = torch.nn.CrossEntropyLoss()
    # https://github.com/cleverhans-lab/machine-unlearning/blob/master/sisa.py#L100
    optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate)
    
    train_times = list()
    train_accs, test_accs, forget_accs = list(), list(), list()
    # https://github.com/cleverhans-lab/machine-unlearning/blob/master/sisa.py#L183

    for epoch in range(EPOCHS):
        
        # train
        
        train_time = 0
        
        start_time = time.time()
        
        model.train()

        for indices in train_loader:

            x, y = train_set[indices]

            optimizer.zero_grad()
            output = model(x.cuda())
            y = y.cuda()
            loss = criterion(output, y)
            loss.backward()
            optimizer.step()
            
            train_time += time.time() - start_time
            
            start_time = time.time()
            
        train_times.append(train_time)
        
        # test
            
        model.eval()
        with torch.no_grad():
            
            x, y = train_set.tensors
            
            accs = list()
            
            for i in range(0, x.shape[0], args.batch_size):
            
                output = model(x[i:i+args.batch_size].cuda())

                predicted = torch.argmax(output.data, dim=-1)
                accs.append((predicted == y[i:i+args.batch_size].cuda()).float().mean().detach().cpu().numpy())
            
            train_accs.append(np.mean(accs))
            
            x, y = test_set.tensors
            
            accs = list()
            
            for i in range(0, x.shape[0], args.batch_size):
            
                output = model(x[i:i+args.batch_size].cuda())

                predicted = torch.argmax(output.data, dim=-1)
                accs.append((predicted == y[i:i+args.batch_size].cuda()).float().mean().detach().cpu().numpy())
            
            test_accs.append(np.mean(accs))
            

            x, y = forget_set.tensors
            
            accs = list()

            for i in range(0, x.shape[0], args.batch_size):

                output = model(x[i:i+args.batch_size].cuda())

                predicted = torch.argmax(output.data, dim=-1)
                accs.append((predicted == y[i:i+args.batch_size].cuda()).float().mean().detach().cpu().numpy())

            forget_accs.append(np.mean(accs))
        
        # save
        torch.save(model.state_dict(), os.path.join(save_dir, f'{(epoch+1):03d}.pt'))

    return train_times, train_accs, test_accs, forget_accs

In [8]:
results = list()

for percentage in tqdm(PERCENTAGES):
    
    model = CNN().cuda()

    model.load_state_dict(torch.load('./weights/init.pt'))
    
    train_set, test_set, forget_set = load_deleted_dataset(DATA_DIR, percentage)
    
    train_times, train_accs, test_accs, forget_accs = fit(model, f'weights/SISA/{percentage}', train_set, test_set, forget_set)
    
    df = pd.DataFrame(zip(train_times, train_accs, test_accs, forget_accs), columns=['train_time', 'train_acc', 'test_acc', 'forget_acc'])
    df['epoch'] = range(1, EPOCHS+1)
    df['percentage'] = percentage
    
    results.append(df)

  0%|          | 0/11 [00:00<?, ?it/s]

In [9]:
results = pd.concat(results).set_index(['percentage', 'epoch'])

results.to_csv('results/SISA.csv')

results

Unnamed: 0_level_0,Unnamed: 1_level_0,train_time,train_acc,test_acc,forget_acc
percentage,epoch,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1,12.227887,0.96923,0.9696,0.970395
1,2,11.319716,0.974835,0.9741,0.976974
1,3,11.359001,0.979464,0.9788,0.980263
1,4,11.324547,0.983538,0.9847,0.983553
1,5,11.355701,0.980592,0.9818,0.981908
10,1,10.590553,0.970981,0.9733,0.970833
10,2,10.380322,0.977056,0.979,0.975167
10,3,10.785652,0.979833,0.9806,0.977333
10,4,10.643745,0.982222,0.9807,0.9795
10,5,10.345651,0.982241,0.9821,0.981833
