In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
import sys

import pandas as pd
import numpy as np

import time

from collections import deque

import torch

import copy

from tqdm.auto import tqdm

In [2]:
from datasets import load_original_dataset, load_deleted_dataset
from models import CNN

In [3]:
DATA_DIR = 'Datasets/Features/'
EPOCHS = 5
PERCENTAGES = [1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 99]

In [4]:
# For MNIST dataset: python3 main.py --bz 16384 --epochs 20 --model Logistic_regression --dataset MNIST --wd 0.0001 --lr 0.1 0.05 --lrlen 10 10 --method deltagrad --period 5 --init 20 -m 2 --cached_size 20
is_GPU = True
device = 0
BATCH_SIZE = 1024
LR = 0.05
WD = 0.0001
INIT_EPOCHS = 1
PERIOD = 2
M = 2

In [5]:
sys.path.append(os.path.abspath('./libraries/DeltaGrad/src/'))

In [6]:
from utils import post_processing_gradien_para_list_all_epochs, append_gradient_list, init_model, get_model_para_shape_list, get_devectorized_parameters, get_all_vectorized_parameters1, compute_model_para_diff, compute_derivative_one_more_step
from main_delete import explicit_iters, compute_grad_final3, cal_approx_hessian_vec_prod0_3, compute_approx_hessian_vector_prod_with_prepared_terms1, prepare_hessian_vec_prod0_3

In [7]:
# https://github.com/thuwuyinjun/DeltaGrad/blob/master/src/main_delete.py#L362
def update_para_final2(para, gradient_list, alpha):
    
    vec_para = get_all_vectorized_parameters1(para)
    
    vec_para -= alpha*gradient_list
        
    return vec_para

# https://github.com/thuwuyinjun/DeltaGrad/blob/master/src/Models/DNN_single.py#L105
class DGCNN(CNN):
    def get_all_gradient(self):
        
        para_list = []
        
        for param in self.parameters():
            para_list.append(param.grad.clone())
            
        return para_list    

In [8]:
# https://github.com/thuwuyinjun/DeltaGrad/blob/master/src/main_delete.py#L1176

In [9]:
# https://github.com/thuwuyinjun/DeltaGrad/blob/master/README.md?plain=1#L77
# update the model after the training phase with deltagrad

# https://github.com/thuwuyinjun/DeltaGrad/blob/master/src/main.py#L107

def fit(model, save_dir, train_set, test_set, forget_set):
    
    os.makedirs(save_dir, exist_ok=True)
    
    # https://github.com/thuwuyinjun/DeltaGrad/blob/master/src/Models/Data_preparer.py#L325
    # replace softmax+nlloss with cross_entropy
    criterion = torch.nn.CrossEntropyLoss()
    
    # prepare model
    # https://github.com/thuwuyinjun/DeltaGrad/blob/master/src/main_delete.py#L1133
    net = copy.deepcopy(model)
    
    # https://github.com/thuwuyinjun/DeltaGrad/blob/master/src/Models/Data_preparer.py#L326
    optimizer = torch.optim.SGD(model.parameters(), lr=LR, weight_decay=WD)
    net_optimizer = torch.optim.SGD(net.parameters(), lr=LR, weight_decay=WD)

    train_batch_size = int(np.ceil(BATCH_SIZE * len(train_set) / (len(train_set) + len(forget_set))))
    forget_batch_size = int(np.ceil(BATCH_SIZE * len(forget_set) / (len(train_set) + len(forget_set))))

    num_steps = min(len(train_set) // train_batch_size, len(forget_set) // forget_batch_size)

    train_x, train_y = train_set.tensors[0], train_set.tensors[1]
    forget_x, forget_y = forget_set.tensors[0], forget_set.tensors[1]

    # https://github.com/thuwuyinjun/DeltaGrad/blob/master/src/main_delete.py#L1202
    # https://github.com/thuwuyinjun/DeltaGrad/blob/master/src/main_delete.py#L815

    # https://github.com/thuwuyinjun/DeltaGrad/blob/master/src/main_delete.py#L819
    para = list(model.parameters())
    # https://github.com/thuwuyinjun/DeltaGrad/blob/master/src/main_delete.py#L826
    full_shape_list, shape_list, total_shape_size = get_model_para_shape_list(model.parameters())

    # https://github.com/thuwuyinjun/DeltaGrad/blob/master/src/main_delete.py#L836
    S_k_list = deque()
    Y_k_list = deque()
    
    train_times = list()
    
    train_accs, test_accs, forget_accs = list(), list(), list()
    
    for epoch in range(EPOCHS):    
        
        # train
        
        train_time = 0
        
        start_time = time.time()

        model.train()
        for i in range(num_steps):

            # https://github.com/thuwuyinjun/DeltaGrad/blob/master/src/main_delete.py#L890

            batch_remaining_X = train_x[train_batch_size*i:train_batch_size*(i+1)].cuda()
            batch_remaining_Y = train_y[train_batch_size*i:train_batch_size*(i+1)].cuda()

            # https://github.com/thuwuyinjun/DeltaGrad/blob/master/src/main_delete.py#L930

            batch_delta_X = forget_x[forget_batch_size*i:forget_batch_size*(i+1)].cuda()
            batch_delta_Y = forget_y[forget_batch_size*i:forget_batch_size*(i+1)].cuda()

            # https://github.com/thuwuyinjun/DeltaGrad/blob/master/src/main_delete.py#L895

            curr_matched_ids_size = batch_delta_X.shape[0]

            # https://github.com/thuwuyinjun/DeltaGrad/blob/master/src/model_train.py#L18
            
            net_optimizer.zero_grad()
            output = net(torch.concat([batch_remaining_X, batch_delta_X], dim=0))
            loss = criterion(output, torch.concat([batch_remaining_Y, batch_delta_Y], dim=0))
            loss.backward()
            net_optimizer.step()

            gradient_list = []
            para_list = []
            append_gradient_list(gradient_list, None, para_list, net, None, is_GPU, device)

            para_list_tensor, grad_list_tensor = post_processing_gradien_para_list_all_epochs(para_list, gradient_list)
            para_list_tensor, grad_list_tensor = para_list_tensor.cuda(), grad_list_tensor.cuda()
                
            # https://github.com/thuwuyinjun/DeltaGrad/blob/master/src/main_delete.py#L927

            if epoch < INIT_EPOCHS:

                para, _, init_hessian_para_prod, theta_k = explicit_iters(
                    batch_delta_X, batch_delta_Y, batch_remaining_X, batch_remaining_Y, 
                    curr_matched_ids_size, model, para, epoch, i, M+1, S_k_list, Y_k_list, LR, WD, 
                    grad_list_tensor, grad_list_tensor, 0, full_shape_list, shape_list, 
                    is_GPU, device, 
                    criterion, optimizer, None, None
                )

            else:
                
                '''use l-bfgs algorithm to evaluate the gradients'''

                # https://github.com/thuwuyinjun/DeltaGrad/blob/master/src/main_delete.py#L952
                
                init_model(model, para)

                compute_derivative_one_more_step(model, batch_delta_X, batch_delta_Y, criterion, optimizer)
                
                gradient_dual = model.get_all_gradient()
                
                with torch.no_grad():
                
                    vec_para_diff = torch.t((get_all_vectorized_parameters1(para) - para_list_tensor))
                    
                    if (epoch - INIT_EPOCHS) / PERIOD >= 1:
                        if (epoch - INIT_EPOCHS) % PERIOD == 0:
                            zero_mat_dim, curr_Y_k, curr_S_k, sigma_k, mat_prime = prepare_hessian_vec_prod0_3(list(S_k_list)[1:], list(Y_k_list)[1:], i, INIT_EPOCHS, M, is_GPU, device)
                            
                            mat = np.linalg.inv(mat_prime.cpu().numpy())
                            mat = torch.from_numpy(mat)
                            mat = mat.to(device)
                            
                        hessian_para_prod = compute_approx_hessian_vector_prod_with_prepared_terms1(zero_mat_dim, curr_Y_k, curr_S_k, sigma_k, mat, vec_para_diff, is_GPU, device)
                        
                    else:
                        '''S_k_list, Y_k_list, v_vec, k, is_GPU, device'''
                        hessian_para_prod, zero_mat_dim, curr_Y_k, curr_S_k, sigma_k, mat_prime = cal_approx_hessian_vec_prod0_3(list(S_k_list)[1:], list(Y_k_list)[1:], vec_para_diff, M, is_GPU, device)
                    
                    is_positive, final_gradient_list = compute_grad_final3(
                        get_all_vectorized_parameters1(para), torch.t(hessian_para_prod), 
                        get_all_vectorized_parameters1(gradient_dual), 
                        grad_list_tensor, para_list_tensor, 
                        batch_remaining_X.shape[0] + curr_matched_ids_size, curr_matched_ids_size, 
                        LR, WD, is_GPU, device
                    )
                        
                    vec_para = update_para_final2(para, final_gradient_list, LR)
                    
                    para = get_devectorized_parameters(vec_para, full_shape_list, shape_list)
                

        
        train_time += time.time() - start_time
            
        train_times.append(train_time)
        
        # test
            
        model.eval()
        with torch.no_grad():
            
            #
            
            x, y = train_set.tensors
            
            accs = list()
            
            for i in range(0, x.shape[0], BATCH_SIZE):
            
                output = model(x[i:i+BATCH_SIZE].cuda())

                predicted = torch.argmax(output.data, dim=-1)
                accs.append((predicted == y[i:i+BATCH_SIZE].cuda()).float().mean().detach().cpu().numpy())
            
            train_accs.append(np.mean(accs))
            
            #
            
            x, y = test_set.tensors
            
            accs = list()
            
            for i in range(0, x.shape[0], BATCH_SIZE):
            
                output = model(x[i:i+BATCH_SIZE].cuda())

                predicted = torch.argmax(output.data, dim=-1)
                accs.append((predicted == y[i:i+BATCH_SIZE].cuda()).float().mean().detach().cpu().numpy())
            
            test_accs.append(np.mean(accs))
            
            #

            x, y = forget_set.tensors
            
            accs = list()

            for i in range(0, x.shape[0], BATCH_SIZE):

                output = model(x[i:i+BATCH_SIZE].cuda())

                predicted = torch.argmax(output.data, dim=-1)
                accs.append((predicted == y[i:i+BATCH_SIZE].cuda()).float().mean().detach().cpu().numpy())

            forget_accs.append(np.mean(accs))
        
        # save
        torch.save(model.state_dict(), os.path.join(save_dir, f'{(epoch+1):03d}.pt'))

    return train_times, train_accs, test_accs, forget_accs

In [10]:
results = list()

for percentage in tqdm(PERCENTAGES):

    # https://github.com/thuwuyinjun/DeltaGrad/blob/master/src/main_delete.py#L1124
    # https://github.com/thuwuyinjun/DeltaGrad/blob/master/src/Models/DNN_single.py#L47
    # remove softmax
    model = DGCNN().cuda()
        
    # https://github.com/thuwuyinjun/DeltaGrad/blob/master/src/main_delete.py#L1133
    model.load_state_dict(torch.load('./weights/init.pt'))
    
    train_set, test_set, forget_set = load_deleted_dataset(DATA_DIR, percentage)
    
    train_times, train_accs, test_accs, forget_accs = fit(model, f'weights/DeltaGrad/{percentage}', train_set, test_set, forget_set)
    
    df = pd.DataFrame(zip(train_times, train_accs, test_accs, forget_accs), columns=['train_time', 'train_acc', 'test_acc', 'forget_acc'])
    df['epoch'] = range(1, EPOCHS+1)
    df['percentage'] = percentage
    
    results.append(df)

results = pd.concat(results).set_index(['percentage', 'epoch'])

results.to_csv(f'results/DeltaGrad.csv')

  0%|          | 0/11 [00:00<?, ?it/s]

In [11]:
results

Unnamed: 0_level_0,Unnamed: 1_level_0,train_time,train_acc,test_acc,forget_acc
percentage,epoch,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1,2.79803,0.684852,0.692215,0.691667
1,2,2.019802,0.097077,0.098184,0.096667
1,3,1.956517,0.097077,0.098184,0.096667
1,4,1.979096,0.097077,0.098184,0.096667
1,5,1.321297,0.097077,0.098184,0.096667
10,1,2.231315,0.222672,0.220566,0.224216
10,2,2.166035,0.098714,0.098184,0.098991
10,3,2.136526,0.098714,0.098184,0.098991
10,4,2.113361,0.098714,0.098184,0.098991
10,5,1.438504,0.098714,0.098184,0.098991
