In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

import pandas as pd
import numpy as np

import time

import torch

from tqdm.auto import tqdm

In [2]:
from datasets import load_original_dataset, load_deleted_dataset
from models import CNN

In [3]:
DATA_DIR = 'Datasets/Features/'
BATCH_SIZE = 32
EPOCHS = 5
PERCENTAGES = [1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 99]

In [4]:
def fit(model, save_dir, train_set, test_set, forget_set=None):
    
    os.makedirs(save_dir, exist_ok=True)
    
    optimizer = torch.optim.Adam(model.parameters())
    error = torch.nn.CrossEntropyLoss()
    
    train_loader = torch.utils.data.DataLoader(train_set, batch_size = BATCH_SIZE, shuffle = True, drop_last=True)
    
    train_times = list()
    train_accs, test_accs, forget_accs = list(), list(), list()
    
    for epoch in range(EPOCHS):
        
        # train
        
        accs = list()
        
        train_time = 0
        
        model.train()
        
        start_time = time.time()
        
        for x, y in train_loader:
            optimizer.zero_grad()
            output = model(x.cuda())
            y = y.cuda()
            loss = error(output, y)
            loss.backward()
            optimizer.step()
            
            train_time += time.time() - start_time
            
            predicted = torch.argmax(output.data, dim=-1)
            accs.append((predicted == y).float().mean().detach().cpu().numpy())
            
            start_time = time.time()
            
        train_times.append(train_time)
        train_accs.append(np.mean(accs))
        
        # eval
            
        model.eval()
        with torch.no_grad():
            
            x, y = test_set.tensors
            
            accs = list()
            
            for i in range(0, x.shape[0], BATCH_SIZE):
            
                output = model(x[i:i+BATCH_SIZE].cuda())

                predicted = torch.argmax(output.data, dim=-1)
                accs.append((predicted == y[i:i+BATCH_SIZE].cuda()).float().mean().detach().cpu().numpy())
            
            test_accs.append(np.mean(accs))
            
            if forget_set is not None:

                x, y = forget_set.tensors
            
                accs = list()

                for i in range(0, x.shape[0], BATCH_SIZE):

                    output = model(x[i:i+BATCH_SIZE].cuda())

                    predicted = torch.argmax(output.data, dim=-1)
                    accs.append((predicted == y[i:i+BATCH_SIZE].cuda()).float().mean().detach().cpu().numpy())

                forget_accs.append(np.mean(accs))
        
        # save
        torch.save(model.state_dict(), os.path.join(save_dir, f'{(epoch+1):03d}.pt'))
        

    return train_times, train_accs, test_accs, forget_accs

# origin train

In [5]:
train_set, test_set = load_original_dataset(DATA_DIR)

In [6]:
model = CNN().cuda()

In [7]:
# save
torch.save(model.state_dict(), 'weights/init.pt')

In [8]:
train_times, train_accs, test_accs, _ = fit(model, 'weights/original', train_set, test_set)

In [9]:
df = pd.DataFrame(zip(train_times, train_accs, test_accs), columns=['train_time', 'train_acc', 'test_acc'])
df['epoch'] = range(1, EPOCHS+1)
df.set_index('epoch', inplace=True)

df.to_csv('results/original.csv')

df

Unnamed: 0_level_0,train_time,train_acc,test_acc
epoch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,12.413962,0.868583,0.973143
2,11.08528,0.944733,0.980931
3,10.715224,0.954433,0.989317
4,11.184714,0.957367,0.986821
5,11.101995,0.959433,0.985723


# naive retrain

In [10]:
results = list()

for percentage in tqdm(PERCENTAGES):
    
    model = CNN().cuda()
        
    model.load_state_dict(torch.load('./weights/init.pt'))
    
    train_set, test_set, forget_set = load_deleted_dataset(DATA_DIR, percentage)
    
    train_times, train_accs, test_accs, forget_accs = fit(model, f'weights/naive_retrain/{percentage}', train_set, test_set, forget_set)
    
    df = pd.DataFrame(zip(train_times, train_accs, test_accs, forget_accs), columns=['train_time', 'train_acc', 'test_acc', 'forget_acc'])
    df['epoch'] = range(1, EPOCHS+1)
    df['percentage'] = percentage
    
    results.append(df)

  0%|          | 0/11 [00:00<?, ?it/s]

In [11]:
results = pd.concat(results).set_index(['percentage', 'epoch'])

results.to_csv('results/naive_retrain.csv')

results

Unnamed: 0_level_0,Unnamed: 1_level_0,train_time,train_acc,test_acc,forget_acc
percentage,epoch,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1,11.065428,0.857018,0.972943,0.972039
1,2,11.427742,0.940531,0.983327,0.983553
1,3,11.147157,0.950313,0.985723,0.980263
1,4,12.055764,0.952418,0.985923,0.983553
1,5,10.988493,0.958025,0.985623,0.981908
10,1,9.890369,0.854957,0.973942,0.976396
10,2,9.809762,0.939964,0.98123,0.97889
10,3,9.869044,0.950022,0.981929,0.979721
10,4,9.747755,0.953523,0.986122,0.984874
10,5,9.809713,0.957302,0.98772,0.985539
