# Common routine for CIFAR10 (preparation)

In [39]:
import torch, torchvision; from transformers import Trainer, TrainingArguments, EarlyStoppingCallback; device = 'cuda' if torch.cuda.is_available() else 'cpu'
B = 1024//4
tf = torchvision.transforms.Compose([torchvision.transforms.ToTensor(), torchvision.transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
trainset= torchvision.datasets.CIFAR10(root=".", train=True, download=True, transform=tf); trainloader= torch.utils.data.DataLoader(trainset,batch_size=B, shuffle=True, num_workers=4)
testset = torchvision.datasets.CIFAR10(root=".", train=False,download=True, transform=tf); testloader = torch.utils.data.DataLoader(testset, batch_size=B, shuffle=False,num_workers=4)

class Net(torch.nn.Module): # modified from https://www.kaggle.com/code/faressayah/cifar-10-images-classification-using-cnns-88
    def __init__(self):
        super().__init__()
        self.pool = torch.nn.MaxPool2d(2, 2); p=0.0; # p=1/6
        self.dropout1 = torch.nn.Dropout(p)
        self.dropout2 = torch.nn.Dropout(p)
        self.dropout3 = torch.nn.Dropout(p)
        self.dropout4 = torch.nn.Dropout(p)
        self.conv1 = torch.nn.Conv2d( 3, 32, 3, 1, 1); self.batch_norm1 = torch.nn.BatchNorm2d(32)
        self.conv2 = torch.nn.Conv2d(32, 32, 3, 1, 1); self.batch_norm2 = torch.nn.BatchNorm2d(32)
        self.conv3 = torch.nn.Conv2d(32, 64, 3, 1, 1); self.batch_norm3 = torch.nn.BatchNorm2d(64)
        self.conv4 = torch.nn.Conv2d(64, 64, 3, 1, 1); self.batch_norm4 = torch.nn.BatchNorm2d(64)
        self.conv5 = torch.nn.Conv2d(64, 128,3, 1, 1); self.batch_norm5 = torch.nn.BatchNorm2d(128)
        self.conv6 = torch.nn.Conv2d(128,128,3, 1, 1); self.batch_norm6 = torch.nn.BatchNorm2d(128)
        self.fc1 = torch.nn.Linear(128 * 4 * 4, 64)
        self.fc2 = torch.nn.Linear(64, 10)
        self.criterion = torch.nn.CrossEntropyLoss()
    def forward(self, input, target):
        x = self.batch_norm1(torch.nn.functional.relu(self.conv1(input)))
        x = self.pool(self.batch_norm2(torch.nn.functional.relu(self.conv2(x))))
        x = self.dropout1(x)
        x = self.batch_norm3(torch.nn.functional.relu(self.conv3(x)))
        x = self.pool(self.batch_norm4(torch.nn.functional.relu(self.conv4(x))))
        x = self.dropout2(x)
        x = self.batch_norm5(torch.nn.functional.relu(self.conv5(x)))
        x = self.pool(self.batch_norm6(torch.nn.functional.relu(self.conv6(x))))
        x = self.dropout3(x)
        x = torch.flatten(x, 1) # flatten all dimensions except batch
        x = torch.nn.functional.relu(self.fc1(x))
        x = self.dropout4(x)
        x = self.fc2(x)
        return torch.nn.functional.cross_entropy(x, target), x

def compute_metrics(logits):
    _, predicted = torch.max(torch.tensor(logits[0]), 1)
    return {'accuracy': (predicted == torch.tensor(testset.targets) ).sum().item() / len(testset.targets)}

def collator(X):
    temp = torch.Tensor([X[i][1] for i in range(len(X))]).to(dtype=torch.int64)
    temp1 = torch.stack([X[i][0] for i in range(len(X))])
    return {'input': temp1, 'target': temp}

Files already downloaded and verified
Files already downloaded and verified


# Normal Model

In [3]:
# Normal Models (hyperparameteters tuned)

# Best p for DropOut searched, (ep10 fixed)
# p=0: 85.39%  85.15% again
# p=0.1: 85.81%
# p=0.14: 86.56% best!
# p=0.15: 86.14% 
# p=1/6: 86.09%
# p=0.18: 86.54%
# p=0.2: 86.25%
# p=0.225: 85.81%
# p=0.25: 85.79%
# p=0.4: 84.43%

## Best max number of epochs searched
# 74.2%  5 ep # Max_N_ep 512
# 72.4%  3 ep # Max_N_ep 256
# 70.5%  4 ep # Max_N_ep 128
# 74.0%  6 ep # Max_N_ep 64
# 79.9% 13 ep # Max_N_ep 32
# 80.5%  7 ep # Max_N_ep 16
# 83.9% 11 ep # Max_N_ep 12
# 85.4% 10 ep # Max_N_ep 10    ->   BEST
# 85.1%  8 ep # Max_N_ep 8
# 84.5%  6 ep # Max_N_ep 6
# 82.8%  4 ep # Max_N_ep 4
# 74.1%  2 ep # Max_N_ep 2
# 61.2%  1 ep # Max_N_ep 1

## Best weight decay searched
# 18.0% 90.19/4 sec # WD 16
# 78.7% 216.0/10sec # WD 4 unstable x
# 65.4% 87.13/4 sec # WD 1
# 85.7% 208.3/10sec # WD 4/5 fp16
# 85.8% 217.6/10sec # WD 3/4 slightly unstable x
# 85.6% 209.3/10sec # WD 2/3 fp16
# 85.4% 220.6/10sec # WD 1/2  ->   FIX to 1/2
# 82.9% 177.6/8 sec # WD 1/3
# 84.2% 195.5/9 sec # WD 1/4
# 83.4% 173.9/8 sec # WD 1/6
# 84.2% 194.5/9 sec # WD 1/4/2
# 81.9% 154.2/7 sec # WD 1/4/4
# 81.3% 153.6/7 sec # WD 1/4/4/4
# 82.3% 176.5/8 sec # WD 1/4/4/4/4/4
# 82.2% 154.0/7 sec # WD 0   weight_decay=0
# 82.0% 150.3/7 sec # no WD specified

## Best learning rate searched
# 79.6% 175.8/8 sec # lr20e-3
# 81.4% 176.4/8 sec # lr14e-3
# 81.5% 177.8/8 sec # lr9e-3
# 82.2% 154.3/7 sec # lr7e-3 again   ->   FIX to 7e-3
# 82.3% 177.8/8 sec # lr5e-3
# 81.7% 150.8/7 sec # lr3e-3
# 80.7% 106.2/5 sec # lr1e-3
# 79.2% 150.7/7 sec # lr0.5e-3
# 75.2% 152.4/7 sec # lr0.25e-3
# 71.5% 220.6/10 sec # lr0.1e-3

## Best number of filters searched
# 79.8% 165.1/8 sec 0.4G # 16-32-64 x

## Best number of neurons in hidden layer searched  ## default: Ep10, WD1/4/4/4/4, lr7e-3, 32-64-128
#x10.0% for 4 neurons 130.1/6 sec 0.7G
#x80.4% for 8 neurons 174.2/8 sec 0.7G or 10.0%
# 81.8% for 16 neurons 176.7/8 sec 0.7G
# 81.6% for 32 neurons 153.7/7 sec
# 82.1% for 64 neurons 175.8/8 sec   ->   FIX to 64 neurons
# 80.9% for 128 neurons ?sec
# 81.8% for 256 neurons ?sec
# 82.3% for 512 neurons ?sec
# 81.8% for 1024 neurons 172/8 sec
# 82.3% for 2048 neurons 183/9 sec
# 80.9% for 4096 neurons 183.8/9 sec

In [34]:
# p=1/6
model = Net().to(device)
training_args = TrainingArguments(num_train_epochs=10, weight_decay=1/2, learning_rate=7e-3, lr_scheduler_type="linear", fp16=True, load_best_model_at_end=True, output_dir='./test', logging_strategy="epoch", save_strategy='epoch', per_device_train_batch_size=B, per_device_eval_batch_size=B, eval_strategy='epoch', report_to='none')
trainer = Trainer(model=model, compute_metrics=compute_metrics, data_collator=collator, args=training_args, train_dataset=trainset, eval_dataset=testset, callbacks = [EarlyStoppingCallback(2,0.0)]); trainer.can_return_loss = True;
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,1.5639,1.27338,0.5348
2,1.0802,0.901626,0.6868
3,0.8497,0.866347,0.703
4,0.7288,0.796626,0.7207
5,0.6353,0.647749,0.782
6,0.5674,0.617481,0.7896
7,0.4966,0.551259,0.8121
8,0.4201,0.486332,0.8356
9,0.3361,0.453437,0.8454
10,0.2494,0.419907,0.8609


TrainOutput(global_step=1960, training_loss=0.6927466820697395, metrics={'train_runtime': 196.6962, 'train_samples_per_second': 2541.991, 'train_steps_per_second': 9.965, 'total_flos': 0.0, 'train_loss': 0.6927466820697395, 'epoch': 10.0})

In [40]:
# p=0
model = Net().to(device)
training_args = TrainingArguments(num_train_epochs=10, weight_decay=1/2, learning_rate=7e-3, lr_scheduler_type="linear", fp16=True, load_best_model_at_end=True, output_dir='./test', logging_strategy="epoch", save_strategy='epoch', per_device_train_batch_size=B, per_device_eval_batch_size=B, eval_strategy='epoch', report_to='none')
trainer = Trainer(model=model, compute_metrics=compute_metrics, data_collator=collator, args=training_args, train_dataset=trainset, eval_dataset=testset, callbacks = [EarlyStoppingCallback(2,0.0)]); trainer.can_return_loss = True;
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,1.5216,1.167548,0.5703
2,0.9051,0.905943,0.6863
3,0.6968,0.866236,0.6969
4,0.5858,0.77848,0.7348
5,0.493,0.725188,0.7549
6,0.4128,0.607915,0.7976
7,0.3291,0.54116,0.8178
8,0.2284,0.500386,0.8378
9,0.1277,0.507056,0.844
10,0.0391,0.499806,0.8515


TrainOutput(global_step=1960, training_loss=0.5339338682135757, metrics={'train_runtime': 205.7295, 'train_samples_per_second': 2430.376, 'train_steps_per_second': 9.527, 'total_flos': 0.0, 'train_loss': 0.5339338682135757, 'epoch': 10.0})

# Forgetter (Without last drop now, Try last drop in the future work)

In [None]:
## Forgetter with l=7ep fixed
# p=0:   87.05% @10th
# p=0.1: 87.51% @12th(max)
# p=0.14: 87.51% @7th
# p=0.15: 87.34%@6th
# p=1/6:    again   87.81@11th            87.96%@12th(max) without data augmentation
# p=0.18: 87.70% @12th(max)
# p=0.2: 87.68@12th(max)
# p=0.225: 87.79@8th
# p=0.25: 87.30% @7th
# p=0.4: 85.58% @10th

## p=0
# l= 1ep 84.49% @21th/36
# l= 3ep 86.40% @9th
# l= 5ep 86.71% @11th
# l= 7ep: 87.05% @10th
# l= 9ep: 86.92% @6th
# l=11ep: 86.88% @4th
# l=13ep: 86.88% @8th
# l=15ep: 86.62% @2nd
# l=17ep: 85.90% @4th
# l=19ep: 86.96% @5th

In [18]:
# p=1/6: 87.96%@12th without data augmentation
model = Net().to(device); torch.cuda.empty_cache()
for i in range(4 *3):
    training_args = TrainingArguments(num_train_epochs=7, weight_decay=1/2, learning_rate=7e-3, lr_scheduler_type="linear", fp16=True, load_best_model_at_end=True, output_dir='./test', logging_strategy="epoch", save_strategy='epoch', per_device_train_batch_size=B, per_device_eval_batch_size=B, eval_strategy='epoch', report_to='none')
    trainer = Trainer(model=model, compute_metrics=compute_metrics, data_collator=collator, args=training_args, train_dataset=trainset, eval_dataset=testset); trainer.can_return_loss = True;
    trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,1.5523,1.305877,0.5359
2,1.0241,0.926753,0.6784
3,0.8062,0.720201,0.7501
4,0.6783,0.650388,0.7775
5,0.5695,0.556564,0.8116
6,0.4565,0.509839,0.8299
7,0.3484,0.444927,0.853


Epoch,Training Loss,Validation Loss,Accuracy
1,0.735,0.761184,0.7391
2,0.6561,0.767646,0.746
3,0.5962,0.616969,0.7863
4,0.5336,0.648708,0.7818
5,0.4554,0.543349,0.8166
6,0.3688,0.47786,0.8433
7,0.2642,0.412849,0.8632


Epoch,Training Loss,Validation Loss,Accuracy
1,0.7306,0.763553,0.7409
2,0.6421,0.769258,0.7355
3,0.5784,0.661447,0.7795
4,0.5086,0.623779,0.7878
5,0.4377,0.556988,0.8147
6,0.3517,0.453311,0.8497
7,0.246,0.402951,0.866


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6976,0.946135,0.6884
2,0.6202,0.753916,0.7407
3,0.553,0.653013,0.782
4,0.4944,0.645014,0.7849
5,0.4249,0.511852,0.8287
6,0.34,0.418717,0.8615
7,0.2378,0.392263,0.8711


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6935,0.850486,0.7116
2,0.6092,0.669669,0.7674
3,0.5538,0.729484,0.7647
4,0.4987,0.598517,0.7943
5,0.4204,0.48808,0.8338
6,0.3344,0.443566,0.8524
7,0.2364,0.387291,0.874


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6905,0.896907,0.6938
2,0.6113,0.673291,0.7672
3,0.5533,0.630646,0.7856
4,0.4877,0.643037,0.7805
5,0.419,0.517796,0.8299
6,0.3319,0.437361,0.8518
7,0.2329,0.377234,0.8726


Epoch,Training Loss,Validation Loss,Accuracy
1,0.691,0.756502,0.7389
2,0.6003,0.7374,0.7515
3,0.55,0.61999,0.7887
4,0.4865,0.558711,0.811
5,0.4056,0.498798,0.8333
6,0.3255,0.418786,0.8601
7,0.2251,0.382255,0.876


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6713,0.827838,0.7252
2,0.6084,0.848556,0.7111
3,0.547,0.71186,0.7584
4,0.484,0.60269,0.8015
5,0.4067,0.522167,0.8284
6,0.3211,0.414746,0.8618
7,0.2228,0.37991,0.8777


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6648,0.890706,0.6909
2,0.597,0.925404,0.7007
3,0.5432,0.621693,0.7878
4,0.486,0.661027,0.7796
5,0.4069,0.48084,0.8362
6,0.3204,0.438217,0.8542
7,0.2228,0.38763,0.8733


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6807,0.922429,0.6861
2,0.6015,0.776766,0.7372
3,0.5406,0.839467,0.7295
4,0.4822,0.691438,0.7733
5,0.4029,0.558677,0.8172
6,0.325,0.41335,0.8611
7,0.2214,0.383404,0.8734


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6798,0.738306,0.7484
2,0.6044,0.959916,0.6922
3,0.529,0.665175,0.7803
4,0.481,0.575141,0.8037
5,0.4089,0.531933,0.8187
6,0.3196,0.479334,0.8414
7,0.2199,0.381543,0.8752


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6892,0.929882,0.7056
2,0.6015,0.814252,0.7267
3,0.5383,0.696872,0.7596
4,0.4758,0.564464,0.8107
5,0.4046,0.532331,0.8289
6,0.3166,0.412615,0.8594
7,0.2171,0.375071,0.8796


In [38]:
# p=0: 86.66%
model = Net().to(device); torch.cuda.empty_cache()
for i in range(4 *3):
    training_args = TrainingArguments(num_train_epochs=7, weight_decay=1/2, learning_rate=7e-3, lr_scheduler_type="linear", fp16=True, load_best_model_at_end=True, output_dir='./test', logging_strategy="epoch", save_strategy='epoch', per_device_train_batch_size=B, per_device_eval_batch_size=B, eval_strategy='epoch', report_to='none')
    trainer = Trainer(model=model, compute_metrics=compute_metrics, data_collator=collator, args=training_args, train_dataset=trainset, eval_dataset=testset); trainer.can_return_loss = True;
    trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,1.4731,1.237151,0.5668
2,0.9152,1.007223,0.6523
3,0.693,0.759306,0.7322
4,0.5572,0.716968,0.7556
5,0.4415,0.577269,0.8025
6,0.3111,0.506118,0.8286
7,0.1617,0.492942,0.8403


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6237,0.8095,0.7212
2,0.5541,0.754241,0.7445
3,0.4842,0.683308,0.7665
4,0.4123,0.661162,0.7719
5,0.3203,0.587605,0.8074
6,0.2044,0.492132,0.8441
7,0.078,0.473661,0.8549


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6136,0.999666,0.6841
2,0.5352,0.716151,0.7633
3,0.4451,0.678258,0.776
4,0.3781,0.66488,0.7846
5,0.2914,0.538364,0.822
6,0.1808,0.482828,0.8413
7,0.0655,0.467647,0.8562


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6048,0.749729,0.7414
2,0.5094,0.674041,0.7732
3,0.4396,0.771994,0.7375
4,0.365,0.656201,0.7878
5,0.2718,0.517912,0.8254
6,0.1639,0.485493,0.8477
7,0.0581,0.463024,0.8635


Epoch,Training Loss,Validation Loss,Accuracy
1,0.572,0.689065,0.7678
2,0.4993,0.65806,0.776
3,0.4253,0.641545,0.7848
4,0.3531,0.587337,0.8078
5,0.2671,0.56897,0.8149
6,0.1595,0.48132,0.8478
7,0.0555,0.451046,0.8648


Epoch,Training Loss,Validation Loss,Accuracy
1,0.5575,0.736368,0.7564
2,0.4916,0.633143,0.7849
3,0.4231,0.657486,0.7755
4,0.347,0.61097,0.8038
5,0.2569,0.498411,0.8336
6,0.1545,0.493884,0.8465
7,0.0529,0.462189,0.8628


Epoch,Training Loss,Validation Loss,Accuracy
1,0.582,0.71717,0.7572
2,0.5001,0.681173,0.7662
3,0.4262,0.692941,0.77
4,0.3508,0.620752,0.7975
5,0.2573,0.532296,0.8246
6,0.1508,0.457005,0.8562
7,0.0495,0.46454,0.8633


Epoch,Training Loss,Validation Loss,Accuracy
1,0.5538,0.679087,0.7642
2,0.4796,0.623457,0.7849
3,0.4111,0.682461,0.7708
4,0.3368,0.597549,0.8038
5,0.2466,0.517347,0.8332
6,0.1481,0.484265,0.8476
7,0.0503,0.468071,0.8612


Epoch,Training Loss,Validation Loss,Accuracy
1,0.5647,0.92138,0.689
2,0.4845,0.769248,0.7421
3,0.4175,0.692958,0.7686
4,0.3379,0.554973,0.8133
5,0.2516,0.550261,0.8174
6,0.1463,0.487148,0.8509
7,0.0472,0.457577,0.8666


Epoch,Training Loss,Validation Loss,Accuracy
1,0.5532,0.935497,0.6852
2,0.4846,0.809507,0.7376
3,0.415,0.772614,0.7444
4,0.3357,0.600651,0.7997
5,0.2496,0.550885,0.8229
6,0.1419,0.493474,0.847
7,0.048,0.460799,0.8656


Epoch,Training Loss,Validation Loss,Accuracy
1,0.567,0.788326,0.7271
2,0.4789,0.796516,0.7258
3,0.4067,0.77204,0.7327
4,0.3324,0.557793,0.811
5,0.248,0.56535,0.8186
6,0.1419,0.517967,0.8443
7,0.0495,0.458845,0.8659


Epoch,Training Loss,Validation Loss,Accuracy
1,0.5817,0.77361,0.7332
2,0.4896,0.661982,0.7794
3,0.4118,0.748099,0.7483
4,0.337,0.640041,0.7839
5,0.2524,0.643277,0.804
6,0.1477,0.486543,0.8512
7,0.0503,0.465223,0.8624
