# 1. Baseline

## Libraries

In [1]:
"""
!pip install torch
!pip install easydict
!pip install torch
!pip install torchvision
!pip install opencv-python
"""

'\n!pip install torch\n!pip install easydict\n!pip install torch\n!pip install torchvision\n!pip install opencv-python\n'

In [2]:
import torch

# Check if CUDA is available
if torch.cuda.is_available():
    # Get the CUDA version
    cuda_version = torch.version.cuda
    print(f"CUDA version: {cuda_version}")
else:
    print("CUDA is not available. Install CUDA and try again.")

CUDA version: 12.1


In [None]:

import os
from pathlib import Path
from tqdm import tqdm # used for visualising the training part
from easydict import EasyDict as edict

import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.utils as vutils
from torchvision import transforms as trans

from data.ms1m import get_train_loader
from data.lfw import LFW

from backbone.arcfacenet import SEResNet_IR
from margin.ArcMarginProduct import ArcMarginProduct

from util.utils import save_checkpoint, test


## Configuration

In [None]:
conf = edict()
conf.train_root = "./dataset/MS1M"
conf.lfw_test_root = "./dataset/lfw_aligned_112"
conf.lfw_file_list = "./dataset/lfw_pair.txt"

conf.mode = "se_ir" #  model mode. ir based on resnet , se_ir uses se blocks in addition to the ir blocks
conf.depth = 50 # we can use 50, 100 or 152. its inside the arcfacenet.py file. increase for better result
conf.margin_type ="ArcFace"
conf.feature_dim = 512
conf.scale_size = 32.0
conf.batch_size = 48 # if the memory of your video card is less than 6 gb you can use it as 16 we use multiples of 16. optimal value = 96
conf.lr = 0.005 # learning rate, if we decrease the batch_size we need to decrease the lr at the same time, optimal value = 0.01 
conf.milestones = [8,10,12] 
conf.total_epoch = 14

conf.save_folder = "./saved"
conf.save_dir = os.path.join(conf.save_folder, conf.mode +"_" + str(conf.depth)) # ./saved/se_ir_50
conf.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
conf.num_workers = 4 # we enter how many workers we want to use to load the gpu data
conf.pin_memory = True # allows us to get faster results during training with gpu

In [5]:
os.makedirs(conf.save_dir, exist_ok = True)

## Data Loader

In [None]:
transform = trans.Compose([  #  if there is more than one transformation, it combines them
    trans.ToTensor(), # converts variables to tensor, from range [0,255] to [0.0,1.0]
    trans.Normalize(mean = (0.5,0.5,0.5), std = (0.5,0.5,0.5)) # rgb value. optimal value = 0.5
])

trainloader , class_num = get_train_loader(conf) #  inside the ms1m.py file

In [None]:
print("number of id :", class_num) # 200 different person's photos

number of id : 200


In [8]:
print(trainloader.dataset)

Dataset ImageFolder
    Number of datapoints: 29148
    Root location: ./dataset/MS1M
    StandardTransform
Transform: Compose(
               RandomHorizontalFlip(p=0.5)
               ToTensor()
               Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
           )


In [None]:
lfwdataset = LFW(conf.lfw_test_root, conf.lfw_file_list, transform = transform)  # apply the transform transformation to the images with the conf.lfw_file_list label in conf.lfw_test_root

lfwloader = torch.utils.data.DataLoader(lfwdataset, batch_size = 128, num_workers = conf.num_workers) # load the data

# Model

In [10]:
print(conf.device)

cuda:0


In [None]:
net = SEResNet_IR(conf.depth, feature_dim = conf.feature_dim, mode = conf.mode).to(conf.device) # model
margin = ArcMarginProduct(conf.feature_dim, class_num).to(conf.device) # we use ArcFace, ArcFace implements the loss function.


In [12]:
print(net)

SEResNet_IR(
  (input_layer): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): PReLU(num_parameters=64)
  )
  (output_layer): Sequential(
    (0): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (1): Dropout(p=0.4, inplace=False)
    (2): Flatten()
    (3): Linear(in_features=25088, out_features=512, bias=True)
    (4): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (body): Sequential(
    (0): BottleNeck_IR_SE(
      (shortcut_layer): MaxPool2d(kernel_size=1, stride=2, padding=0, dilation=1, ceil_mode=False)
      (res_layer): Sequential(
        (0): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (2): BatchNorm2d(64, eps=1e-05, moment

In [13]:
criterion = nn.CrossEntropyLoss()

In [None]:
optimizer = optim.SGD([
    {"params": net.parameters(), "weight_decay":5e-4}, # wight_decay used to prevent overfitting
    {"params": margin.parameters(), "weight_decay":5e-4}
], lr = conf.lr, momentum = 0.9, nesterov = True)

In [15]:
print(optimizer, flush  = True)

SGD (
Parameter Group 0
    dampening: 0
    differentiable: False
    foreach: None
    fused: None
    lr: 0.005
    maximize: False
    momentum: 0.9
    nesterov: True
    weight_decay: 0.0005

Parameter Group 1
    dampening: 0
    differentiable: False
    foreach: None
    fused: None
    lr: 0.005
    maximize: False
    momentum: 0.9
    nesterov: True
    weight_decay: 0.0005
)


In [16]:
def schedule_lr():
    for params in optimizer.param_groups:
        params["lr"]/=10
    print(optimizer)

## Train

In [None]:
best_acc = 0 
for epoch in range(1, conf.total_epoch+1):
    net.train()

    print("epoch {}/{}".format(epoch, conf.total_epoch), flush = True)

    if epoch == conf.milestones[0]: #8.epoch
        schedule_lr()
    if epoch == conf.milestones[1]: #10.epoch
        schedule_lr()
    if epoch == conf.milestones[2]: #12.epoch
        schedule_lr()
# when you run the above again, the lr will drop again, not from the initial value but from the lowest value, so you need to reload the model and the optimizer
    for data in tqdm(trainloader): 
        img, label = data[0].to(conf.device), data[1].to(conf.device)
        optimizer.zero_grad() # We reset the gradients of the optimizer. We reset the errors in the Backward phase so that they do not overlap

        logits = net(img) # we send the images to the model and output them, we have a vector of (96,512)
        output = margin(logits,label)
        total_loss = criterion(output, label)
        total_loss.backward()
        optimizer.step()

    #test
    net.eval()
    lfw_acc = test(conf, net,lfwdataset, lfwloader)
    print("\nLFW: {:.4f} | train_loss: {:.4f}\n".format(lfw_acc, total_loss.item()))
    is_best = lfw_acc > best_acc
    best_acc = max(lfw_acc, best_acc)
    save_checkpoint({
        "epoch":epoch,
        "net_state_dict": net.state_dict(),
        "margin_state_dict":margin.state_dict(),
        "best_acc": best_acc
    }, is_best,checkpoint = conf.save_dir)

epoch 1/14


100%|██████████| 608/608 [03:59<00:00,  2.53it/s]



LFW: 0.7770 | train_loss: 14.3787

best model saved

epoch 2/14


100%|██████████| 608/608 [04:31<00:00,  2.24it/s]



LFW: 0.7910 | train_loss: 11.4155

best model saved

epoch 3/14


100%|██████████| 608/608 [04:31<00:00,  2.24it/s]



LFW: 0.8322 | train_loss: 7.6531

best model saved

epoch 4/14


100%|██████████| 608/608 [03:48<00:00,  2.67it/s]



LFW: 0.8385 | train_loss: 6.7124

best model saved

epoch 5/14


100%|██████████| 608/608 [04:12<00:00,  2.41it/s]



LFW: 0.8395 | train_loss: 3.4643

best model saved

epoch 6/14


100%|██████████| 608/608 [05:06<00:00,  1.98it/s]



LFW: 0.8512 | train_loss: 5.3998

best model saved

epoch 7/14


100%|██████████| 608/608 [04:08<00:00,  2.44it/s]



LFW: 0.8583 | train_loss: 1.7992

best model saved

epoch 8/14
SGD (
Parameter Group 0
    dampening: 0
    differentiable: False
    foreach: None
    fused: None
    lr: 0.0005
    maximize: False
    momentum: 0.9
    nesterov: True
    weight_decay: 0.0005

Parameter Group 1
    dampening: 0
    differentiable: False
    foreach: None
    fused: None
    lr: 0.0005
    maximize: False
    momentum: 0.9
    nesterov: True
    weight_decay: 0.0005
)


100%|██████████| 608/608 [04:09<00:00,  2.44it/s]



LFW: 0.8635 | train_loss: 1.9196

best model saved

epoch 9/14


100%|██████████| 608/608 [04:08<00:00,  2.44it/s]



LFW: 0.8662 | train_loss: 1.6392

best model saved

epoch 10/14
SGD (
Parameter Group 0
    dampening: 0
    differentiable: False
    foreach: None
    fused: None
    lr: 5e-05
    maximize: False
    momentum: 0.9
    nesterov: True
    weight_decay: 0.0005

Parameter Group 1
    dampening: 0
    differentiable: False
    foreach: None
    fused: None
    lr: 5e-05
    maximize: False
    momentum: 0.9
    nesterov: True
    weight_decay: 0.0005
)


100%|██████████| 608/608 [04:08<00:00,  2.45it/s]



LFW: 0.8623 | train_loss: 0.8672

epoch 11/14


100%|██████████| 608/608 [04:08<00:00,  2.45it/s]



LFW: 0.8635 | train_loss: 1.4873

epoch 12/14
SGD (
Parameter Group 0
    dampening: 0
    differentiable: False
    foreach: None
    fused: None
    lr: 5e-06
    maximize: False
    momentum: 0.9
    nesterov: True
    weight_decay: 0.0005

Parameter Group 1
    dampening: 0
    differentiable: False
    foreach: None
    fused: None
    lr: 5e-06
    maximize: False
    momentum: 0.9
    nesterov: True
    weight_decay: 0.0005
)


100%|██████████| 608/608 [04:08<00:00,  2.44it/s]



LFW: 0.8638 | train_loss: 1.8121

epoch 13/14


100%|██████████| 608/608 [04:08<00:00,  2.44it/s]



LFW: 0.8638 | train_loss: 2.2188

epoch 14/14


100%|██████████| 608/608 [04:09<00:00,  2.44it/s]



LFW: 0.8637 | train_loss: 1.3310



In [None]:
"""
SOTA: The state of the art (CASIA)

1. download all MS1M dataset
2. use conf.mode = "ir"
3. use conf.depth = "100"
4. use conf.total_epoch = 20
5. use conf.milestones = [12,16,18]


"""