<a href="https://colab.research.google.com/github/kode-git/ViT-emotion-recognition/blob/main/Transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 5. Loading the final dataset

In [6]:
!pip install timm
from __future__ import print_function
from __future__ import division
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
import time
import os
import copy
import timm
print("PyTorch Version: ",torch.__version__)
print("Torchvision Version: ",torchvision.__version__)

Collecting timm
  Downloading timm-0.5.4-py3-none-any.whl (431 kB)
[?25l[K     |▊                               | 10 kB 16.2 MB/s eta 0:00:01[K     |█▌                              | 20 kB 15.3 MB/s eta 0:00:01[K     |██▎                             | 30 kB 9.6 MB/s eta 0:00:01[K     |███                             | 40 kB 8.9 MB/s eta 0:00:01[K     |███▉                            | 51 kB 4.5 MB/s eta 0:00:01[K     |████▋                           | 61 kB 5.3 MB/s eta 0:00:01[K     |█████▎                          | 71 kB 5.5 MB/s eta 0:00:01[K     |██████                          | 81 kB 5.7 MB/s eta 0:00:01[K     |██████▉                         | 92 kB 6.3 MB/s eta 0:00:01[K     |███████▋                        | 102 kB 5.1 MB/s eta 0:00:01[K     |████████▍                       | 112 kB 5.1 MB/s eta 0:00:01[K     |█████████▏                      | 122 kB 5.1 MB/s eta 0:00:01[K     |█████████▉                      | 133 kB 5.1 MB/s eta 0:00:01[K     |█

In [None]:
# Data augmentation and normalization for training
# Just normalization for validation
input_size=(224,224)
batch_size=10
data_dir=""
data_transforms = {
    'train': transforms.Compose([
        transforms.RandomResizedCrop(input_size),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    'val': transforms.Compose([
        transforms.Resize(input_size),
        transforms.CenterCrop(input_size),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
}

print("Initializing Datasets and Dataloaders...")

# Create training and validation datasets
image_datasets = {x: datasets.ImageFolder(os.path.join(data_dir, x), data_transforms[x]) for x in ['train', 'val']}
# Create training and validation dataloaders
dataloaders_dict = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size=batch_size, shuffle=True, num_workers=4) for x in ['train', 'val']}

# Detect if we have a GPU available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

## Loading the pretrained model
The visual transformer implemented relies on the model proposed by https://arxiv.org/abs/2010.11929 where the input sequence can be formed from feature maps of a CNN (LeCun et al., 1989). In this hybrid model, the patch embedding projection is then applied to patches extracted from a CNN feature map.

In [44]:
NUM_CLASSES = 8
model = timm.create_model('vit_base_patch16_224', pretrained=True)

Showing the model structure 

In [29]:
model.eval()

VisionTransformer(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
    (norm): Identity()
  )
  (pos_drop): Dropout(p=0.0, inplace=False)
  (blocks): Sequential(
    (0): Block(
      (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=768, out_features=2304, bias=True)
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=768, out_features=768, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (drop_path): Identity()
      (norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=768, out_features=3072, bias=True)
        (act): GELU()
        (drop1): Dropout(p=0.0, inplace=False)
        (fc2): Linear(in_features=3072, out_features=768, bias=True)
        (drop2): Dropout(p=0.0, inplace=False)
      )
    )
    (1): Block(
      (norm1): LayerNorm((768,),

In order to implement the finetuning technique, the last layer needs to be reimplemented, the purpose of this line is to replace the latest layer with a 8 dimension one.

In [30]:
model.head = nn.Linear(768, NUM_CLASSES)
model.eval()

VisionTransformer(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
    (norm): Identity()
  )
  (pos_drop): Dropout(p=0.0, inplace=False)
  (blocks): Sequential(
    (0): Block(
      (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=768, out_features=2304, bias=True)
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=768, out_features=768, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (drop_path): Identity()
      (norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=768, out_features=3072, bias=True)
        (act): GELU()
        (drop1): Dropout(p=0.0, inplace=False)
        (fc2): Linear(in_features=3072, out_features=768, bias=True)
        (drop2): Dropout(p=0.0, inplace=False)
      )
    )
    (1): Block(
      (norm1): LayerNorm((768,),

## Trainer function
In order to train the model, the trainer function is settled. The main purpose of this function is to handle the training and validation phases.

In [37]:
def train_model(model, dataloaders, criterion, optimizer, num_epochs=25, is_inception=False):
    since = time.time()

    val_acc_history = []

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    # Get model outputs and calculate loss


                      outputs = model(inputs)
                      loss = criterion(outputs, labels)

                      _, preds = torch.max(outputs, 1)

                    # backward + optimize only if in training phase
                      if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)

            epoch_loss = running_loss / len(dataloaders[phase].dataset)
            epoch_acc = running_corrects.double() / len(dataloaders[phase].dataset)

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))

            # deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())
            if phase == 'val':
                val_acc_history.append(epoch_acc)

        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model, val_acc_history

##Optimizer
Now that the model structure is correct, the final step for finetuning and feature extracting is to create an optimizer that only updates the desired parameters.

In [40]:
# Detect if we have a GPU available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# Send the model to GPU
model_ft = model.to(device)
feature_extract=False

# Gather the parameters to be optimized/updated in this run. If we are
#  finetuning we will be updating all parameters. However, if we are
#  doing feature extract method, we will only update the parameters
#  that we have just initialized, i.e. the parameters with requires_grad
#  is True.
params_to_update = model_ft.parameters()
print("Params to learn:")

for name,param in model_ft.named_parameters():
    if param.requires_grad == True:
          print("\t",name)

# stochasic gradient descent
optimizer_ft = optim.SGD(params_to_update, lr=0.001, momentum=0.9)

Params to learn:
	 cls_token
	 pos_embed
	 patch_embed.proj.weight
	 patch_embed.proj.bias
	 blocks.0.norm1.weight
	 blocks.0.norm1.bias
	 blocks.0.attn.qkv.weight
	 blocks.0.attn.qkv.bias
	 blocks.0.attn.proj.weight
	 blocks.0.attn.proj.bias
	 blocks.0.norm2.weight
	 blocks.0.norm2.bias
	 blocks.0.mlp.fc1.weight
	 blocks.0.mlp.fc1.bias
	 blocks.0.mlp.fc2.weight
	 blocks.0.mlp.fc2.bias
	 blocks.1.norm1.weight
	 blocks.1.norm1.bias
	 blocks.1.attn.qkv.weight
	 blocks.1.attn.qkv.bias
	 blocks.1.attn.proj.weight
	 blocks.1.attn.proj.bias
	 blocks.1.norm2.weight
	 blocks.1.norm2.bias
	 blocks.1.mlp.fc1.weight
	 blocks.1.mlp.fc1.bias
	 blocks.1.mlp.fc2.weight
	 blocks.1.mlp.fc2.bias
	 blocks.2.norm1.weight
	 blocks.2.norm1.bias
	 blocks.2.attn.qkv.weight
	 blocks.2.attn.qkv.bias
	 blocks.2.attn.proj.weight
	 blocks.2.attn.proj.bias
	 blocks.2.norm2.weight
	 blocks.2.norm2.bias
	 blocks.2.mlp.fc1.weight
	 blocks.2.mlp.fc1.bias
	 blocks.2.mlp.fc2.weight
	 blocks.2.mlp.fc2.bias
	 blocks.3.norm

## Training phase
Setting the loss function as a categorical cross entropy function because of the output shape of the model

In [None]:
# Setup the loss fxn
criterion = nn.CrossEntropyLoss()
num_epochs=10
# Train and evaluate
model_ft, hist = train_model(model_ft, dataloaders_dict, criterion, optimizer_ft, num_epochs=num_epochs, is_inception=False)
#Saving the updatedmodel for the inference phase
torch.save(model_ft.state_dict(), "/content/data")