<a href="https://colab.research.google.com/github/kongwanbianjinyu/Deep-Learning-Tutorial/blob/main/Transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
## Standard libraries
import os
import numpy as np
import random
import math
import json
from functools import partial

## Imports for plotting
import matplotlib.pyplot as plt
plt.set_cmap('cividis')
%matplotlib inline
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('svg', 'pdf') # For export
from matplotlib.colors import to_rgb
import matplotlib
matplotlib.rcParams['lines.linewidth'] = 2.0
import seaborn as sns
sns.reset_orig()

## tqdm for loading bars
from tqdm.notebook import tqdm

## PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as data
import torch.optim as optim

## Torchvision
import torchvision
from torchvision.datasets import CIFAR100
from torchvision import transforms

# PyTorch Lightning
try:
    import pytorch_lightning as pl
except ModuleNotFoundError: # Google Colab does not have PyTorch Lightning installed by default. Hence, we do it here if necessary
    !pip install --quiet pytorch-lightning>=1.4
    import pytorch_lightning as pl
from pytorch_lightning.callbacks import LearningRateMonitor, ModelCheckpoint

# Path to the folder where the datasets are/should be downloaded (e.g. CIFAR10)
DATASET_PATH = "drive/MyDrive/deep learning tutorial/data"
# Path to the folder where the pretrained models are saved
CHECKPOINT_PATH = "drive/MyDrive/deep learning tutorial/saved_models/transformer"

# Setting the seed
pl.seed_everything(42)

# Ensure that all operations are deterministic on GPU (if used) for reproducibility
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
print("Device:", device)


Global seed set to 42


Device: cuda:0


In [3]:
import urllib.request
from urllib.error import HTTPError
# Github URL where saved models are stored for this tutorial
base_url = "https://raw.githubusercontent.com/phlippe/saved_models/main/tutorial6/"
# Files to download
pretrained_files = ["ReverseTask.ckpt", "SetAnomalyTask.ckpt"]

# Create checkpoint path if it doesn't exist yet
os.makedirs(CHECKPOINT_PATH, exist_ok=True)

# For each file, check whether it already exists. If not, try downloading it.
for file_name in pretrained_files:
    file_path = os.path.join(CHECKPOINT_PATH, file_name)
    if "/" in file_name:
        os.makedirs(file_path.rsplit("/",1)[0], exist_ok=True)
    if not os.path.isfile(file_path):
        file_url = base_url + file_name
        print(f"Downloading {file_url}...")
        try:
            urllib.request.urlretrieve(file_url, file_path)
        except HTTPError as e:
            print("Something went wrong. Please try to download the file from the GDrive folder, or contact the author with the full output including the following error:\n", e)

# Attention

In [4]:
def scaled_dot_product_attention(q,k,v, mask = None):
  # q, k, v [batch, num_heads, seq_len, hidden_dim]
  hidden_dim = q.size()[-1]
  # attn_logits [seq_len, seq_len]
  attn_logits = torch.matmul(q,k.transpose(-2,-1)) / math.sqrt(hidden_dim)
  if mask is not None:
      # mask out padding tokens, fill where mask True with small value
      attn_logits = attn_logits.masked_fill(mask == 0, -9e15)
  # softmax on each row
  attention = F.softmax(attn_logits, dim = -1)
  values = torch.matmul(attention, v)

  return values , attention


In [5]:
batch, head_num, seq_len, d_k = 2,2,3, 2
pl.seed_everything(42)
q = torch.randn(batch,head_num, seq_len, d_k)
k = torch.randn(batch,head_num, seq_len, d_k)
v = torch.randn(batch,head_num, seq_len, d_k)
values, attention = scaled_dot_product_attention(q, k, v)
print("Q\n", q)
print("K\n", k)
print("V\n", v)
print("Values\n", values)
print("Attention\n", attention)

Global seed set to 42


Q
 tensor([[[[ 1.9269,  1.4873],
          [ 0.9007, -2.1055],
          [ 0.6784, -1.2345]],

         [[-0.0431, -1.6047],
          [ 0.3559, -0.6866],
          [-0.4934,  0.2415]]],


        [[[-1.1109,  0.0915],
          [-2.3169, -0.2168],
          [-0.3097, -0.3957]],

         [[ 0.8034, -0.6216],
          [-0.5920, -0.0631],
          [-0.8286,  0.3309]]]])
K
 tensor([[[[ 0.0349,  0.3211],
          [ 1.5736, -0.8455],
          [ 1.3123,  0.6872]],

         [[-1.0892, -0.3553],
          [ 1.4451,  0.8564],
          [ 2.2181,  0.5232]]],


        [[[ 0.3466, -0.1973],
          [-1.0546,  1.2780],
          [-0.1722,  0.5238]],

         [[ 0.0566,  0.4263],
          [ 0.5750, -0.6417],
          [-2.2064, -0.7508]]]])
V
 tensor([[[[ 0.0109, -0.3387],
          [-1.3407, -0.5854],
          [ 0.5362,  0.5246]],

         [[ 1.1412,  0.0516],
          [-0.6788,  0.5743],
          [ 0.1877, -0.3576]]],


        [[[-0.3165,  0.5886],
          [-0.8905,  0.4098],
   

# Multi-Head Attention

In [6]:
class MultiheadAttention(nn.Module):
  def __init__(self, input_dim, embed_dim, num_heads):
    super().__init__()
    assert embed_dim % num_heads == 0, "embeding dim must num_heads modulo 0"

    # embed_dim is output dim
    self.embed_dim = embed_dim
    # head_dim is the vector dim for each head
    self.head_dim = embed_dim // num_heads

    self.num_heads = num_heads
    
    # linear transform, 3 for W_Q,W_K,W_V
    self.qkv_proj = nn.Linear(input_dim, 3*embed_dim) 
    self.o_proj = nn.Linear(embed_dim, embed_dim)

  def init_parameters(self):
    # init parameter for W_Q,W_K,W_V
    nn.init.xavier_uniform_(self.qkv_proj.weight)
    self.qkv_proj.bias.data.fill_(0)

    nn.init.xavier_uniform_(self.o_proj.weight)
    self.o_proj.bias.data.fill_(0)

  def forward(self,x, mask = None, return_attention = False):
    batch_size, seq_len, input_dim = x.size()
    # qkv [batch_size, seq_len, 3*embed_dim]
    qkv = self.qkv_proj(x)

    # separate Q,K,V
    qkv = qkv.reshape(batch_size, seq_len, self.num_heads, 3*self.head_dim)
    # [batch_size, num_heads, seq_len, 3*head_dim]
    qkv = qkv.permute(0,2,1,3) 
    # q [batch_size, num_heads, seq_len, head_dim]
    q, k, v = qkv.chunk(3, dim = -1)

    # attention
    # values = [batch_size, num_heads, seq_len, head_dim]
    # attention =  [batch_size, num_heads, seq_len, seq_len]
    values, attention = scaled_dot_product_attention(q,k,v, mask = mask)
    
    # values = [batch_size, seq_len, num_heads, head_dim]
    values = values.permute(0,2,1,3)

    values = values.reshape(batch_size, seq_len, self.embed_dim)
    # output = [batch_size, seq_len, embed_dim]
    output = self.o_proj(values)

    if return_attention:
      return output, attention
    else:
      return output

  


In [7]:
batch_size, seq_len, input_dim = 2,3,2
pl.seed_everything(42)
x = torch.randn(batch_size, seq_len, input_dim)

mua = MultiheadAttention(input_dim = 2,embed_dim = 20,num_heads = 5)
output, attention = mua(x,return_attention = True)
print(f"output size: {output.size()}, attentin size: {attention.size()}")

Global seed set to 42


output size: torch.Size([2, 3, 20]), attentin size: torch.Size([2, 5, 3, 3])


# Transformer Encoder

In [8]:
class TransformerEncoderBlock(nn.Module):
  # in embed_dim == input_dim, because we need to add attention value to x 
  def __init__(self, input_dim, embed_dim, num_heads, dim_feedforward, dropout = 0.0):
    super().__init__()

    # attention layer
    self.self_attn = MultiheadAttention(input_dim = input_dim,embed_dim = embed_dim,num_heads = num_heads)

    # Feed Forward net
    self.linear_net = nn.Sequential(
        nn.Linear(embed_dim,dim_feedforward),
        nn.Dropout(dropout),
        nn.ReLU(inplace =True),
        nn.Linear(dim_feedforward, embed_dim)
    )
    # Layer norm
    self.norm1 = nn.LayerNorm(input_dim)
    self.norm2 = nn.LayerNorm(input_dim)
    self.dropout = nn.Dropout(dropout)

  def forward(self, x, mask = None):
    # attention, resnet like add and layer norm
    attn_out = self.self_attn(x, mask = mask)
    x = x + self.dropout(attn_out)
    x = self.norm1(x)

    # FFN
    ffn_out = self.linear_net(x)
    x = x + self.dropout(ffn_out)
    x = self.norm2(x)

    return x 


In [9]:
class TransformerEncoder(nn.Module):
  def __init__(self, num_layers, **block_args):
    # num_layers: how many Encoder block, **block_args collect kwargs for each block
    super().__init__()
    self.layers = nn.ModuleList([TransformerEncoderBlock(**block_args) for _ in range(num_layers)])

  def forward(self, x, mask = None):
    for l in self.layers:
      x = l(x, mask = mask)
    return x
  def get_attention(self, x, mask = None):
    attentions = []
    for l in self.layers:
      _,attention = l.self_attn(x, mask = mask, return_attention = True)
      attentions.append(attention)
      x = l(x, mask = mask)
      return attentions

# Positional Encoding

In [10]:
class PositionEncoding(nn.Module):
  def __init__(self, d_model, max_len = 5000):
    super().__init__()

    # Create matrix of [SeqLen, HiddenDim] representing the positional encoding for max_len inputs
    pe = torch.zeros(max_len, d_model)
    position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
    div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
    pe[:, 0::2] = torch.sin(position * div_term)
    pe[:, 1::2] = torch.cos(position * div_term)
    pe = pe.unsqueeze(0)

    self.register_buffer('pe', pe, persistent=False)

  def forward(self, x):
    # get the seq_len according to input x
    x = x + self.pe[:,:x.size(1)]
    return x

    

# Lr Warm-up


In [11]:
class CosineWarmupScheduler(optim.lr_scheduler._LRScheduler):

    def __init__(self, optimizer, warmup, max_iters):
        self.warmup = warmup
        self.max_num_iters = max_iters
        super().__init__(optimizer)

    def get_lr(self):
        lr_factor = self.get_lr_factor(epoch=self.last_epoch)
        return [base_lr * lr_factor for base_lr in self.base_lrs]

    def get_lr_factor(self, epoch):
        lr_factor = 0.5 * (1 + np.cos(np.pi * epoch / self.max_num_iters))
        if epoch <= self.warmup:
            lr_factor *= epoch * 1.0 / self.warmup
        return lr_factor

# Pytorch Lightning

In [12]:
class Transformer(pl.LightningModule):
  def __init__(self, input_dim, model_dim, num_layers, num_heads, num_classes, lr, warmup, max_iters, dropout = 0.0, input_dropout = 0.0):
    super().__init__()
    # save all hypar parameters to namespace of hparams.
    self.save_hyperparameters()
    self._create_model()
    
  def _create_model(self):
    self.input_net = nn.Sequential(
        nn.Dropout(self.hparams.input_dropout),
        nn.Linear(self.hparams.input_dim , self.hparams.model_dim)
    )

    self.position_encoding = PositionEncoding(d_model = self.hparams.model_dim)

    self.transformer = TransformerEncoder(num_layers = self.hparams.num_layers,
                                          input_dim = self.hparams.model_dim,
                                          embed_dim = self.hparams.model_dim,
                                          num_heads = self.hparams.num_heads,
                                          dim_feedforward = 2*self.hparams.model_dim,
                                          dropout = self.hparams.dropout)
    
    self.output_net = nn.Sequential(
        nn.Linear(self.hparams.model_dim, self.hparams.model_dim),
        nn.LayerNorm(self.hparams.model_dim),
        nn.ReLU(inplace = True),
        nn.Dropout(self.hparams.dropout),
        nn.Linear(self.hparams.model_dim, self.hparams.num_classes)

    )

  def forward(self, x, mask = None, add_positional_encoding = True):
    # x size: [batch_size, seq_len, input_dim]
    x = self.input_net(x)
    if add_positional_encoding:
      x = self.position_encoding(x)
    x = self.transformer(x, mask = mask)
    x = self.output_net(x)
    return x

  def configure_optimizers(self):
    optimizer = optim.Adam(self.parameters(), lr = self.hparams.lr)

    lr_scheduler = CosineWarmupScheduler(optimizer, warmup = self.hparams.warmup, max_iters = self.hparams.max_iters)

    return [optimizer], [{'scheduler': lr_scheduler, 'interval':'step'}]

  def training_step(self, batch, batch_idx):
    raise NotImplementedError

  def validation_step(self, batch, batch_idx):
    raise NotImplementedError

  def test_step(self, batch, batch_idx):
    raise NotImplementedError


# Task

## Task1: Sequence to Sequence

### Dataset 

In [13]:
class ReverseDataset(data.Dataset):
  def __init__(self, num_classes, seq_len, size):
    super().__init__()
    self.num_classes = num_classes
    self.seq_len = seq_len
    self.size = size
    
    # data int number between 0 - 9 
    self.data = torch.randint(low =0, high = self.num_classes, size = (self.size, self.seq_len))

  def __len__(self):
    return self.size

  def __getitem__(self, idx):
    data = self.data[idx]
    label = torch.flip(data, dims = (0,))
    return data, label

In [14]:
# partial create a new function with some args fixed
dataset = partial(ReverseDataset, num_classes = 10, seq_len = 16)

train_loader = data.DataLoader(dataset(size = 50000), batch_size = 128, shuffle = True, drop_last = True, pin_memory = True)
val_loader = data.DataLoader(dataset(size = 1000), batch_size = 128)
test_loader = data.DataLoader(dataset(size = 10000), batch_size = 128)

thedata, label = train_loader.dataset[0]
print(f"data  = {thedata},\nlabel = {label}")


data  = tensor([2, 7, 6, 9, 6, 0, 0, 5, 0, 4, 3, 9, 8, 6, 7, 9]),
label = tensor([9, 7, 6, 8, 9, 3, 4, 0, 5, 0, 0, 6, 9, 6, 7, 2])


### Pytorch Lightning module for reverse sequence

In [15]:
class ReverseTransformer(Transformer):
  def _calculate_loss(self,batch, mode = "train"):
    # X, y size [batch_size, seq_len]
    X, y = batch
    # X [batch_size, seq_len, input_dim]
    X = F.one_hot(X, num_classes = self.hparams.num_classes).float()
    # preds [batch_size, seq_len, num_classes]
    preds = self.forward(X, add_positional_encoding=True)

    # preds [batch_size* seq_len, num_classes] y [batch_size*seq_len ,1]
    loss = F.cross_entropy(preds.view(-1,preds.size(-1)),y.view(-1))

    acc = (preds.argmax(dim = -1) == y).float().mean()

    #logging
    self.log(f"{mode}_loss",loss)
    self.log(f"{mode}_acc",acc)
    return loss, acc

  def training_step(self,batch, batch_idx):
    loss, _ = self._calculate_loss(batch)
    return loss

  def validation_step(self,batch, batch_idx):
      _ , _ = self._calculate_loss(batch, mode = "val")

  def test_step(self,batch, batch_idx):
      _ , _ = self._calculate_loss(batch, mode = "test")

  


### Trainer

In [16]:
def train_reverse_transformer(**kwargs):
  # Create a PyTorch Lightning trainer with the generation callback
  root_dir = os.path.join(CHECKPOINT_PATH, "ReverseTask")
  os.makedirs(root_dir, exist_ok=True)
  trainer = pl.Trainer(default_root_dir=root_dir,
                      callbacks=[ModelCheckpoint(save_weights_only=True, mode="max", monitor="val_acc")],
                      gpus=1 if str(device).startswith("cuda") else 0,
                      max_epochs=10,
                      gradient_clip_val=5,
                      progress_bar_refresh_rate=1)
  trainer.logger._default_hp_metric = None # Optional logging argument that we don't need

  # Check whether pretrained model exists. If yes, load it and skip training
  pretrained_filename = os.path.join(CHECKPOINT_PATH, "ReverseTask.ckpt")
  if os.path.isfile(pretrained_filename):
      print("Found pretrained model, loading...")
      model = ReverseTransformer.load_from_checkpoint(pretrained_filename)
  else:
      model = ReverseTransformer(max_iters=trainer.max_epochs*len(train_loader), **kwargs)
      trainer.fit(model, train_loader, val_loader)

  # Test best model on validation and test set
  val_result = trainer.test(model, val_loader, verbose=False)
  test_result = trainer.test(model, test_loader, verbose=False)
  result = {"test_acc": test_result[0]["test_acc"], "val_acc": val_result[0]["test_acc"]}

  model = model.to(device)
  return model, result

In [17]:
reverse_model, reverse_result = train_reverse_transformer(input_dim=train_loader.dataset.num_classes,
                                              model_dim=32,
                                              num_heads=1,
                                              num_classes=train_loader.dataset.num_classes,
                                              num_layers=1,
                                              dropout=0.0,
                                              lr=5e-4,
                                              warmup=50)

  f"Setting `Trainer(progress_bar_refresh_rate={progress_bar_refresh_rate})` is deprecated in v1.5 and"
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


Found pretrained model, loading...


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: 0it [00:00, ?it/s]

In [18]:
print(f"Val accuracy:  {(100.0 * reverse_result['val_acc']):4.2f}%")
print(f"Test accuracy:  {(100.0 * reverse_result['test_acc']):4.2f}%")

Val accuracy:  100.00%
Test accuracy:  100.00%


## Set Anomaly Detection

### Dataset

In [19]:
train_set = CIFAR100(root = DATASET_PATH, train = True, download = True)

print(train_set.data.shape)
mean = (train_set.data / 255.0).mean(axis = (0,1,2))
print(mean)

std = (train_set.data / 255.0).std(axis = (0,1,2))
print(std)

Files already downloaded and verified
(50000, 32, 32, 3)
[0.50707516 0.48654887 0.44091784]
[0.26733429 0.25643846 0.27615047]


In [20]:
data_mean = torch.from_numpy(np.array(mean)).view(1,3,1,1)
data_std = torch.from_numpy(np.array(std)).view(1,3,1,1)

# Resize to 224x224, and normalize to ImageNet statistic
transform = transforms.Compose([transforms.Resize((224,224)),
                                transforms.ToTensor(),
                                transforms.Normalize(mean, std)
                                ])
# Loading the training dataset.
train_set = CIFAR100(root=DATASET_PATH, train=True, transform=transform, download=True)

# Loading the test set
test_set = CIFAR100(root=DATASET_PATH, train=False, transform=transform, download=True)

Files already downloaded and verified
Files already downloaded and verified


### Pretrained ResNet

In [21]:
import os
os.environ["TORCH_HOME"] = CHECKPOINT_PATH
pretrained_model = torchvision.models.resnet34(pretrained=True)

# set fc or classifier to empty, remove classifier
pretrained_model.fc = nn.Sequential()
pretrained_model.classifier = nn.Sequential()
# To GPU
pretrained_model = pretrained_model.to(device)

# Only eval, no gradient required
pretrained_model.eval()
for p in pretrained_model.parameters():
    p.requires_grad = False

  f"The parameter '{pretrained_param}' is deprecated since 0.13 and will be removed in 0.15, "


In [22]:
data_loader = data.DataLoader(train_set, batch_size = 128, shuffle = False, drop_last = False, num_workers = 4)
X,y = next(iter(data_loader))
print(X.shape)
print(len(y))

  cpuset_checked))


torch.Size([128, 3, 224, 224])
128


In [23]:
@torch.no_grad()
def extract_features(dataset, save_file):
  if not os.path.isfile(save_file):
    data_loader = data.DataLoader(dataset, batch_size = 128, shuffle = False, drop_last = False, num_workers = 4)
    features = []
    # for each batch imgs: [128, 3, 244, 244]
    for imgs, _ in tqdm(data_loader):
      imgs = imgs.to(device)
      feature = pretrained_model(imgs)
      features.append(feature)
    # features size [img_total_num, feature_dim]
    features = torch.cat(features, dim = 0)
    # detach from graph, no need gradient
    features = features.detach().cpu()
    torch.save(features, save_file)
  else:
    features = torch.load(save_file)
  
  return features

train_feat_file = os.path.join(CHECKPOINT_PATH, "train_set_features.tar")
train_set_feats = extract_features(train_set, train_feat_file)

test_feat_file = os.path.join(CHECKPOINT_PATH, "test_set_features.tar")
test_feats = extract_features(test_set, test_feat_file)



In [24]:
print(train_set_feats.shape)
print(test_feats.shape)

torch.Size([50000, 512])
torch.Size([10000, 512])


In [25]:
l = torch.tensor([2,2,3,1,4,6])
print(l)
# argsort is the sort indices
print(torch.argsort(l))

tensor([2, 2, 3, 1, 4, 6])
tensor([3, 0, 1, 2, 4, 5])


In [26]:
## Split train into train+val, validation set have images for 100 labels. Balanced
# Get labels from train set
labels = train_set.targets

# Get indices of images per class
labels = torch.LongTensor(labels)
num_labels = labels.max()+1
sorted_indices = torch.argsort(labels).reshape(num_labels, -1) # [classes, num_imgs per class]
print(sorted_indices.shape) # [100, 500]

# Determine number of validation images per class
num_val_exmps = sorted_indices.shape[1] // 10

# Get image indices for validation and training
val_indices   = sorted_indices[:,:num_val_exmps].reshape(-1)
train_indices = sorted_indices[:,num_val_exmps:].reshape(-1)

# Group corresponding image features and labels
train_feats, train_labels = train_set_feats[train_indices], labels[train_indices]
val_feats,   val_labels   = train_set_feats[val_indices],   labels[val_indices]

torch.Size([100, 500])


### Anomaly Dataset

In [27]:
class SetAnomalyDataset(data.Dataset):

    def __init__(self, img_feats, labels, set_size=10, train=True):
        """
        Inputs:
            img_feats - Tensor of shape [num_imgs, img_dim]. Represents the high-level features.
            labels - Tensor of shape [num_imgs], containing the class labels for the images
            set_size - Number of elements in a set. N-1 are sampled from one class, and one from another one.
            train - If True, a new set will be sampled every time __getitem__ is called.
        """
        super().__init__()
        self.img_feats = img_feats
        self.labels = labels
        self.set_size = set_size-1 # The set size is here the size of correct images
        self.train = train

        # Tensors with indices of the images per class
        self.num_labels = labels.max()+1
        self.img_idx_by_label = torch.argsort(self.labels).reshape(self.num_labels, -1)

        if not train:
            self.test_sets = self._create_test_sets()


    def _create_test_sets(self):
        # Pre-generates the sets for each image for the test set
        test_sets = []
        num_imgs = self.img_feats.shape[0]
        np.random.seed(42)
        test_sets = [self.sample_img_set(self.labels[idx]) for idx in range(num_imgs)]
        test_sets = torch.stack(test_sets, dim=0)
        return test_sets


    def sample_img_set(self, anomaly_label):
        """
        Samples a new set of images, given the label of the anomaly.
        The sampled images come from a different class than anomaly_label
        """
        # Sample class from 0,...,num_classes-1 while skipping anomaly_label as class
        set_label = np.random.randint(self.num_labels-1)
        if set_label >= anomaly_label:
            set_label += 1

        # Sample images from the class determined above
        img_indices = np.random.choice(self.img_idx_by_label.shape[1], size=self.set_size, replace=False)
        img_indices = self.img_idx_by_label[set_label, img_indices]
        return img_indices


    def __len__(self):
        return self.img_feats.shape[0]


    def __getitem__(self, idx):
        anomaly = self.img_feats[idx]
        if self.train: # If train => sample
            img_indices = self.sample_img_set(self.labels[idx])
        else: # If test => use pre-generated ones
            img_indices = self.test_sets[idx]

        # Concatenate images. The anomaly is always the last image for simplicity
        img_set = torch.cat([self.img_feats[img_indices], anomaly[None]], dim=0)
        indices = torch.cat([img_indices, torch.LongTensor([idx])], dim=0)
        label = img_set.shape[0]-1

        # We return the indices of the images for visualization purpose. "Label" is the index of the anomaly
        return img_set, indices, label

In [28]:
SET_SIZE = 10
test_labels = torch.LongTensor(test_set.targets)

train_anom_dataset = SetAnomalyDataset(train_feats, train_labels, set_size=SET_SIZE, train=True)
val_anom_dataset   = SetAnomalyDataset(val_feats,   val_labels,   set_size=SET_SIZE, train=False)
test_anom_dataset  = SetAnomalyDataset(test_feats,  test_labels,  set_size=SET_SIZE, train=False)

train_anom_loader = data.DataLoader(train_anom_dataset, batch_size=64, shuffle=True,  drop_last=True,  num_workers=4, pin_memory=True)
val_anom_loader   = data.DataLoader(val_anom_dataset,   batch_size=64, shuffle=False, drop_last=False, num_workers=4)
test_anom_loader  = data.DataLoader(test_anom_dataset,  batch_size=64, shuffle=False, drop_last=False, num_workers=4)

  cpuset_checked))


In [29]:
img_set, indices, label = next(iter(train_anom_loader))
print(img_set.shape)
print(label.shape)

  cpuset_checked))


torch.Size([64, 10, 512])
torch.Size([64])


In [30]:
class AnomalyPredictor(Transformer):

    def _calculate_loss(self, batch, mode="train"):
        img_sets, _, labels = batch
        preds = self.forward(img_sets, add_positional_encoding=False) # No positional encodings as it is a set, not a sequence!
        preds = preds.squeeze(dim=-1) # Shape: [Batch_size, set_size]
        loss = F.cross_entropy(preds, labels) # Softmax/CE over set dimension
        acc = (preds.argmax(dim=-1) == labels).float().mean()
        self.log(f"{mode}_loss", loss)
        self.log(f"{mode}_acc", acc, on_step=False, on_epoch=True)
        return loss, acc

    def training_step(self, batch, batch_idx):
        loss, _ = self._calculate_loss(batch, mode="train")
        return loss

    def validation_step(self, batch, batch_idx):
        _ = self._calculate_loss(batch, mode="val")

    def test_step(self, batch, batch_idx):
        _ = self._calculate_loss(batch, mode="test")

In [31]:
def train_anomaly(**kwargs):
    # Create a PyTorch Lightning trainer with the generation callback
    root_dir = os.path.join(CHECKPOINT_PATH, "SetAnomalyTask")
    os.makedirs(root_dir, exist_ok=True)
    trainer = pl.Trainer(default_root_dir=root_dir,
                         callbacks=[ModelCheckpoint(save_weights_only=True, mode="max", monitor="val_acc")],
                         gpus=1 if str(device).startswith("cuda") else 0,
                         max_epochs=100,
                         gradient_clip_val=2,
                         progress_bar_refresh_rate=1)
    trainer.logger._default_hp_metric = None # Optional logging argument that we don't need

    # Check whether pretrained model exists. If yes, load it and skip training
    pretrained_filename = os.path.join(CHECKPOINT_PATH, "SetAnomalyTask.ckpt")
    if os.path.isfile(pretrained_filename):
        print("Found pretrained model, loading...")
        model = AnomalyPredictor.load_from_checkpoint(pretrained_filename)
    else:
        model = AnomalyPredictor(max_iters=trainer.max_epochs*len(train_anom_loader), **kwargs)
        trainer.fit(model, train_anom_loader, val_anom_loader)
        model = AnomalyPredictor.load_from_checkpoint(trainer.checkpoint_callback.best_model_path)

    # Test best model on validation and test set
    train_result = trainer.test(model, train_anom_loader, verbose=False)
    val_result = trainer.test(model, val_anom_loader, verbose=False)
    test_result = trainer.test(model, test_anom_loader, verbose=False)
    result = {"test_acc": test_result[0]["test_acc"], "val_acc": val_result[0]["test_acc"], "train_acc": train_result[0]["test_acc"]}

    model = model.to(device)
    return model, result

In [32]:
anomaly_model, anomaly_result = train_anomaly(input_dim=train_anom_dataset.img_feats.shape[-1],
                                              model_dim=256,
                                              num_heads=4,
                                              num_classes=1,
                                              num_layers=4,
                                              dropout=0.1,
                                              input_dropout=0.1,
                                              lr=5e-4,
                                              warmup=100)

  f"Setting `Trainer(progress_bar_refresh_rate={progress_bar_refresh_rate})` is deprecated in v1.5 and"
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


Found pretrained model, loading...


Missing logger folder: drive/MyDrive/deep learning tutorial/saved_models/transformer/SetAnomalyTask/lightning_logs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: 0it [00:00, ?it/s]

  cpuset_checked))
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: 0it [00:00, ?it/s]

In [35]:
print(f"Val accuracy:  {(100.0 * anomaly_result['val_acc']):4.2f}%")
print(f"Test accuracy: {(100.0 * anomaly_result['test_acc']):4.2f}%")

Val accuracy:  95.70%
Test accuracy: 94.18%
