In [6]:
import torch
import torchvision
from pathlib import Path
from torchvision import transforms
from torch import nn

In [7]:
!pip install torchinfo



In [8]:
from torchinfo import summary

In [9]:
from helper_functions import download_data

In [10]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [11]:
transformer_encoder_layer = nn.TransformerEncoderLayer(d_model = 768,
                                                       nhead = 12,
                                                       dim_feedforward=3072,
                                                       dropout=0.1,
                                                       activation="gelu",
                                                       batch_first=True,
                                                       norm_first=True)

In [14]:
class PatchEmbedding(nn.Module):
  def __init__(self,
               in_channels: int = 3,
               patch_size: int = 16,
               embedding_dim: int = 768):
    super().__init__()

    self.patch_size = patch_size

    self.patcher = nn.Conv2d(in_channels=in_channels,
                             out_channels=embedding_dim,
                             kernel_size=patch_size,
                             stride=patch_size,
                             padding=0)

    self.flatten = nn.Flatten(start_dim=2,
                              end_dim=3)

  def forward(self, x):
    image_resolution = x.shape[-1]
    assert image_resolution % self.patch_size == 0, f"Image size must be divisible by patch_size. Image size: {image_resolution}, patch size: {self.patch_size}"

    x_patched = self.patcher(x)
    x_flattened = self.flatten(x_patched)

    return x_flattened.permute(0, 2, 1)

In [15]:
class ViT(nn.Module):
  def __init__(self,
               img_size: int = 224,
               in_channels: int = 3,
               patch_size: int = 16,
               num_transformer_layers: int = 12,
               embedding_dim: int = 768,
               mlp_size: int = 3072,
               num_heads: int = 12,
               attn_dropout: float = 0.0,
               mlp_dropout: float = 0.1,
               embedding_dropout: float = 0.1,
               num_classes: int = 3):
    super().__init__()

    assert img_size % patch_size == 0, f"Image size must be divisible by patch size, image size: {img_size}, patch size: {patch_size}"

    self.num_patches = (img_size * img_size) // patch_size**2

    self.class_embedding = nn.Parameter(data = torch.randn(1, 1, embedding_dim),
                                        requires_grad=True)

    self.position_embedding = nn.Parameter(data=torch.randn(1,
                                                            self.num_patches+1,
                                                            embedding_dim),
                                           requires_grad=True)

    self.embedding_dropout = nn.Dropout(p=embedding_dropout)

    # patch embedding layer
    self.patch_embedding = PatchEmbedding(in_channels=in_channels,
                                          patch_size=patch_size,
                                          embedding_dim=embedding_dim)

    self.transformer_encoder = nn.Sequential(*[transformer_encoder_layer for _ in range(num_transformer_layers)])

    self.classifier = nn.Sequential(nn.LayerNorm(normalized_shape=embedding_dim),
                                    nn.Linear(in_features=embedding_dim,
                                              out_features=num_classes))

  def forward(self, x):
    batch_size = x.shape[0]

    class_token = self.class_embedding.expand(batch_size, -1, -1)

    x = self.patch_embedding(x)

    x = torch.cat((class_token, x), dim=1)

    x = self.position_embedding + x

    x = self.embedding_dropout(x)

    x = self.transformer_encoder(x)

    x = self.classifer(x[:, 0])

    return x


In [16]:
vit = ViT()

In [19]:
%%writefile vit.py
import torch
from torch import nn

transformer_encoder_layer = nn.TransformerEncoderLayer(d_model = 768,
                                                       nhead = 12,
                                                       dim_feedforward=3072,
                                                       dropout=0.1,
                                                       activation="gelu",
                                                       batch_first=True,
                                                       norm_first=True)

class PatchEmbedding(nn.Module):
  def __init__(self,
               in_channels: int = 3,
               patch_size: int = 16,
               embedding_dim: int = 768):
    super().__init__()

    self.patch_size = patch_size

    self.patcher = nn.Conv2d(in_channels=in_channels,
                             out_channels=embedding_dim,
                             kernel_size=patch_size,
                             stride=patch_size,
                             padding=0)

    self.flatten = nn.Flatten(start_dim=2,
                              end_dim=3)

  def forward(self, x):
    image_resolution = x.shape[-1]
    assert image_resolution % self.patch_size == 0, f"Image size must be divisible by patch_size. Image size: {image_resolution}, patch size: {self.patch_size}"

    x_patched = self.patcher(x)
    x_flattened = self.flatten(x_patched)

    return x_flattened.permute(0, 2, 1)

class ViT(nn.Module):
  def __init__(self,
               img_size: int = 224,
               in_channels: int = 3,
               patch_size: int = 16,
               num_transformer_layers: int = 12,
               embedding_dim: int = 768,
               mlp_size: int = 3072,
               num_heads: int = 12,
               attn_dropout: float = 0.0,
               mlp_dropout: float = 0.1,
               embedding_dropout: float = 0.1,
               num_classes: int = 3):
    super().__init__()

    assert img_size % patch_size == 0, f"Image size must be divisible by patch size, image size: {img_size}, patch size: {patch_size}"

    self.num_patches = (img_size * img_size) // patch_size**2

    self.class_embedding = nn.Parameter(data = torch.randn(1, 1, embedding_dim),
                                        requires_grad=True)

    self.position_embedding = nn.Parameter(data=torch.randn(1,
                                                            self.num_patches+1,
                                                            embedding_dim),
                                           requires_grad=True)

    self.embedding_dropout = nn.Dropout(p=embedding_dropout)

    # patch embedding layer
    self.patch_embedding = PatchEmbedding(in_channels=in_channels,
                                          patch_size=patch_size,
                                          embedding_dim=embedding_dim)

    self.transformer_encoder = nn.Sequential(*[transformer_encoder_layer for _ in range(num_transformer_layers)])

    self.classifier = nn.Sequential(nn.LayerNorm(normalized_shape=embedding_dim),
                                    nn.Linear(in_features=embedding_dim,
                                              out_features=num_classes))

  def forward(self, x):
    batch_size = x.shape[0]

    class_token = self.class_embedding.expand(batch_size, -1, -1)

    x = self.patch_embedding(x)

    x = torch.cat((class_token, x), dim=1)

    x = self.position_embedding + x

    x = self.embedding_dropout(x)

    x = self.transformer_encoder(x)

    x = self.classifer(x[:, 0])

    return x


Overwriting vit.py


In [20]:
from vit import ViT as ViT_import

In [21]:
vit = ViT_import()

In [23]:
from get_data import download_data

url = "https://github.com/mrdbourke/pytorch-deep-learning/raw/main/data/pizza_steak_sushi_20_percent.zip"

images_path = download_data(url,
                            target_directory="pizza_steak_sushi_20")

train_dir = images_path / "train"
test_dir = images_path / "test"

Creating data/pizza_steak_sushi_20
Downloading pizza_steak_sushi_20_percent.zip...
Unzipping pizza_steak_sushi_20_percent.zip...


In [25]:
pretrained_vit_weights = torchvision.models.ViT_B_16_Weights.DEFAULT
pretrained_vit_transforms = pretrained_vit_weights.transforms()

pretrained_vit = torchvision.models.vit_b_16(weights=pretrained_vit_weights).to(device)

In [26]:
for parameter in pretrained_vit.parameters():
  parameter.requires_grad = False

In [27]:
import data_setup

BATCH_SIZE = 32

train_dataloader, test_dataloader, class_names = data_setup.create_dataloaders(train_dir,
                                                                               test_dir,
                                                                               pretrained_vit_transforms,
                                                                               pretrained_vit_transforms,
                                                                               BATCH_SIZE)


In [29]:
pretrained_vit.heads = nn.Linear(in_features=768, out_features=len(class_names))

pretrained_vit

VisionTransformer(
  (conv_proj): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
  (encoder): Encoder(
    (dropout): Dropout(p=0.0, inplace=False)
    (layers): Sequential(
      (encoder_layer_0): EncoderBlock(
        (ln_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (self_attention): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (dropout): Dropout(p=0.0, inplace=False)
        (ln_2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (mlp): MLPBlock(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU(approximate='none')
          (2): Dropout(p=0.0, inplace=False)
          (3): Linear(in_features=3072, out_features=768, bias=True)
          (4): Dropout(p=0.0, inplace=False)
        )
      )
      (encoder_layer_1): EncoderBlock(
        (ln_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (self_a

In [30]:
summary(model=pretrained_vit,
        input_size=(1, 3, 224, 224),
        col_names=("input_size", "output_size", "num_params", "trainable"),
        col_width=20,
        row_settings=["var_names"])

Layer (type (var_name))                                      Input Shape          Output Shape         Param #              Trainable
VisionTransformer (VisionTransformer)                        [1, 3, 224, 224]     [1, 3]               768                  Partial
├─Conv2d (conv_proj)                                         [1, 3, 224, 224]     [1, 768, 14, 14]     (590,592)            False
├─Encoder (encoder)                                          [1, 197, 768]        [1, 197, 768]        151,296              False
│    └─Dropout (dropout)                                     [1, 197, 768]        [1, 197, 768]        --                   --
│    └─Sequential (layers)                                   [1, 197, 768]        [1, 197, 768]        --                   False
│    │    └─EncoderBlock (encoder_layer_0)                   [1, 197, 768]        [1, 197, 768]        (7,087,872)          False
│    │    └─EncoderBlock (encoder_layer_1)                   [1, 197, 768]        [1, 1

In [31]:
import engine

EPOCHS = 10

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(pretrained_vit.parameters(),
                             lr=0.001)

pretrained_vit_results = engine.train(pretrained_vit,
                                      train_dataloader,
                                      test_dataloader,
                                      optimizer,
                                      loss_fn,
                                      EPOCHS,
                                      device)

  0%|          | 0/10 [00:00<?, ?it/s]

Epochs: 0 | Train Loss: 0.726 | Train Acc: 71.04% | Test Loss: 0.268 | Test Acc: 94.38%
Epochs: 1 | Train Loss: 0.248 | Train Acc: 93.54% | Test Loss: 0.153 | Test Acc: 95.97%
Epochs: 2 | Train Loss: 0.169 | Train Acc: 95.42% | Test Loss: 0.108 | Test Acc: 98.12%
Epochs: 3 | Train Loss: 0.139 | Train Acc: 96.25% | Test Loss: 0.091 | Test Acc: 98.12%
Epochs: 4 | Train Loss: 0.133 | Train Acc: 97.08% | Test Loss: 0.082 | Test Acc: 98.12%
Epochs: 5 | Train Loss: 0.117 | Train Acc: 97.08% | Test Loss: 0.077 | Test Acc: 98.75%
Epochs: 6 | Train Loss: 0.088 | Train Acc: 97.29% | Test Loss: 0.074 | Test Acc: 98.12%
Epochs: 7 | Train Loss: 0.080 | Train Acc: 97.71% | Test Loss: 0.067 | Test Acc: 98.12%
Epochs: 8 | Train Loss: 0.070 | Train Acc: 98.33% | Test Loss: 0.063 | Test Acc: 98.75%
Epochs: 9 | Train Loss: 0.063 | Train Acc: 98.75% | Test Loss: 0.060 | Test Acc: 98.12%


In [39]:
swag_vit_weights = torchvision.models.ViT_B_16_Weights.IMAGENET1K_SWAG_E2E_V1
swag_vit_transforms = swag_vit_weights.transforms()

swag_vit = torchvision.models.vit_b_16(weights=swag_vit_weights).to(device)

for parameter in swag_vit.parameters():
  parameter.requires_grad = False

In [40]:
swag_train_dataloader, swag_test_dataloader, class_names = data_setup.create_dataloaders(train_dir,
                                                                                         test_dir,
                                                                                         swag_vit_transforms,
                                                                                         swag_vit_transforms,
                                                                                         BATCH_SIZE)

In [41]:
summary(model=swag_vit,
        input_size=(1, 3, 384, 384),
        col_names=("input_size", "output_size", "num_params", "trainable"),
        col_width=20,
        row_settings=["var_names"])

Layer (type (var_name))                                      Input Shape          Output Shape         Param #              Trainable
VisionTransformer (VisionTransformer)                        [1, 3, 384, 384]     [1, 1000]            768                  False
├─Conv2d (conv_proj)                                         [1, 3, 384, 384]     [1, 768, 24, 24]     (590,592)            False
├─Encoder (encoder)                                          [1, 577, 768]        [1, 577, 768]        443,136              False
│    └─Dropout (dropout)                                     [1, 577, 768]        [1, 577, 768]        --                   --
│    └─Sequential (layers)                                   [1, 577, 768]        [1, 577, 768]        --                   False
│    │    └─EncoderBlock (encoder_layer_0)                   [1, 577, 768]        [1, 577, 768]        (7,087,872)          False
│    │    └─EncoderBlock (encoder_layer_1)                   [1, 577, 768]        [1, 577

In [42]:
swag_vit.heads = nn.Linear(in_features=768, out_features=len(class_names))

In [43]:
summary(model=swag_vit,
        input_size=(1, 3, 384, 384),
        col_names=("input_size", "output_size", "num_params", "trainable"),
        col_width=20,
        row_settings=["var_names"])

Layer (type (var_name))                                      Input Shape          Output Shape         Param #              Trainable
VisionTransformer (VisionTransformer)                        [1, 3, 384, 384]     [1, 3]               768                  Partial
├─Conv2d (conv_proj)                                         [1, 3, 384, 384]     [1, 768, 24, 24]     (590,592)            False
├─Encoder (encoder)                                          [1, 577, 768]        [1, 577, 768]        443,136              False
│    └─Dropout (dropout)                                     [1, 577, 768]        [1, 577, 768]        --                   --
│    └─Sequential (layers)                                   [1, 577, 768]        [1, 577, 768]        --                   False
│    │    └─EncoderBlock (encoder_layer_0)                   [1, 577, 768]        [1, 577, 768]        (7,087,872)          False
│    │    └─EncoderBlock (encoder_layer_1)                   [1, 577, 768]        [1, 5

In [45]:
swag_optimizer = torch.optim.Adam(swag_vit.parameters(),
                             lr=0.001)

engine.train(swag_vit,
             swag_train_dataloader,
             swag_test_dataloader,
             swag_optimizer,
             loss_fn,
             EPOCHS,
             device)

  0%|          | 0/10 [00:00<?, ?it/s]

Epochs: 0 | Train Loss: 0.379 | Train Acc: 87.08% | Test Loss: 0.040 | Test Acc: 100.00%
Epochs: 1 | Train Loss: 0.061 | Train Acc: 98.12% | Test Loss: 0.012 | Test Acc: 100.00%
Epochs: 2 | Train Loss: 0.038 | Train Acc: 98.54% | Test Loss: 0.008 | Test Acc: 100.00%
Epochs: 3 | Train Loss: 0.026 | Train Acc: 99.38% | Test Loss: 0.007 | Test Acc: 100.00%
Epochs: 4 | Train Loss: 0.019 | Train Acc: 99.79% | Test Loss: 0.007 | Test Acc: 100.00%
Epochs: 5 | Train Loss: 0.014 | Train Acc: 99.79% | Test Loss: 0.006 | Test Acc: 100.00%
Epochs: 6 | Train Loss: 0.011 | Train Acc: 99.79% | Test Loss: 0.006 | Test Acc: 100.00%
Epochs: 7 | Train Loss: 0.009 | Train Acc: 100.00% | Test Loss: 0.005 | Test Acc: 100.00%
Epochs: 8 | Train Loss: 0.008 | Train Acc: 100.00% | Test Loss: 0.005 | Test Acc: 100.00%
Epochs: 9 | Train Loss: 0.007 | Train Acc: 100.00% | Test Loss: 0.004 | Test Acc: 100.00%


{'train_loss': [0.3791725266414384,
  0.060845239673896385,
  0.03773432287077109,
  0.02574083979707211,
  0.01920426142460201,
  0.014374229571937272,
  0.01129259168325613,
  0.009201703205083808,
  0.007669865623271713,
  0.006519828321567426],
 'train_acc': [0.8708333333333333,
  0.98125,
  0.9854166666666667,
  0.99375,
  0.9979166666666667,
  0.9979166666666667,
  0.9979166666666667,
  1.0,
  1.0,
  1.0],
 'test_loss': [0.040079341270029545,
  0.012161548167932778,
  0.008372983278241009,
  0.0073016483045648785,
  0.006946713998331688,
  0.006307534044026397,
  0.005544812249718234,
  0.005006687316927128,
  0.004512662460911088,
  0.0041628509236034],
 'test_acc': [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]}

In [50]:
!cd / && find -name "*pth" -type "f"


./sys/devices/pci0000:00/0000:00:03.0/virtio0/host0/target0:0:1/0:0:1:0/queue_depth
./sys/fs/cgroup/cgroup.max.depth
./sys/module/nvme/parameters/io_queue_depth
./sys/module/dm_mod/parameters/dm_mq_queue_depth
./proc/sys/kernel/max_lock_depth
./proc/driver/nvidia/suspend_depth
find: ‘./proc/53/task/53/net’: Invalid argument
find: ‘./proc/53/net’: Invalid argument
find: ‘./proc/26910’: No such file or directory
./var/colab/cgroup/jupyter-children/cgroup.max.depth
./root/.cache/torch/hub/checkpoints/vit_b_16_swag-9ac1b537.pth
./root/.cache/torch/hub/checkpoints/vit_b_16-c867db91.pth
./usr/local/lib/python3.10/dist-packages/google_colab-1.0.0-py3.10-nspkg.pth
./usr/local/lib/python3.10/dist-packages/google_api_core-2.11.1-py3.9-nspkg.pth
./usr/local/lib/python3.10/dist-packages/google_auth-2.17.3-py3.9-nspkg.pth
./usr/local/lib/python3.10/dist-packages/google_cloud_language-2.9.1-py3.9-nspkg.pth
./usr/local/lib/python3.10/dist-packages/matplotlib-3.7.1-py3.10-nspkg.pth
./usr/local/lib/pyt