In [1]:
!pip3 install triton torchinfo utils
!pip install -U git+https://github.com/sustcsonglin/flash-linear-attention

Collecting triton
  Downloading triton-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.3 kB)
Collecting torchinfo
  Downloading torchinfo-1.8.0-py3-none-any.whl.metadata (21 kB)
Collecting utils
  Downloading utils-1.0.2.tar.gz (13 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading triton-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (209.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.5/209.5 MB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading torchinfo-1.8.0-py3-none-any.whl (23 kB)
Building wheels for collected packages: utils
  Building wheel for utils (setup.py) ... [?25l[?25hdone
  Created wheel for utils: filename=utils-1.0.2-py2.py3-none-any.whl size=13906 sha256=72245c113f8405134841d74b8b7092f0391c1046df08f76a6fcc4a60e0b41de6
  Stored in directory: /root/.cache/pip/wheels/b8/39/f5/9d0ca31dba85773ececf0a7f5469f18810e1c8a8ed9da28ca7
Successfully built utils
Installing collecte

In [2]:
#TODO: 11/11 merge GLA into LeViT_impl and then test model

import torch
from einops import rearrange
import triton
import triton.language as tl

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import numpy as np

import itertools
import utils
import timm

from fla.ops.gla import fused_chunk_gla, chunk_gla, fused_recurrent_gla

from torchvision import datasets, transforms
from torch.utils.data import DataLoader, random_split, ConcatDataset

from tqdm import tqdm

from torchinfo import summary

#import os
#os.environ['TRITON_DISABLE_BF16'] = '1'

FLOPS_COUNTER = 0

# Implementation of LeViT

In [3]:
class ConvNorm(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size=3, stride=2, padding=1):
        super(ConvNorm, self).__init__()
        self.linear = nn.Conv2d(
            in_channels, out_channels, kernel_size=kernel_size,
            stride=stride, padding=padding, bias=False
        )
        self.bn = nn.BatchNorm2d(out_channels)

    def forward(self, x):
        x = self.linear(x)
        x = self.bn(x)
        return x

In [4]:
class Stem16(nn.Module):
    def __init__(self):
        super(Stem16, self).__init__()
        self.conv1 = ConvNorm(3, 32)
        self.act1 = nn.Hardswish()
        self.conv2 = ConvNorm(32, 64)
        self.act2 = nn.Hardswish()
        self.conv3 = ConvNorm(64, 128)
        self.act3 = nn.Hardswish()
        self.conv4 = ConvNorm(128, 256)

    def forward(self, x):
        x = self.act1(self.conv1(x))
        x = self.act2(self.conv2(x))
        x = self.act3(self.conv3(x))
        x = self.conv4(x)
        return x

In [5]:
class LinearNorm(nn.Module):
    def __init__(self, in_features, out_features):
        super(LinearNorm, self).__init__()
        self.linear = nn.Linear(in_features, out_features, bias=False)
        self.bn = nn.BatchNorm1d(out_features)

    def forward(self, x):

        if x.dim() == 3:
            B, N, C = x.shape
            x = x.reshape(B * N, C)
            x = self.bn(self.linear(x))
            x = x.reshape(B, N, -1)
        else:
            x = self.bn(self.linear(x))
        return x

In [6]:
class Attention(nn.Module):
    def __init__(self, dim, num_heads, attn_ratio=2):
        super(Attention, self).__init__()
        self.num_heads = num_heads
        head_dim = dim // num_heads
        self.scale = head_dim ** -0.5
        inner_dim = head_dim * num_heads * 3
        self.qkv = LinearNorm(dim, inner_dim)

        self.proj = nn.Sequential(
            nn.Hardswish(),
            LinearNorm(dim, dim)
        )

    def forward(self, x):
        B, N, C = x.shape
        qkv = self.qkv(x)
        qkv = qkv.view(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
        q, k, v = qkv[0], qkv[1], qkv[2]
        attn = (q @ k.transpose(-2, -1)) * self.scale
        attn = attn.softmax(dim=-1)
        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
        return self.proj(x)

## GLA (Gated Linear Attention) Module

In [7]:
class GatedLinearAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.embed_dim = config.dim
        self.num_heads = config.num_heads

        self.gate_fn = nn.functional.silu
        assert config.use_gk and not config.use_gv, "Only use_gk is supported for simplicity."

        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim//2, bias=False)
        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim//2, bias=False)
        self.k_gate =  nn.Sequential(nn.Linear(self.embed_dim, 16, bias=False), nn.Linear(16, self.embed_dim // 2))

        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
        self.g_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=True)
        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False)

        self.head_dim = self.embed_dim // self.num_heads
        self.key_dim = self.embed_dim // self.num_heads
        self.scaling = self.key_dim ** -0.5
        self.group_norm = nn.LayerNorm(self.head_dim, eps=1e-5, elementwise_affine=False)

        self.post_init()

    def post_init(self):
        nn.init.xavier_uniform_(self.q_proj.weight, gain=2 ** -2.5)
        nn.init.xavier_uniform_(self.k_proj.weight, gain=2 ** -2.5)
        if isinstance(self.k_gate, nn.Sequential):
            nn.init.xavier_uniform_(self.k_gate[0].weight, gain=2 ** -2.5)
            nn.init.xavier_uniform_(self.k_gate[1].weight, gain=2 ** -2.5)
        else:
            nn.init.xavier_uniform_(self.k_gate.weight, gain=2 ** -2.5)

    def forward(self, x, hidden_states=None):
        q = self.q_proj(x)
        k = self.k_proj(x) * self.scaling
        k_gate = self.k_gate(x)
        v = self.v_proj(x)
        g = self.g_proj(x)

        output, new_hidden_states = self.gated_linear_attention(q, k, v, k_gate, hidden_states=hidden_states)
        output = self.gate_fn(g) * output
        output = self.out_proj(output)
        return output#, new_hidden_states # this needs to be tensor, not tuple


    def gated_linear_attention(self, q, k, v, gk, normalizer=16, hidden_states=None):
        q = rearrange(q, 'b l (h d) -> b h l d', h = self.num_heads).contiguous()
        k = rearrange(k, 'b l (h d) -> b h l d', h = self.num_heads).contiguous()
        v = rearrange(v, 'b l (h d) -> b h l d', h = self.num_heads).contiguous()
        gk = rearrange(gk, 'b l (h d) -> b h l d', h = self.num_heads).contiguous()
        gk = F.logsigmoid(gk) / normalizer

        # for storing original dtype
        original_dtype = q.dtype

        if self.training:
            # cast inputs to float32 if needed
            if q.dtype == torch.bfloat16:
                q, k, v, gk = q.float(), k.float(), v.float(), gk.float()
            o, new_hidden_states = fused_chunk_gla(q, k, v, gk, initial_state=hidden_states, output_final_state=True)
            # cast back to origianl dtype if needed
            if o.dtype != original_dtype:
              o = o.type(original_dtype)

        else:
            o = fused_recurrent_gla(q, k, v, gk)

            new_hidden_states = None

        if isinstance(o, tuple):
          o = o[0]

        o = self.group_norm(o)
        o = rearrange(o, 'b h l d -> b l (h d)')
        return o, new_hidden_states

In [8]:
# define seperate config object for GLA input
class Config:
    def __init__(self, dim, num_heads, use_gk=True, use_gv=False):
        self.dim = dim
        self.num_heads = num_heads
        self.use_gk = use_gk
        self.use_gv = use_gv

In [9]:
class LevitMlp(nn.Module):
    def __init__(self, in_features, hidden_features, out_features):
        super(LevitMlp, self).__init__()
        self.ln1 = LinearNorm(in_features, hidden_features)
        self.act = nn.Hardswish()
        self.drop = nn.Dropout(p=0.0, inplace=False)
        self.ln2 = LinearNorm(hidden_features, out_features)

    def forward(self, x):
        x = self.ln1(x)
        x = self.act(x)
        x = self.drop(x)
        x = self.ln2(x)
        return x

In [10]:
class LevitBlock(nn.Module):
    def __init__(self, dim, num_heads, mlp_ratio=2): # hidden state from GLA
        super(LevitBlock, self).__init__()
        #self.attn = Attention(dim, num_heads) # -> GLA, hidden state updates
        self.attn = GatedLinearAttention(Config(dim, num_heads))
        self.drop_path1 = nn.Identity()
        self.mlp = LevitMlp(dim, dim * mlp_ratio, dim)
        self.drop_path2 = nn.Identity()

    def forward(self, x):
        x = x + self.drop_path1(self.attn(x))
        x = x + self.drop_path2(self.mlp(x))
        return x

In [11]:
class AttentionDownsample(nn.Module):
    def __init__(self, dim, out_dim, num_heads, attn_ratio=2):
        super(AttentionDownsample, self).__init__()
        self.num_heads = num_heads
        self.scale = (dim // num_heads) ** -0.5
        inner_dim = dim * attn_ratio * num_heads
        self.kv = LinearNorm(dim, inner_dim)

        self.q = nn.Sequential(
            nn.Conv2d(dim, dim, kernel_size=2, stride=2),
            nn.Flatten(start_dim=1)
        )

        self.proj = nn.Sequential(
            nn.Hardswish(),
            LinearNorm(dim, out_dim)
        )

    def forward(self, x):
        B, N, C = x.shape
        H = W = int(N ** 0.5)
        x = x.reshape(B, C, H, W)

        kv = self.kv(x.flatten(2).transpose(1, 2))
        q = self.q(x)

        q = q.reshape(B, -1, C)
        x = self.proj(q)
        return x

In [12]:
class LevitDownsample(nn.Module):
    def __init__(self, dim, out_dim, num_heads, attn_ratio=2):
        super(LevitDownsample, self).__init__()
        self.attn_downsample = AttentionDownsample(dim, out_dim, num_heads, attn_ratio)
        self.mlp = LevitMlp(out_dim, out_dim * attn_ratio, out_dim)
        self.drop_path = nn.Identity()

    def forward(self, x):
        x = self.attn_downsample(x)
        x = self.drop_path(self.mlp(x))
        return x

In [13]:
class NormLinear(nn.Module):
    def __init__(self, in_features, out_features, dropout_prob=0.0):
        super(NormLinear, self).__init__()
        self.bn = nn.BatchNorm1d(in_features)
        self.drop = nn.Dropout(p=dropout_prob, inplace=False)
        self.linear = nn.Linear(in_features, out_features, bias=True)

    def forward(self, x):
        x = self.bn(x)
        x = self.drop(x)
        x = self.linear(x)
        return x


In [14]:
class LevitStage(nn.Module):
    def __init__(self, dim, out_dim, num_heads, num_blocks, downsample=True):
        super(LevitStage, self).__init__()
        self.downsample = LevitDownsample(dim, out_dim, num_heads) if downsample else nn.Identity()
        self.blocks = nn.Sequential(*[LevitBlock(out_dim, num_heads) for _ in range(num_blocks)])

    def forward(self, x):
        x = self.downsample(x)
        x = self.blocks(x)
        return x

In [15]:
class GLALeViT(nn.Module):
    def __init__(self, num_classes=37):
        super(GLALeViT, self).__init__()

        self.stem = Stem16()

        self.stages = nn.Sequential(
            LevitStage(dim=256, out_dim=256, num_heads=4, num_blocks=3, downsample=False),
            LevitStage(dim=256, out_dim=384, num_heads=6, num_blocks=3, downsample=True),
            LevitStage(dim=384, out_dim=512, num_heads=8, num_blocks=2, downsample=True)
        )

        self.head = NormLinear(in_features=512, out_features=num_classes, dropout_prob=0.0)
        self.head_dist = NormLinear(in_features=512, out_features=num_classes, dropout_prob=0.0)

    def forward(self, x):
        x = self.stem(x)
        B, C, H, W = x.shape
        x = x.view(B, C, -1).transpose(1, 2)
        x = self.stages(x)
        out = self.head(x.mean(dim=1))
        out_dist = self.head_dist(x.mean(dim=1))
        return out

In [16]:
ResNet50 = timm.create_model('resnet50', pretrained=True)
for param in ResNet50.parameters():
    param.requires_grad = False
ResNet50.fc = nn.Identity()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

In [17]:
class LauncherModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.resnet = ResNet50
        self.fc = nn.Linear(2048, 3 * 56 * 56)

        self.upsample = nn.ConvTranspose2d(3, 3, kernel_size=4, stride=4, padding=0)

        self.levit = GLALeViT()

    def forward(self, x):
        x = self.resnet(x)  # (32, 2048)
        x = self.fc(x)  # (32, 9408)
        x = x.view(x.size(0), 3, 56, 56)  # (32, 3, 56, 56)
        x = self.upsample(x)  # (32, 3, 224, 224)
        x = self.levit(x)
        return x

In [18]:
model = LauncherModel()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

batch_size = 32
learning_rate = 0.001
num_epochs = 50

In [19]:
model.to(device)

LauncherModel(
  (resnet): ResNet(
    (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (act1): ReLU(inplace=True)
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): Bottleneck(
        (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act1): ReLU(inplace=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (drop_block): Identity()
        (act2): ReLU(inplace=True)
        (aa): Identity()
        (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): BatchNorm2d(256, eps=1e-05,

In [20]:
from torchinfo import summary
print(summary(model, input_size=(32, 3, 224, 224)))

Layer (type:depth-idx)                                       Output Shape              Param #
LauncherModel                                                [32, 37]                  --
├─ResNet: 1-1                                                [32, 2048]                --
│    └─Conv2d: 2-1                                           [32, 64, 112, 112]        (9,408)
│    └─BatchNorm2d: 2-2                                      [32, 64, 112, 112]        (128)
│    └─ReLU: 2-3                                             [32, 64, 112, 112]        --
│    └─MaxPool2d: 2-4                                        [32, 64, 56, 56]          --
│    └─Sequential: 2-5                                       [32, 256, 56, 56]         --
│    │    └─Bottleneck: 3-1                                  [32, 256, 56, 56]         (75,008)
│    │    └─Bottleneck: 3-2                                  [32, 256, 56, 56]         (70,400)
│    │    └─Bottleneck: 3-3                                  [32, 256, 56, 

In [21]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

In [22]:
trainval_data = datasets.OxfordIIITPet(root="data", split="trainval", target_types="category", download=True, transform=transform)
test_data = datasets.OxfordIIITPet(root="data", split="test", target_types="category", download=True, transform=transform)
combined_data = ConcatDataset([trainval_data, test_data])

train_size = int(0.7 * len(combined_data))
val_size = int(0.15 * len(combined_data))
test_size = len(combined_data) - train_size - val_size
train_data, val_data, test_data = random_split(combined_data, [train_size, val_size, test_size])

Downloading https://thor.robots.ox.ac.uk/pets/images.tar.gz to data/oxford-iiit-pet/images.tar.gz


100%|██████████| 792M/792M [00:21<00:00, 37.2MB/s]


Extracting data/oxford-iiit-pet/images.tar.gz to data/oxford-iiit-pet
Downloading https://thor.robots.ox.ac.uk/pets/annotations.tar.gz to data/oxford-iiit-pet/annotations.tar.gz


100%|██████████| 19.2M/19.2M [00:01<00:00, 16.2MB/s]


Extracting data/oxford-iiit-pet/annotations.tar.gz to data/oxford-iiit-pet


In [23]:
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

print(f"Train set size: {len(train_data)}")
print(f"Validation set size: {len(val_data)}")
print(f"Test set size: {len(test_data)}")

Train set size: 5144
Validation set size: 1102
Test set size: 1103


In [24]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [25]:
def train(model, train_loader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    for inputs, labels in tqdm(train_loader, desc="Training"):
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    epoch_loss = running_loss / len(train_loader)
    accuracy = 100 * correct / total
    print(f"Train Loss: {epoch_loss:.4f}, Train Accuracy: {accuracy:.2f}%")

In [26]:
def evaluate(model, data_loader, criterion, device, phase="Validation"):
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in tqdm(data_loader, desc=f"{phase}"):
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)

            running_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    epoch_loss = running_loss / len(data_loader)
    accuracy = 100 * correct / total
    print(f"{phase} Loss: {epoch_loss:.4f}, {phase} Accuracy: {accuracy:.2f}%")

In [27]:
def measure_inference_time(model, data_loader, device):
    model.eval()
    times = []

    with torch.no_grad():
        for inputs, _ in data_loader:
            inputs = inputs.to(device)
            start_time = torch.cuda.Event(enable_timing=True)
            end_time = torch.cuda.Event(enable_timing=True)

            start_time.record()
            _ = model(inputs)  # inference 수행
            end_time.record()

            # 시간 측정
            torch.cuda.synchronize()  # CUDA에서 모든 커널이 완료될 때까지 대기
            elapsed_time = start_time.elapsed_time(end_time)  # 밀리초 단위로 반환
            times.append(elapsed_time)

    # 통계량 계산
    times_np = np.array(times)
    total_inferences = len(times_np)
    avg_time = np.mean(times_np)
    std_dev = np.std(times_np)
    max_time = np.max(times_np)
    min_time = np.min(times_np)

    # 결과 출력
    print(f"Inference Time Measurement Results:")
    print(f"Total Inferences: {total_inferences}")
    print(f"Average Time: {avg_time:.2f} ms")
    print(f"Standard Deviation: {std_dev:.2f} ms")
    print(f"Maximum Time: {max_time:.2f} ms")
    print(f"Minimum Time: {min_time:.2f} ms")

    return times

In [28]:
for epoch in range(num_epochs):
    print(f"\nEpoch {epoch+1}/{num_epochs}")
    train(model, train_loader, criterion, optimizer, device)
    evaluate(model, val_loader, criterion, device, phase="Validation")


Epoch 1/50


Training: 100%|██████████| 161/161 [00:57<00:00,  2.81it/s]


Train Loss: 3.2799, Train Accuracy: 6.84%


Validation: 100%|██████████| 35/35 [00:06<00:00,  5.05it/s]


Validation Loss: 34.7455, Validation Accuracy: 3.27%

Epoch 2/50


Training: 100%|██████████| 161/161 [00:34<00:00,  4.61it/s]


Train Loss: 2.3531, Train Accuracy: 19.01%


Validation: 100%|██████████| 35/35 [00:06<00:00,  5.67it/s]


Validation Loss: 43.3360, Validation Accuracy: 2.27%

Epoch 3/50


Training: 100%|██████████| 161/161 [00:34<00:00,  4.66it/s]


Train Loss: 1.8434, Train Accuracy: 34.27%


Validation: 100%|██████████| 35/35 [00:05<00:00,  5.85it/s]


Validation Loss: 43.1318, Validation Accuracy: 2.54%

Epoch 4/50


Training: 100%|██████████| 161/161 [00:34<00:00,  4.64it/s]


Train Loss: 1.4569, Train Accuracy: 46.99%


Validation: 100%|██████████| 35/35 [00:06<00:00,  5.77it/s]


Validation Loss: 9.7438, Validation Accuracy: 10.44%

Epoch 5/50


Training: 100%|██████████| 161/161 [00:34<00:00,  4.63it/s]


Train Loss: 1.2547, Train Accuracy: 54.72%


Validation: 100%|██████████| 35/35 [00:06<00:00,  5.75it/s]


Validation Loss: 3.5905, Validation Accuracy: 21.51%

Epoch 6/50


Training: 100%|██████████| 161/161 [00:34<00:00,  4.64it/s]


Train Loss: 1.1465, Train Accuracy: 58.51%


Validation: 100%|██████████| 35/35 [00:06<00:00,  5.77it/s]


Validation Loss: 8.7159, Validation Accuracy: 5.81%

Epoch 7/50


Training: 100%|██████████| 161/161 [00:34<00:00,  4.65it/s]


Train Loss: 1.1463, Train Accuracy: 58.61%


Validation: 100%|██████████| 35/35 [00:05<00:00,  5.84it/s]


Validation Loss: 6.4988, Validation Accuracy: 15.34%

Epoch 8/50


Training: 100%|██████████| 161/161 [00:34<00:00,  4.67it/s]


Train Loss: 1.1460, Train Accuracy: 58.57%


Validation: 100%|██████████| 35/35 [00:06<00:00,  5.73it/s]


Validation Loss: 1.1895, Validation Accuracy: 58.35%

Epoch 9/50


Training: 100%|██████████| 161/161 [00:34<00:00,  4.65it/s]


Train Loss: 1.0181, Train Accuracy: 62.97%


Validation: 100%|██████████| 35/35 [00:06<00:00,  5.78it/s]


Validation Loss: 1.6585, Validation Accuracy: 47.10%

Epoch 10/50


Training: 100%|██████████| 161/161 [00:34<00:00,  4.65it/s]


Train Loss: 0.9724, Train Accuracy: 65.03%


Validation: 100%|██████████| 35/35 [00:06<00:00,  5.78it/s]


Validation Loss: 3.8947, Validation Accuracy: 26.86%

Epoch 11/50


Training: 100%|██████████| 161/161 [00:34<00:00,  4.65it/s]


Train Loss: 0.9078, Train Accuracy: 67.67%


Validation: 100%|██████████| 35/35 [00:06<00:00,  5.72it/s]


Validation Loss: 1.7423, Validation Accuracy: 48.64%

Epoch 12/50


Training: 100%|██████████| 161/161 [00:34<00:00,  4.63it/s]


Train Loss: 0.9262, Train Accuracy: 67.28%


Validation: 100%|██████████| 35/35 [00:06<00:00,  5.68it/s]


Validation Loss: 1.5574, Validation Accuracy: 52.18%

Epoch 13/50


Training: 100%|██████████| 161/161 [00:34<00:00,  4.62it/s]


Train Loss: 0.9202, Train Accuracy: 67.71%


Validation: 100%|██████████| 35/35 [00:05<00:00,  5.83it/s]


Validation Loss: 1.5223, Validation Accuracy: 51.36%

Epoch 14/50


Training: 100%|██████████| 161/161 [00:34<00:00,  4.64it/s]


Train Loss: 0.8141, Train Accuracy: 70.88%


Validation: 100%|██████████| 35/35 [00:06<00:00,  5.72it/s]


Validation Loss: 9.4874, Validation Accuracy: 10.53%

Epoch 15/50


Training: 100%|██████████| 161/161 [00:34<00:00,  4.65it/s]


Train Loss: 2.0151, Train Accuracy: 54.88%


Validation: 100%|██████████| 35/35 [00:06<00:00,  5.77it/s]


Validation Loss: 39.9461, Validation Accuracy: 2.90%

Epoch 16/50


Training: 100%|██████████| 161/161 [00:34<00:00,  4.64it/s]


Train Loss: 1.4130, Train Accuracy: 49.67%


Validation: 100%|██████████| 35/35 [00:06<00:00,  5.70it/s]


Validation Loss: 1.3277, Validation Accuracy: 54.81%

Epoch 17/50


Training: 100%|██████████| 161/161 [00:34<00:00,  4.64it/s]


Train Loss: 1.0833, Train Accuracy: 61.57%


Validation: 100%|██████████| 35/35 [00:06<00:00,  5.80it/s]


Validation Loss: 1.5208, Validation Accuracy: 52.90%

Epoch 18/50


Training: 100%|██████████| 161/161 [00:34<00:00,  4.65it/s]


Train Loss: 0.9534, Train Accuracy: 67.50%


Validation: 100%|██████████| 35/35 [00:06<00:00,  5.79it/s]


Validation Loss: 2.0049, Validation Accuracy: 42.56%

Epoch 19/50


Training: 100%|██████████| 161/161 [00:34<00:00,  4.65it/s]


Train Loss: 0.7726, Train Accuracy: 73.68%


Validation: 100%|██████████| 35/35 [00:05<00:00,  5.84it/s]


Validation Loss: 2.9804, Validation Accuracy: 35.39%

Epoch 20/50


Training: 100%|██████████| 161/161 [00:34<00:00,  4.60it/s]


Train Loss: 0.7196, Train Accuracy: 75.86%


Validation: 100%|██████████| 35/35 [00:06<00:00,  5.71it/s]


Validation Loss: 2.4602, Validation Accuracy: 40.20%

Epoch 21/50


Training: 100%|██████████| 161/161 [00:34<00:00,  4.66it/s]


Train Loss: 0.6844, Train Accuracy: 76.11%


Validation: 100%|██████████| 35/35 [00:05<00:00,  5.84it/s]


Validation Loss: 3.7119, Validation Accuracy: 31.76%

Epoch 22/50


Training: 100%|██████████| 161/161 [00:34<00:00,  4.65it/s]


Train Loss: 0.6067, Train Accuracy: 79.00%


Validation: 100%|██████████| 35/35 [00:06<00:00,  5.74it/s]


Validation Loss: 2.9464, Validation Accuracy: 41.29%

Epoch 23/50


Training: 100%|██████████| 161/161 [00:34<00:00,  4.63it/s]


Train Loss: 0.5578, Train Accuracy: 80.33%


Validation: 100%|██████████| 35/35 [00:06<00:00,  5.76it/s]


Validation Loss: 2.7101, Validation Accuracy: 44.56%

Epoch 24/50


Training: 100%|██████████| 161/161 [00:34<00:00,  4.66it/s]


Train Loss: 0.5586, Train Accuracy: 80.17%


Validation: 100%|██████████| 35/35 [00:06<00:00,  5.76it/s]


Validation Loss: 5.7854, Validation Accuracy: 19.69%

Epoch 25/50


Training: 100%|██████████| 161/161 [00:34<00:00,  4.68it/s]


Train Loss: 0.5383, Train Accuracy: 81.03%


Validation: 100%|██████████| 35/35 [00:06<00:00,  5.82it/s]


Validation Loss: 2.7685, Validation Accuracy: 43.28%

Epoch 26/50


Training: 100%|██████████| 161/161 [00:34<00:00,  4.63it/s]


Train Loss: 0.5431, Train Accuracy: 81.12%


Validation: 100%|██████████| 35/35 [00:06<00:00,  5.70it/s]


Validation Loss: 1.4956, Validation Accuracy: 58.26%

Epoch 27/50


Training: 100%|██████████| 161/161 [00:34<00:00,  4.63it/s]


Train Loss: 0.5187, Train Accuracy: 81.82%


Validation: 100%|██████████| 35/35 [00:06<00:00,  5.80it/s]


Validation Loss: 2.3505, Validation Accuracy: 42.47%

Epoch 28/50


Training: 100%|██████████| 161/161 [00:34<00:00,  4.64it/s]


Train Loss: 0.5278, Train Accuracy: 81.73%


Validation: 100%|██████████| 35/35 [00:06<00:00,  5.74it/s]


Validation Loss: 1.0137, Validation Accuracy: 68.97%

Epoch 29/50


Training: 100%|██████████| 161/161 [00:34<00:00,  4.64it/s]


Train Loss: 0.4610, Train Accuracy: 83.51%


Validation: 100%|██████████| 35/35 [00:06<00:00,  5.83it/s]


Validation Loss: 1.4777, Validation Accuracy: 62.34%

Epoch 30/50


Training: 100%|██████████| 161/161 [00:34<00:00,  4.62it/s]


Train Loss: 0.4947, Train Accuracy: 83.42%


Validation: 100%|██████████| 35/35 [00:06<00:00,  5.70it/s]


Validation Loss: 2.2530, Validation Accuracy: 50.09%

Epoch 31/50


Training: 100%|██████████| 161/161 [00:34<00:00,  4.63it/s]


Train Loss: 0.4840, Train Accuracy: 82.25%


Validation: 100%|██████████| 35/35 [00:06<00:00,  5.82it/s]


Validation Loss: 8.0298, Validation Accuracy: 16.06%

Epoch 32/50


Training: 100%|██████████| 161/161 [00:34<00:00,  4.63it/s]


Train Loss: 0.4330, Train Accuracy: 84.51%


Validation: 100%|██████████| 35/35 [00:06<00:00,  5.72it/s]


Validation Loss: 1.6204, Validation Accuracy: 58.98%

Epoch 33/50


Training: 100%|██████████| 161/161 [00:34<00:00,  4.63it/s]


Train Loss: 0.4289, Train Accuracy: 85.03%


Validation: 100%|██████████| 35/35 [00:06<00:00,  5.79it/s]


Validation Loss: 2.0940, Validation Accuracy: 51.81%

Epoch 34/50


Training: 100%|██████████| 161/161 [00:34<00:00,  4.61it/s]


Train Loss: 0.4161, Train Accuracy: 85.93%


Validation: 100%|██████████| 35/35 [00:06<00:00,  5.77it/s]


Validation Loss: 1.4548, Validation Accuracy: 61.71%

Epoch 35/50


Training: 100%|██████████| 161/161 [00:34<00:00,  4.62it/s]


Train Loss: 0.4311, Train Accuracy: 84.88%


Validation: 100%|██████████| 35/35 [00:06<00:00,  5.82it/s]


Validation Loss: 0.8678, Validation Accuracy: 75.14%

Epoch 36/50


Training: 100%|██████████| 161/161 [00:34<00:00,  4.63it/s]


Train Loss: 0.4104, Train Accuracy: 85.85%


Validation: 100%|██████████| 35/35 [00:06<00:00,  5.78it/s]


Validation Loss: 0.9988, Validation Accuracy: 72.78%

Epoch 37/50


Training: 100%|██████████| 161/161 [00:34<00:00,  4.62it/s]


Train Loss: 0.3841, Train Accuracy: 86.68%


Validation: 100%|██████████| 35/35 [00:05<00:00,  5.85it/s]


Validation Loss: 1.6274, Validation Accuracy: 60.34%

Epoch 38/50


Training: 100%|██████████| 161/161 [00:34<00:00,  4.64it/s]


Train Loss: 0.3780, Train Accuracy: 86.82%


Validation: 100%|██████████| 35/35 [00:06<00:00,  5.75it/s]


Validation Loss: 4.3117, Validation Accuracy: 30.49%

Epoch 39/50


Training: 100%|██████████| 161/161 [00:34<00:00,  4.63it/s]


Train Loss: 0.3864, Train Accuracy: 86.68%


Validation: 100%|██████████| 35/35 [00:06<00:00,  5.81it/s]


Validation Loss: 5.6317, Validation Accuracy: 25.77%

Epoch 40/50


Training: 100%|██████████| 161/161 [00:34<00:00,  4.67it/s]


Train Loss: 0.4458, Train Accuracy: 84.78%


Validation: 100%|██████████| 35/35 [00:06<00:00,  5.77it/s]


Validation Loss: 1.9432, Validation Accuracy: 55.35%

Epoch 41/50


Training: 100%|██████████| 161/161 [00:34<00:00,  4.65it/s]


Train Loss: 0.4100, Train Accuracy: 85.75%


Validation: 100%|██████████| 35/35 [00:06<00:00,  5.81it/s]


Validation Loss: 1.6259, Validation Accuracy: 62.16%

Epoch 42/50


Training: 100%|██████████| 161/161 [00:34<00:00,  4.63it/s]


Train Loss: 0.3397, Train Accuracy: 88.24%


Validation: 100%|██████████| 35/35 [00:06<00:00,  5.70it/s]


Validation Loss: 2.7373, Validation Accuracy: 49.46%

Epoch 43/50


Training: 100%|██████████| 161/161 [00:34<00:00,  4.64it/s]


Train Loss: 0.3559, Train Accuracy: 87.44%


Validation: 100%|██████████| 35/35 [00:05<00:00,  5.84it/s]


Validation Loss: 4.3290, Validation Accuracy: 36.48%

Epoch 44/50


Training: 100%|██████████| 161/161 [00:34<00:00,  4.64it/s]


Train Loss: 0.3646, Train Accuracy: 87.31%


Validation: 100%|██████████| 35/35 [00:06<00:00,  5.72it/s]


Validation Loss: 38.8127, Validation Accuracy: 6.08%

Epoch 45/50


Training: 100%|██████████| 161/161 [00:34<00:00,  4.62it/s]


Train Loss: 0.3813, Train Accuracy: 86.94%


Validation: 100%|██████████| 35/35 [00:06<00:00,  5.83it/s]


Validation Loss: 2.1127, Validation Accuracy: 53.18%

Epoch 46/50


Training: 100%|██████████| 161/161 [00:34<00:00,  4.66it/s]


Train Loss: 0.3036, Train Accuracy: 89.72%


Validation: 100%|██████████| 35/35 [00:06<00:00,  5.73it/s]


Validation Loss: 3.4757, Validation Accuracy: 42.20%

Epoch 47/50


Training: 100%|██████████| 161/161 [00:34<00:00,  4.64it/s]


Train Loss: 0.3122, Train Accuracy: 89.64%


Validation: 100%|██████████| 35/35 [00:06<00:00,  5.82it/s]


Validation Loss: 1.2266, Validation Accuracy: 70.87%

Epoch 48/50


Training: 100%|██████████| 161/161 [00:34<00:00,  4.64it/s]


Train Loss: 0.2570, Train Accuracy: 90.92%


Validation: 100%|██████████| 35/35 [00:06<00:00,  5.72it/s]


Validation Loss: 1.9570, Validation Accuracy: 58.89%

Epoch 49/50


Training: 100%|██████████| 161/161 [00:34<00:00,  4.64it/s]


Train Loss: 0.2209, Train Accuracy: 92.34%


Validation: 100%|██████████| 35/35 [00:05<00:00,  5.85it/s]


Validation Loss: 1.0322, Validation Accuracy: 75.32%

Epoch 50/50


Training: 100%|██████████| 161/161 [00:34<00:00,  4.61it/s]


Train Loss: 0.2402, Train Accuracy: 91.62%


Validation: 100%|██████████| 35/35 [00:06<00:00,  5.71it/s]

Validation Loss: 2.8954, Validation Accuracy: 48.28%





In [29]:
print("\nFinal Test Evaluation")
evaluate(model, test_loader, criterion, device, phase="Test")


Final Test Evaluation


Test: 100%|██████████| 35/35 [00:06<00:00,  5.03it/s]

Test Loss: 2.8617, Test Accuracy: 49.32%





In [30]:
times = measure_inference_time(model, test_loader, device)

Inference Time Measurement Results:
Total Inferences: 35
Average Time: 24.53 ms
Standard Deviation: 0.57 ms
Maximum Time: 24.67 ms
Minimum Time: 21.20 ms
