In [38]:
import torch
import torch.nn as nn
from torchvision import transforms
from torch.optim import AdamW
from torch import optim

In [None]:
class PatchEmbedding(nn.Module):
    def __init__(self, img_size=32, patch_size=7, in_channels=1, embed_dim=64):
        super().__init__()
        self.num_patches = (img_size // patch_size) ** 2
        self.embed_dim = embed_dim
        self.patch_size = patch_size

        # Conv2d instead of Flattening Manually
        self.proj = nn.Conv2d(in_channels, embed_dim, kernel_size=patch_size, stride=patch_size)

        # Classification token (Trainable parameter)
        self.cls_token = nn.Parameter(torch.randn(1, 1, embed_dim))

        # Learnable Positional Embedding
        self.pos_embedding = nn.Parameter(torch.randn(1, self.num_patches + 1, embed_dim) * 0.02)

    def forward(self, x):
        B = x.shape[0]
        x = self.proj(x)  # (B, embed_dim, num_patches_sqrt, num_patches_sqrt)
        x = x.flatten(2).transpose(1, 2)  # Shape: (B, num_patches, embed_dim)

        cls_tokens = self.cls_token.expand(B, -1, -1)  # (B, 1, embed_dim)
        x = torch.cat([cls_tokens, x], dim=1)  # (B, num_patches+1, embed_dim)
        x = x + self.pos_embedding
        return x


In [56]:
class TransformerBlock(nn.Module):
    def __init__(self, embed_dim, num_heads=8, dropout=0.1):
        super().__init__()
        self.layer_norm1 = nn.LayerNorm(embed_dim)
        self.attn = nn.MultiheadAttention(embed_dim, num_heads, dropout=dropout)
        self.layer_norm2 = nn.LayerNorm(embed_dim)
        self.dropout1=nn.Dropout(dropout)

        self.mlp = nn.Sequential(
            nn.Linear(embed_dim, embed_dim * 4),
            nn.GELU(),
            nn.Linear(embed_dim * 4, embed_dim),
            nn.Dropout(dropout)
        )
    
    def forward(self, x):
        x_residual = x
        x = self.layer_norm1(x)
        x_attn, _ = self.attn(x, x, x, need_weights=False)
        x = x_residual + self.dropout1(x_attn)  # Add Residual Connection

        x_residual = x
        x = self.layer_norm2(x)
        x_mlp = self.mlp(x)
        x = x_residual + x_mlp  # Add Residual Connection

        return x


In [57]:
class VIT(nn.Module):
    def __init__(self, img_size=32, patch_size=7, in_channels=1, embed_dim=64, num_layers=4, num_classes=10, dropout=0.1):
        super().__init__()
        self.patch_embed = PatchEmbedding(img_size, patch_size, in_channels, embed_dim)

        self.encoder_blocks = nn.ModuleList([
            TransformerBlock(embed_dim, num_heads=8, dropout=dropout) for _ in range(num_layers)
        ])

        self.layer_norm = nn.LayerNorm(embed_dim)
        self.dropout = nn.Dropout(dropout)
        self.mlp_head = nn.Linear(embed_dim, num_classes)

    def forward(self, x):
        x = self.patch_embed(x)  # Convert image to patches

        for layer in self.encoder_blocks:
            x = layer(x)
        
        cls_token = x[:, 0, :]  # Extract classification token
        cls_token = self.layer_norm(cls_token)
        cls_token = self.dropout(cls_token)
        logits = self.mlp_head(cls_token)  # Output class probabilities

        return logits


root/
1.    class_1/
        img1.jpg
        img2.jpg
        ...
2.    class_2/
        img3.jpg
        img4.jpg
        ...
    ...


In [58]:
import torch
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

In [59]:
# Define Transformations
transform = transforms.Compose([
    transforms.Resize((32, 32)),  # Resize for easier patch division
    transforms.ToTensor(),        # Convert to tensor
    transforms.Normalize((0.5,), (0.5,))  # Normalize
])

# Load MNIST dataset
train_dataset = datasets.MNIST(root="./data", train=True, download=True, transform=transform)
test_dataset = datasets.MNIST(root="./data", train=False, download=True, transform=transform)

# Create Dataloaders
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)


In [67]:
device="cuda" if torch.cuda.is_available() else "cpu"

In [68]:
model = VIT().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=1e-3, weight_decay=0.01)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10)

In [69]:
def train(model, train_loader, criterion, optimizer, scheduler, num_epochs=10, clip_grad=1.0):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        correct, total = 0, 0

        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), clip_grad)
            optimizer.step()
            scheduler.step()

            total_loss += loss.item()
            _, predicted = outputs.max(1)
            correct += (predicted == labels).sum().item()
            total += labels.size(0)

        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(train_loader):.4f}, Accuracy: {100 * correct / total:.2f}%")


In [71]:
train(model, train_loader, criterion, optimizer, scheduler, num_epochs=10)

Epoch [1/10], Loss: 2.3211, Accuracy: 10.44%
Epoch [2/10], Loss: 2.3073, Accuracy: 10.54%
Epoch [3/10], Loss: 2.3049, Accuracy: 10.57%
Epoch [4/10], Loss: 2.3038, Accuracy: 10.67%
Epoch [5/10], Loss: 2.3027, Accuracy: 10.99%
Epoch [6/10], Loss: 2.3025, Accuracy: 10.90%
Epoch [7/10], Loss: 2.3024, Accuracy: 10.88%
Epoch [8/10], Loss: 2.3020, Accuracy: 10.97%
Epoch [9/10], Loss: 2.3021, Accuracy: 11.06%
Epoch [10/10], Loss: 2.3019, Accuracy: 11.06%


In [None]:
import torch.nn as nn


class C3D(nn.Module):
    """
    The C3D network as described in [1].
    """

    def __init__(self):
        super(C3D, self).__init__()

        self.conv1 = nn.Conv3d(3, 64, kernel_size=(3, 3, 3), padding=(1, 1, 1))
        self.pool1 = nn.MaxPool3d(kernel_size=(1, 2, 2), stride=(1, 2, 2))

        self.conv2 = nn.Conv3d(64, 128, kernel_size=(3, 3, 3), padding=(1, 1, 1))
        self.pool2 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2))

        self.conv3a = nn.Conv3d(128, 256, kernel_size=(3, 3, 3), padding=(1, 1, 1))
        self.conv3b = nn.Conv3d(256, 256, kernel_size=(3, 3, 3), padding=(1, 1, 1))
        self.pool3 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2))

        self.conv4a = nn.Conv3d(256, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1))
        self.conv4b = nn.Conv3d(512, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1))
        self.pool4 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2))

        self.conv5a = nn.Conv3d(512, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1))
        self.conv5b = nn.Conv3d(512, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1))
        self.pool5 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2), padding=(0, 1, 1))

        self.fc6 = nn.Linear(8192, 4096)
        self.fc7 = nn.Linear(4096, 4096)
        self.fc8 = nn.Linear(4096, 487)

        self.dropout = nn.Dropout(p=0.5)

        self.relu = nn.ReLU()
        self.softmax = nn.Softmax()

    def forward(self, x):

        h = self.relu(self.conv1(x))
        h = self.pool1(h)

        h = self.relu(self.conv2(h))
        h = self.pool2(h)

        h = self.relu(self.conv3a(h))
        h = self.relu(self.conv3b(h))
        h = self.pool3(h)

        h = self.relu(self.conv4a(h))
        h = self.relu(self.conv4b(h))
        h = self.pool4(h)

        h = self.relu(self.conv5a(h))
        h = self.relu(self.conv5b(h))
        h = self.pool5(h)

        h = h.view(-1, 8192)
        h = self.relu(self.fc6(h))
        h = self.dropout(h)
        h = self.relu(self.fc7(h))
        h = self.dropout(h)

        logits = self.fc8(h)
        probs = self.softmax(logits)

        return probs
