# Vision Transformer model for Human motion database (HMDB)

[https://www.kaggle.com/datasets/avigoen/hmdb-human-activity-recognition/data](https://)

In [None]:
import os
import cv2
import torch
import random
import numpy as np
from PIL import Image
from tqdm import tqdm
from tqdm.notebook import tqdm
from torchvision import transforms
from transformers import get_cosine_schedule_with_warmup
from torch.utils.data import DataLoader, Dataset, random_split
from transformers import ViTForImageClassification, ViTFeatureExtractor

## Data loading and pre-processing:

In [None]:
dataset_path = '/kaggle/input/hmdb-human-activity-recognition/HMDB_dataset'
output_frames_path = '/kaggle/working/hmdb_frames'

os.makedirs(output_frames_path, exist_ok=True)

frame_interval = 10
image_size = 224                # - as expected by ViT

data_transforms = transforms.Compose([
    transforms.RandomResizedCrop(image_size, scale=(0.8, 1.0)),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

In [None]:
for action_class in tqdm(os.listdir(dataset_path), desc="Processing classes"):
    class_path = os.path.join(dataset_path, action_class)
    if not os.path.isdir(class_path):
        continue

    class_output_path = os.path.join(output_frames_path, action_class)
    os.makedirs(class_output_path, exist_ok=True)

    for video_file in os.listdir(class_path):
        video_path = os.path.join(class_path, video_file)

        cap = cv2.VideoCapture(video_path)
        frame_count = 0
        saved_frame_count = 0

        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break

            if frame_count % frame_interval == 0:
                frame_resized = cv2.resize(frame, (image_size, image_size))
                frame_pil = Image.fromarray(cv2.cvtColor(frame_resized, cv2.COLOR_BGR2RGB))

                frame_augmented = data_transforms(frame_pil)

                save_path = os.path.join(class_output_path, f"{video_file}_{saved_frame_count}.jpg")
                frame_augmented_pil = transforms.ToPILImage()(frame_augmented)
                frame_augmented_pil.save(save_path)

                saved_frame_count += 1

            frame_count += 1

        cap.release()

print("Frame extraction and augmentation complete.")

Processing classes:   0%|          | 0/51 [00:00<?, ?it/s]

Frame extraction and augmentation complete.


## Loading vision transformer model:

In [None]:
num_classes = 51

model_name = "google/vit-base-patch16-224-in21k"
model = ViTForImageClassification.from_pretrained(model_name)
feature_extractor = ViTFeatureExtractor.from_pretrained(model_name)

model.classifier = torch.nn.Linear(model.classifier.in_features, num_classes)
print(model)

config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]

ViTForImageClassification(
  (vit): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTSdpaAttention(
            (attention): ViTSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_fe



## Setting Up Training Configurations:

In [None]:
random.seed(42)

batch_size = 16
num_epochs = 25
learning_rate = 0.001

data_path = '/kaggle/working/hmdb_frames'

In [None]:
class HMDBDataset(Dataset):
    def __init__(self, data_path, feature_extractor, transform=None):
        self.transform = transform
        self.feature_extractor = feature_extractor
        self.images = []
        self.labels = []

        for label, action_class in enumerate(os.listdir(data_path)):
            class_dir = os.path.join(data_path, action_class)
            for img_name in os.listdir(class_dir):
                self.images.append(os.path.join(class_dir, img_name))
                self.labels.append(label)

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        image_path = self.images[idx]
        label = self.labels[idx]

        image = Image.open(image_path).convert('RGB')
        image = self.feature_extractor(images=image, return_tensors="pt")["pixel_values"].squeeze()

        if self.transform:
            image = self.transform(image)

        return image, label

train_transform = transforms.Compose([
    transforms.RandomResizedCrop(224),
    transforms.RandomHorizontalFlip(),
])

val_transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
])

full_dataset = HMDBDataset(data_path, feature_extractor, transform=train_transform)

train_size = int(0.8 * len(full_dataset))
val_size = len(full_dataset) - train_size
train_dataset, val_dataset = random_split(full_dataset, [train_size, val_size])

val_dataset.dataset.transform = val_transform

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_loader) * num_epochs
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=100, num_training_steps=total_steps)

criterion = torch.nn.CrossEntropyLoss()

print(f"Training on device: {device}")
print(f"Batch size: {batch_size}, Number of epochs: {num_epochs}, Learning rate: {learning_rate}")
print(f"Training set size: {train_size}, Validation set size: {val_size}")

Training on device: cuda
Batch size: 16, Number of epochs: 25, Learning rate: 0.001
Training set size: 52716, Validation set size: 13179


## Check pointing and Early Stopping:

In [None]:
patience = 3
best_val_accuracy = 0
epochs_without_improvement = 0
checkpoint_path = "/kaggle/working/best_model.pth"

In [None]:
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")

    model.train()
    train_loss = 0
    correct_preds = 0
    total_preds = 0
    for images, labels in tqdm(train_loader):
        images, labels = images.to(device), labels.to(device)

        optimizer.zero_grad()

        outputs = model(images)
        logits = outputs.logits
        loss = criterion(logits, labels)
        train_loss += loss.item() * images.size(0)

        loss.backward()
        optimizer.step()

        _, predicted = torch.max(logits, 1)
        correct_preds += (predicted == labels).sum().item()
        total_preds += labels.size(0)

    avg_train_loss = train_loss / len(train_loader.dataset)
    train_accuracy = correct_preds / total_preds
    print(f"Training Loss: {avg_train_loss:.4f}, Training Accuracy: {train_accuracy:.4f}")

    model.eval()
    val_loss = 0
    correct_preds = 0
    total_preds = 0
    with torch.no_grad():
        for images, labels in val_loader:
            images, labels = images.to(device), labels.to(device)

            outputs = model(images)
            logits = outputs.logits
            loss = criterion(logits, labels)
            val_loss += loss.item() * images.size(0)

            _, predicted = torch.max(logits, 1)
            correct_preds += (predicted == labels).sum().item()
            total_preds += labels.size(0)

    avg_val_loss = val_loss / len(val_loader.dataset)
    val_accuracy = correct_preds / total_preds
    print(f"Validation Loss: {avg_val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")

    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        epochs_without_improvement = 0
        torch.save(model.state_dict(), checkpoint_path)
        print(f"Checkpoint saved: Model improved with validation accuracy of {val_accuracy:.4f}")
    else:
        epochs_without_improvement += 1
        print(f"No improvement for {epochs_without_improvement} epoch(s)")

    if epochs_without_improvement >= patience:
        print("Early stopping triggered.")
        break

    scheduler.step()

Epoch 1/25


100%|██████████| 3295/3295 [20:04<00:00,  2.74it/s]


Training Loss: 3.9392, Training Accuracy: 0.0305
Validation Loss: 3.9376, Validation Accuracy: 0.0311
Checkpoint saved: Model improved with validation accuracy of 0.0311
Epoch 2/25


100%|██████████| 3295/3295 [20:05<00:00,  2.73it/s]


Training Loss: 2.8896, Training Accuracy: 0.3525
Validation Loss: 2.1942, Validation Accuracy: 0.5304
Checkpoint saved: Model improved with validation accuracy of 0.5304
Epoch 3/25


100%|██████████| 3295/3295 [20:06<00:00,  2.73it/s]


Training Loss: 1.4822, Training Accuracy: 0.6617
Validation Loss: 1.0945, Validation Accuracy: 0.7345
Checkpoint saved: Model improved with validation accuracy of 0.7345
Epoch 4/25


100%|██████████| 3295/3295 [20:06<00:00,  2.73it/s]


Training Loss: 0.6948, Training Accuracy: 0.8236
Validation Loss: 0.6270, Validation Accuracy: 0.8355
Checkpoint saved: Model improved with validation accuracy of 0.8355
Epoch 5/25


100%|██████████| 3295/3295 [20:04<00:00,  2.74it/s]


Training Loss: 0.3599, Training Accuracy: 0.9030
Validation Loss: 0.5416, Validation Accuracy: 0.8518
Checkpoint saved: Model improved with validation accuracy of 0.8518
Epoch 6/25


100%|██████████| 3295/3295 [20:04<00:00,  2.73it/s]


Training Loss: 0.2413, Training Accuracy: 0.9303
Validation Loss: 0.5326, Validation Accuracy: 0.8570
Checkpoint saved: Model improved with validation accuracy of 0.8570
Epoch 7/25


100%|██████████| 3295/3295 [20:06<00:00,  2.73it/s]


Training Loss: 0.1845, Training Accuracy: 0.9465
Validation Loss: 0.4768, Validation Accuracy: 0.8768
Checkpoint saved: Model improved with validation accuracy of 0.8768
Epoch 8/25


100%|██████████| 3295/3295 [20:06<00:00,  2.73it/s]


Training Loss: 0.1731, Training Accuracy: 0.9490
Validation Loss: 0.4478, Validation Accuracy: 0.8856
Checkpoint saved: Model improved with validation accuracy of 0.8856
Epoch 9/25


100%|██████████| 3295/3295 [20:12<00:00,  2.72it/s]


Training Loss: 0.1584, Training Accuracy: 0.9534
Validation Loss: 0.4587, Validation Accuracy: 0.8794
No improvement for 1 epoch(s)
Epoch 10/25


100%|██████████| 3295/3295 [20:26<00:00,  2.69it/s]


Training Loss: 0.1512, Training Accuracy: 0.9544
Validation Loss: 0.4752, Validation Accuracy: 0.8768
No improvement for 2 epoch(s)
Epoch 11/25


100%|██████████| 3295/3295 [20:29<00:00,  2.68it/s]


Training Loss: 0.1476, Training Accuracy: 0.9567
Validation Loss: 0.5144, Validation Accuracy: 0.8723
No improvement for 3 epoch(s)
Early stopping triggered.


#### Saving trained model:

In [None]:
final_model_path = "/kaggle/working/final_trained_model.pth"

torch.save(model.state_dict(), final_model_path)
print(f"Model saved to {final_model_path}")

Model saved to /kaggle/working/final_trained_model.pth


## Model evaluation:

In [None]:
total_size = len(full_dataset)
train_size = int(0.8 * total_size)
val_size = int(0.1 * total_size)
test_size = total_size - train_size - val_size

train_dataset, val_dataset, test_dataset = random_split(full_dataset, [train_size, val_size, test_size])

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [None]:
model.load_state_dict(torch.load(checkpoint_path))
model.eval()  # Set the model to evaluation mode

test_loss = 0
correct_preds = 0
total_preds = 0

with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)

        outputs = model(images)
        logits = outputs.logits
        loss = criterion(logits, labels)
        test_loss += loss.item() * images.size(0)

        _, predicted = torch.max(logits, 1)
        correct_preds += (predicted == labels).sum().item()
        total_preds += labels.size(0)

avg_test_loss = test_loss / len(test_loader.dataset)
test_accuracy = correct_preds / total_preds

print(f"Test Loss: {avg_test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")

  model.load_state_dict(torch.load(checkpoint_path))


Test Loss: 0.1687, Test Accuracy: 0.9533
