Step 1: Import libraries

install dependencies using the following command:

pip install opencv-python torch torchvision

PyTorch supports 11.x and 12.0-12.2 so if you see that it uses device "cpu" in the code below then see CUDA version with command:

nvidia-smi



In [2]:
# Import necessary libraries
import torch
import torchvision
from torchvision import models, transforms
from torch.utils.data import DataLoader, Dataset
from PIL import Image
import os
import cv2
from pathlib import Path
import time

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


Using device: cpu


Step 2: Pre-Process the data

Here we perform the pre-processing to ensure that we have good data going into the model. "trash in trash out" 

various Pre-Processing Processes:
- Cropping the video to the size specified by the "box" values
- Converting the video to frames (temporary)



In [8]:
# Extract frames from videos
# Paths
data_dir = Path("dataset/dataset-top-15")
output_dir = Path("processed_dataset")

# Parameters
target_frame = 5  # Extract the 30th frame
img_size = (224, 224)  # Resize images to 224x224 for ResNet50
categories = ['train', 'val', 'test']

# Create output directories
for category in categories:
    for class_dir in (data_dir / category).iterdir():
        output_class_dir = output_dir / category / class_dir.name
        output_class_dir.mkdir(parents=True, exist_ok=True)

# Extract frames
for category in categories:
    for class_dir in (data_dir / category).iterdir():
        for video_file in class_dir.iterdir():
            cap = cv2.VideoCapture(str(video_file))
            frame_count = 0
            success, frame = cap.read()
            while success:
                frame_count += 1
                if frame_count == target_frame:
                    # Resize frame
                    frame = cv2.resize(frame, img_size)
                    # Save frame as an image
                    output_path = output_dir / category / class_dir.name / f"{video_file.stem}.jpg"
                    cv2.imwrite(str(output_path), frame)
                    break
                success, frame = cap.read()
            cap.release()

print("Frame extraction complete.")

Frame extraction complete.


Step 3: Pre-Processing+

define a bunch of stuff

In [9]:
# Define PyTorch Dataset
class ASLDataset(Dataset):
    def __init__(self, data_dir, transform=None):
        self.data_dir = Path(data_dir)
        self.transform = transform
        self.image_paths = []
        self.labels = []
        self.class_to_idx = {cls_name: idx for idx, cls_name in enumerate(sorted(os.listdir(self.data_dir)))}
        
        for class_name, class_idx in self.class_to_idx.items():
            class_dir = self.data_dir / class_name
            for img_path in class_dir.iterdir():
                self.image_paths.append(img_path)
                self.labels.append(class_idx)
    
    def __len__(self):
        return len(self.image_paths)
    
    def __getitem__(self, idx):
        img_path = self.image_paths[idx]
        label = self.labels[idx]
        image = Image.open(img_path).convert("RGB")
        
        if self.transform:
            image = self.transform(image)
        
        return image, label

# Define transformations
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),  # For ResNet
])

# Load datasets
train_dataset = ASLDataset(output_dir / 'train', transform=transform)
val_dataset = ASLDataset(output_dir / 'val', transform=transform)
test_dataset = ASLDataset(output_dir / 'test', transform=transform)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

print("Data preprocessing and loading complete.")

Data preprocessing and loading complete.


Step 4: Load the Pre-trained model

Here we load the model into the "model" variable, which will be used later in steps X and X.
If we want to change to a different model, use a different activation function, etc. Here is where we make the change.



In [6]:
# Load Pre-trained ResNet50 Model
model = models.resnet50(weights=True)
num_classes = len(train_dataset.class_to_idx)  # Dynamically set num_classes
model.fc = torch.nn.Linear(model.fc.in_features, num_classes)  # Update the final layer
model = model.to(device)

# Print the model architecture
print(model)

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

Step 5: Fine-tune the Model

- Freeze the base layers (Optional) (We don't do this)
- Define Optimizer & Loss function
- Training Loop

In [10]:
# Fine-tuning the ResNet50 Model
import torch.optim as optim
import torch.nn as nn

# learning rate
lr = 0.001


# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training and Validation Loop
def train_model(model, dataloaders, criterion, optimizer, num_epochs=10):
    model.train()
    for epoch in range(num_epochs):
        print(f"Epoch {epoch+1}/{num_epochs}")
        print("-" * 10)

        # Training phase
        model.train()
        running_loss = 0.0
        correct = 0
        total = 0
        for inputs, labels in dataloaders['train']:
            inputs, labels = inputs.to(device), labels.to(device)

            # Zero the parameter gradients
            optimizer.zero_grad()

            # Forward pass
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            # Statistics
            running_loss += loss.item() * inputs.size(0)
            _, preds = torch.max(outputs, 1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

        epoch_loss = running_loss / total
        epoch_acc = correct / total
        print(f"Train Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}")

        # Validation phase
        model.eval()
        val_loss = 0.0
        val_correct = 0
        val_total = 0
        with torch.no_grad():
            for inputs, labels in dataloaders['val']:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                loss = criterion(outputs, labels)

                val_loss += loss.item() * inputs.size(0)
                _, preds = torch.max(outputs, 1)
                val_correct += (preds == labels).sum().item()
                val_total += labels.size(0)

        val_epoch_loss = val_loss / val_total
        val_epoch_acc = val_correct / val_total
        print(f"Val Loss: {val_epoch_loss:.4f} Acc: {val_epoch_acc:.4f}")

    return model

# Create dataloaders dictionary
dataloaders = {
    'train': train_loader,
    'val': val_loader
}

# Fine-tune the model
model = train_model(model, dataloaders, criterion, optimizer, num_epochs=10)

# Save the fine-tuned model
torch.save(model.state_dict(), "resnet50_asl_finetuned.pth")
print("Model fine-tuning complete and saved.")

Epoch 1/10
----------
Train Loss: 3.4321 Acc: 0.0860
Val Loss: 741.9008 Acc: 0.0733
Epoch 2/10
----------
Train Loss: 2.8595 Acc: 0.0841
Val Loss: 2.7692 Acc: 0.0467
Epoch 3/10
----------
Train Loss: 2.7265 Acc: 0.0804
Val Loss: 2.8487 Acc: 0.0533
Epoch 4/10
----------
Train Loss: 2.7309 Acc: 0.1047
Val Loss: 2.7881 Acc: 0.0400
Epoch 5/10
----------
Train Loss: 2.6842 Acc: 0.1065
Val Loss: 2.7332 Acc: 0.0667
Epoch 6/10
----------
Train Loss: 2.6381 Acc: 0.1402
Val Loss: 2.6938 Acc: 0.0867
Epoch 7/10
----------
Train Loss: 2.6004 Acc: 0.1271
Val Loss: 5.8511 Acc: 0.0667
Epoch 8/10
----------
Train Loss: 2.4971 Acc: 0.1421
Val Loss: 2.8520 Acc: 0.0867
Epoch 9/10
----------
Train Loss: 2.3714 Acc: 0.2168
Val Loss: 2.7166 Acc: 0.1067
Epoch 10/10
----------
Train Loss: 2.2279 Acc: 0.2729
Val Loss: 11.3834 Acc: 0.0467
Model fine-tuning complete and saved.


Step 6: Test the model

The second line of code is determining which model is being used, so be sure to check that the number in: ...finetuned_XX.pth is correct.

In [13]:
# Load the fine-tuned model
model.load_state_dict(torch.load("models/resnet50_asl_finetuned_01.pth"))
model = model.to(device)
model.eval()  # Set model to evaluation mode

# Evaluation loop
def evaluate_model(model, test_loader):
    correct = 0
    total = 0

    with torch.no_grad():
        StartTime = time.time()
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)

            # Get predictions
            _, preds = torch.max(outputs, 1)

            # Update metrics
            correct += (preds == labels).sum().item()
            total += labels.size(0)
        EndTime = time.time()
    print(f"Time taken for evaluation: {EndTime - StartTime:.2f} seconds")
    print(f"average time taken for each image: {(EndTime - StartTime)/total:.2f} seconds")

    try:
        accuracy = correct / total
    except ZeroDivisionError:
        print("Division by zero occurred. Check if the test folder is empty.")
        accuracy = 0.0
    print(f"Test Accuracy: {accuracy:.4f}")
    return accuracy

# Evaluate the model
test_accuracy = evaluate_model(model, test_loader)


  model.load_state_dict(torch.load("models/resnet50_asl_finetuned_01.pth"))


Time taken for evaluation: 4.90 seconds
average time taken for each image: 0.07 seconds
Test Accuracy: 0.1067
