In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import accuracy_score, precision_score, recall_score
import numpy as np
from PIL import Image
import os
import random


In [6]:
def load_data(data_dir):
    classes = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'del', 'nothing', 'space']
    images = []
    labels = []

    for label, class_name in enumerate(classes):
        class_dir = os.path.join(data_dir, class_name)
        for img_name in os.listdir(class_dir):
            img_path = os.path.join(class_dir, img_name)
            # Double check that we need to convert to grayscale from paper
            image = Image.open(img_path).convert('L')  # Convert to grayscale
            image = image.resize((64, 64))
            images.append(np.array(image))
            labels.append(label)

    images = np.array(images)
    labels = np.array(labels)
    
    return images, labels


In [7]:
def split_data(images, labels, train_ratio=0.8, val_ratio=0.1, test_ratio=0.1):
    assert train_ratio + val_ratio + test_ratio == 1, "Ratios must sum to 1"
    total_size = len(labels)
    indices = list(range(total_size))
    random.shuffle(indices)
    
    train_split = int(train_ratio * total_size)
    val_split = int(val_ratio * total_size)
    
    train_indices = indices[:train_split]
    val_indices = indices[train_split:train_split + val_split]
    test_indices = indices[train_split + val_split:]
    
    train_images = images[train_indices]
    train_labels = labels[train_indices]
    val_images = images[val_indices]
    val_labels = labels[val_indices]
    test_images = images[test_indices]
    test_labels = labels[test_indices]
    
    return train_images, train_labels, val_images, val_labels, test_images, test_labels


In [8]:
data_dir = '../data/mnist_asl_alphabet_train'

images, labels = load_data(data_dir)

train_images, train_labels, val_images, val_labels, test_images, test_labels = split_data(images, labels)

# Convert to tensors (unsqueeze for Conv2d and normalize)
train_images = torch.tensor(train_images, dtype=torch.float32).unsqueeze(1) / 255.0

train_labels = torch.tensor(train_labels, dtype=torch.long)

val_images = torch.tensor(val_images, dtype=torch.float32).unsqueeze(1) / 255.0

val_labels = torch.tensor(val_labels, dtype=torch.long)

test_images = torch.tensor(test_images, dtype=torch.float32).unsqueeze(1) / 255.0

test_labels = torch.tensor(test_labels, dtype=torch.long)


In [None]:
class ASLModel(nn.Module):
    def __init__(self):
        super(ASLModel, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.fc1 = nn.Linear(64 * 16 * 16, 512)
        self.fc2 = nn.Linear(512, 29)  # 29 classes

    def forward(self, x):
        x = self.pool(torch.relu(self.conv1(x)))
        x = self.pool(torch.relu(self.conv2(x)))
        x = x.view(-1, 64 * 16 * 16)
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = ASLModel().to(device)

loss_fn = nn.CrossEntropyLoss()

optimizer = optim.Adam(model.parameters(), lr=0.001)

# Hyperparameters

batch_size = 32

num_epochs = 5

In [None]:
for epoch in range(num_epochs):
    model.train()
    permutation = torch.randperm(train_images.size()[0])
    running_loss = 0.0

    # Loop over all batches, in terms of batch_size
    for i in range(0, train_images.size()[0], batch_size):
        
        # Get the random permutation of numbers according to batch_size
        indices = permutation[i:i+batch_size]
        
        # Get the corresponding images and labels 
        batch_images, batch_labels = train_images[indices], train_labels[indices]

        # Bring them to the corresponding device
        batch_images, batch_labels = batch_images.to(device), batch_labels.to(device)
        
        outputs = model(batch_images)
        loss = loss_fn(outputs, batch_labels)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item() * batch_images.size(0)

        print(loss)
    
    epoch_loss = running_loss / train_images.size(0)
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}')
    
    # Validation
    model.eval()
    with torch.no_grad():
        val_outputs = model(val_images.to(device))
        val_loss = criterion(val_outputs, val_labels.to(device))
        print(f'Validation Loss: {val_loss.item():.4f}')


In [None]:
# Testing model with test dataset
def evaluate_model(test_images, test_labels):
    
    model.eval()
    
    with torch.no_grad():
        
        test_outputs = model(test_images.to(device))
        
        _, preds = torch.max(test_outputs, 1)
        
        accuracy = accuracy_score(test_labels.cpu(), preds.cpu())
        
        precision = precision_score(test_labels.cpu(), preds.cpu(), average='macro')
        
        recall = recall_score(test_labels.cpu(), preds.cpu(), average='macro')
        
    return accuracy, precision, recall


In [None]:
# Evaluate the model
accuracy, precision, recall = evaluate_model(model, test_images, test_labels)
print(f'Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}')

In [None]:
torch.save(model.state(dict, "model.pth"))