In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
import numpy as np
import random
from collections import defaultdict

print("Imports Successful")

In [None]:
transform = transforms.Compose([
    transforms.Resize((192, 192)),
    transforms.Grayscale(),    
    transforms.ToTensor(),       
    transforms.Normalize(mean=[0.5], std=[0.5]) 
])


dataset = ImageFolder(root='./dataset/asl_alphabet_train', transform=transform)

In [None]:
print(len(dataset))

In [None]:
# Samplers per sign
samples_per_class = 100

random.shuffle(dataset.samples)

count_dict = defaultdict(int)

filtered_samples = []
filtered_targets = []

for sample_index, (sample_path, target) in enumerate(dataset.samples):
    if count_dict[target] < samples_per_class:
      filtered_samples.append((sample_path, target))
      filtered_targets.append(target)
      count_dict[target]+=1

dataset.samples = filtered_samples
dataset.targets = filtered_targets
print(count_dict)
print(len(dataset.targets))
print(len(dataset.samples))
print(len(dataset))

In [None]:
# Define the CNN architecture
class Signlingo(nn.Module):
    def __init__(self, num_classes):
        super(Signlingo, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=16, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, stride=1, padding=1)
        self.conv3 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=1, padding=1)
        #self.conv4 = nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1, padding=1)
        self.MaxPool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        self.fc1 = nn.Linear(24*24*64, 512)
        self.fc2 = nn.Linear(512, 29)

    def forward(self, x):
        x = self.MaxPool(torch.relu(self.conv1(x)))
        x = self.MaxPool(torch.relu(self.conv2(x)))
        x = self.MaxPool(torch.relu(self.conv3(x)))
        #x = self.MaxPool(torch.relu(self.conv4(x)))
        x = x.view(x.size(0), -1)
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [None]:
train_set, test_set = train_test_split(dataset, test_size=0.2, random_state=42)
print(len(train_set))
# Data Loaders
train_loader = DataLoader(train_set, batch_size=32, shuffle=True)
test_loader = DataLoader(test_set, batch_size=32, shuffle=False)

# Model
model = Signlingo(num_classes=len(dataset.classes))

# Loss function
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=False)

# Train
num_epochs = 8
for epoch in range(num_epochs):
    print(epoch)
    for i, (data, labels) in enumerate(train_loader):
        optimizer.zero_grad()
        outputs = model(data)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

    # Test Accuracy
    model.eval()
    accuracy = 0
    total = 0
    with torch.no_grad():
        for data, labels in test_loader:
            outputs = model(data)
            predicted = torch.argmax(outputs, dim=1)
            total += labels.size(0)
            accuracy += (predicted == labels).sum()
    model.train()

    print('Test Accuracy:' + str(accuracy/total))
torch.save(model, 'model.pth')

In [None]:
# See what we are getting wrong
for data, labels in test_loader:
  outputs = model(data)
  print(labels)
  print(torch.max(outputs.data, 1))

In [None]:
# Testing our own photos
dataset2 = ImageFolder(root='./dataset/asl_alphabet_test', transform=transform)
test_loader2 = DataLoader(dataset2, batch_size=32, shuffle=False)
for data, labels in test_loader2:
  outputs = model(data)
  print(labels)
  print(torch.max(outputs.data, 1))