In [18]:
import cv2
import os
import torch, torchvision
from torch import nn
import torch.nn.functional as F
from torchvision.datasets import ImageFolder
from PIL import Image
import torchvision.transforms as transforms
from tqdm import tqdm
import matplotlib.pyplot as plt

In [2]:
# !pip install git+https://github.com/davidbau/baukit

In [3]:
from baukit import show

## Cuting the frame into 48x48

In [4]:
# # Setting up the directory to transfer the data to
# directory = 'SignImage48x48'
# if not os.path.exists(directory):
#     os.mkdir(directory)
# if not os.path.exists(f'{directory}/blank'):
#     os.mkdir(f'{directory}/blank')

In [5]:
# # range 65 to 91 is just the alphabet from A to Z when transform from into character
# for i in range(65, 91):
#     letter = chr(i)
#     if not os.path.exists(f'{directory}/{letter}'):
#         os.mkdir(f'{directory}/{letter}')

In [6]:
# cap = cv2.VideoCapture(0)
# while True:
#     _, frame = cap.read()
#     count = {}

## ASL CNN MODEL

In [7]:
class ASL_Model(nn.Module):
    def __init__(self, num_classes):
        super(ASL_Model, self).__init__()
        self.conv1 = nn.Conv2d(1, 128, kernel_size=3)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        self.dropout = nn.Dropout(p=0.4)
        self.conv2 = nn.Conv2d(128, 256, kernel_size=3)
        self.conv3 = nn.Conv2d(256, 512, kernel_size=3)
        self.conv4 = nn.Conv2d(512, 512, kernel_size=3)
        
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(128, 512)  # Adjust the size accordingly
        self.dropout1 = nn.Dropout(p=0.4)
        self.fc2 = nn.Linear(512, 64)
        self.dropout2 = nn.Dropout(p=0.2)
        self.fc3 = nn.Linear(64, 256)
        self.dropout3 = nn.Dropout(p=0.3)
        self.fc4 = nn.Linear(256, 64)
        self.dropout4 = nn.Dropout(p=0.2)
        self.fc5 = nn.Linear(64, 256)
        self.dropout5 = nn.Dropout(p=0.3)
        self.fc6 = nn.Linear(256, 6)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.pool(x)
        x = self.dropout(x)
        
        x = F.relu(self.conv2(x))
        x = self.pool(x)
        x = self.dropout(x)
        
        x = F.relu(self.conv3(x))
        x = self.pool(x)
        
        x = F.relu(self.conv4(x))
        x = self.pool(x)
        x = self.dropout(x)
        
        x = self.flatten(x)
        
        x = F.relu(self.fc1(x))
        x = self.dropout1(x)
        x = F.relu(self.fc2(x))
        x = self.dropout2(x)
        x = F.relu(self.fc3(x))
        x = self.dropout3(x)
        x = F.relu(self.fc4(x))
        x = self.dropout4(x)
        x = F.relu(self.fc5(x))
        x = self.dropout5(x)
        
        x = self.fc6(x)
        return F.softmax(x, dim=num_classes)


## Loading the ASL Dataset

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [9]:
train_path = "archive/asl_alphabet_train/asl_alphabet_train"

In [10]:
train_set = ImageFolder(train_path, transform = transforms.Compose([
    transforms.Grayscale(num_output_channels=1),
    transforms.Resize((48, 48)),
transforms.ToTensor()]))

print("Number of images in the training set =", len(train_set))

Number of images in the training set = 87000


In [11]:
idx = 14400
item = train_set[idx]
print(f"{idx}th item is a pair", item)

14400th item is a pair (tensor([[[0.1412, 0.1569, 0.1647,  ..., 0.4549, 0.4627, 0.3922],
         [0.1647, 0.1922, 0.1961,  ..., 0.6706, 0.6824, 0.5804],
         [0.2078, 0.2667, 0.2588,  ..., 0.6118, 0.6235, 0.5490],
         ...,
         [0.3529, 0.5176, 0.5216,  ..., 0.5412, 0.5373, 0.4588],
         [0.3569, 0.5216, 0.5216,  ..., 0.5412, 0.5333, 0.4588],
         [0.3137, 0.4471, 0.4510,  ..., 0.4667, 0.4588, 0.3961]]]), 4)


In [12]:
train_loader = torch.utils.data.DataLoader(
    train_set,
    batch_size = 128,
    shuffle = True,
    num_workers=2,
    pin_memory = True
)

In [13]:
def train_model(model, train_loader, loss_fn, optimizer):
    model.train()
    # initiate a loss monitor
    train_loss = []
    correct_predictions = 0

    for images, labels in train_loader:
        # predict the class
        images, labels = images.to(device), labels.to(device)
        predicted = model(images)
        loss = loss_fn(predicted, labels)
        correct_predictions += (predicted.argmax(dim=1) == labels).sum().item()

        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss.append(loss.item())
    
    return np.mean(train_loss), correct_predictions / len(train_loader.dataset)

In [14]:
model = nn.Sequential(
            nn.Conv2d(in_channels = 1, out_channels = 128, kernel_size=3),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Dropout(p=0.4),
            nn.Conv2d(128, 256, kernel_size=3),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2, padding=0),
            nn.Conv2d(256, 512, kernel_size=3),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2, padding=0),
            nn.Conv2d(512, 512, kernel_size=3),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2, padding=0),
            nn.AdaptiveAvgPool2d(output_size = (1,1)),
            nn.Flatten(),
            nn.Linear(in_features = 512, out_features = 512),  # Adjust the size accordingly
            nn.ReLU(),
            nn.Dropout(p=0.4),
            nn.Linear(512, 64),
            nn.ReLU(),
            nn.Dropout(p=0.2),
            nn.Linear(64, 256),
            nn.ReLU(),
            nn.Dropout(p=0.3),
            nn.Linear(256, 64),
            nn.ReLU(),
            nn.Dropout(p=0.2),
            nn.Linear(64, 256),
            nn.ReLU(),
            nn.Dropout(p=0.3),
            nn.Linear(256, len(train_set.classes))
        ).to(device)

In [None]:
# model = ASL_Model(len(train_set.classes)).to(device)
epochs = 10
learning_rate = 0.001
weight_decay = 0.001
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate, weight_decay = weight_decay)
training_losses = []


for epoch in tqdm(range(epochs)):
    train_loss, train_acc = train_model(model, train_loader, loss_fn, optimizer)
    training_losses.append(train_loss)
    print(f"epoch {epoch+1}/{epochs} | train loss={np.mean(train_loss):.4f}, {train_acc=:.4f}")