In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from torch.utils.data import DataLoader

In [2]:
class ASL_First_CNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv = nn.Sequential(
            # 3*200*200 --> 64*100*100
            nn.Conv2d(3, 64, kernel_size=5, stride=2, padding=2),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            # 64*100*100 --> 64*50*50
            nn.MaxPool2d(kernel_size=2),
            
            # 64*50*50 --> 64*25*25
            nn.Conv2d(64, 64, kernel_size=3, stride=2, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            
            # 64*25*25 --> 128*13*13
            nn.Conv2d(64, 128, kernel_size=3, stride=2, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            # 128*13*13 --> 128*3*3
            nn.AdaptiveAvgPool2d((3,3))
        )
            
        self.fc = nn.Sequential(
            nn.Flatten(),
            
            nn.Dropout(0.4),
            nn.Linear(128*3*3, 1024),
            nn.ReLU(inplace=True),
            
            nn.Dropout(0.3),
            nn.Linear(1024, 512),
            nn.ReLU(inplace=True),
            
            nn.Dropout(0.2),
            nn.Linear(512, 24),
#             nn.Softmax()
        )
        
    def forward(self, x):
        x = self.conv(x)
        x = self.fc(x)
        return x

# model = ASL_First_CNN().to(device)

In [3]:
import torch
import torch.nn as nn
import torchvision.transforms as T
import cv2
from torchvision.models import mobilenet_v2
from PIL import Image
import mediapipe as mp

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

normalize = T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
test_transform = T.Compose([
    T.Resize(256),
    T.CenterCrop(200),
    T.ToTensor(),
    normalize,
])

def load_model(model_path):
    model = ASL_First_CNN().to(device)
    # mobilenet_v2(num_classes=24).to(device)
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.eval()
    return model

#Since J, Z has no symbols, people use moves to show J, Z gestures
index_to_letter = [chr(i) for i in range(65, 91) if chr(i) not in ['J', 'Z']]

def predict(model, image):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image = Image.fromarray(image)
    image = test_transform(image).unsqueeze(0).to(device)
    output = model(image)
    _, predicted = torch.max(output.data, 1)
    
    return index_to_letter[predicted.item()]

model_path = 'asl_first_cnn.pth'
# model_path = 'asl_cnn_first.pth'
model = load_model(model_path)

# Hands detection
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(
    static_image_mode=False,
    max_num_hands=1,
    min_detection_confidence=0.5,
    min_tracking_confidence=0.5)

index = 0
cap = None
while cap is None or not cap.isOpened():
    cap = cv2.VideoCapture(index)
    if not cap.isOpened():
        cap = None
        index += 1
        if index > 5:
            print("No camera available.")
            exit()

print(f"Using camera with index: {index}")

while True:
    ret, frame = cap.read()
    if not ret:
        break

    image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    results = hands.process(image)
    
    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            h, w, _ = frame.shape
            x_min, x_max, y_min, y_max = w, 0, h, 0
            for lm in hand_landmarks.landmark:
                x, y = int(lm.x * w), int(lm.y * h)
                x_min, x_max = min(x_min, x), max(x_max, x)
                y_min, y_max = min(y_min, y), max(y_max, y)

            x_min = max(0, x_min - 20)
            x_max = min(w, x_max + 20)
            y_min = max(0, y_min - 20)
            y_max = min(h, y_max + 20)

            hand_image = frame[y_min:y_max, x_min:x_max]

            symbol = predict(model, hand_image)
            print(f'Predicted Symbol: {symbol}')
            
            cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), (0, 255, 0), 2)
            cv2.putText(frame, symbol, (x_min, y_min-10), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2, cv2.LINE_AA)

    cv2.imshow('Frame', frame)
    
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
hands.close()
cv2.destroyAllWindows()


Using camera with index: 0




Predicted Symbol: B
Predicted Symbol: B
Predicted Symbol: B
Predicted Symbol: B
Predicted Symbol: B
Predicted Symbol: B
Predicted Symbol: B
Predicted Symbol: B
Predicted Symbol: B
Predicted Symbol: B
Predicted Symbol: B
Predicted Symbol: B
Predicted Symbol: B
Predicted Symbol: B
Predicted Symbol: B
Predicted Symbol: B
Predicted Symbol: B
Predicted Symbol: B
Predicted Symbol: B
Predicted Symbol: B
Predicted Symbol: B
Predicted Symbol: C
Predicted Symbol: B
Predicted Symbol: B
Predicted Symbol: B
Predicted Symbol: C
Predicted Symbol: C
Predicted Symbol: B
Predicted Symbol: B
Predicted Symbol: B
Predicted Symbol: B
Predicted Symbol: B
Predicted Symbol: B
Predicted Symbol: B
Predicted Symbol: B
Predicted Symbol: B
Predicted Symbol: B
Predicted Symbol: C
Predicted Symbol: B
Predicted Symbol: B
Predicted Symbol: B
Predicted Symbol: B
Predicted Symbol: B
Predicted Symbol: B
Predicted Symbol: B
Predicted Symbol: B
Predicted Symbol: B
Predicted Symbol: B
Predicted Symbol: B
Predicted Symbol: B
