In [None]:
# Install necessary libraries
!pip uninstall -y mediapipe
!pip install mediapipe
!pip install mediapipe kaggle
!pip install --upgrade numpy
!pip install --upgrade torch torchvision

import os
import cv2
import glob
import json
import zipfile
import shutil
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm  # For progress bars

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms, models
from torchvision.datasets import ImageFolder
from PIL import Image

import mediapipe as mp

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Initialize MediaPipe Hands
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(
    static_image_mode=True,
    max_num_hands=1,
    min_detection_confidence=0.3
)

In [None]:
# Set up Kaggle API
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Download and unzip the dataset
print("Downloading dataset...")
!kaggle datasets download -d prathumarikeri/indian-sign-language-isl
print("Unzipping dataset...")
!unzip -q indian-sign-language-isl.zip

# Define dataset paths
DATASET_PATH = 'Indian'
PROCESSED_PATH = 'Processed_Dataset'

# Clean up old processed data if it exists
if os.path.exists(PROCESSED_PATH):
    shutil.rmtree(PROCESSED_PATH)

os.makedirs(PROCESSED_PATH, exist_ok=True)
print("Setup complete.")

In [None]:

def get_hand_bbox(image, padding=30):
    """
    Finds the hand in an image and returns a padded bounding box.
    """
    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    results = hands.process(image_rgb)

    if not results.multi_hand_landmarks:
        return None  # No hand found

    # Get landmarks
    hand_landmarks = results.multi_hand_landmarks[0]

    # Get image dimensions
    h, w, _ = image.shape

    # Find min/max coordinates
    x_coords = [landmark.x for landmark in hand_landmarks.landmark]
    y_coords = [landmark.y for landmark in hand_landmarks.landmark]

    x_min, x_max = min(x_coords), max(x_coords)
    y_min, y_max = min(y_coords), max(y_coords)

    # Convert normalized coords to pixel coords
    x_min_px = int(x_min * w)
    x_max_px = int(x_max * w)
    y_min_px = int(y_min * h)
    y_max_px = int(y_max * h)

    # Apply padding
    x_min_px = max(0, x_min_px - padding)
    y_min_px = max(0, y_min_px - padding)
    x_max_px = min(w, x_max_px + padding)
    y_max_px = min(h, y_max_px + padding)

    # Ensure width and height are positive
    if x_min_px >= x_max_px or y_min_px >= y_max_px:
        return None

    return (x_min_px, y_min_px, x_max_px, y_max_px)

def preprocess_dataset(source_path, dest_path, train_split=0.8):
    """
    Processes the entire dataset, crops hands, and splits into train/val.
    """
    # Create train and val directories
    train_dir = os.path.join(dest_path, 'train')
    val_dir = os.path.join(dest_path, 'val')
    os.makedirs(train_dir, exist_ok=True)
    os.makedirs(val_dir, exist_ok=True)

    # Get all class names (directories)
    class_names = [d for d in os.listdir(source_path) if os.path.isdir(os.path.join(source_path, d))]
    # Sort them (e.g., 0-9, then A-Z)
    class_names.sort()

    print(f"Found classes: {class_names}")

    for class_name in class_names:
        class_path = os.path.join(source_path, class_name)
        images = glob.glob(os.path.join(class_path, '*.jpg'))

        # Create class-specific train/val dirs
        train_class_dir = os.path.join(train_dir, class_name)
        val_class_dir = os.path.join(val_dir, class_name)
        os.makedirs(train_class_dir, exist_ok=True)
        os.makedirs(val_class_dir, exist_ok=True)

        # Split images
        split_idx = int(len(images) * train_split)
        train_images = images[:split_idx]
        val_images = images[split_idx:]

        print(f"Processing Class {class_name}: {len(train_images)} train, {len(val_images)} val")

        # Process training images
        for img_path in tqdm(train_images, desc=f"Train {class_name}"):
            image = cv2.imread(img_path)
            bbox = get_hand_bbox(image)
            if bbox:
                x1, y1, x2, y2 = bbox
                cropped_hand = image[y1:y2, x1:x2]
                if cropped_hand.size > 0:
                    save_path = os.path.join(train_class_dir, os.path.basename(img_path))
                    cv2.imwrite(save_path, cropped_hand)

        # Process validation images
        for img_path in tqdm(val_images, desc=f"Val {class_name}"):
            image = cv2.imread(img_path)
            bbox = get_hand_bbox(image)
            if bbox:
                x1, y1, x2, y2 = bbox
                cropped_hand = image[y1:y2, x1:x2]
                if cropped_hand.size > 0:
                    save_path = os.path.join(val_class_dir, os.path.basename(img_path))
                    cv2.imwrite(save_path, cropped_hand)

# Run the preprocessing
print("Starting dataset preprocessing...")
preprocess_dataset(DATASET_PATH, PROCESSED_PATH)
print("Preprocessing finished.")

In [None]:
# --- STEP 4: STRONGER AUGMENTATION ---
INPUT_SIZE = 224
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])

data_transforms = {
    'train': transforms.Compose([
        # This forces the model to learn hands at different zooms/positions
        transforms.RandomResizedCrop(INPUT_SIZE, scale=(0.6, 1.0)),
        transforms.RandomHorizontalFlip(),
        transforms.RandomRotation(30), # Increased rotation
        # Simulates different webcam lighting conditions
        transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4, hue=0.1),
        transforms.ToTensor(),
        normalize
    ]),
    'val': transforms.Compose([
        transforms.Resize((INPUT_SIZE, INPUT_SIZE)),
        transforms.ToTensor(),
        normalize
    ]),
}

# Create ImageFolder datasets
train_dataset = ImageFolder(os.path.join(PROCESSED_PATH, 'train'), data_transforms['train'])
val_dataset = ImageFolder(os.path.join(PROCESSED_PATH, 'val'), data_transforms['val'])

# Create DataLoaders
BATCH_SIZE = 32
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2)

class_names = train_dataset.classes
num_classes = len(class_names)
print(f"Classes: {class_names}")

In [None]:
# --- STEP 5: FINE-TUNING RESNET ---
model = models.resnet18(weights=models.ResNet18_Weights.DEFAULT)

# 1. First, freeze everything
for param in model.parameters():
    param.requires_grad = False

# 2. UNFREEZE the last block (Layer 4) and the fully connected layer
# This allows the model to learn hand-specific features, not just generic shapes
for param in model.layer4.parameters():
    param.requires_grad = True

num_ftrs = model.fc.in_features
model.fc = nn.Linear(num_ftrs, num_classes)

model = model.to(device)

# Use a lower learning rate for fine-tuning
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam([
    {'params': model.layer4.parameters(), 'lr': 1e-4}, # Lower LR for feature layers
    {'params': model.fc.parameters(), 'lr': 1e-3}      # Higher LR for classifier
])

# Training loop
NUM_EPOCHS = 20
best_val_acc = 0.0
model_path = "best_gesture_model_resnet.pth"

print("Starting fine-tuning...")
for epoch in range(NUM_EPOCHS):
    model.train()
    running_loss = 0.0
    running_corrects = 0

    for inputs, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{NUM_EPOCHS} [Train]"):
        inputs = inputs.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        _, preds = torch.max(outputs, 1)
        loss = criterion(outputs, labels)

        loss.backward()
        optimizer.step()

        running_loss += loss.item() * inputs.size(0)
        running_corrects += torch.sum(preds == labels.data)

    epoch_train_loss = running_loss / len(train_dataset)
    epoch_train_acc = running_corrects.double() / len(train_dataset)

    # Validation
    model.eval()
    running_val_loss = 0.0
    running_val_corrects = 0

    with torch.no_grad():
        for inputs, labels in tqdm(val_loader, desc=f"Epoch {epoch+1}/{NUM_EPOCHS} [Val]"):
            inputs = inputs.to(device)
            labels = labels.to(device)
            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)
            loss = criterion(outputs, labels)
            running_val_loss += loss.item() * inputs.size(0)
            running_val_corrects += torch.sum(preds == labels.data)

    epoch_val_loss = running_val_loss / len(val_dataset)
    epoch_val_acc = running_val_corrects.double() / len(val_dataset)

    print(f"Epoch {epoch+1} - Train Acc: {epoch_train_acc:.4f} | Val Acc: {epoch_val_acc:.4f}")

    if epoch_val_acc >= best_val_acc:
        best_val_acc = epoch_val_acc
        torch.save(model.state_dict(), model_path)

print("Training finished.")

In [None]:
# --- Colab Webcam Snippet (Fixed Imports) ---
from IPython.display import display, Javascript, Image as IPImage, clear_output
from google.colab.output import eval_js
from base64 import b64decode, b64encode
import cv2
import numpy as np
import torch
import torch.nn as nn
from torchvision import models, transforms
from PIL import Image
import time

# Set device again just to be safe
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# --- Load Model ---
# Ensure this path matches where your model is saved
model_path = "best_gesture_model_resnet.pth"

# Re-create the model architecture
# (We need to know the number of classes. If you ran Step 4, 'class_names' exists.
# If not, we assume 36 for 0-9 + A-Z, or 10 for just 0-9. Let's try to grab it dynamically).
try:
    num_classes = len(class_names)
except NameError:
    print("Warning: class_names not found. Assuming 10 classes (0-9). Change if needed.")
    num_classes = 10 # Default fall back

model = models.resnet18()
num_ftrs = model.fc.in_features
model.fc = nn.Linear(num_ftrs, num_classes)

try:
    model.load_state_dict(torch.load(model_path, map_location=device))
    print("Model loaded successfully.")
except FileNotFoundError:
    print("Error: Model file not found. Please make sure you ran the training step!")
    # Create a dummy model just so the code doesn't crash immediately
    pass

model = model.to(device)
model.eval()

# Define transforms for live input
# (Must match validation transforms)
INPUT_SIZE = 224
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])
live_transform = transforms.Compose([
    transforms.Resize((INPUT_SIZE, INPUT_SIZE)),
    transforms.ToTensor(),
    normalize
])

def js_to_image(js_reply):
    """
    Converts a base64 image string from Javascript to an OpenCV image.
    """
    image_bytes = b64decode(js_reply.split(',')[1])
    jpg_as_np = np.frombuffer(image_bytes, dtype=np.uint8)
    img = cv2.imdecode(jpg_as_np, flags=cv2.IMREAD_COLOR)
    return img

def video_stream():
    js = Javascript('''
    var video;
    var div = null;
    var stream;
    var captureCanvas;
    var labelElement;

    var pendingResolve = null;
    var shutdown = false;

    function startVideo() {
      shutdown = false;
      div = document.createElement('div');
      document.body.appendChild(div);

      video = document.createElement('video');
      video.style.display = 'block';
      div.appendChild(video);

      labelElement = document.createElement('div');
      labelElement.innerText = 'Initializing...';
      labelElement.style.color = 'green';
      labelElement.style.fontSize = '20px';
      labelElement.style.fontWeight = 'bold';
      div.appendChild(labelElement);

      captureCanvas = document.createElement('canvas');
      captureCanvas.width = 640;
      captureCanvas.height = 480;

      stream = navigator.mediaDevices.getUserMedia({video: true})
        .then(function(s) {
          video.srcObject = s;
          video.play();
          stream = s;

          video.onloadedmetadata = () => {
             if (pendingResolve) {
                pendingResolve(true);
                pendingResolve = null;
             }
          };
        });
    }

    function stopVideo() {
      shutdown = true;
      if (stream) {
        stream.getTracks().forEach(track => track.stop());
        video.srcObject = null;
        stream = null;
      }
      if (div) {
        div.remove();
        div = null;
      }
    }

    function captureFrame() {
      if (shutdown) {
        return Promise.resolve(null);
      }
      if (!captureCanvas) {
        return Promise.resolve(null);
      }

      captureCanvas.getContext('2d').drawImage(video, 0, 0, 640, 480);
      var data = captureCanvas.toDataURL('image/jpeg', 0.8);
      return Promise.resolve(data);
    }

    function updateLabel(text) {
        if (labelElement) {
            labelElement.innerText = text;
        }
    }

    window.google_colab_notebook = {
      start: startVideo,
      stop: stopVideo,
      capture: captureFrame,
      update_label: updateLabel
    };
    ''')
    display(js)

# --- Main Loop ---

# 1. Inject Javascript
video_stream()

# 2. Start Webcam
eval_js('google_colab_notebook.start()')

# 3. Init MediaPipe
import mediapipe as mp
mp_hands = mp.solutions.hands
live_hands = mp_hands.Hands(
    static_image_mode=False,
    max_num_hands=1,
    min_detection_confidence=0.5,
    min_tracking_confidence=0.5
)

print("Webcam started...")

try:
    while True:
        # Capture frame
        js_reply = eval_js('google_colab_notebook.capture()')
        if not js_reply:
            break

        frame = js_to_image(js_reply)
        frame = cv2.flip(frame, 1)

        # MediaPipe
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        results = live_hands.process(frame_rgb)

        prediction_text = "No Hand Detected"

        if results.multi_hand_landmarks:
            hand_landmarks = results.multi_hand_landmarks[0]

            # BBox
            h, w, _ = frame.shape
            x_coords = [lm.x for lm in hand_landmarks.landmark]
            y_coords = [lm.y for lm in hand_landmarks.landmark]

            # Padding
            pad = 20
            x_min = int(min(x_coords) * w) - pad
            y_min = int(min(y_coords) * h) - pad
            x_max = int(max(x_coords) * w) + pad
            y_max = int(max(y_coords) * h) + pad

            # Clip to image boundaries
            x_min = max(0, x_min)
            y_min = max(0, y_min)
            x_max = min(w, x_max)
            y_max = min(h, y_max)

            if x_max - x_min > 10 and y_max - y_min > 10:
                # Crop
                cropped = frame[y_min:y_max, x_min:x_max]
                cropped_rgb = cv2.cvtColor(cropped, cv2.COLOR_BGR2RGB)
                pil_img = Image.fromarray(cropped_rgb)

                # Predict
                input_tensor = live_transform(pil_img).unsqueeze(0).to(device)

                with torch.no_grad():
                    outputs = model(input_tensor)
                    _, predicted = torch.max(outputs, 1)

                    # Get class name if available
                    idx = predicted.item()
                    if 'class_names' in globals() and idx < len(class_names):
                        label = class_names[idx]
                    else:
                        label = str(idx)

                    prediction_text = f"Prediction: {label}"

        # Update label in browser
        eval_js(f'google_colab_notebook.update_label("{prediction_text}")')

except KeyboardInterrupt:
    print("Stopped.")
except Exception as e:
    print(f"Error: {e}")
finally:
    eval_js('google_colab_notebook.stop()')
    live_hands.close()