In [None]:
%pip install onnxruntime opencv-python numpy

In [None]:
import cv2
print("cv2 module path:", cv2.__file__)
print(cv2.getBuildInformation())

In [None]:
import cv2
import numpy as np
import onnxruntime as ort
import string


def load_labels():
    """Create labels for ASL alphabet"""
    return [
        "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N",
        "NOTHING", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z"
    ]


def preprocess(img: np.ndarray, input_shape):
    """
    Resize & normalize to [0,1], convert BGR→RGB,
    HWC → CHW, add batch dim.
    """
    _, c, h, w = input_shape  # e.g. [1,3,224,224]
    img_resized = cv2.resize(img, (w, h))
    img_rgb = cv2.cvtColor(img_resized, cv2.COLOR_BGR2RGB).astype(np.float32) / 255.0
    chw = np.transpose(img_rgb, (2, 0, 1))
    return np.expand_dims(chw, axis=0)  # [1,3,h,w]


def softmax(x: np.ndarray):
    e = np.exp(x - np.max(x, axis=1, keepdims=True))
    return e / e.sum(axis=1, keepdims=True)


def main():
    # Model path - adjust this to your model location
    model_path = "./models/asl-v2.onnx"

    # Load class names
    labels = load_labels()

    # Create ONNX runtime session
    print("Loading ONNX model...")
    sess = ort.InferenceSession(model_path, providers=["CPUExecutionProvider"])

    # Grab I/O metadata
    input_meta = sess.get_inputs()[0]
    input_name = input_meta.name
    input_shape = input_meta.shape  # [batch,channel,height,width]
    print(f"Model expects input shape: {input_shape}")

    # Open webcam
    print("Opening webcam...")
    cap = cv2.VideoCapture(0)

    if not cap.isOpened():
        print("Error: Could not open webcam")
        return

    print("Press 'q' to quit")

    while True:
        # Capture frame
        ret, frame = cap.read()
        if not ret:
            print("Error: Failed to capture frame")
            break

        # Make a copy for display
        display_frame = frame.copy()

        # Preprocess frame
        input_tensor = preprocess(frame, input_shape)

        # Run inference
        raw_outputs = sess.run(None, {input_name: input_tensor})
        logits = raw_outputs[0]  # shape [1, NC]
        probs = softmax(logits)  # shape [1, NC]

        # Pick top‑3 predictions
        top_indices = np.argsort(probs[0])[::-1][:3]

        # Display results on frame
        y_offset = 30
        for i, idx in enumerate(top_indices):
            prob = float(probs[0, idx])
            if prob > 0.05:  # Only show predictions with >5% confidence
                label_text = labels[idx] if idx < len(labels) else f"Class {idx}"
                text = f"{label_text}: {prob*100:.1f}%"
                # Position text with increasing y offset for each prediction
                cv2.putText(
                    display_frame,
                    text,
                    (10, y_offset),
                    cv2.FONT_HERSHEY_SIMPLEX,
                    0.8,
                    (0, 255, 0),
                    2,
                )
                y_offset += 30

        # Show frame
        cv2.imshow("ASL Detection", display_frame)

        # Exit on 'q' key press
        if cv2.waitKey(1) & 0xFF == ord("q"):
            break

    # Release resources
    cap.release()
    cv2.destroyAllWindows()


if __name__ == "__main__":
    main()