In [1]:
import os

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torchvision.io import read_image
import torchvision.transforms as transforms
import torchvision.transforms.functional as fn
import torchvision.ops as ops

from datetime import datetime
from torch.utils.tensorboard import SummaryWriter

from PIL import Image

import pandas as pd
import numpy as np
import matplotlib.pyplot as mt
from torch.autograd import Variable


torch.autograd.set_detect_anomaly(True)
if torch.cuda.is_available():
  device = torch.device("cuda")
else:
  device = torch.device("cpu")
print(device)

cuda


Loading the data into dataframes

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Replacing class labels with numbers

In [3]:
train_path = '/content/drive/My Drive/ai-ml-round3/train/'
val_path = '/content/drive/My Drive/ai-ml-round3/valid/'
test_path = '/content/drive/My Drive/ai-ml-round3/test/'
train_set = pd.read_csv(train_path + '_annotations.csv')
val_set = pd.read_csv(val_path + '_annotations.csv')
test_set = pd.read_csv(test_path + '_annotations.csv')


train_set['class'].replace({'Rock': 1, 'Paper': 2, 'Scissors': 3}, inplace= True)
val_set['class'].replace({'Rock': 1, 'Paper': 2, 'Scissors': 3}, inplace= True)
test_set['class'].replace({'Rock': 1, 'Paper': 2, 'Scissors': 3}, inplace= True)

Adding all the blank images missing from the annotations files. I've decided to use 0,0,0,0 as the bounding box coordinates for no image, not sure if that's a good idea or not. (After training and checking the accuracy, I noticed that it liked predicting None class way too much, so i removed the data again) However, I've removed images with duplicate entries, so that the model doesnt get confused, it isn't complex enough to detect two hands at the same time :(

In [None]:
# from os import listdir



# for images in os.listdir(train_path):

#   # check if the image ends with png
#   if (images.endswith(".jpg")):

#     if(images not in train_set.values):
#       dic = pd.DataFrame({'filename':images, 'width': 640, 'height': 640, 'class':0, 'xmin':0, 'ymin':0, 'xmax':0,'ymax':0}, index=[0])
#       train_set = pd.concat([train_set,dic], ignore_index = True)

# for images in os.listdir(val_path):
#   # check if the image ends with png
#   if (images.endswith(".jpg")):
#     if(images not in val_set.filename.values):
#       dic = pd.DataFrame({'filename':images, 'width': 640, 'height': 640, 'class':0, 'xmin':0, 'ymin':0, 'xmax':0,'ymax':0}, index=[0])
#       val_set = pd.concat([val_set,dic], ignore_index = True)

# for images in os.listdir(test_path):
#   # check if the image ends with png
#   if (images.endswith(".jpg")):
#     if(images not in test_set.filename.values):
#       dic = pd.DataFrame({'filename':images, 'width': 640, 'height': 640, 'class':0, 'xmin':0, 'ymin':0, 'xmax':0,'ymax':0}, index=[0])
#       test_set = pd.concat([test_set,dic], ignore_index = True)


Dropped images with multiple hands to reduce the complexity of the task

In [4]:
train_set.drop_duplicates(subset = ['filename'], keep = False, inplace = True)
val_set.drop_duplicates(subset = ['filename'], keep = False, inplace = True)
test_set.drop_duplicates(subset = ['filename'], keep = False, inplace = True)

train_set.tail()
# train_set.drop(['Unnamed: 0'], axis = 1, inplace = True)
# val_set.drop([ 'Unnamed: 0'], axis = 1, inplace = True)
# test_set.drop([ 'Unnamed: 0'], axis = 1, inplace = True)
train_set.to_csv(train_path + '_annotations.csv', index = False)
val_set.to_csv(val_path + '_annotations.csv', index = False)
test_set.to_csv(test_path + '_annotations.csv', index = False)


In [5]:
train_set.head()

Unnamed: 0.1,Unnamed: 0,filename,width,height,class,xmin,ymin,xmax,ymax
0,0,zoom_tests_RockPaperScissors-mohamed_mp4-67_jp...,640,640,1,12,0,141,250
1,1,egohands-public-1625070649053_png_jpg.rf.000a5...,640,640,1,153,407,485,637
2,2,zoom_tests_RockPaperScissors-mohamed_mp4-111_j...,640,640,2,0,227,233,640
3,5,egohands-public-1626201626932_png_jpg.rf.001f7...,640,640,3,18,240,277,458
4,8,IMG_7043_MOV-71_jpg.rf.0059209d24d383f7ac8f635...,640,640,1,255,317,609,566


Defining the Dataset class. Its purpose is to define a way to get the image and its labels for the DataLoader object to form batches with.

In [5]:
class ImageDataset(Dataset):
    def __init__(self, dir, dataset, transform=None, target_transform=None):
        self.dataset = dataset
        self.img_dir = dir
        self.transform = transform
        self.target_transform = target_transform

    def __len__(self):
        return len(self.dataset.axes[0])

    def __getitem__(self, idx):
        img_path = os.path.join(self.img_dir, self.dataset.iloc[idx, self.dataset.columns.get_loc("filename")])
        img = Image.open(img_path)

        transform = transforms.Compose([
            transforms.ToTensor(),

        ])
        image = transform(img)
        image = fn.resize(image, [200])
        nplabel = self.dataset.iloc[idx, 4:8].values
        class_label = int(self.dataset.iloc[idx, 3])

        box_label = torch.from_numpy(nplabel.astype(float)) /640

        if self.transform:
            image = self.transform(image)
        if self.target_transform:
            label = self.target_transform(label)
        return image, box_label, class_label

Defining the object detection model
The purpose of this is to predict a bounding box around the hand, which will be classified by the classifier model
The model is comprised of 2 sets of 3 convolutional layers with maxpooling, and then three fully connected layers

In [6]:
class Object_Detect(nn.Module):
  def __init__(self, ):
    super(Object_Detect, self).__init__()

    self.conv1_1 = nn.Conv2d(in_channels= 3, out_channels= 10, kernel_size= (3,3), padding = 1)
    nn.init.xavier_uniform_(self.conv1_1.weight)
    self.conv1_2 = nn.Conv2d(in_channels= 10, out_channels= 15,kernel_size= (3,3), padding = 1)
    nn.init.xavier_uniform_(self.conv1_2.weight)
    self.conv1_3 = nn.Conv2d(in_channels= 15, out_channels= 20, kernel_size= (3,3), padding = 1)
    nn.init.xavier_uniform_(self.conv1_3.weight)
    self.pool1 = nn.MaxPool2d(kernel_size=2)

    self.conv2_1 = nn.Conv2d(in_channels= 20, out_channels= 30, kernel_size= (3,3), padding= 1)
    torch.nn.init.xavier_uniform_(self.conv2_1.weight)
    self.conv2_2 = nn.Conv2d(in_channels= 30, out_channels= 30, kernel_size= (3,3), padding = 1)
    torch.nn.init.xavier_uniform_(self.conv2_2.weight)
    self.conv2_3 = nn.Conv2d(in_channels= 30, out_channels= 35, kernel_size= (3,3), padding = 1)
    torch.nn.init.xavier_uniform_(self.conv2_3.weight)
    self.pool2 = nn.MaxPool2d(kernel_size=2)

    self.flatten = nn.Flatten()


    self.bboxfc1 = nn.Linear(87500,128)
    self.bboxfc2 = nn.Linear(128,64)
    self.bboxfc3 = nn.Linear(64,32)
    self.bboxfc4 = nn.Linear(32,4)

    self.classfc1 = nn.Linear(87500,512)
    self.dropout1 = nn.Dropout(0.2)
    self.classfc2 = nn.Linear(512,512)
    self.dropout2 = nn.Dropout(0.2)
    self.classfc3 = nn.Linear(512, 4)



  def forward(self, x):
    out = F.relu(self.conv1_1(x))
    out = F.relu(self.conv1_2(out))
    out = F.relu(self.conv1_3(out))
    out = self.pool1(out)

    out = F.relu(self.conv2_1(out))
    out = F.relu(self.conv2_2(out))
    out = F.relu(self.conv2_3(out))
    out = self.pool2(out)

    out = self.flatten(out)

    reg_out = F.relu(self.bboxfc1(out))
    reg_out = F.relu(self.bboxfc2(reg_out))
    reg_out = F.relu(self.bboxfc3(reg_out))
    reg_out = self.bboxfc4(reg_out)

    class_out = F.relu(self.classfc1(out))
    class_out = self.dropout1(class_out)
    class_out = F.relu(self.classfc2(class_out))
    class_out = self.dropout2(class_out)
    class_out = F.softmax(self.classfc3(class_out))



    return reg_out, class_out


Function to train the model for one epoch and report the loss regularly

In [10]:


def train_one_epoch(epoch_index, tb_writer):
    running_loss = 0.
    last_loss = 0.
    # Here, we use enumerate(training_loader) instead of
    # iter(training_loader) so that we can track the batch
    # index and do some intra-epoch reporting
    for i, data in enumerate(train_loader):
        # Every data instance is an input + label pair
        inputs, labels, class_label = data
        inputs = inputs.to(device)
        labels = labels.to(device)
        class_label =class_label.long().to(device)
        # Zero your gradients for every batch!
        object_optim.zero_grad()

        # Make predictions for this batch
        box_output, class_output = detection_model(inputs)


        # Compute the loss and its gradients
        loss =  class_l(class_output, class_label) + box_l(box_output, labels.to(torch.float32))
        loss.backward()

        # Adjust learning weights
        object_optim.step()

        # Gather data and report
        running_loss += loss.item()
        print(i)
        if i % 50 == 49:
            last_loss = running_loss / 50 # loss per batch
            print('  batch {} loss: {}'.format(i + 1, last_loss))
            tb_x = epoch_index * len(train_loader) + i + 1
            tb_writer.add_scalar('Loss/train', last_loss, tb_x)
            running_loss = 0.

    return last_loss




Training loop. The model is trained for an epoch and then tested against the validation images. The model with the lowest loss is saved for later use by writing its state_dict to my drive


In [8]:
#defining the models
detection_model = Object_Detect().to(device)

#added this to use the saved state from previous runs
detection_model.load_state_dict(torch.load('/content/drive/My Drive/ai-ml-round3/model_state_dict',map_location=device))

<All keys matched successfully>

In [12]:
learning_rate = 0.001
num_epochs = 30
batch_size = 32


#defining loss
box_l = nn.MSELoss()
class_l = nn.CrossEntropyLoss()


#optimisers
object_optim = optim.Adam(detection_model.parameters(), lr= learning_rate)


# Load dataset using DataLoader
training_data = ImageDataset(dir = train_path,dataset=train_set)
train_loader = DataLoader(training_data, batch_size=batch_size, shuffle=True)

validation_data = ImageDataset(dir = val_path,dataset=val_set)
val_loader = DataLoader(validation_data, batch_size=batch_size, shuffle=True)

timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
writer = SummaryWriter('runs/fashion_trainer_{}'.format(timestamp))

epoch_number = 0

EPOCHS = 30

best_vloss = 1.3

for epoch in range(EPOCHS):
    print('EPOCH {}:'.format(epoch_number + 1))

    # Make sure gradient tracking is on, and do a pass over the data
    detection_model.train(True)
    avg_loss = train_one_epoch(epoch_number, writer)


    running_vloss = 0.0
    # Set the model to evaluation mode, disabling dropout and using population
    # statistics for batch normalization.
    detection_model.eval()

    # Disable gradient computation and reduce memory consumption.
    with torch.no_grad():
        for i, vdata in enumerate(val_loader):
            vinputs, vlabels, class_label = vdata
            vinputs = vinputs.to(device)
            vlabels = vlabels.to(device)
            class_label = class_label.to(device)
            vboutput, vcoutput = detection_model(vinputs)
            vloss = class_l(vcoutput, class_label) +  box_l(vboutput,vlabels.to(torch.float32))
            running_vloss += vloss

    avg_vloss = running_vloss / (i + 1)
    print('LOSS train {} valid {}'.format(avg_loss, avg_vloss))

    # Log the running loss averaged per batch
    # for both training and validation
    writer.add_scalars('Training vs. Validation Loss',
                    { 'Training' : avg_loss, 'Validation' : avg_vloss },
                    epoch_number + 1)
    writer.flush()

    # Track best performance, and save the model's state
    if avg_vloss < best_vloss:
        best_vloss = avg_vloss
        model_path = '/content/drive/My Drive/ai-ml-round3/model_state_dict'
        torch.save(detection_model.state_dict(), model_path)

    epoch_number += 1

EPOCH 1:


KeyboardInterrupt: 

In [None]:
# model_path = '/content/drive/My Drive/ai-ml-round3/model_state_dict'
# torch.save(detection_model.state_dict(), model_path)

Testing the data on the test images, just to see how the model is performing

In [13]:
import matplotlib.pyplot as plt
import matplotlib.patches as patches

def show_test_output(image, box_pred, class_pred):
  xmin, ymin, xmax, ymax = box_pred[0]*200
  print(box_pred[0]*200)
  xmin = round(xmin.item())
  ymin = round(ymin.item())
  xmax = round(xmax.item())
  ymax = round(ymax.item())
  pred = class_pred[0].detach()
  print("class:",np.argmax(pred), " confidence:", max(pred))

  plt.imshow(image[0].permute(1,2,0))
  ax = plt.gca()
  rect = patches.Rectangle((xmin,ymin),xmax-xmin,ymax-ymin,linewidth=1,edgecolor='r',facecolor='none')
  ax.add_patch(rect)

  plt.show()

In [14]:
test_data = ImageDataset(dir = test_path, dataset = test_set)
test_loader = DataLoader(test_data, batch_size = 1, shuffle = True)

for idx, data in enumerate(test_loader):
  image, label, class_label = data
  image = image.to(device)
  # label = label.to(device)
  # class_label = class_label.to(device)
  print(image.shape)
  boxo,classo = detection_model(image)
  boxo = boxo.to(torch.device('cpu'))
  classo = classo.to(torch.device('cpu'))
  show_test_output(image.to(torch.device('cpu')), boxo, classo)

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


Configuring the model to run with webcam input. This is the first step to making it run on live video feed

In [15]:
# import dependencies
from IPython.display import display, Javascript, Image
from google.colab.output import eval_js
from base64 import b64decode, b64encode
import cv2
import numpy as np
import PIL
import io
import html
import time
import matplotlib.pyplot as plt
from PIL import Image as Im


#defining the models
detection_model = Object_Detect().to(device)

#added this to use the saved state from previous runs
detection_model.load_state_dict(torch.load('/content/drive/My Drive/ai-ml-round3/model_state_dict',map_location=device))


# function to convert the JavaScript object into an OpenCV image
def js_to_image(js_reply):
  """
  Params:
          js_reply: JavaScript object containing image from webcam
  Returns:
          img: OpenCV BGR image
  """
  # decode base64 image
  image_bytes = b64decode(js_reply.split(',')[1])
  # convert bytes to numpy array
  jpg_as_np = np.frombuffer(image_bytes, dtype=np.uint8)
  # decode numpy array into OpenCV BGR image
  img = cv2.imdecode(jpg_as_np, flags=1)

  img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)


  return img

# function to convert OpenCV Rectangle bounding box image into base64 byte string to be overlayed on video stream
def bbox_to_bytes(bbox_array):
  """
  Params:
          bbox_array: Numpy array (pixels) containing rectangle to overlay on video stream.
  Returns:
        bytes: Base64 image byte string
  """
  # convert array into PIL image
  bbox_PIL = Im.fromarray(bbox_array, 'RGBA')
  iobuf = io.BytesIO()
  # format bbox into png for return
  bbox_PIL.save(iobuf, format='png')
  # format return string
  bbox_bytes = 'data:image/png;base64,{}'.format((str(b64encode(iobuf.getvalue()), 'utf-8')))

  return bbox_bytes

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [16]:
def take_photo(filename='photo.jpg', quality=0.8):
  js = Javascript('''
    async function takePhoto(quality) {
      const div = document.createElement('div');
      const capture = document.createElement('button');
      capture.textContent = 'Capture';
      div.appendChild(capture);

      const video = document.createElement('video');
      video.style.display = 'block';
      const stream = await navigator.mediaDevices.getUserMedia({video: true});

      document.body.appendChild(div);
      div.appendChild(video);
      video.srcObject = stream;
      await video.play();

      // Resize the output to fit the video element.
      google.colab.output.setIframeHeight(document.documentElement.scrollHeight, true);

      // Wait for Capture to be clicked.
      await new Promise((resolve) => capture.onclick = resolve);

      const canvas = document.createElement('canvas');
      canvas.width = video.videoWidth;
      canvas.height = video.videoHeight;
      canvas.getContext('2d').drawImage(video, 0, 0);
      stream.getVideoTracks()[0].stop();
      div.remove();
      return canvas.toDataURL('image/jpeg', quality);
    }
    ''')
  display(js)

  # get photo data
  data = eval_js('takePhoto({})'.format(quality))
  # get OpenCV format image
  img= js_to_image(data)
  img = cv2.resize(img, (200,200), interpolation = cv2.INTER_AREA)
  input = transforms.ToTensor()(img)
  input = torch.stack([input])

  input = input.to(device)
  # get classes and bounding box coordinates
  boxo, classo = detection_model(input)
  classo = classo.detach()
  boxo = boxo.to(torch.device('cpu'))
  classo = classo.to(torch.device('cpu'))
  # draw face bounding box on image
  print(np.argmax(classo[0]),max(classo[0]))
  for (x1,y1,x2,y2) in boxo:
    x1,x2 = int(x1*200),int(x2*200)
    y1,y2 = int(y1*200),int(y2*200)
    img = cv2.rectangle(img,(x1,y1),(x2,y2),(255,0,0),2)
  # # save image
  cv2.imwrite(filename, img)

  return filename

In [None]:
try:
  filename = take_photo('photo.jpg')
  print('Saved to {}'.format(filename))

  # Show the image which was just taken.
  display(Image(filename))
except Exception as err:
  # Errors will be thrown if the user does not have a webcam or if they do not
  # grant the page permission to access it.
  print(str(err))

Configuring the model to run with a video stream

In [17]:
# JavaScript to properly create our live video stream using our webcam as input
def video_stream():
  js = Javascript('''
    var video;
    var div = null;
    var stream;
    var captureCanvas;
    var imgElement;
    var labelElement;

    var pendingResolve = null;
    var shutdown = false;

    function removeDom() {
       stream.getVideoTracks()[0].stop();
       video.remove();
       div.remove();
       video = null;
       div = null;
       stream = null;
       imgElement = null;
       captureCanvas = null;
       labelElement = null;
    }

    function onAnimationFrame() {
      if (!shutdown) {
        window.requestAnimationFrame(onAnimationFrame);
      }
      if (pendingResolve) {
        var result = "";
        if (!shutdown) {
          captureCanvas.getContext('2d').drawImage(video, 0, 0, 200, 200);
          result = captureCanvas.toDataURL('image/jpeg', 0.8)
        }
        var lp = pendingResolve;
        pendingResolve = null;
        lp(result);
      }
    }

    async function createDom() {
      if (div !== null) {
        return stream;
      }

      div = document.createElement('div');
      div.style.border = '2px solid black';
      div.style.padding = '3px';
      div.style.width = '100%';
      div.style.maxWidth = '600px';
      document.body.appendChild(div);

      const modelOut = document.createElement('div');
      modelOut.innerHTML = "<span>Status:</span>";
      labelElement = document.createElement('span');
      labelElement.innerText = 'No data';
      labelElement.style.fontWeight = 'bold';
      modelOut.appendChild(labelElement);
      div.appendChild(modelOut);

      video = document.createElement('video');
      video.style.display = 'block';
      video.width = div.clientWidth - 6;
      video.setAttribute('playsinline', '');
      video.onclick = () => { shutdown = true; };
      stream = await navigator.mediaDevices.getUserMedia(
          {video: { facingMode: "environment"}});
      div.appendChild(video);

      imgElement = document.createElement('img');
      imgElement.style.position = 'absolute';
      imgElement.style.zIndex = 1;
      imgElement.onclick = () => { shutdown = true; };
      div.appendChild(imgElement);

      const instruction = document.createElement('div');
      instruction.innerHTML =
          '<span style="color: red; font-weight: bold;">' +
          'When finished, click here or on the video to stop this demo</span>';
      div.appendChild(instruction);
      instruction.onclick = () => { shutdown = true; };

      video.srcObject = stream;
      await video.play();

      captureCanvas = document.createElement('canvas');
      captureCanvas.width = 200; //video.videoWidth;
      captureCanvas.height = 200; //video.videoHeight;
      window.requestAnimationFrame(onAnimationFrame);

      return stream;
    }
    async function stream_frame(label, imgData) {
      if (shutdown) {
        removeDom();
        shutdown = false;
        return '';
      }

      var preCreate = Date.now();
      stream = await createDom();

      var preShow = Date.now();
      if (label != "") {
        labelElement.innerHTML = label;
      }

      if (imgData != "") {
        var videoRect = video.getClientRects()[0];
        imgElement.style.top = videoRect.top + "px";
        imgElement.style.left = videoRect.left + "px";
        imgElement.style.width = videoRect.width + "px";
        imgElement.style.height = videoRect.height + "px";
        imgElement.src = imgData;
      }

      var preCapture = Date.now();
      var result = await new Promise(function(resolve, reject) {
        pendingResolve = resolve;
      });
      shutdown = false;

      return {'create': preShow - preCreate,
              'show': preCapture - preShow,
              'capture': Date.now() - preCapture,
              'img': result};
    }
    ''')

  display(js)

def video_frame(label, bbox):
  data = eval_js('stream_frame("{}", "{}")'.format(label, bbox))
  return data

In [19]:
#defining the models
detection_model = Object_Detect().to(device)

#added this to use the saved state from previous runs
detection_model.load_state_dict(torch.load('/content/drive/My Drive/ai-ml-round3/model_state_dict',map_location=device))

# start streaming video from webcam
video_stream()
# label for video
label_html = 'Capturing...'
# initialze bounding box to empty
bbox = ''
count = 0
while True:
    js_reply = video_frame(label_html, bbox)
    if not js_reply:
        break

    # convert JS response to OpenCV Image
    img = js_to_image(js_reply["img"])

    # # create transparent overlay for bounding box
    bbox_array = np.zeros([200,200,4], dtype=np.uint8)

    # # get face region coordinates
    # faces = face_cascade.detectMultiScale(gray)
    # # get face bounding box for overlay
    # for (x,y,w,h) in faces:
    #   bbox_array = cv2.rectangle(bbox_array,(x,y),(x+w,y+h),(255,0,0),2)

    img = cv2.resize(img, (200,200), interpolation = cv2.INTER_AREA)
    input = transforms.ToTensor()(img)
    input = torch.stack([input])

    input = input.to(device)
    # get classes and bounding box coordinates
    boxo, classo = detection_model(input)
    classo = classo.detach()
    boxo = boxo.to(torch.device('cpu'))
    classo = classo.to(torch.device('cpu'))
    # draw face bounding box on image
    dclass = np.argmax(classo[0]).item()
    color = (0,0,0)
    keys = {0:'None',1:'Rock', 2:'Paper',3:'Scissors'}
    op = '{cl}, {pred:0.2f}'.format(cl = keys[dclass], pred = max(classo[0]).item())
    if(dclass == 1):
      color = (255,0,0)
    elif(dclass == 2):
      color = (0,255,0)
    elif(dclass == 3):
      color = (0,0,255)
    for (x1,y1,x2,y2) in boxo:
      x1,x2 = int(x1*200),int(x2*200)
      y1,y2 = int(y1*200),int(y2*200)
      bbox_array = cv2.rectangle(bbox_array,(x1,y1),(x2,y2),color,2)
      bbox_array = cv2.putText(bbox_array, op, (10,10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0,0,255),1,cv2.LINE_AA)
    bbox_array[:,:,3] = (bbox_array.max(axis = 2) > 0 ).astype(int) * 255
    # convert overlay of bbox into bytes
    bbox_bytes = bbox_to_bytes(bbox_array)
    # update bbox so next frame gets new overlay
    bbox = bbox_bytes

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In the start, I had attempted to create two neural networks, one to determine a bounding box around the hand, and the other to classify a cropped image of the bounding box as rock paper or scissors. However, this idea fell apart very quickly when i realised small bounding box innacuracies would lead to many mistakes in classification.





I changed up the approach to be similar to that used in vgg, resnet and yolo, where a CNN acts as the backbone to the network, and then splits into two neural networks, tasked with regression and classification.

I tried different loss functions and settled on Cross Entropy for classification and Mean Square Error for bounding box prediction. I had experimented with using Box IOU as a metric, but pytorch was giving me an error wrt the gradient calculation that I wasnt able to fix.

I also attempted to include the images without any hands in them, but this skewed my result and messed up the accuracy of the model a lot. I suspect that its due to there being images of hands, which arent rock, paper or scissors; getting trained to find the bounding box as (0,0),(0,0). This could be fixed by annotating the classless images properly for bounding box at least.