In [None]:
import cv2
import numpy as np
import onnxruntime as ort

from collections import deque
import onnx
import urllib.request

In [None]:


url = "https://raw.githubusercontent.com/deepmind/kinetics-i3d/master/data/label_map.txt"
lines = urllib.request.urlopen(url).read().decode("utf-8").splitlines()

labels = [line.strip() for line in lines]
print(labels[0:5])

['abseiling', 'air drumming', 'answering questions', 'applauding', 'applying cream']


In [12]:
model = onnx.load("models/resnet_mixed-resnet-mixed-convolution-float.onnx")
for prop in model.metadata_props:
    print(f"{prop.key}: {prop.value}")

In [20]:
# Load ONNX model
# session = ort.InferenceSession("models/resnet_mixed-resnet-mixed-convolution-float.onnx")
# session = ort.InferenceSession("models/resnet_2plus1d-resnet-2plus1d-float.onnx")
session = ort.InferenceSession("models/video_mae.onnx")

In [21]:
# Inspect ONNX model input details
for input in session.get_inputs():
    print("Name:", input.name)
    print("Shape:", input.shape)
    print("Type:", input.type)

Name: video
Shape: [1, 3, 16, 224, 224]
Type: tensor(float)


In [22]:
# Preprocess a single frame
# def preprocess_frame(frame):
#     frame = cv2.resize(frame, (112, 112))
#     frame = frame.astype(np.float32) / 255.0
#     frame = np.transpose(frame, (2, 0, 1))  # Channels first
#     return frame
def preprocess_frame(frame):
    frame = cv2.resize(frame, (224, 224))
    frame = frame.astype(np.float32) / 255.0
    frame = np.transpose(frame, (2, 0, 1))  # Channels first: [3, 224, 224]
    return frame

# Collect 16 frames for each inference
frame_buffer = []

cap = cv2.VideoCapture(0)

last_activity = ""
activity_counter = 0
display_duration = 30  # Number of frames to keep the text

while True:
    ret, frame = cap.read()
    if not ret:
        break

    processed = preprocess_frame(frame)
    frame_buffer.append(processed)

    if len(frame_buffer) == 16:
        video = np.stack(frame_buffer, axis=1)
        video = np.expand_dims(video, axis=0)

        outputs = session.run(None, {"video": video.astype(np.float32)})
        prediction = np.argmax(outputs[0])
        last_activity = labels[prediction]
        activity_counter = display_duration
        prediction_buffer.append(last_activity)
        frame_buffer = []

    # Show text if counter > 0
    if activity_counter > 0:
        cv2.putText(frame, f"Activity: {last_activity}", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0,255,0), 2)
        activity_counter -= 1

    cv2.imshow("Activity Recognition", frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

print("Prediction buffer:", list(prediction_buffer))

cap.release()
cv2.destroyAllWindows()

Prediction buffer: ['cracking neck', 'whistling', 'whistling', 'whistling', 'whistling', 'whistling', 'eating chips', 'answering questions', 'whistling', 'trimming or shaving beard', 'whistling', 'whistling', 'whistling', 'cracking neck', 'cracking neck', 'cracking neck', 'playing harmonica', 'playing harmonica', 'sign language interpreting', 'cracking neck']
