In [1]:
# IMPORT ALL PACKAGE
import cv2
import numpy as np
import torch
import json
import time
import wget

from IPython.display import display, clear_output
from ipywidgets import Image

from pytorchvideo.data.encoded_video import EncodedVideo

from torchvision.transforms import Compose, Lambda
from torchvision.transforms._transforms_video import (
    CenterCropVideo,
    NormalizeVideo,
)
from pytorchvideo.transforms import (
    ApplyTransformToKey,
    ShortSideScale,
    UniformTemporalSubsample
)



In [2]:
# DOWNLOAD MODEL

# Device on which to run the model
# Set to cuda to load on GPU
device = "cpu"

# Pick a pretrained model and load the pretrained weights
model_name = "slow_r50"
model = torch.hub.load("facebookresearch/pytorchvideo", model=model_name, pretrained=True)

# Set to eval mode and move to desired device
model = model.eval()
model = model.to(device)

Using cache found in C:\Users\nyok/.cache\torch\hub\facebookresearch_pytorchvideo_main


In [3]:
# DOWNLOAD KINETIC-400 LABEL
# url = "https://dl.fbaipublicfiles.com/pyslowfast/dataset/class_names/kinetics_classnames.json"
# wget.download(url, 'kinetics_classnames.json')

with open("kinetics_classnames.json", "r") as f:
    kinetics_classnames = json.load(f)
    labels = [line.strip() for line in kinetics_classnames]

# Create an id to label name mapping
kinetics_id_to_classname = {}
for k, v in kinetics_classnames.items():
    kinetics_id_to_classname[v] = str(k).replace('"', "")

print(labels[0:10], np.shape(labels))

['"sharpening knives"', '"eating ice cream"', '"cutting nails"', '"changing wheel"', '"bench pressing"', 'deadlifting', '"eating carrots"', 'marching', '"throwing discus"', '"playing flute"'] (400,)


In [4]:
side_size = 256
mean = [0.45, 0.45, 0.45]
std = [0.225, 0.225, 0.225]
crop_size = 256
num_frames = 8
sampling_rate = 8
frames_per_second = 16

# Note that this transform is specific to the slow_R50 model.
transform =  ApplyTransformToKey(
    key="video",
    transform=Compose(
        [
            UniformTemporalSubsample(num_frames),
            Lambda(lambda x: x/255.0),
            NormalizeVideo(mean, std),
            ShortSideScale(
                size=side_size
            ),
            CenterCropVideo(crop_size=(crop_size, crop_size))
        ]
    ),
)

# The duration of the input clip is also specific to the model.
clip_duration = (num_frames * sampling_rate)/frames_per_second

In [5]:
capture = cv2.VideoCapture(0)
if not capture.isOpened():
    print("Error: Could not open webcam.")
    exit()

prev_time = 0 
    
try:
    while True:
        frames = []
        for _ in range(num_frames):
            ret, img = capture.read()
            if not ret:
                print("Error: Could not read frame.")
                break
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            img = cv2.resize(img, (side_size, side_size))
            frames.append(img)
        
        current_time = time.time()
        # Calculate the FPS (frames per second)
        fps = 1 / (current_time - prev_time)
        prev_time = current_time

        
        if len(frames) < num_frames:
            break
        
        # Create input tensor
        video_tensor = torch.tensor(np.array(frames)).permute(3, 0, 1, 2).float()
        video_data = {"video": video_tensor}
        
        # Apply the transform to normalize the input
        video_data = transform(video_data)
        
        # Move the inputs to the desired device
        inputs = video_data["video"].to(device)
        
        # Pass the input through the model
        with torch.no_grad():
            preds = model(inputs[None, ...])
        
        
        # Apply softmax to get class probabilities
        post_act = torch.nn.Softmax(dim=1)
        preds = post_act(preds)
        pred_classes = preds.topk(k=5).indices[0]
        
        # Map the predicted classes to the label names
        pred_class_names = [kinetics_id_to_classname[int(i)] for i in pred_classes]
        
        # Display the prediction on the video frame
        img_bgr = cv2.cvtColor(frames[-1], cv2.COLOR_RGB2BGR)
        for i, class_name in enumerate(pred_class_names):
            cv2.putText(img_bgr, f"{class_name}", (20, 40 + i * 20), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2)
            
        cv2.putText(img_bgr, f"FPS: {int(fps)}", (20, 150), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 3)
        
        # Show the video stream with predictions
        cv2.imshow('Slow Model Action Recognition', img_bgr)
        
        # Press 'Esc' to exit
        if cv2.waitKey(30) & 0xFF == 27:
            break

finally:
    capture.release()
    cv2.destroyAllWindows()

NameError: name 'current_time' is not defined