In [None]:
%pip install opencv-python
%pip install numpy
%pip install torch torchvision
%pip install wget
%pip install pytorchvideo
%pip install json

In [1]:
# IMPORT ALL THE PACKAGE

import cv2
import numpy as np
import torch
import json
import time
import wget

from IPython.display import display, clear_output
from ipywidgets import Image

from pytorchvideo.data.encoded_video import EncodedVideo

from torchvision.transforms import Compose, Lambda
from torchvision.transforms._transforms_video import (
    CenterCropVideo,
    NormalizeVideo,
)
from pytorchvideo.transforms import (
    ApplyTransformToKey,
    ShortSideScale,
    UniformTemporalSubsample
)



In [2]:
# DOWNLOAD THE MODEL

# Device on which to run the model
# Set to cuda to load on GPU
device = "cpu"

# Pick a pretrained model and load the pretrained weights
model_name = "slowfast_r50"
model = torch.hub.load("facebookresearch/pytorchvideo", model=model_name, pretrained=True)

# Set to eval mode and move to desired device
model = model.eval()
model = model.to(device)

Using cache found in C:\Users\nyok/.cache\torch\hub\facebookresearch_pytorchvideo_main


In [3]:
# DOWNLOAD KINETIC-400 LABEL

# url = "https://dl.fbaipublicfiles.com/pyslowfast/dataset/class_names/kinetics_classnames.json"
# wget.download(url, 'kinetics_classnames.json')

with open("kinetics_classnames.json", "r") as f:
    kinetics_classnames = json.load(f)
    labels = [line.strip() for line in kinetics_classnames]

# Create an id to label name mapping
kinetics_id_to_classname = {}
for k, v in kinetics_classnames.items():
    kinetics_id_to_classname[v] = str(k).replace('"', "")

print(labels[0:10], np.shape(labels))

['"sharpening knives"', '"eating ice cream"', '"cutting nails"', '"changing wheel"', '"bench pressing"', 'deadlifting', '"eating carrots"', 'marching', '"throwing discus"', '"playing flute"'] (400,)


In [4]:
# TRANSFORM PARAMETER
side_size = 256
mean = [0.45, 0.45, 0.45]
std = [0.225, 0.225, 0.225]
num_frames = 32
sampling_rate = 2
frames_per_second = 16
slowfast_alpha = 4

# TRANSFORMING THE VIDEO 
class PackPathway(torch.nn.Module):
    def __init__(self, alpha=4):
        super().__init__()
        self.alpha = alpha

    def forward(self, frames: torch.Tensor):
        fast_pathway = frames
        slow_pathway = torch.index_select(
            frames,
            1,
            torch.linspace(0, frames.shape[1] - 1, frames.shape[1] // self.alpha).long(),
        )
        return slow_pathway, fast_pathway

transform = ApplyTransformToKey(
    key="video",
    transform=Compose(
        [
            UniformTemporalSubsample(num_frames),
            Lambda(lambda x: x/255.0),
            NormalizeVideo(mean, std),
            ShortSideScale(size=side_size),
            PackPathway(slowfast_alpha)
        ]
    ),
)

In [5]:
# MAIN PRE-PROCESSING FUNCTION

def process_video(use_webcam=False, video_path=None):
    if use_webcam:
        capture = cv2.VideoCapture(0)
    elif video_path:
        capture = cv2.VideoCapture(video_path)
    else:
        print("Error: Please specify either use_webcam=True or provide a video_path.")
        return

    if not capture.isOpened():
        print("Error: Could not open video source.")
        return

    try:
        while True:
            frames = []
            for _ in range(num_frames):
                ret, img = capture.read()
                if not ret:
                    print("Error: Could not read frame.")
                    break
                img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
                img = cv2.resize(img, (side_size, side_size))
                frames.append(img)

            if len(frames) < num_frames:
                break

            # Create input tensor
            video_tensor = torch.tensor(np.array(frames)).permute(3, 0, 1, 2).float()
            video_data = {"video": video_tensor}

            # Apply the transform to normalize the input
            video_data = transform(video_data)

            # Prepare inputs for the model
            slow_pathway, fast_pathway = video_data["video"]
            inputs = [slow_pathway.unsqueeze(0), fast_pathway.unsqueeze(0)]
            # inputs = [i.to(device) for i in inputs]  # Uncomment when using GPU

            # Pass the input through the model
            with torch.no_grad():
                preds = model(inputs)

            # Apply softmax to get class probabilities
            post_act = torch.nn.Softmax(dim=1)
            preds = post_act(preds)
            pred_classes = preds.topk(k=5).indices[0]

            # Map the predicted classes to the label names
            pred_class_names = [kinetics_id_to_classname[int(i)] for i in pred_classes]

            # Display the prediction on the video frame
            img_bgr = cv2.cvtColor(frames[-1], cv2.COLOR_RGB2BGR)
            for i, class_name in enumerate(pred_class_names):
                cv2.putText(img_bgr, f"{class_name}", (20, 40 + i * 20), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2)

            # Show the video stream with predictions
            cv2.imshow('SlowFast R50 Action Recognition', img_bgr)

            # Press 'Esc' to exit
            if cv2.waitKey(30) & 0xFF == 27:
                break

    finally:
        capture.release()
        cv2.destroyAllWindows()

In [6]:
# RUNNING THE MODEL WITH OR WITHOUT WEBCAME
# For webcam:
process_video(use_webcam=True)

# For video file:
# process_video(video_path="C:/Users/nyok/Desktop/OpenCV/Videos/diving.MP4")


KeyboardInterrupt

