In [2]:
# IMPORTS ALL PACKAGE

import time
import cv2
import numpy as np
import torch
import json
import wget

from pytorchvideo.data.encoded_video import EncodedVideo

from torchvision.transforms import Compose, Lambda
from torchvision.transforms._transforms_video import (
    CenterCropVideo,
    NormalizeVideo,
)
from pytorchvideo.transforms import (
    ApplyTransformToKey,
    ShortSideScale,
    UniformTemporalSubsample
)



In [3]:
# DOWNLOAD THE MODEL

# Device on which to run the model
device = "cpu"

# Pick a pretrained model and load the pretrained weights
model_name = "x3d_xs"
model_recognition = torch.hub.load("facebookresearch/pytorchvideo", model=model_name, pretrained=True)

# Set to eval mode and move to desired device
model_recognition = model_recognition.eval()
model_recognition = model_recognition.to(device)

Using cache found in C:\Users\nyok/.cache\torch\hub\facebookresearch_pytorchvideo_main


In [4]:
# DOWNLOAD THE KINETIC-400 LABEL

# url = "https://dl.fbaipublicfiles.com/pyslowfast/dataset/class_names/kinetics_classnames.json"
# wget.download(url, 'kinetics_classnames.json')

with open("kinetics_classnames.json", "r") as f:
    kinetics_classnames = json.load(f)
    labels = [line.strip() for line in kinetics_classnames]

# Create an id to label name mapping
kinetics_id_to_classname = {}
for k, v in kinetics_classnames.items():
    kinetics_id_to_classname[v] = str(k).replace('"', "")

print(labels[0:10], np.shape(labels))

['"sharpening knives"', '"eating ice cream"', '"cutting nails"', '"changing wheel"', '"bench pressing"', 'deadlifting', '"eating carrots"', 'marching', '"throwing discus"', '"playing flute"'] (400,)


In [5]:
# TRANSFORM PARAMETER
mean = [0.45, 0.45, 0.45]
std = [0.225, 0.225, 0.225]
frames_per_second = 6

# SPECIFIC PARAMETER FOR MODEL
model_transform_params = {
    "x3d_xs": {"side_size": 182, "crop_size": 182, "num_frames": 4, "sampling_rate": 12},
    "x3d_s": {"side_size": 182, "crop_size": 182, "num_frames": 13, "sampling_rate": 6},
    "x3d_m": {"side_size": 256, "crop_size": 256, "num_frames": 16, "sampling_rate": 5}
}

transform_params = model_transform_params[model_name]

transform = ApplyTransformToKey(
    key="video",
    transform=Compose(
        [
            UniformTemporalSubsample(transform_params["num_frames"]),
            Lambda(lambda x: x/255.0),
            NormalizeVideo((0.45, 0.45, 0.45), (0.225, 0.225, 0.225)),
            ShortSideScale(size=transform_params["side_size"]),
            CenterCropVideo(
                crop_size=(transform_params["crop_size"], transform_params["crop_size"])
            )
        ]
    ),
)

# UniformTemporalSubsample : Reduces the number of frames to the required num_frames by uniformly sampling.
# Lambda(lambda x: x/255.0): Normalizes pixel values to the range [0, 1].
# NormalizeVideo           : Applies mean and standard deviation normalization using the provided values.
# ShortSideScale           : Resizes the video frames so that the shorter side is of length side_size.
# CenterCropVideo          : Crops the center crop_size×crop_size region from each frame.

In [6]:
# MAIN PROCESSING FUNCTION

def process_videosss(use_webcam=False, video_path=None):
    # Initialize webcam or video file
    if use_webcam:
        capture = cv2.VideoCapture(0)
    elif video_path:
        capture = cv2.VideoCapture(video_path)
    else:
        print("Error: Provide video path or function for using webcam")
        return

    if not capture.isOpened():
        print("Error: Could not open video source.")
        return
    
    frame_buffer = []
    
    prev_time = 0
    
    frame_count = 0
    
    try:
        while True:
            ret, img = capture.read()
            if not ret:
                if not use_webcam:
                    print("End of video file.")
                else:
                    print("Error: Could not read frame from webcam.")
                break
            
            frame_count += 1
            
            # Skip frames based on the value of skip_frames
            if frame_count % 2 != 0:
                continue 
            
            # Convert color format of openCV(BGR) to pytorch(RGB)
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            
            current_time = time.time()
            
            # Add frame to buffer
            frame_buffer.append(img)
            if len(frame_buffer) < transform_params["num_frames"]:
                continue
            
            # Create a batch of repeated frames to match with required number for model
            video_frames = frame_buffer[-transform_params["num_frames"]:]
            # Convert to PyTorch tensor and permuted the tensor into shape (C,H,T,W)
            video_data = {"video": torch.tensor(np.array(video_frames)).permute(3, 0, 1, 2).float()}
            
            # Apply the transform to normalize the input for model
            video_data = transform(video_data)
            
            # Move the inputs to the desired device
            inputs = video_data["video"].to(device)
            
            # Pass the input through the model
            with torch.no_grad():
                preds = model_recognition(inputs[None, ...])
            
            # Apply softmax to get class probabilities
            post_act = torch.nn.Softmax(dim=1)
            preds = post_act(preds)
            pred_classes = preds.topk(k=5).indices[0]
            
            # Map the predicted classes to the label names
            pred_class_names = [kinetics_id_to_classname[int(i)] for i in pred_classes]
            
            # Convert back color format from BGR to RGB
            img_bgr = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
            
            # Calculate the FPS (frames per second)
            fps = 1 / (current_time - prev_time)
            prev_time = current_time

            
            # Display the prediction on the video frame
            for i, class_name in enumerate(pred_class_names):
                cv2.putText(img_bgr, f"{class_name}", (20, 40 + i * 20), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2)
                
            cv2.putText(img_bgr, f"FPS: {float(fps)}", (20, 150), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 3)

            # Show the video stream with predictions
            cv2.imshow('X3D-S Action Recognition', img_bgr)
                            
            # Press 'Esc' to exit
            if cv2.waitKey(30) & 0xFF == 27:
                break

    finally:
        capture.release()
        cv2.destroyAllWindows()


In [15]:
def draw_enhanced_roi(frame, roi):

        # Draw corner brackets
        cv2.line(frame, (roi[2] + 3, roi[0] + 3), (roi[2] + 3, roi[0] + 100), (0, 200, 0), 2)
        cv2.line(frame, (roi[2] + 3, roi[0] + 3), (roi[2] + 100, roi[0] + 3), (0, 200, 0), 2)
        cv2.line(frame, (roi[3] - 3, roi[1] - 3), (roi[3] - 3, roi[1] - 100), (0, 200, 0), 2)
        cv2.line(frame, (roi[3] - 3, roi[1] - 3), (roi[3] - 100, roi[1] - 3), (0, 200, 0), 2)
        cv2.line(frame, (roi[3] - 3, roi[0] + 3), (roi[3] - 3, roi[0] + 100), (0, 200, 0), 2)
        cv2.line(frame, (roi[3] - 3, roi[0] + 3), (roi[3] - 100, roi[0] + 3), (0, 200, 0), 2)
        cv2.line(frame, (roi[2] + 3, roi[1] - 3), (roi[2] + 3, roi[1] - 100), (0, 200, 0), 2)
        cv2.line(frame, (roi[2] + 3, roi[1] - 3), (roi[2] + 100, roi[1] - 3), (0, 200, 0), 2)

        # Write ROI label
        FONT_STYLE = cv2.FONT_HERSHEY_SIMPLEX
        org = (roi[2] + 3, roi[1] - 3)
        org2 = (roi[2] + 2, roi[1] - 2)
        FONT_SIZE = 0.5
        FONT_COLOR = (0, 200, 0)
        FONT_COLOR2 = (0, 0, 0)
        cv2.putText(frame, "ROI", org2, FONT_STYLE, FONT_SIZE, FONT_COLOR2, 2)
        cv2.putText(frame, "ROI", org, FONT_STYLE, FONT_SIZE, FONT_COLOR, 2)

        return frame

In [16]:
# MAIN PROCESSING FUNCTION
def process_videos(use_webcam=False, video_path=None):
    # Initialize webcam or video file
    if use_webcam:
        capture = cv2.VideoCapture(0)
    elif video_path:
        capture = cv2.VideoCapture(video_path)
    else:
        print("Error: Provide video path or function for using webcam")
        return
    if not capture.isOpened():
        print("Error: Could not open video source.")
        return
    
    frame_buffer = []
    prev_time = 0
    frame_count = 0

    try:
        while True:
            ret, img = capture.read()
            if not ret:
                if not use_webcam:
                    print("End of video file.")
                else:
                    print("Error: Could not read frame from webcam.")
                break
            
            frame_count += 1
            
            # Skip frames based on the value of skip_frames
            if frame_count % 2 != 0:
                continue 
            
            # RESIZE THE IMAGE
#             h, w, _ = img.shape
#             scale = 2 / min(h, w)
#             w_scaled, h_scaled = int(w * scale), int(h * scale)
#             if w_scaled == w and h_scaled == h:
#                 return img
#             cv2.resize(img, (w_scaled, h_scaled))
            
            
            # MAKE A CENTER CROP FOR ROI
            def center_crop(img: np.ndarray) -> np.ndarray:
                """
                Center crop squared the original frame to standardize the input image to the encoder model

                :param frame: input frame
                :returns: center-crop-squared frame
                """
                img_h, img_w, _ = img.shape
                min_dim = min(img_h, img_w)
                start_x = int((img_w - min_dim) / 2.0)
                start_y = int((img_h - min_dim) / 2.0)
                roi = [start_y, (start_y + min_dim), start_x, (start_x + min_dim)]
                return img[start_y : (start_y + min_dim), start_x : (start_x + min_dim), ...], roi

            
            # Convert color format of openCV(BGR) to pytorch(RGB)
            img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            
            # MAKE THE ROI AS INPUT FOR PREDICTION
            (img_crop, roi) = center_crop(img_rgb)
            
            current_time = time.time()
            
            # Add frame to buffer
            frame_buffer.append(img_crop)
            if len(frame_buffer) < transform_params["num_frames"]:
                continue
            
            # Create a batch of repeated frames to match with required number for model
            video_frames = frame_buffer[-transform_params["num_frames"]:]
            # Convert to PyTorch tensor and permuted the tensor into shape (C,H,T,W)
            video_data = {"video": torch.tensor(np.array(video_frames)).permute(3, 0, 1, 2).float()}
            
            # Apply the transform to normalize the input for model
            video_data = transform(video_data)
            
            # Move the inputs to the desired device
            inputs = video_data["video"].to(device)
            
            # Pass the input through the model
            with torch.no_grad():
                preds = model(inputs[None, ...])
            
            # Apply softmax to get class probabilities
            post_act = torch.nn.Softmax(dim=1)
            preds = post_act(preds)
            pred_classes = preds.topk(k=5).indices[0]
            
            # Map the predicted classes to the label names
            pred_class_names = [kinetics_id_to_classname[int(i)] for i in pred_classes]
            
            # Calculate the FPS (frames per second)
            fps = 1 / (current_time - prev_time)
            prev_time = current_time
            
            # Display the prediction on the video frame
            for i, class_name in enumerate(pred_class_names):
                cv2.putText(img, f"{class_name}", (20, 40 + i * 20), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2)
                
            cv2.putText(img, f"FPS: {float(fps):.2f}", (20, 150), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 3)
            
            # Draw enhanced ROI
            img = draw_enhanced_roi(img, roi)
            
            # Show the video stream with predictions
            cv2.imshow('X3D-S Action Recognition', img)
                            
            # Press 'Esc' to exit
            if cv2.waitKey(30) & 0xFF == 27:
                break
    finally:
        capture.release()
        cv2.destroyAllWindows()

# Usage examples:
# For webcam:
# process_video(use_webcam=True)

# For video file:
# process_video(video_path="path/to/your/video.mp4")

In [10]:

import cv2
import torch
from PIL import Image, ImageDraw
from torchvision.models.detection import fasterrcnn_mobilenet_v3_large_320_fpn, FasterRCNN_MobileNet_V3_Large_320_FPN_Weights
from torchvision.utils import draw_bounding_boxes
from torchvision.transforms.functional import pil_to_tensor, to_pil_image
import numpy as np
import time

# Step 1: Initialize model with the best available weights
weights = FasterRCNN_MobileNet_V3_Large_320_FPN_Weights.DEFAULT
model = fasterrcnn_mobilenet_v3_large_320_fpn(weights=weights, box_score_thresh=0.9)
model.eval()

# Step 2: Initialize the inference transforms
preprocess = weights.transforms()

# MAIN PROCESSING FUNCTION
def process_vide(use_webcam=False, video_path=None):
    # Initialize webcam or video file
    if use_webcam:
        capture = cv2.VideoCapture(0)
    elif video_path:
        capture = cv2.VideoCapture(video_path)
    else:
        print("Error: Provide video path or function for using webcam")
        return
    if not capture.isOpened():
        print("Error: Could not open video source.")
        return
    
    output_size = (224, 224)
    frame_buffer = []
    prev_time = 0
    frame_count = 0

    try:
        while True:
            ret, img = capture.read()
            if not ret:
                print("Error: Could not read frame.")
                break
            
            frame_count += 1
            
            # Skip frames based on the value of skip_frames
            if frame_count % 2 != 0:
                continue 
            
            resized_frame = cv2.resize(img, output_size)
            # Convert OpenCV BGR frame to RGB
            rgb_frame = cv2.cvtColor(resized_frame, cv2.COLOR_BGR2RGB)

            # Convert numpy array to tensor, add batch dimension, and normalize
            img_tensor = torch.from_numpy(rgb_frame).permute(2, 0, 1).float() / 255.0
            img_tensor = img_tensor.unsqueeze(0)  # Add batch dimension
            
            print(f"Shape of img_tensor: {img_tensor.shape}")

            # Apply inference preprocessing transforms
            batch = [preprocess(img_tensor[0])]
            
            print(f"Shape of batch[0]: {batch[0].shape}")

            # Step 5: Perform object detection
            try:
                with torch.no_grad():
                    prediction = model(batch)
                
                print(f"Number of predictions: {len(prediction)}")
                prediction = prediction[0]  # Get the first (and only) prediction
                
                print(f"Keys in prediction: {prediction.keys()}")
                print(f"Shape of prediction['boxes']: {prediction['boxes'].shape}")
                
                # Extract labels and draw bounding boxes on the frame
                boxes = prediction["boxes"]
                labels = prediction["labels"]
                scores = prediction["scores"]

                # Filter detections to keep only persons (class 1 in COCO dataset)
                person_indices = labels == 1
                person_boxes = boxes[person_indices]
                person_scores = scores[person_indices]

                print(f"Number of detected persons: {len(person_boxes)}")

                # Draw bounding boxes on the original frame
                result_frame = resized_frame.copy()
                for box, score in zip(person_boxes, person_scores):
                    if score > 0.7:  # Adjust confidence threshold as needed
                        box = box.int().tolist()
                        cv2.rectangle(result_frame, (box[0], box[1]), (box[2], box[3]), (0, 255, 0), 2)
                        cv2.putText(result_frame, f"Person {score:.2f}", (box[0], box[1] - 10),
                                    cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

                print(f"Shape of result_frame: {result_frame.shape}")

            except RuntimeError as e:
                print(f"RuntimeError occurred: {e}")
                print(f"Error occurred at frame {frame_count}")
                break

            
            # Add frame to buffer
            frame_buffer.append(cv2.cvtColor(result_frame, cv2.COLOR_BGR2RGB))
            if len(frame_buffer) >= transform_params["num_frames"]:
                try:
                    # Create a batch of repeated frames to match with required number for model
                    video_frames = frame_buffer[-transform_params["num_frames"]:]
                    # Convert to PyTorch tensor and permute the tensor into shape (C,T,H,W)
                    video_data = {"video": torch.tensor(np.array(video_frames)).permute(3, 1, 2, 0).float() / 255.0}

                    print(f"Shape of video_data['video']: {video_data['video'].shape}")

                    # Apply the transform to normalize the input for model
                    video_data = transform(video_data)

                    # Move the inputs to the desired device
                    inputs = video_data["video"].to(device)

                    print(f"Shape of inputs: {inputs.shape}")

                    # Pass the input through the model
                    with torch.no_grad():
                        preds = model(inputs.unsqueeze(0))  # Add batch dimension

                    # Apply softmax to get class probabilities
                    post_act = torch.nn.Softmax(dim=1)
                    preds = post_act(preds)
                    pred_classes = preds.topk(k=5).indices[0]

                    # Map the predicted classes to the label names
                    pred_class_names = [kinetics_id_to_classname[int(i)] for i in pred_classes]

                    # Display the prediction on the video frame
                    for i, class_name in enumerate(pred_class_names):
                        cv2.putText(result_frame, f"{class_name}", (20, 40 + i * 20), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2)

                except RuntimeError as e:
                    print(f"RuntimeError in action recognition: {e}")
                    print(f"Error occurred at frame {frame_count}")
        
            current_time = time.time()

            # Calculate FPS (frames per second)
            fps = 1 / (current_time - prev_time)
            prev_time = current_time
        
            cv2.putText(img, f"FPS: {float(fps):.2f}", (20, 150), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 3)
            
            
            # Show the video stream with predictions
            cv2.imshow('X3D-S Action Recognition', img)
                            
            # Press 'Esc' to exit
            if cv2.waitKey(30) & 0xFF == 27:
                break
    finally:
        capture.release()
        cv2.destroyAllWindows()

# Usage examples:
# For webcam:
# process_video(use_webcam=True)

# For video file:
# process_video(video_path="path/to/your/video.mp4")

In [6]:
import cv2
import torch
import numpy as np
import time
import queue
import threading
from collections import deque

class VideoStreamProcessor:
    def __init__(self, use_webcam=False, video_path=None, queue_size=128):
        self.use_webcam = use_webcam
        self.video_path = video_path
        self.capture = None
        
        # Queues for thread communication
        self.frame_queue = queue.Queue(maxsize=queue_size)
        self.result_queue = queue.Queue(maxsize=queue_size)
        self.frame_buffer = deque(maxlen=transform_params["num_frames"])
        
        # Threading control
        self.stopped = False
        self.lock = threading.Lock()
        
    def start(self):
        # Initialize capture
        self.capture = cv2.VideoCapture(0 if self.use_webcam else self.video_path)
        if not self.capture.isOpened():
            raise RuntimeError("Could not open video source")
            
        # Start threads
        self.capture_thread = threading.Thread(target=self._capture_frames)
        self.process_thread = threading.Thread(target=self._process_frames)
        self.display_thread = threading.Thread(target=self._display_frames)
        
        self.capture_thread.start()
        self.process_thread.start()
        self.display_thread.start()
        return self
    
    def stop(self):
        self.stopped = True
        # Wait for threads to finish
        if self.capture_thread.is_alive():
            self.capture_thread.join()
        if self.process_thread.is_alive():
            self.process_thread.join()
        if self.display_thread.is_alive():
            self.display_thread.join()
            
        if self.capture is not None:
            self.capture.release()
        cv2.destroyAllWindows()
    
    def _capture_frames(self):
        frame_count = 0
        while not self.stopped:
            if self.frame_queue.full():
                time.sleep(0.1)  # Prevent busy-waiting
                continue
                
            ret, frame = self.capture.read()
            if not ret:
                if not self.use_webcam:
                    print("End of video file.")
                else:
                    print("Error: Could not read frame from webcam.")
                self.stopped = True
                break
            
            frame_count += 1
            if frame_count % 2 == 0:  # Skip every other frame
                # Convert BGR to RGB
                frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                self.frame_queue.put((frame_rgb, frame_count, time.time()))
    
    def _process_frames(self):
        while not self.stopped:
            try:
                frame_data = self.frame_queue.get(timeout=1.0)
            except queue.Empty:
                continue
                
            frame_rgb, frame_count, timestamp = frame_data
            
            # Add to frame buffer
            self.frame_buffer.append(frame_rgb)
            
            if len(self.frame_buffer) >= transform_params["num_frames"]:
                # Process batch of frames
                video_frames = list(self.frame_buffer)
                video_tensor = torch.tensor(np.array(video_frames)).permute(3, 0, 1, 2).float()
                
                # Apply transforms
                video_data = {"video": video_tensor}
                video_data = transform(video_data)
                
                # Move to device and get predictions
                inputs = video_data["video"].to(device)
                
                with torch.no_grad():
                    preds = model(inputs[None, ...])
                
                # Get top predictions
                post_act = torch.nn.Softmax(dim=1)
                preds = post_act(preds)
                pred_classes = preds.topk(k=5).indices[0]
                pred_class_names = [kinetics_id_to_classname[int(i)] for i in pred_classes]
                
                # Convert back to BGR for display
                frame_bgr = cv2.cvtColor(frame_rgb, cv2.COLOR_RGB2BGR)
                
                # Add predictions to frame
                for i, class_name in enumerate(pred_class_names):
                    cv2.putText(frame_bgr, 
                              f"{class_name}", 
                              (20, 40 + i * 20), 
                              cv2.FONT_HERSHEY_SIMPLEX, 
                              0.6, 
                              (0, 255, 0), 
                              2)
                
                self.result_queue.put((frame_bgr, timestamp))
            
            self.frame_queue.task_done()
    
    def _display_frames(self):
        prev_time = time.time()
        
        while not self.stopped:
            try:
                frame_bgr, timestamp = self.result_queue.get(timeout=1.0)
            except queue.Empty:
                continue
                
            # Calculate and display FPS
            current_time = time.time()
            fps = 1 / (current_time - prev_time)
            prev_time = current_time
            
            cv2.putText(frame_bgr, 
                       f"FPS: {float(fps):.2f}", 
                       (20, 150), 
                       cv2.FONT_HERSHEY_SIMPLEX, 
                       0.7, 
                       (0, 255, 0), 
                       3)
            
            cv2.imshow('X3D-S Action Recognition', frame_bgr)
            
            if cv2.waitKey(1) & 0xFF == 27:
                self.stopped = True
                break
                
            self.result_queue.task_done()

def process_video(use_webcam=False, video_path=None):
    try:
        # Start video processing pipeline
        processor = VideoStreamProcessor(use_webcam=use_webcam, 
                                      video_path=video_path)
        processor.start()
        
        # Wait for Esc key
        while not processor.stopped:
            time.sleep(0.1)
            
    except KeyboardInterrupt:
        print("Interrupted by user")
    finally:
        processor.stop()

# Example usage
if __name__ == "__main__":
    process_video(use_webcam=True)  # For webcam
    # process_video(video_path="path/to/video.mp4")  # For video file

In [17]:
# RUNNING THE MODEL WITH OR WITHOUT WEBCAME
# For webcam:
# process_videosss(use_webcam=True)

# For video file:
process_videosss(video_path="C:/Users/nyok/Desktop/OpenCV/Videos/diving.MP4")

In [None]:
# SIMPAN

# MAIN PROCESSING FUNCTION
def process_video(use_webcam=False, video_path=None):
    # Initialize webcam or video file
    if use_webcam:
        capture = cv2.VideoCapture(0)
    elif video_path:
        capture = cv2.VideoCapture(video_path)
    else:
        print("Error: Provide video path or function for using webcam")
        return
    if not capture.isOpened():
        print("Error: Could not open video source.")
        return
    
    input_size = (480, 480)
    frame_buffer = []
    frame_count = 0
    prev_time = 0
    start_time = time.time()   
    
    try:
        while True:
            ret, img = capture.read()
            if not ret:
                print("Error: Could not read frame.")
                break
            
            frame_count += 1
            
            # Skip frames based on the value of skip_frames
#             if frame_count % 2 != 0:
#                 continue 
            
            # Resize image based on input size
            resized_frame = cv2.resize(img, input_size)
            
            # Call detection function 
            result_frame = detect(resized_frame)
            
            # Add frame to buffer
            frame_buffer.append(cv2.cvtColor(result_frame, cv2.COLOR_BGR2RGB))
            if len(frame_buffer) >= transform_params["num_frames"]:
                try:
                    # Create a batch of repeated frames to match with required number for model
                    video_frames = frame_buffer[-transform_params["num_frames"]:]
                    
                    # Convert to PyTorch tensor and permute the tensor into shape (C,T,H,W)
                    video_data = {"video": torch.tensor(np.array(video_frames)).permute(3, 0, 1, 2).float()}

                    print(f"Shape of video_data['video']: {video_data['video'].shape}")

                    # Apply the transform to normalize the input for model
                    video_data = transform(video_data)
                    
                    predict_class = recognize(video_data)

                    # Display the prediction on the video frame
                    for i, class_name in enumerate(predict_class):
                        cv2.putText(result_frame, f"{class_name}", (20, 40 + i * 20), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 153, 0), 2)
                        
                except RuntimeError as e:
                    print(f"RuntimeError in action recognition: {e}")
                    print(f"Error occurred at frame {frame_count}")
        
        
            current_time = time.time()
            total_time = current_time - start_time  # Total elapsed time since the start
            
            # Calculate FPS (instantaneous and average)
            fps = 1 / (current_time - prev_time) if (current_time - prev_time) > 0 else 0
            average_fps = frame_count / total_time if total_time > 0 else 0
            prev_time = current_time
    
        
            cv2.putText(result_frame, f"FPS: {float(fps):.2f}", (20, 120), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2)
            cv2.putText(result_frame, f"Average FPS: {float(average_fps):.2f}", (20, 140), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2)
            
            # Show the video stream with predictions
            cv2.imshow('X3D-S Action Recognition', result_frame)
                            
            # Press 'Esc' to exit
            if cv2.waitKey(30) & 0xFF == 27:
                break
    finally:
        capture.release()
        cv2.destroyAllWindows()

        
        
        
def recognize(video_data):
    # Move the inputs to the desired device
    inputs = video_data["video"].to(device)

    print(f"Shape of inputs: {inputs.shape}")

    # Pass the input through the model
    with torch.no_grad():
        preds = model_recognition(inputs[None, ...])

    # Apply softmax to get class probabilities
    post_act = torch.nn.Softmax(dim=1)
    preds = post_act(preds)
    pred_classes = preds.topk(k=3).indices[0]

    # Map the predicted classes to the label names
    pred_class_names = [kinetics_id_to_classname[int(i)] for i in pred_classes]
    return pred_class_names



# DETECTION FUNCTION
def detect(resized_frame):
    # Convert OpenCV BGR frame to PIL Image
    pil_img = Image.fromarray(cv2.cvtColor(resized_frame, cv2.COLOR_BGR2RGB))

    # Convert PIL Image to tensor and add batch dimension
    img_tensor = pil_to_tensor(pil_img).unsqueeze(0)

    # Apply inference preprocessing transforms
    batch = [preprocess(img_tensor[0])]

    # Step 5: Perform object detection
    with torch.no_grad():
        prediction = model_detection(batch)[0]

    # Draw bounding boxes on the frame
    boxes = prediction["boxes"]
    box = draw_bounding_boxes(img_tensor[0], boxes=boxes, colors="red", width=2)

    # Convert tensor back to OpenCV format for display
    result_frame = cv2.cvtColor(np.array(to_pil_image(box.detach())), cv2.COLOR_RGB2BGR)
    return result_frame