In [8]:
import cv2
import numpy as np
import os

Here we are taking each frame for one every second in the video and removing the similar frames using structural similarity.
I tried with histogram similarity also but structural is working well.

In [9]:
#Extracting Frames form
def extract_frames_per_second(video_path, output_folder):
    cap = cv2.VideoCapture(video_path)
    fps = int(cap.get(cv2.CAP_PROP_FPS))  # Get the FPS of the video
    frame_count = 0
    extracted_count = 0
    frames = []
    while True:
        success, frame = cap.read()
        if not success:
            break
        if frame_count % fps == 0:
            frames.append(frame)
            cv2.imwrite(f'{output_folder}/frame_{extracted_count:04d}.png', frame)
            extracted_count += 1

        frame_count += 1

    cap.release()
    print(f"Extracted {extracted_count} frames (1 frame per second).")
    return frames


In [10]:
frames = extract_frames_per_second("data/video.mp4","data/frames")

Extracted 60 frames (1 frame per second).


In [11]:
net = cv2.dnn.readNet("yolo_config/yolov3.weights", "yolo_config/yolov3.cfg")
print("YOLO model loaded successfully.")

# Get layer names
layer_names = net.getLayerNames()
print("Layer names:", layer_names)

classes = []
with open("yolo_config/coco.names", "r") as f:
    classes = [line.strip() for line in f.readlines()]

# Get output layer names
output_layers_indices = net.getUnconnectedOutLayers()
output_layers = [layer_names[i - 1] for i in output_layers_indices]
print("Output layers:", output_layers)

YOLO model loaded successfully.
Layer names: ('conv_0', 'bn_0', 'leaky_1', 'conv_1', 'bn_1', 'leaky_2', 'conv_2', 'bn_2', 'leaky_3', 'conv_3', 'bn_3', 'leaky_4', 'shortcut_4', 'conv_5', 'bn_5', 'leaky_6', 'conv_6', 'bn_6', 'leaky_7', 'conv_7', 'bn_7', 'leaky_8', 'shortcut_8', 'conv_9', 'bn_9', 'leaky_10', 'conv_10', 'bn_10', 'leaky_11', 'shortcut_11', 'conv_12', 'bn_12', 'leaky_13', 'conv_13', 'bn_13', 'leaky_14', 'conv_14', 'bn_14', 'leaky_15', 'shortcut_15', 'conv_16', 'bn_16', 'leaky_17', 'conv_17', 'bn_17', 'leaky_18', 'shortcut_18', 'conv_19', 'bn_19', 'leaky_20', 'conv_20', 'bn_20', 'leaky_21', 'shortcut_21', 'conv_22', 'bn_22', 'leaky_23', 'conv_23', 'bn_23', 'leaky_24', 'shortcut_24', 'conv_25', 'bn_25', 'leaky_26', 'conv_26', 'bn_26', 'leaky_27', 'shortcut_27', 'conv_28', 'bn_28', 'leaky_29', 'conv_29', 'bn_29', 'leaky_30', 'shortcut_30', 'conv_31', 'bn_31', 'leaky_32', 'conv_32', 'bn_32', 'leaky_33', 'shortcut_33', 'conv_34', 'bn_34', 'leaky_35', 'conv_35', 'bn_35', 'leaky_36

In [12]:

output_directory = 'data/annotated_frames'
os.makedirs(output_directory, exist_ok=True)

# Dictionary to store labels and confidence scores for each frame
frames_info = {}

for i, frame in enumerate(frames):
    height, width, channels = frame.shape

    # Detecting objects
    blob = cv2.dnn.blobFromImage(frame, 0.00392, (416, 416), (0, 0, 0), True, crop=False)
    net.setInput(blob)
    outs = net.forward(output_layers)

    # Initialize dictionary for this frame
    frame_info = {}

    # Showing informations on the screen
    class_ids = []
    confidences = []
    boxes = []
    for out in outs:
        for detection in out:
            scores = detection[5:]
            class_id = np.argmax(scores)
            confidence = scores[class_id]
            if confidence > 0.5:
                # Object detected
                center_x = int(detection[0] * width)
                center_y = int(detection[1] * height)
                w = int(detection[2] * width)
                h = int(detection[3] * height)

                # Rectangle coordinates
                x = int(center_x - w / 2)
                y = int(center_y - h / 2)

                boxes.append([x, y, w, h])
                confidences.append(float(confidence))
                class_ids.append(class_id)

                # Add label and confidence to the frame info dictionary
                label = str(classes[class_id])
                frame_info[label] = confidence

    # Draw bounding boxes and labels on the frame
    indexes = cv2.dnn.NMSBoxes(boxes, confidences, 0.5, 0.4)
    font = cv2.FONT_HERSHEY_PLAIN
    for i in range(len(boxes)):
        if i in indexes:
            x, y, w, h = boxes[i]
            label = str(classes[class_ids[i]])
            cv2.rectangle(frame, (x, y), (x + w, y + h), (0, 255, 0), 2)
            cv2.putText(frame, label, (x, y + 30), font, 3, (0, 255, 0), 3)

    # Save the annotated frame
    annotated_frame_path = os.path.join(output_directory, f"annotated_frame_{i}.jpg")
    cv2.imwrite(annotated_frame_path, frame)

    # Store frame info in the frames_info dictionary
    for k in frame_info.keys():
        if k in frames_info:
            frames_info[k] = max(frames_info[k], frame_info[k])
        else:
            frames_info[k] = [frame_info[k]]

# Print frames_info dictionary
print(frames_info)


{'person': 0.99741817, 'umbrella': [0.9143889], 'bus': [0.6156007], 'aeroplane': 0.89126605, 'car': 0.99696904, 'surfboard': [0.6609388], 'truck': 0.9346713, 'sheep': 0.6979648, 'tie': [0.7132201], 'horse': [0.9321146]}


In [13]:
# sort the dictionary by confidence score
sorted_frames_info = dict(sorted(frames_info.items(), key=lambda item: item[1], reverse=True))
print(sorted_frames_info)

{'person': 0.99741817, 'car': 0.99696904, 'truck': 0.9346713, 'horse': [0.9321146], 'umbrella': [0.9143889], 'aeroplane': 0.89126605, 'tie': [0.7132201], 'sheep': 0.6979648, 'surfboard': [0.6609388], 'bus': [0.6156007]}
