## SIFT Feature Extraction

This code extracts features from videos using the SIFT feature extractor and saves the extracted descriptors, video IDs, and timestamps in separate npz files for both query and reference videos which is required for the competition submission.

The features are stored as descriptors for each video frame along with the corresponding timestamps. The extracted descriptors are then stored in a NumPy array, and the descriptors for each video are concatenated into a single array. The code also pads the descriptors to ensure that all descriptors for a video have the same dimensions before concatenation.

In [None]:
import os
import numpy as np
import cv2

# Set paths to query and reference videos
QUERY_PATH = 'path/to/query/videos'
REFERENCE_PATH = 'path/to/reference/videos'

# Set paths to output npz files
QUERY_OUT_PATH = 'path/query_descriptors.npz'
REFERENCE_OUT_PATH = 'path/reference_descriptors.npz'

# Initialize feature extractor
feature_extractor = cv2.xfeatures2d.SIFT_create()

# Initialize lists to store video ids, descriptors, and timestamps
query_video_ids = []
query_descriptors = []
query_timestamps = []
reference_video_ids = []
reference_descriptors = []
reference_timestamps = []

# Extract descriptors from query videos
for i, file in enumerate(os.listdir(QUERY_PATH)):
    if file.endswith('.mp4'):
        video_id = os.path.splitext(file)[0]
        video_path = os.path.join(QUERY_PATH, file)
        cap = cv2.VideoCapture(video_path)
        timestamps = []
        descriptors = []
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break
            # Extract features from frame
            kp, des = feature_extractor.detectAndCompute(frame, None)
            descriptors.append(des)
            timestamps.append(cap.get(cv2.CAP_PROP_POS_MSEC) / 1000)
        cap.release()
        query_video_ids.append(video_id)
        max_dims = max(len(d) for d in descriptors if d is not None)
        padded_descriptors = [np.pad(d, ((0, max_dims - len(d)), (0, 0))) if d is not None else np.zeros((max_dims, 128)) for d in descriptors]
        query_descriptors.append(np.vstack(padded_descriptors))
        query_timestamps.append(timestamps)
        print(f"Processed query video {video_id} ({i+1}/{len(os.listdir(QUERY_PATH))})")

# Extract descriptors from reference videos
for i, file in enumerate(os.listdir(REFERENCE_PATH)):
    if file.endswith('.mp4'):
        video_id = os.path.splitext(file)[0]
        video_path = os.path.join(REFERENCE_PATH, file)
        cap = cv2.VideoCapture(video_path)
        timestamps = []
        descriptors = []
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break
            # Extract features from frame
            kp, des = feature_extractor.detectAndCompute(frame, None)
            descriptors.append(des)
            timestamps.append(cap.get(cv2.CAP_PROP_POS_MSEC) / 1000)
        cap.release()
        reference_video_ids.append(video_id)
        max_dims = max(len(d) for d in descriptors if d is not None)
        padded_descriptors = [np.pad(d, ((0, max_dims - len(d)), (0, 0))) if d is not None else np.zeros((max_dims, 128)) for d in descriptors]
        reference_descriptors.append(np.vstack(padded_descriptors))
        reference_timestamps.append(timestamps)
        print(f"Processed reference video {video_id} ({i+1}/{len(os.listdir(REFERENCE_PATH))})")

# Save query descriptors to npz file
np.savez(QUERY_OUT_PATH, video_ids=query_video_ids,
         features=query_descriptors, timestamps=query_timestamps)

# Save reference descriptors to npz file
np.savez(REFERENCE_OUT_PATH, video_ids=reference_video_ids,
         features=reference_descriptors, timestamps=reference_timestamps)

Processed query video Copy of Q100005 (1/5)
Processed query video Copy of Q100001 (2/5)
Processed query video Copy of Q100002 (3/5)
Processed query video Copy of Q100003 (4/5)
Processed query video Copy of Q100004 (5/5)
Processed reference video Copy of R100007 (1/6)
Processed reference video Copy of R100011 (2/6)
Processed reference video Copy of R100004 (3/6)
Processed reference video Copy of R100008 (4/6)
Processed reference video Copy of R100003 (5/6)


## ResNet Feature Extraction

Modification of code above to use a ResNet model from Keras for feature extraction instead of the SIFT feature extractor. The ResNet model extracts 2048-dimensional feature vectors for each frame. This method will generate better features but takes longer to process videos so implementing downsampling is recommended.

In [None]:
import os
import numpy as np
import cv2
from tensorflow.keras.applications.resnet50 import ResNet50
from tensorflow.keras.models import Model

# Set paths to query and reference videos
QUERY_PATH = 'path/to/query/videos'
REFERENCE_PATH = 'path/to/reference/videos'

# Set paths to output npz files
QUERY_OUT_PATH = 'path/query_descriptors.npz'
REFERENCE_OUT_PATH = 'path/reference_descriptors.npz'

# Initialize feature extractor
model = ResNet50(weights='imagenet', include_top=False, pooling='avg')
feature_extractor = Model(inputs=model.input, outputs=model.layers[-1].output)

# Initialize lists to store video ids, descriptors, and timestamps
query_video_ids = []
query_descriptors = []
query_timestamps = []
reference_video_ids = []
reference_descriptors = []
reference_timestamps = []

# Extract descriptors from query videos
for i, file in enumerate(os.listdir(QUERY_PATH)):
    if file.endswith('.mp4'):
        video_id = os.path.splitext(file)[0]
        video_path = os.path.join(QUERY_PATH, file)
        cap = cv2.VideoCapture(video_path)
        timestamps = []
        descriptors = []
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break
            # Resize frame to fit ResNet input shape
            frame = cv2.resize(frame, (224, 224))
            # Extract features from frame
            des = feature_extractor.predict(np.expand_dims(frame, axis=0))[0]
            descriptors.append(des)
            timestamps.append(cap.get(cv2.CAP_PROP_POS_MSEC) / 1000)
        cap.release()
        query_video_ids.append(video_id)
        query_descriptors.append(np.vstack(descriptors))
        query_timestamps.append(timestamps)
        print(f"Processed query video {i+1}/{len(os.listdir(QUERY_PATH))}")

# Extract descriptors from reference videos
for i, file in enumerate(os.listdir(REFERENCE_PATH)):
    if file.endswith('.mp4'):
        video_id = os.path.splitext(file)[0]
        video_path = os.path.join(REFERENCE_PATH, file)
        cap = cv2.VideoCapture(video_path)
        timestamps = []
        descriptors = []
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break
            # Resize frame to fit ResNet input shape
            frame = cv2.resize(frame, (224, 224))
            # Extract features from frame
            des = feature_extractor.predict(np.expand_dims(frame, axis=0))[0]
            descriptors.append(des)
            timestamps.append(cap.get(cv2.CAP_PROP_POS_MSEC) / 1000)
        cap.release()
        reference_video_ids.append(video_id)
        reference_descriptors.append(np.vstack(descriptors))
        reference_timestamps.append(timestamps)
        print(f"Processed reference video {i+1}/{len(os.listdir(REFERENCE_PATH))}")

# Save query descriptors to npz file
np.savez(QUERY_OUT_PATH, video_ids=query_video_ids,
         features=query_descriptors, timestamps=query_timestamps)

# Save reference descriptors to npz file
np.savez(REFERENCE_OUT_PATH, video_ids=reference_video_ids,
         features=reference_descriptors, timestamps=reference_timestamps)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Processed query video 4/5
Processed query video 5/5
Processed reference video 1/6
Processed reference video 2/6
Processed reference video 3/6
Processed reference video 4/6
Processed reference video 5/6


  val = np.asanyarray(val)
