In [7]:
import cv2
import os

import os
from PIL import Image
import torch
from torchvision import transforms
import torch.nn.functional as F

In [8]:
from transformers import AutoFeatureExtractor, AutoModel

model_ckpt = "nateraw/vit-base-beans"
extractor = AutoFeatureExtractor.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)

Some weights of the model checkpoint at nateraw/vit-base-beans were not used when initializing ViTModel: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing ViTModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ViTModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:
def FrameCapture(path):
    input_filename = path.split("/")[-1].split(".")[0]
    # Create a new folder for frames if it doesn't exist
    folder_path = f"/kaggle/working/frames/{input_filename}"
    os.makedirs(folder_path, exist_ok=True)

    # Path to video file
    vidObj = cv2.VideoCapture(path)

    # Used as counter variable
    count = 0

    # Number of frames to select
    num_frames = 10

    # Frame selection interval
    frame_interval = int(vidObj.get(cv2.CAP_PROP_FRAME_COUNT) / num_frames)

    # checks whether frames were extracted
    success = 1
    frame_count = 0

    while success and count < num_frames:
        # vidObj object calls read
        # function to extract frames
        success, image = vidObj.read()

        # Save the frames with frame-count as filename
        filename = "frame%d.jpg" % count
        file_path = os.path.join(folder_path, filename)
        cv2.imwrite(file_path, image)

        count += 1
        frame_count += 1

        # Skip frames to select 10 evenly spaced frames
        vidObj.set(cv2.CAP_PROP_POS_FRAMES, frame_count * frame_interval)

    vidObj.release()

In [10]:
FrameCapture("/kaggle/input/telegram-videos/IMG_2400.MOV")

In [11]:
ls frames/IMG_2400

frame0.jpg  frame2.jpg  frame4.jpg  frame6.jpg  frame8.jpg
frame1.jpg  frame3.jpg  frame5.jpg  frame7.jpg  frame9.jpg


In [12]:
FrameCapture("/kaggle/input/telegram-videos/document_2023-05-20_12-21-33.mp4")

In [13]:
ls frames/document_2023-05-20_12-21-33

frame0.jpg  frame2.jpg  frame4.jpg  frame6.jpg  frame8.jpg
frame1.jpg  frame3.jpg  frame5.jpg  frame7.jpg  frame9.jpg


In [14]:
def load_images_from_folder(folder_path, batch_size=10):
    image_list = []
    for filename in os.listdir(folder_path):
        img_path = os.path.join(folder_path, filename)
        if os.path.isfile(img_path):
            image = Image.open(img_path).convert("RGB")
            image_list.append(image)

    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])

    images = torch.stack([transform(image) for image in image_list])

    num_images = len(images)
    num_batches = num_images // batch_size

    # Truncate images to ensure a complete batch
    images = images[:num_batches * batch_size]

    # Reshape images to match batch dimensions
    images = images.view(num_batches, batch_size, 3, 224, 224)

    return images

def extract_embeddings(batch):
    with torch.no_grad():
        embeddings = model(batch).last_hidden_state[:, 0].cpu()
    return embeddings

In [15]:
batch_1 = load_images_from_folder("frames/IMG_2400")
batch_2 = load_images_from_folder("frames/document_2023-05-20_12-21-33")

In [18]:
embeddings_1 = extract_embeddings(batch_1[0])
embeddings_2 = extract_embeddings(batch_2[0])

In [19]:
def compute_scores(emb_one, emb_two):
    """Computes cosine similarity between two vectors."""
    scores = torch.nn.functional.cosine_similarity(emb_one, emb_two, dim=1)
    return scores.numpy().tolist()

def fetch_similar(image, top_k=5):
    """Fetches the `top_k` similar images with `image` as the query."""
    # Prepare the input query image for embedding computation.
    image_transformed = transformation_chain(image).unsqueeze(0)
    new_batch = {"pixel_values": image_transformed.to(device)}

    # Comute the embedding.
    with torch.no_grad():
        query_embeddings = model(**new_batch).last_hidden_state[:, 0].cpu()

    # Compute similarity scores with all the candidate images at one go.
    # We also create a mapping between the candidate image identifiers
    # and their similarity scores with the query image.
    sim_scores = compute_scores(all_candidate_embeddings, query_embeddings)
    similarity_mapping = dict(zip(candidate_ids, sim_scores))
 
    # Sort the mapping dictionary and return `top_k` candidates.
    similarity_mapping_sorted = dict(
        sorted(similarity_mapping.items(), key=lambda x: x[1], reverse=True)
    )
    id_entries = list(similarity_mapping_sorted.keys())[:top_k]

    ids = list(map(lambda x: int(x.split("_")[0]), id_entries))
    labels = list(map(lambda x: int(x.split("_")[-1]), id_entries))
    return ids, labels

In [20]:
video1_features_norm = F.normalize(embeddings_1, p=2, dim=1)
video2_features_norm = F.normalize(embeddings_2, p=2, dim=1)

In [21]:
video1_features_reshaped = video1_features_norm.view(1, -1)
video2_features_reshaped = video2_features_norm.view(1, -1)

In [22]:
cosine_similarity = torch.mm(video1_features_reshaped, video2_features_reshaped.t())

In [23]:
cosine_similarity

tensor([[1.5904]])