In [1]:
import glob
import time

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import tqdm
%matplotlib inline

In [2]:
import cv2

def video_to_images(video_path, max_frames=1):
    # Load the video
    cap = cv2.VideoCapture(video_path)
    
    # Check if video opened successfully
    if not cap.isOpened():
        print("Error: Could not open video.")
        return []
    
    images = []
    frame_cnt = 0
    
    # Read until video is completed
    while cap.isOpened():
        # Capture frame-by-frame
        ret, frame = cap.read()
        
        if ret:
            # If frame is read correctly, append to list
            images.append(frame)
        else:
            # If no frame is read, break the loop
            break
        
        frame_cnt += 1
        if frame_cnt >= max_frames:
            break
    
    # Release the video capture object
    cap.release()
    
    return [i[:,:,[2,1,0]] for i in images]

# Example usage
video_path = '../data/videos/5cff3a6f42908c8ef37a2eb205eb.mp4'
images = video_to_images(video_path)
print(f"Number of frames extracted: {len(images)}")


Number of frames extracted: 1


In [3]:
%%time

mp4_files = glob.glob("../data/videos/*mp4")

CPU times: user 279 ms, sys: 48.6 ms, total: 327 ms
Wall time: 325 ms


In [4]:
mp4_names = set([f.split('/')[-1].replace('.mp4', '') for f in mp4_files])
list(mp4_names)[:3]

['5d0913024de8a68fe70750381a6a',
 '6b1ecd2d4b51a182e897aeb8f426',
 'c6bc6bed49a3975eccca5b256822']

In [5]:
data = pd.read_csv("../data/yappy_hackaton_2024_400k.csv")
data['file_key'] = data.link.apply(lambda x: x.split('/')[-2])
data['downloaded'] = data.file_key.apply(lambda x: x in mp4_names)
print(data.shape)
data.head(2)

(400000, 4)


Unnamed: 0,link,description,file_key,downloaded
0,https://cdn-st.rutubelist.ru/media/b0/e9/ef285...,"#нарезкистримов , #dota2 , #cs2 , #fifa23 , #m...",ef285e0241139fc611318ed33071,False
1,https://cdn-st.rutubelist.ru/media/39/6c/b31bc...,🤫НЕ ВВОДИ ЭТУ КОМАНДУ В РОБЛОКС ! #shorts #rob...,b31bc6864bef9d8a96814f1822ca,False


In [6]:
data = data.loc[data.downloaded].reset_index(drop=True)

In [7]:
data.shape

(107225, 4)

### get image embeddings:
 

In [8]:
# sample = data.sample(5_000, random_state=1)
# sample.head(2)

sample = data.copy()
sample.head(2)

Unnamed: 0,link,description,file_key,downloaded
0,https://cdn-st.rutubelist.ru/media/e2/97/f9164...,,f9164f8a41479f961d64842154a7,True
1,https://cdn-st.rutubelist.ru/media/0f/48/8a1ff...,#diy #постановка #юмор #комедия,8a1ff7324073947a31e80f71d001,True


In [9]:
def gen_batch(arr, batch_size=1024):
    for i in range(0, len(arr), batch_size):
        yield(arr[i:i+batch_size])


def get_single_image_embedding(my_image,processor, model, device):
    image = processor(
      text = None,
      images = my_image,
      return_tensors="pt"
      )["pixel_values"].to(device)
    embedding = model.get_image_features(image)
    # convert the embeddings to numpy array
    return embedding.cpu().detach().numpy()


In [10]:
from transformers import CLIPProcessor, CLIPModel

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")


2024-06-09 14:46:35.253573: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [11]:
%%time

model.to('cuda')
model.device

CPU times: user 2.13 s, sys: 1.06 s, total: 3.19 s
Wall time: 3.19 s


device(type='cuda', index=0)

In [None]:
chunk_size = 1024
model_batch_size = 128

all_files = sample.file_key.tolist()
all_files_batches = gen_batch(all_files, batch_size=chunk_size)

t0 = time.time()
for idx, videos_batch in tqdm.tqdm(enumerate(all_files_batches)):

    frames = list()
    for file_key in videos_batch:
        path = f"../data/videos/{file_key}.mp4"
        first_image = video_to_images(path, max_frames=1)[0]
        frames.append(first_image)

    embeds_arr = list()
    for batch in gen_batch(frames, batch_size=model_batch_size):
        embed = get_single_image_embedding(batch, processor, model, 'cuda')
        embeds_arr.append(embed)

    embed = np.concatenate(embeds_arr, axis=0)
    np.save(f"/home/letfd/lct2024_data/data/clip_numpy_embeds/batch_{idx}.npy", embed)
    
    print(f"finished processing batch {idx}, total time elapsed: {((time.time() - t0) / 60):.3f} minutes")

1it [01:23, 83.09s/it]

finished processing batch 0, total time elapsed: 1.385 minutes


2it [03:21, 103.80s/it]

finished processing batch 1, total time elapsed: 3.357 minutes


3it [05:17, 109.54s/it]

finished processing batch 2, total time elapsed: 5.296 minutes


4it [07:17, 113.43s/it]

finished processing batch 3, total time elapsed: 7.286 minutes


5it [09:22, 117.73s/it]

finished processing batch 4, total time elapsed: 9.375 minutes
