In [1]:
import glob
import time

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import tqdm
%matplotlib inline

In [2]:
%%time

import cv2

def video_to_images(video_path, max_frames=int(1e9)):
    key = video_path.split('/')[-1].replace('.mp4', '')
    
    # Load the video
    cap = cv2.VideoCapture(video_path)
    
    # Check if video opened successfully
    if not cap.isOpened():
        print("Error: Could not open video.")
        return []
    
    images = []
    
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    
    # Frame indices for first, middle, and last frames
    frames_to_capture = [total_frames // 2, total_frames - 1]
    images = []
    
    # Read specific frames
    for frame_index in frames_to_capture:
        # Set the current frame position
        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_index)
        
        # Read the frame
        ret, frame = cap.read()
        if ret:
            images.append(frame)
        else:
            break

    # Release the video capture object
    cap.release()
    
    images = [i[:,:,[2,1,0]] for i in images]
    
    return (
        key,
        total_frames,
        images
    )

# Example usage
# video_path = '../data/videos/5cff3a6f42908c8ef37a2eb205eb.mp4'

images = video_to_images(video_path)


CPU times: user 1.85 s, sys: 138 ms, total: 1.99 s
Wall time: 361 ms


In [3]:
data = pd.read_csv("yappy_data_w_clip_embeds.csv")
print(data.shape)
data.head(2)

(107225, 4)


Unnamed: 0,link,description,file_key,downloaded
0,https://cdn-st.rutubelist.ru/media/e2/97/f9164...,,f9164f8a41479f961d64842154a7,True
1,https://cdn-st.rutubelist.ru/media/0f/48/8a1ff...,#diy #постановка #юмор #комедия,8a1ff7324073947a31e80f71d001,True


In [4]:
data['vid_path'] = data.file_key.apply(lambda k: f"/home/letfd/lct2024_data/data/videos/{k}.mp4")

In [5]:
%%time

tmp = video_to_images(data.iloc[20].vid_path)

CPU times: user 1.16 s, sys: 81.2 ms, total: 1.24 s
Wall time: 227 ms


In [6]:
# парсим видео
# сохраняем кадр (numpy_arr), file_key, frame_index, num_frames


In [7]:
from multiprocessing import Pool

In [8]:
# %%time

# with Pool(4) as p:
#     res = p.map(video_to_images, data['vid_path'].tolist()[:20])

In [9]:
%%time

res2 = [video_to_images(p) for p in data['vid_path'].tolist()[:20]]

CPU times: user 35.4 s, sys: 2.32 s, total: 37.7 s
Wall time: 6.45 s


### get image embeddings:
 

In [10]:
# sample = data.sample(5_000, random_state=1)
# sample.head(2)

sample = data.copy()
sample.head(2)

Unnamed: 0,link,description,file_key,downloaded,vid_path
0,https://cdn-st.rutubelist.ru/media/e2/97/f9164...,,f9164f8a41479f961d64842154a7,True,/home/letfd/lct2024_data/data/videos/f9164f8a4...
1,https://cdn-st.rutubelist.ru/media/0f/48/8a1ff...,#diy #постановка #юмор #комедия,8a1ff7324073947a31e80f71d001,True,/home/letfd/lct2024_data/data/videos/8a1ff7324...


In [11]:
def gen_batch(arr, batch_size=1024):
    for i in range(0, len(arr), batch_size):
        yield(arr[i:i+batch_size])


def get_single_image_embedding(my_image, processor, model, device):
    image = processor(
      text = None,
      images = my_image,
      return_tensors="pt"
      )["pixel_values"].to(device)
    embedding = model.get_image_features(image)
    # convert the embeddings to numpy array
    return embedding.cpu().detach().numpy()


In [12]:
from transformers import CLIPProcessor, CLIPModel

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")


2024-06-23 21:33:53.107298: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [13]:
%%time

model.to('cuda')
model.device

CPU times: user 1.8 s, sys: 1.02 s, total: 2.83 s
Wall time: 2.84 s


device(type='cuda', index=0)

In [None]:
chunk_size = 256 # 1024
model_batch_size = 128

all_files = sample.vid_path.tolist()[:55_000]
all_files_batches = gen_batch(all_files, batch_size=chunk_size)

t0 = time.time()
for idx, videos_batch in tqdm.tqdm(enumerate(all_files_batches)):
    
    ## restart after exception
    if idx < 5:
        continue

    frames = list()
    frames_meta = list()

    for fpath in videos_batch:
        vid_key, vid_num_frames, vid_frames = video_to_images(fpath, max_frames=5_000)
        
        # assert len(vid_frames) == 2
        if len(vid_frames) != 2:
            print(f"video returned not 2 frames: {fpath}; returned {len(vid_frames)} frames; total frames in video: {vid_num_frames}")
        frames.extend(vid_frames)
        frames_meta.extend([
            [vid_key, vid_num_frames, 'mid'],
            [vid_key, vid_num_frames, 'last']
        ])

    embeds_arr = list()
    for batch in gen_batch(frames, batch_size=model_batch_size):
        embed = get_single_image_embedding(batch, processor, model, 'cuda')
        embeds_arr.append(embed)

    embed = np.concatenate(embeds_arr, axis=0)
    np.save(f"/home/letfd/lct2024_data/data/clip_numpy_embeds_ext/batch_{idx}.npy", embed)
    pd.DataFrame(frames_meta, columns=['file_key', 'num_frames', 'frame_pos']).to_csv(
        f"/home/letfd/lct2024_data/data/clip_numpy_embeds_ext/batch_meta_{idx}.csv"
    )
    
    print(f"finished processing batch {idx}, total time elapsed: {((time.time() - t0) / 60):.3f} minutes")


0it [00:00, ?it/s]

video returned not 2 frames: /home/letfd/lct2024_data/data/videos/8dc4f0d74f1e88263ac240b39f85.mp4; returned 1 frames; total frames in video: 1
video returned not 2 frames: /home/letfd/lct2024_data/data/videos/328039b546039fffaf0d53b66ae9.mp4; returned 1 frames; total frames in video: 1


6it [02:10, 21.71s/it]

finished processing batch 5, total time elapsed: 2.171 minutes
