## Video Embedding Pipeline

1. Save copy in Drive and [Download the supporting files](https://drive.google.com/drive/folders/1KnyDEHNdNot9-CMFLYfWhPPwmsSAR2ye?usp=sharing)
2. Load the supporting files - config.yaml, requirements.txt, step1.txt
3. Run cell 1 to create file directories
4. Move config.yaml to local or cloud directory
5. Follow the instructions above each cell below
6. Outputs should populate into the created file directories
---
Bug handling:
*   Error: `/usr/local/lib/python3.10/dist-packages/pyarrow/_dataset_parquet.pyx in pyarrow._dataset_parquet.ParquetFragmentScanOptions.__init__()`

What to do: Reset colab notebook and rerun from step 5. This occurs do to a different verson of pyarrow between video2dataset and clip_video_encode. The solution is to rerun the `generate_config` after running `main()` before `clip_video_encode`.





### 1. Run the following cell to download youtube videos and create keyframes

In [None]:
import os
import pandas as pd
import json
import glob
import configparser
import subprocess
!pip install "pandas>=1.1.5,<2"

def create_directories(config):
        for key, path in config.items():
            if not path.endswith(('.parquet', '.yaml')):
                os.makedirs(path, exist_ok=True)

def generate_config(base_directory):
  return {
      "directory": base_directory,
      "original_videos": f"{base_directory}/original_videos",
      "keyframe_videos": f"{base_directory}/keyframe_videos",
      "embedding_output": f"{base_directory}/embedding_output",
      "keyframe_embedding_output": f"{base_directory}/keyframe_embedding_output",
      "keyframe_parquet": f"{base_directory}/keyframe_video_requirements.parquet",
      "config_yaml": f"{base_directory}/config.yaml"
      }
config = {
        "local": generate_config("./pipeline_datasets"),
        "cloud": generate_config("/content/drive/MyDrive/research/pipeline/datasets")
    }

# Set directories
mode = "cloud"
selected_config = config[mode]
create_directories(selected_config)



In [None]:
def main():
    def install_requirements(requirements_file):
      subprocess.run(["pip", "install", "-r", requirements_file])

    # Step 1
    print("Installing basic requirements and shared packages...")
    def install_package_from_github(repo_url):
        subprocess.run(["git", "clone", repo_url])
        repo_name = repo_url.split("/")[-1].replace(".git", "")
        os.chdir(repo_name)
        subprocess.run(["pip", "install", "-e", "."])
        os.chdir("..")

    def prepare_dataset_requirements(directory):
      dataset_requirements = {
            "data": [
                {"url": "www.youtube.com/watch?v=nXBoOam5xJs", "caption": "The Deadly Portuguese Man O' War"},
                {"url": "www.youtube.com/watch?v=-tvA3Ezqjl8", "caption": "Top 5 David Attenborough Moments"},
            ]}
      os.makedirs(directory, exist_ok=True)
      with open(f"{directory}/dataset_requirements.json", "w") as f:
            json.dump(dataset_requirements, f)
      df = pd.DataFrame(dataset_requirements['data'])
      df.to_csv(f"{directory}/dataset_requirements.csv", index=False)

    def prepare_clip_encode(directory, output):
        # Load dataset requirements
        with open(f"{directory}/dataset_requirements.json", "r") as f:
            dataset_requirements = json.load(f)
        df = pd.DataFrame(dataset_requirements['data'])

        # Initialize lists to store video metadata
        keyframe_video_locs = []
        original_video_locs = []

        # Iterate over video files to collect metadata
        video_files = glob.glob(f"{selected_config['original_videos']}/**/*[0-9]*.mp4", recursive=True)
        for video_file in video_files:
            video_id = os.path.basename(video_file).split('.')[0]

            # Load JSON metadata
            json_meta_path = video_file.replace('.mp4', '.json')
            with open(json_meta_path, 'r') as f:
                metadata = json.load(f)

            # Extract relevant metadata
            print(metadata)
            duration = metadata['video_metadata']['streams'][0]['duration']

            # Append to keyframe and original video lists
            keyframe_video_locs.append({
                "videoLoc": f"{output}/{video_id}_key_frames.mp4",
                "videoID": video_id,
                "duration": duration,})
            original_video_locs.append({
                "videoLoc": video_file,
                "videoID": video_id,
                "duration": duration,})

        # Create and save parquets for future pipeline steps
        keyframe_video_df = pd.DataFrame(keyframe_video_locs)
        original_video_df = pd.DataFrame(original_video_locs)
        keyframe_video_df.to_parquet(f'{selected_config["directory"]}/keyframe_video_requirements.parquet', index=False)
        original_video_df.to_parquet(f'{selected_config["directory"]}/original_video_requirements.parquet', index=False)

    # Download with extension of yt-dlp since the native codec saved by video2dataset is incompatable with clip_video_encode - need to look into soon
    def download_video(video_url, num_threads=10, video_count=1):
      command = f'yt-dlp -N {num_threads} --format "bestvideo[height<=360][ext=mp4]+bestaudio[ext=m4a]/best[height<=360][ext=mp4]" --write-auto-sub --sub-lang en --output "{selected_config["original_videos"]}/{video_count}.%(ext)s" {video_url}'
      return os.system(command) == 0

    def run_video2dataset_with_yt_dlp(directory, output):
        os.makedirs(output, exist_ok=True)
        url_list = f'{directory}/dataset_requirements.csv'
        print(url_list)
        df = pd.read_csv(url_list)
        for idx, row in df.iterrows():
            # Step 2: Use yt-dlp to get .mp4 and video2dataset to get video metadata
            temp_video_path = os.path.join(selected_config["original_videos"], f"temp_{idx+1}.mp4")
            download_success = download_video(row['url'], num_threads=10, video_count=f"temp_{idx+1}")
            print(row['url'])
            if not download_success:
                print(f"Failed to download video from {row['url']}")
                continue
            # Step 2: Run video2dataset
            command = [
                'video2dataset',
                '--url_list', url_list,
                '--output_folder', output,
                '--config', selected_config['config_yaml']]
            result = subprocess.run(command, capture_output=True, text=True)
            print("Return code:", result.returncode)
            print("STDOUT:", result.stdout)
            print("STDERR:", result.stderr)

            # Overwrite video2dataset .mp4 with yt-dlp
            video2dataset_file_paths = glob.glob(f"{output}/**/*[0-9]*.mp4", recursive=True)
            video2dataset_file_path = video2dataset_file_paths[-1]  # Assumes the last
            print(video2dataset_file_path)

            # Overwrite the video downloaded by video2dataset with the one downloaded by yt-dlp
            if os.path.exists(video2dataset_file_path):
                os.remove(video2dataset_file_path)
            if os.path.exists(temp_video_path):
                os.rename(temp_video_path, video2dataset_file_path)

    def segment_key_frames(input_file, output_file):
        command = f'ffmpeg -y -loglevel error -discard nokey -i {input_file} -c:s copy -c copy -copyts {output_file}'
        process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        stdout, stderr = process.communicate()
        return process.returncode == 0

    # Install common requirements between video2dataset and clip-video-encode
    install_requirements("./requirements.txt")
    install_requirements("./step1.txt")
    prepare_dataset_requirements(selected_config["directory"])
    install_package_from_github("https://github.com/iejMac/video2dataset.git")
    run_video2dataset_with_yt_dlp(selected_config["directory"], selected_config["original_videos"])
    # Segment key frames for downloaded videos
    video_files = glob.glob(f"{selected_config['original_videos']}/**/*[0-9]*.mp4", recursive=True)
    for video_file in video_files:
        video_id = os.path.basename(video_file).split('.')[0]
        input_file = video_file
        output_file = os.path.join(selected_config["keyframe_videos"], f"{video_id}_key_frames.mp4")
        print(f"Segmenting key frames for {video_id}...")
        if not segment_key_frames(input_file, output_file):
          print(f"Failed to segment key frames for {video_id}.")
    # Prepare CLIP encode
    prepare_clip_encode(selected_config["directory"], selected_config["keyframe_videos"])
if __name__ == "__main__":
    main()

Installing basic requirements and shared packages...
/content/drive/MyDrive/research/pipeline/datasets/dataset_requirements.csv
www.youtube.com/watch?v=nXBoOam5xJs
Return code: 0
STDOUT: Starting the downloading of this file
Sharding file number 1 of 1 called /content/drive/MyDrive/research/pipeline/datasets/dataset_requirements.csv

[download]   0.0% of   41.99MiB at  Unknown B/s ETA Unknown
[download]   0.0% of   41.99MiB at  Unknown B/s ETA Unknown
[download]   0.0% of   41.99MiB at    6.57MiB/s ETA 00:06  
[download]   0.0% of   41.99MiB at   10.72MiB/s ETA 00:03
[download]   0.1% of   41.99MiB at    9.26MiB/s ETA 00:04
[download]   0.1% of   41.99MiB at    8.54MiB/s ETA 00:04
[download]   0.3% of   41.99MiB at   12.16MiB/s ETA 00:03
[download]   0.6% of   41.99MiB at   18.01MiB/s ETA 00:02
[download]   1.2% of   41.99MiB at   27.68MiB/s ETA 00:01
[download]   2.4% of   41.99MiB at   45.36MiB/s ETA 00:00
[download]   0.0% of   11.31MiB at  Unknown B/s ETA Unknown
[download]   0.0% 


### 2. Run `generate_config` to validate directory paths then run clip-video-encode to get keyframe embeddings


In [None]:
def generate_config(base_directory):
  return {
      "directory": base_directory,
      "original_videos": f"{base_directory}/original_videos",
      "keyframe_videos": f"{base_directory}/keyframe_videos",
      "embedding_output": f"{base_directory}/embedding_output",
      "keyframe_embedding_output": f"{base_directory}/keyframe_embedding_output",
      "keyframe_parquet": f"{base_directory}/keyframe_video_requirements.parquet",
      "config_yaml": f"{base_directory}/config.yaml"
      }
config = {
        "local": generate_config("./pipeline_datasets"),
        "cloud": generate_config("/content/drive/MyDrive/research/pipeline/datasets")
    }

# Set directories
mode = "cloud"
selected_config = config[mode]
import pandas as pd
pd.read_parquet(selected_config['keyframe_parquet'])

Unnamed: 0,videoLoc,videoID,duration
0,/content/drive/MyDrive/research/pipeline/datas...,1,1176.04
1,/content/drive/MyDrive/research/pipeline/datas...,0,332.64


In [None]:
!pip install clip-video-encode
from pandas.core.indexes.accessors import CombinedDatetimelikeProperties
from google.colab.patches import cv2_imshow
from clip_video_encode import clip_video_encode
import pandas as pd
# Now run clip_video_encode
clip_video_encode(
    selected_config["keyframe_parquet"],
    selected_config["keyframe_embedding_output"],
    frame_workers=25,
    take_every_nth=1,
    metadata_columns=['videoLoc', 'videoID', 'duration']
)


Collecting clip-video-encode
  Downloading clip_video_encode-1.3.0-py3-none-any.whl (17 kB)
Collecting torch<2,>=1.7.1 (from clip-video-encode)
  Downloading torch-1.13.1-cp310-cp310-manylinux1_x86_64.whl (887.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m887.5/887.5 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Collecting webdataset<0.2,>=0.1.103 (from clip-video-encode)
  Downloading webdataset-0.1.103-py3-none-any.whl (47 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
Collecting open-clip-torch<3.0.0,>=2.0.0 (from clip-video-encode)
  Downloading open_clip_torch-2.22.0-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m74.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ffmpeg (from clip-video-encode)
  Downloading ffmpeg-1.4.tar.gz (5.1 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting youtube-dl (from clip-

Downloading (…)ip_pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

Reading 2 videos using 25 workers...
Worker #0 starting processing 0 videos
Worker #0 done processing 0 videos in 0.022959740000260354[s]

Worker #1 starting processing 0 videosWorker #1 done processing 0 videos in 0.023732984000162105[s]
Worker #2 starting processing 0 videos
Worker #2 done processing 0 videos in 0.03359342799967635[s]
Worker #3 starting processing 0 videos
Worker #3 done processing 0 videos in 0.02856344400015587[s]Worker #4 starting processing 0 videos

Worker #4 done processing 0 videos in 0.025552775000051042[s]
Worker #5 starting processing 0 videos
Worker #5 done processing 0 videos in 0.024817935000100988[s]
Worker #6 starting processing 0 videos
Worker #6 done processing 0 videos in 0.02725544200029617[s]
Worker #7 starting processing 0 videos
Worker #7 done processing 0 videos in 0.027718896999886056[s]
Worker #8 starting processing 0 videos
Worker #8 done processing 0 videos in 0.03375947599988649[s]
Worker #9 starting processing 0 videos
Worker #9 done proc



All jobs completed in 5.63357807400007[s].
