In [299]:
import os
import math
import scann
import shutil
import vertexai
import numpy as np
import pandas as pd
from PIL import Image as img
from moviepy.editor import VideoFileClip
from google.cloud import storage, aiplatform
from moviepy.video.io.ffmpeg_tools import ffmpeg_extract_subclip
from vertexai.vision_models import Image, MultiModalEmbeddingModel, Video, VideoSegmentConfig

In [8]:
project_id = "vtxdemos"
bucket_id = "vtxdemos-mlb"

In [9]:
vertexai.init(project=project_id, location="us-central1")
model = MultiModalEmbeddingModel.from_pretrained("multimodalembedding")
bucket = storage.Client().bucket(bucket_id)

In [30]:
print(output_filepath)

tmp/part-0.mp4


In [114]:
def processing(required_video_file):
    output_dir_list = []
    output_dir = "../tmp"
    segment_length = 118

    clip = VideoFileClip(required_video_file)
    duration = clip.duration

    start_time = 0
    end_time = segment_length
    i = 1

    basename = os.path.basename(required_video_file).split('.')[0]
    dir_path = os.path.dirname(required_video_file)
    output_path = os.path.join(dir_path, output_dir)
    if not os.path.exists(output_path):
        os.makedirs(output_path)

    while start_time < duration:
        output = os.path.join(output_path, f"{basename}_part{i}.mp4")
        print(output)
        ffmpeg_extract_subclip(required_video_file, start_time, min(end_time, duration), targetname=output)
        start_time = end_time
        end_time += segment_length
        output_dir_list.append(f"tmp/{basename}_part{i}.mp4")
        i += 1
    print(f'Video split into {i-1} parts.')
    return output_dir_list

In [72]:
output_dir_list

['tmp/alex_rodriguez_part2.mp4',
 'tmp/alex_rodriguez_part3.mp4',
 'tmp/alex_rodriguez_part4.mp4']

In [78]:
embeddings = model.get_embeddings(
    video=video,
    video_segment_config=VideoSegmentConfig(interval_sec = 5),
)

In [146]:
# thumbnail
def thumbnail(file, timeframe):
    thumbnail_array = VideoFileClip(file).get_frame(timeframe)
    thumbnail_image = img.fromarray(thumbnail_array)
    thumbnail_filename = file.split('/')[-1]
    tn_name = f"{thumbnail_filename.split('.')[0]}_{timeframe}.png"
    thumbnail_image.save(f"thumbnails/{tn_name}")
    return tn_name

In [83]:
thumbnails_dir = "thumbnails"
if os.path.exists(thumbnails_dir):
    shutil.rmtree(thumbnails_dir)
os.makedirs(thumbnails_dir)

In [92]:
output_dir_list

['tmp/alex_rodriguez_part1_part1.mp4', 'tmp/alex_rodriguez_part1_part2.mp4']

In [95]:
preproces_list

['tmp/alex_rodriguez_part1_part1.mp4', 'tmp/alex_rodriguez_part1_part2.mp4']

In [106]:
preproces_list

['tmp/babe_ruth_part1.mp4',
 'tmp/babe_ruth_part2.mp4',
 'tmp/babe_ruth_part3.mp4',
 'tmp/babe_ruth_part4.mp4',
 'tmp/babe_ruth_part5.mp4',
 'tmp/babe_ruth_part6.mp4']

In [119]:
video = Video.load_from_file('tmp/babe_ruth_part1.mp4')
video = Video(gcs_uri="gs://vtxdemos-mlb/babe_ruth.mp4")
embeddings = model.get_embeddings(
    video=video,
    video_segment_config=VideoSegmentConfig(interval_sec = 5),
)

In [143]:
f"thumbnails/{tn_name}"

'thumbnails/babe_ruth_part1.png'

In [145]:
bucket.blob(f"thumbnails/{tn_name}").upload_from_filename(f"thumbnails/{tn_name}")

In [158]:
title_list = []
start_offset_sec_list = []
end_offset_sec_list = []
video_embeddings_list = []
thumbnails_gcs_list = []
video_gcs_list = []
link_prefix = "https://storage.googleapis.com/vtxdemos-mlb/"
output_dir_list = processing("videos/alex_rodriguez.mp4")
videos_dir = "videos/"

for vid in os.listdir(videos_dir):
    vid = videos_dir+vid
    preproces_list = processing(vid)
    for split_vid in preproces_list:
        print(f"Uploading Video {blob_name}")
        blob_name = split_vid.split("/")[-1]
        bucket.blob("video_part/"+blob_name).upload_from_filename(split_vid)
        gcs_blob = f"gs://{bucket_id}/video_part/{blob_name}"
        video = Video(gcs_uri=gcs_blob)
        print("Creating Embeddings")
        embeddings = model.get_embeddings(
            video=video,
            video_segment_config=VideoSegmentConfig(interval_sec = 5),
        )
        for emb in embeddings.video_embeddings:
            print("Creating Thumbnails")
            tn_name = thumbnail(split_vid, emb.start_offset_sec)
            bucket.blob("thumbnails/"+tn_name).upload_from_filename(f"thumbnails/{tn_name}")
            title_list.append(blob_name)
            start_offset_sec_list.append(emb.start_offset_sec)
            end_offset_sec_list.append(emb.end_offset_sec)
            video_embeddings_list.append(emb.embedding)
            thumbnails_gcs_list.append(f"{link_prefix}thumbnails/{tn_name}")
            video_gcs_list.append(f"{link_prefix}video_part/{blob_name}")
            print("done!")

videos/../tmp/alex_rodriguez_part1.mp4
Moviepy - Running:
>>> "+ " ".join(cmd)
Moviepy - Command successful
videos/../tmp/alex_rodriguez_part2.mp4
Moviepy - Running:
>>> "+ " ".join(cmd)
Moviepy - Command successful
videos/../tmp/alex_rodriguez_part3.mp4
Moviepy - Running:
>>> "+ " ".join(cmd)
Moviepy - Command successful
Video split into 3 parts.
videos/../tmp/babe_ruth_part1.mp4
Moviepy - Running:
>>> "+ " ".join(cmd)
Moviepy - Command successful
videos/../tmp/babe_ruth_part2.mp4
Moviepy - Running:
>>> "+ " ".join(cmd)
Moviepy - Command successful
videos/../tmp/babe_ruth_part3.mp4
Moviepy - Running:
>>> "+ " ".join(cmd)
Moviepy - Command successful
videos/../tmp/babe_ruth_part4.mp4
Moviepy - Running:
>>> "+ " ".join(cmd)
Moviepy - Command successful
videos/../tmp/babe_ruth_part5.mp4
Moviepy - Running:
>>> "+ " ".join(cmd)
Moviepy - Command successful
videos/../tmp/babe_ruth_part6.mp4
Moviepy - Running:
>>> "+ " ".join(cmd)
Moviepy - Command successful
Video split into 6 parts.
Upload

OSError: MoviePy error: failed to read the duration of file videos/.ipynb_checkpoints.
Here are the file infos returned by ffmpeg:

ffmpeg version 4.2.2-static https://johnvansickle.com/ffmpeg/  Copyright (c) 2000-2019 the FFmpeg developers
  built with gcc 8 (Debian 8.3.0-6)
  configuration: --enable-gpl --enable-version3 --enable-static --disable-debug --disable-ffplay --disable-indev=sndio --disable-outdev=sndio --cc=gcc --enable-fontconfig --enable-frei0r --enable-gnutls --enable-gmp --enable-libgme --enable-gray --enable-libaom --enable-libfribidi --enable-libass --enable-libvmaf --enable-libfreetype --enable-libmp3lame --enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenjpeg --enable-librubberband --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libvorbis --enable-libopus --enable-libtheora --enable-libvidstab --enable-libvo-amrwbenc --enable-libvpx --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxml2 --enable-libdav1d --enable-libxvid --enable-libzvbi --enable-libzimg
  libavutil      56. 31.100 / 56. 31.100
  libavcodec     58. 54.100 / 58. 54.100
  libavformat    58. 29.100 / 58. 29.100
  libavdevice    58.  8.100 / 58.  8.100
  libavfilter     7. 57.100 /  7. 57.100
  libswscale      5.  5.100 /  5.  5.100
  libswresample   3.  5.100 /  3.  5.100
  libpostproc    55.  5.100 / 55.  5.100
videos/.ipynb_checkpoints: Is a directory


In [160]:
len(video_embeddings_list)

896

In [161]:
data = {
    "title": title_list, 
    "start_offset": start_offset_sec_list,
    "end_offset": end_offset_sec_list,
    "thumbnails_gcs": thumbnails_gcs_list,
    "video_gcs": video_gcs_list,
    "embedding": video_embeddings_list,
}
df = pd.DataFrame(data)

In [162]:
df.to_pickle("df_back.pkl")

In [163]:
df.columns

Index(['title', 'start_offset', 'end_offset', 'thumbnails_gcs', 'video_gcs',
       'embedding'],
      dtype='object')

In [251]:
img = np.array([r["embedding"] for i, r in df.iterrows()])
k = int(np.sqrt(df.shape[0]))

if int(k/20) < 1:
    leave_search = 1
else:
    leave_search = int(k/20)


searcher = scann.scann_ops_pybind.builder(img, num_neighbors=15, distance_measure="dot_product").tree(
    num_leaves=k, num_leaves_to_search=leave_search, training_sample_size=df.shape[0]).score_brute_force(
    2).reorder(10).build()

2024-06-06 02:55:21.832240: I scann/partitioning/partitioner_factory_base.cc:59] Size of sampled dataset for training partition: 896
2024-06-06 02:55:21.873206: I ./scann/partitioning/kmeans_tree_partitioner_utils.h:88] PartitionerFactory ran in 40.880025ms.


In [288]:
query = "game between puerto rico and mexico"

embeddings = model.get_embeddings(
    contextual_text=query,
).text_embedding

In [289]:
neighbors, distances = searcher.search(embeddings,final_num_neighbors=10)

In [290]:
new_df = df.iloc[neighbors,:]

In [291]:
new_df["video_gcs"].iloc[1]

'https://storage.googleapis.com/vtxdemos-mlb/video_part/babe_ruth_part5.mp4'

In [292]:
new_df["thumbnails_gcs"].iloc[1]

'https://storage.googleapis.com/vtxdemos-mlb/thumbnails/babe_ruth_part5_115.0.png'

In [293]:
new_df["start_offset"].iloc[1]

115.0

In [275]:
new_df["end_offset"].iloc[1]

25.0

## Embeddings

In [298]:
def preprocess(df, f_name):
    #df['id'] = df.index
    df.loc[:, 'id'] = df.index
    data = df.to_json(orient='records', lines=True)
    
    with open('data.json', 'w') as f:
        f.write(data)
        
    storage.Client().bucket("vtxdemos-vsearch-datasets").blob(f_name).upload_from_filename("data.json")
    return data
data_1 = preprocess(df, "mlb/data.json")

In [300]:
abnb_index = aiplatform.MatchingEngineIndex.create_tree_ah_index(
    display_name = f"vs-mlb-index-v1",
    contents_delta_uri = "gs://vtxdemos-vsearch-datasets/mlb",
    dimensions = len(df["embedding"].iloc[0]),
    approximate_neighbors_count = 15,
)

Creating MatchingEngineIndex
Create MatchingEngineIndex backing LRO: projects/254356041555/locations/us-central1/indexes/8294632157058433024/operations/4882665366376939520
MatchingEngineIndex created. Resource name: projects/254356041555/locations/us-central1/indexes/8294632157058433024
To use this MatchingEngineIndex in another session:
index = aiplatform.MatchingEngineIndex('projects/254356041555/locations/us-central1/indexes/8294632157058433024')


In [301]:
mlb_index_endpoint = aiplatform.MatchingEngineIndexEndpoint.create(
    display_name = f"vs-mlb-index-endpoint-v1",
    public_endpoint_enabled = True
)

Creating MatchingEngineIndexEndpoint
Create MatchingEngineIndexEndpoint backing LRO: projects/254356041555/locations/us-central1/indexEndpoints/3580278140875833344/operations/2575133507302981632
MatchingEngineIndexEndpoint created. Resource name: projects/254356041555/locations/us-central1/indexEndpoints/3580278140875833344
To use this MatchingEngineIndexEndpoint in another session:
index_endpoint = aiplatform.MatchingEngineIndexEndpoint('projects/254356041555/locations/us-central1/indexEndpoints/3580278140875833344')


In [302]:
mlb_index_endpoint.deploy_index(
    index = abnb_index, deployed_index_id = "vs_mlb_deployed_v1"
)

Deploying index MatchingEngineIndexEndpoint index_endpoint: projects/254356041555/locations/us-central1/indexEndpoints/3580278140875833344
Deploy index MatchingEngineIndexEndpoint index_endpoint backing LRO: projects/254356041555/locations/us-central1/indexEndpoints/3580278140875833344/operations/597490320933912576
MatchingEngineIndexEndpoint index_endpoint Deployed index. Resource name: projects/254356041555/locations/us-central1/indexEndpoints/3580278140875833344


<google.cloud.aiplatform.matching_engine.matching_engine_index_endpoint.MatchingEngineIndexEndpoint object at 0x7fe84b7537f0> 
resource name: projects/254356041555/locations/us-central1/indexEndpoints/3580278140875833344

In [303]:
df.columns

Index(['title', 'start_offset', 'end_offset', 'thumbnails_gcs', 'video_gcs',
       'embedding', 'id'],
      dtype='object')