# BigQuery + Gemini 1.5 for Media

### Libraries

In [None]:
import os
from PIL import Image as img
from google.cloud import bigquery, storage
from moviepy.editor import VideoFileClip

In [None]:
uri = 'gs://vtxdemos-videos/*'

In [None]:
bigquery_client = bigquery.Client()

In [None]:
# Create a Connection to AI/ML APIs

# !bq mk --connection --location=us \
#     --connection_type=CLOUD_RESOURCE gemini_conn

### Create the Model in BQ

In [None]:
%%bigquery
CREATE MODEL `demos_us.gemini_1_5_pro`
REMOTE WITH CONNECTION `us.emb_connection`
OPTIONS(endpoint = 'gemini-1.5-pro');

Query is running:   0%|          |

## Connect Storage Objects (Videos) to BigQuery

In [None]:
%%bigquery
CREATE OR REPLACE EXTERNAL TABLE
  `demos_us.vtxdemos-videos-081624`
WITH CONNECTION `us.emb_connection`
OPTIONS (
  object_metadata = 'SIMPLE',
  uris = ['gs://vtxdemos-videos-embeddings/input_videos/*']
  );

Query is running:   0%|          |

In [None]:
%%bigquery
SELECT * FROM `demos_us.vtxdemos-videos-081624`

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,uri,generation,content_type,size,md5_hash,updated,metadata
0,gs://vtxdemos-videos-embeddings/input_videos/a...,1723817696023018,video/mp4,37064786,34f9ce9675bef6dffbf07763c47d0432,2024-08-16 14:14:56.064000+00:00,[]
1,gs://vtxdemos-videos-embeddings/input_videos/a...,1723818594501894,video/mp4,102896992,78ba58d1c35a87fb06b57249f56444f2,2024-08-16 14:29:54.541000+00:00,[]
2,gs://vtxdemos-videos-embeddings/input_videos/b...,1723817703288472,video/mp4,97257306,4fbf28cac6a6bb2e95f71db086e42693,2024-08-16 14:15:03.331000+00:00,[]
3,gs://vtxdemos-videos-embeddings/input_videos/b...,1723818322880394,video/mp4,670756939,28bc311c9e9f013a71feab37f3abaace,2024-08-16 14:25:22.929000+00:00,[]
4,gs://vtxdemos-videos-embeddings/input_videos/c...,1723817817135710,video/mp4,37442739,92f4245a832c5dfd516608860d244513,2024-08-16 14:16:57.176000+00:00,[]
5,gs://vtxdemos-videos-embeddings/input_videos/c...,1723817705139244,video/mp4,65951489,b798e4bfe51b62f13bfa67ccb9edb648,2024-08-16 14:15:05.180000+00:00,[]
6,gs://vtxdemos-videos-embeddings/input_videos/c...,1723817703316069,video/mp4,132529463,b6579833d942e660b750b9713fa3b0e8,2024-08-16 14:15:03.353000+00:00,[]
7,gs://vtxdemos-videos-embeddings/input_videos/c...,1723817702820535,video/mp4,121743040,c91658dda2c5bd12d98a186df72a91fb,2024-08-16 14:15:02.864000+00:00,[]
8,gs://vtxdemos-videos-embeddings/input_videos/c...,1723816999462495,video/mp4,164249637,9846e0d70b6d6dee255c4e1a476311d8,2024-08-16 14:03:19.497000+00:00,[]
9,gs://vtxdemos-videos-embeddings/input_videos/c...,1723818707118795,video/mp4,126907482,02e9bba6acbeed3bc806854382887d3f,2024-08-16 14:31:47.165000+00:00,[]


# Gemini 1.5 to Analyze the Content

## Title

In [None]:
%%bigquery
CREATE OR REPLACE TABLE `vtxdemos.demo_us_outputs.videos_081624_title` AS (
   SELECT
    uri,
    ml_generate_text_llm_result
  FROM
    ML.GENERATE_TEXT(MODEL `demos_us.gemini_1_5_pro`,
      TABLE `demos_us.vtxdemos-videos-081624`,
      STRUCT(
        '''
        Instructions:
        Create a single title for this movie (no more than 4 words)

        Output:
        ''' AS prompt,
        0.2 AS temperature,
        TRUE AS flatten_json_output,
        8192 AS max_output_tokens,
        [
          STRUCT('HARM_CATEGORY_DANGEROUS_CONTENT' AS category, 'BLOCK_LOW_AND_ABOVE' AS threshold),
          STRUCT('HARM_CATEGORY_HATE_SPEECH' AS category, 'BLOCK_LOW_AND_ABOVE' AS threshold),
          STRUCT('HARM_CATEGORY_HARASSMENT' AS category, 'BLOCK_LOW_AND_ABOVE' AS threshold),
          STRUCT('HARM_CATEGORY_SEXUALLY_EXPLICIT' AS category, 'BLOCK_LOW_AND_ABOVE' AS threshold)
        ] AS safety_settings
      )
    )
)

Query is running:   0%|          |

In [None]:
%%bigquery title_df

SELECT uri, ml_generate_text_llm_result as title FROM `vtxdemos.demo_us_outputs.videos_081624_title`

Query is running:   0%|          |

Downloading:   0%|          |

## Video Summarization

In [None]:
%%bigquery
CREATE OR REPLACE TABLE
  `vtxdemos.demo_us_outputs.videos_081624_summary` AS (
 SELECT
    uri,
    ml_generate_text_llm_result
  FROM
    ML.GENERATE_TEXT(MODEL `demos_us.gemini_1_5_pro`,
      TABLE `demos_us.vtxdemos-videos-081624`,
      STRUCT(
        '''
        Instructions:
        Give me a detailed summary.

        Output:
        ''' AS prompt,
        0.2 AS temperature,
        TRUE AS flatten_json_output,
        8192 AS max_output_tokens,
        [
          STRUCT('HARM_CATEGORY_DANGEROUS_CONTENT' AS category, 'BLOCK_LOW_AND_ABOVE' AS threshold),
          STRUCT('HARM_CATEGORY_HATE_SPEECH' AS category, 'BLOCK_LOW_AND_ABOVE' AS threshold),
          STRUCT('HARM_CATEGORY_HARASSMENT' AS category, 'BLOCK_LOW_AND_ABOVE' AS threshold),
          STRUCT('HARM_CATEGORY_SEXUALLY_EXPLICIT' AS category, 'BLOCK_LOW_AND_ABOVE' AS threshold)
        ] AS safety_settings
      )
    )
  );

Executing query with job ID: 3357cb45-9def-4dd3-accb-1009149b6f1c
Query executing: 219.93s

In [None]:
%%bigquery summary_df
SELECT * FROM `vtxdemos.demo_us_outputs.videos_081624_summary`

Query is running:   0%|          |

Downloading:   0%|          |

In [175]:
summary_df.tail(10)

Unnamed: 0,uri,ml_generate_text_llm_result
7,gs://vtxdemos-videos-embeddings/input_videos/b...,00:00<start_of_image> BEDROOM_ALARM_CLOCK:00:0...
8,gs://vtxdemos-videos-embeddings/input_videos/v...,The video is a tutorial on how to add captions...
9,gs://vtxdemos-videos-embeddings/input_videos/c...,The video shows highlights of NBA games and th...
10,gs://vtxdemos-videos-embeddings/input_videos/u...,The video is about the top 20 most popular fas...
11,gs://vtxdemos-videos-embeddings/input_videos/c...,The video discusses the top 10 most expensive ...
12,gs://vtxdemos-videos-embeddings/input_videos/c...,00:00 The video is about the OpenAI Spring Upd...
13,gs://vtxdemos-videos-embeddings/input_videos/p...,The video is about a soccer game between the P...
14,gs://vtxdemos-videos-embeddings/input_videos/g...,The speaker is reviewing the new Pixel 9 phone...
15,gs://vtxdemos-videos-embeddings/input_videos/v...,The video is about Vimeo's Summer 2024 Release...
16,gs://vtxdemos-videos-embeddings/input_videos/a...,This video is a montage of Aaron Judge's home ...


## Analytics

In [None]:
%%bigquery
CREATE OR REPLACE TABLE
  `vtxdemos.demo_us_outputs.videos_081624_analytics` AS (
  SELECT
    uri,
    ml_generate_text_llm_result
  FROM
    ML.GENERATE_TEXT(MODEL `demos_us.gemini_1_5_pro`,
      TABLE `demos_us.vtxdemos-videos-081624`,
      STRUCT( 0.2 AS temperature,
        '''
          Instructions:
            Extract the following elements from the video:
            - Important Objects and the count of their occurrences.
            - Famous People's Names and the count of their occurrences.
            - Brands or Ads and the count of their occurrences.

          Constraints:
            - Ensure the output is strictly valid JSON.
            - Do not include any extra text or explanations outside the JSON object.
            - **Crucially, ensure all string values within the JSON output are properly escaped for JSON compatibility.**
            - Do not include any markdown or JSON decorators (e.g. ```JSON```, triple backticks) in the output.

          Output Format:
          {
            "objects": {
              "object_1": count,
              "object_2": count,
              ...
            },
            "people_names": {
              "person_name_1": count,
              "person_name_2": count,
              ...
            },
            "brands": {
              "brand_1": count,
              "brand_2": count,
              ...
            }
          }

        ''' AS PROMPT,
        TRUE AS FLATTEN_JSON_OUTPUT,
        8192 AS max_output_tokens,
        [ STRUCT('HARM_CATEGORY_DANGEROUS_CONTENT' AS category,
          'BLOCK_LOW_AND_ABOVE' AS threshold),
        STRUCT('HARM_CATEGORY_HATE_SPEECH' AS category,
          'BLOCK_LOW_AND_ABOVE' AS threshold),
        STRUCT('HARM_CATEGORY_HARASSMENT' AS category,
          'BLOCK_LOW_AND_ABOVE' AS threshold),
        STRUCT('HARM_CATEGORY_SEXUALLY_EXPLICIT' AS category,
          'BLOCK_LOW_AND_ABOVE' AS threshold) ] AS safety_settings ) ) )

In [None]:
%%bigquery analytics_df
SELECT * FROM `vtxdemos.demo_us_outputs.videos_081624_analytics`

Query is running:   0%|          |

Downloading:   0%|          |

In [177]:
analytics_df.iloc[8]["ml_generate_text_llm_result"]

'{"objects": {"ball": 23, "goal": 10, "soccer field": 25, "goal post": 19, "soccer uniform": 37, "flag": 10}, "people_names": {"Johan Vasquez": 1, "Ruben Duarte": 1, "Alfonso Blanco": 3, "Jorge Ruvalcaba": 12, "Alonso Cabral": 4, "Angel Rico": 2, "Leo Suarez": 1, "Salvador Reyes": 2, "Paco Gonzalez": 2, "Funes Mori": 2, "Luis Cervantes": 1, "Rogelio Funes Mori": 1, "Santi Trigo": 1, "Alan Medina": 1, "Hector Yael Uribe": 1, "Maximiliano Quintero": 1, "Jorge Baba": 3, "Antonio \\"Tota\\" Carbajal": 2}, "brands": {"ViX": 5, "Liga MX": 5, "TUDN": 27, "Telcel": 14, "Suzuki": 7, "DHL": 4, "Caliente.mx": 17, "Bardahl": 6, "Bimbo": 6, "Metro by T-Mobile": 5, "Corona Extra": 6, "Assist Card": 2, "Restonic": 9, "Modelo": 6, "Verizon": 7, "Heineken Silver": 4, "Futbol 2024": 2, "RC Auto Partes": 4, "Testronic": 3, "Caribu": 2}}'

In [178]:
import json

json.loads(analytics_df.iloc[8]["ml_generate_text_llm_result"])

{'objects': {'ball': 23,
  'goal': 10,
  'soccer field': 25,
  'goal post': 19,
  'soccer uniform': 37,
  'flag': 10},
 'people_names': {'Johan Vasquez': 1,
  'Ruben Duarte': 1,
  'Alfonso Blanco': 3,
  'Jorge Ruvalcaba': 12,
  'Alonso Cabral': 4,
  'Angel Rico': 2,
  'Leo Suarez': 1,
  'Salvador Reyes': 2,
  'Paco Gonzalez': 2,
  'Funes Mori': 2,
  'Luis Cervantes': 1,
  'Rogelio Funes Mori': 1,
  'Santi Trigo': 1,
  'Alan Medina': 1,
  'Hector Yael Uribe': 1,
  'Maximiliano Quintero': 1,
  'Jorge Baba': 3,
  'Antonio "Tota" Carbajal': 2},
 'brands': {'ViX': 5,
  'Liga MX': 5,
  'TUDN': 27,
  'Telcel': 14,
  'Suzuki': 7,
  'DHL': 4,
  'Caliente.mx': 17,
  'Bardahl': 6,
  'Bimbo': 6,
  'Metro by T-Mobile': 5,
  'Corona Extra': 6,
  'Assist Card': 2,
  'Restonic': 9,
  'Modelo': 6,
  'Verizon': 7,
  'Heineken Silver': 4,
  'Futbol 2024': 2,
  'RC Auto Partes': 4,
  'Testronic': 3,
  'Caribu': 2}}

In [None]:
import json

for index, row in analytics_df.dropna().iterrows():
  print(row["uri"])
  print(json.loads(row["ml_generate_text_llm_result"]))

gs://vtxdemos-videos-embeddings/input_videos/chaplin_overview.mp4
{'objects': {'hat': 10, 'cane': 3, 'fork': 4, 'shoe': 3, 'plate': 2, 'camera': 1, 'door': 1, 'window': 1, 'ladder': 1, 'car': 2, 'train': 2, 'circle': 1}, 'people_names': {'Charlie Chaplin': 13, 'Douglas Fairbanks': 1, 'Mary Pickford': 1, 'D.W. Griffith': 1}, 'brands': {}}
gs://vtxdemos-videos-embeddings/input_videos/chat_gpt_4o.mp4
{'objects': {'phone': 4, 'computer': 1, 'laptop': 1, 'paper': 1, 'marker': 1, 'table': 1, 'chair': 6, 'plant': 10}, 'people_names': {'Mira Murati': 1, 'Mark Chen': 1, 'Barrett Zoph': 1, 'Mark': 10, 'Barrett': 6, 'Jensen': 1}, 'brands': {'OpenAI': 5, 'ChatGPT': 16, 'Nvidia': 1}}
gs://vtxdemos-videos-embeddings/input_videos/pumas_unam.mp4
{'objects': {'ball': 23, 'goal': 10, 'soccer field': 25, 'goal post': 19, 'soccer uniform': 37, 'flag': 10}, 'people_names': {'Johan Vasquez': 1, 'Ruben Duarte': 1, 'Alfonso Blanco': 3, 'Jorge Ruvalcaba': 12, 'Alonso Cabral': 4, 'Angel Rico': 2, 'Leo Suarez': 

## Embeddings

In [None]:
%%bigquery
CREATE OR REPLACE TABLE
  `vtxdemos.demo_us_outputs.videos_emeddings_081624` AS (
      SELECT * FROM ML.GENERATE_EMBEDDING(
        MODEL `vtxdemos.demos_us.multimodalembedding`,
        TABLE `demos_us.vtxdemos-videos-081624`,
        STRUCT(
          TRUE AS flatten_json_output,
          10 AS interval_seconds
        )
      )
  )

Query is running:   0%|          |

In [None]:
%%bigquery df
SELECT * FROM `vtxdemos.demo_us_outputs.videos_emeddings_081624`

Query is running:   0%|          |

Downloading:   0%|          |

In [None]:
df

Unnamed: 0,ml_generate_embedding_result,ml_generate_embedding_status,ml_generate_embedding_start_sec,ml_generate_embedding_end_sec,uri,generation,content_type,size,md5_hash,updated,metadata
0,"[0.00626083231, 0.0537993, 0.0029025, 0.027797...",,70,80,gs://vtxdemos-videos-embeddings/input_videos/b...,1723817703288472,video/mp4,97257306,4fbf28cac6a6bb2e95f71db086e42693,2024-08-16 14:15:03.331000+00:00,[]
1,"[-0.0189423, 0.0573085621, -0.00376170571, 0.0...",,60,70,gs://vtxdemos-videos-embeddings/input_videos/b...,1723817703288472,video/mp4,97257306,4fbf28cac6a6bb2e95f71db086e42693,2024-08-16 14:15:03.331000+00:00,[]
2,"[-0.0109932, 0.0219409987, 0.00889575668, 0.00...",,0,10,gs://vtxdemos-videos-embeddings/input_videos/b...,1723817703288472,video/mp4,97257306,4fbf28cac6a6bb2e95f71db086e42693,2024-08-16 14:15:03.331000+00:00,[]
3,"[0.00327200396, 0.058494132, 0.00913200807, 0....",,110,120,gs://vtxdemos-videos-embeddings/input_videos/b...,1723817703288472,video/mp4,97257306,4fbf28cac6a6bb2e95f71db086e42693,2024-08-16 14:15:03.331000+00:00,[]
4,"[-0.00243466347, 0.0761750937, -0.00546080945,...",,10,20,gs://vtxdemos-videos-embeddings/input_videos/b...,1723817703288472,video/mp4,97257306,4fbf28cac6a6bb2e95f71db086e42693,2024-08-16 14:15:03.331000+00:00,[]
...,...,...,...,...,...,...,...,...,...,...,...
198,"[-0.021082202, 0.0473410897, -0.0450956374, -0...",,0,10,gs://vtxdemos-videos-embeddings/input_videos/v...,1723818302867368,video/mp4,8097043,792f35960a5b342b505c3d3de4ac7d22,2024-08-16 14:25:02.960000+00:00,[]
199,"[0.0126062138, 0.0203445163, -0.0382897556, -0...",,70,80,gs://vtxdemos-videos-embeddings/input_videos/v...,1723818302867368,video/mp4,8097043,792f35960a5b342b505c3d3de4ac7d22,2024-08-16 14:25:02.960000+00:00,[]
200,"[-0.0105335349, 0.0542397723, -0.0200508777, -...",,90,100,gs://vtxdemos-videos-embeddings/input_videos/v...,1723818302867368,video/mp4,8097043,792f35960a5b342b505c3d3de4ac7d22,2024-08-16 14:25:02.960000+00:00,[]
201,"[-0.00796977524, 0.0441429503, -0.0187974, -0....",,110,120,gs://vtxdemos-videos-embeddings/input_videos/v...,1723818302867368,video/mp4,8097043,792f35960a5b342b505c3d3de4ac7d22,2024-08-16 14:25:02.960000+00:00,[]


In [None]:
# Logic to Create Thumbnails from 10 sec segments over Videos.
import os
from google.cloud import storage

thumbnail_list = []
output_thumbnails_bucket = "vtxdemos-videos-embeddings"
output_thumbnail_client = storage.Client().bucket(output_thumbnails_bucket)

# Keep track of the previous video
previous_video = None

def thumbnail(file, timeframe):
  print(f"Processing video: {file}")
  thumbnail_array = VideoFileClip(file).get_frame(timeframe)
  thumbnail_image = img.fromarray(thumbnail_array)
  thumbnail_filename = file.split('/')[-1]
  tn_name = f"{thumbnail_filename.split('.')[0]}_{timeframe}.png"
  thumbnail_image.save(tn_name)
  blob_name = f"thumbnails/{tn_name}"
  output_thumbnail_client.blob(blob_name).upload_from_filename(tn_name)
  thum_uri = f"https://storage.googleapis.com/{output_thumbnails_bucket}/{blob_name}"
  return tn_name, thum_uri

def download_video(file_name):
  if os.path.exists(file_name):
    return
  print(f"Downloading video: {file_name}")
  thumbnail_bucket.blob(f"input_videos/{file_name}").download_to_filename(file_name)

for n, row in df.iterrows():
  uri_dec = row["uri"].split("/")
  bucket_name = uri_dec[2]
  file_name = uri_dec[-1]
  if file_name == "llama31.mp4":
    continue
  print(bucket_name, file_name)
  thumbnail_bucket = storage.Client().bucket(bucket_name)
  download_video(file_name)
  tn_name, thum_uri = thumbnail(file_name, row["ml_generate_embedding_start_sec"])
  thumbnail_list.append(thum_uri)

  # Check if the current video is different from the previous one
  if file_name != previous_video:
    # Delete the previous video if it exists
    if previous_video:
      print("deletevideo")
      print(file_name)
      os.remove(previous_video)
      print(f"Deleted video: {previous_video}")

  # Update the previous video
  previous_video = file_name

vtxdemos-videos-embeddings babe_ruth.mp4
Downloading video: babe_ruth.mp4
Processing video: babe_ruth.mp4
vtxdemos-videos-embeddings babe_ruth.mp4
Processing video: babe_ruth.mp4
vtxdemos-videos-embeddings babe_ruth.mp4
Processing video: babe_ruth.mp4
vtxdemos-videos-embeddings babe_ruth.mp4
Processing video: babe_ruth.mp4
vtxdemos-videos-embeddings babe_ruth.mp4
Processing video: babe_ruth.mp4
vtxdemos-videos-embeddings babe_ruth.mp4
Processing video: babe_ruth.mp4
vtxdemos-videos-embeddings babe_ruth.mp4
Processing video: babe_ruth.mp4
vtxdemos-videos-embeddings babe_ruth.mp4
Processing video: babe_ruth.mp4
vtxdemos-videos-embeddings babe_ruth.mp4
Processing video: babe_ruth.mp4
vtxdemos-videos-embeddings babe_ruth.mp4
Processing video: babe_ruth.mp4
vtxdemos-videos-embeddings babe_ruth.mp4
Processing video: babe_ruth.mp4
vtxdemos-videos-embeddings babe_ruth.mp4
Processing video: babe_ruth.mp4
vtxdemos-videos-embeddings us_chains.mp4
Downloading video: us_chains.mp4
Processing video:

In [None]:
#Delete PNG files
for root, dirs, files in os.walk("./"):
    for file in files:
        if file.endswith(".png"):
            file_path = os.path.join(root, file)
            os.remove(file_path)
            print(f"Deleted: {file_path}")

Deleted: ./celebrities_nba_60.png
Deleted: ./african_animals_50.png
Deleted: ./places_for_travel_80.png
Deleted: ./celeb_audience_70.png
Deleted: ./places_for_travel_90.png
Deleted: ./us_chains_0.png
Deleted: ./us_chains_30.png
Deleted: ./aaron_judge_0.png
Deleted: ./us_chains_20.png
Deleted: ./will_ferrel_0.png
Deleted: ./celebrities_nba_100.png
Deleted: ./babe_ruth_biography_50.png
Deleted: ./aaron_judge_30.png
Deleted: ./chaplin_overview_110.png
Deleted: ./us_chains_60.png
Deleted: ./celebrities_nba_10.png
Deleted: ./vimeo_workflow_70.png
Deleted: ./pumas_unam_90.png
Deleted: ./babe_ruth_biography_90.png
Deleted: ./google_pixel_9_30.png
Deleted: ./will_ferrel_70.png
Deleted: ./babe_ruth_80.png
Deleted: ./will_ferrel_40.png
Deleted: ./video_adding_captions_60.png
Deleted: ./celeb_audience_60.png
Deleted: ./babe_ruth_90.png
Deleted: ./will_ferrel_90.png
Deleted: ./chat_gpt_4o_50.png
Deleted: ./celebrities_nba_20.png
Deleted: ./vimeo_workflow_80.png
Deleted: ./chaplin_the_kid_10.png
De

In [None]:
df["thumbnails_uri"] = thumbnail_list
df["public_uri"] = df["uri"].apply(lambda x: x.replace("gs://","https://storage.googleapis.com/"))
df = df[["ml_generate_embedding_result", "ml_generate_embedding_start_sec", "ml_generate_embedding_end_sec", "uri", "public_uri", "thumbnails_uri"]]

In [None]:
df.head()

Unnamed: 0,ml_generate_embedding_result,ml_generate_embedding_start_sec,ml_generate_embedding_end_sec,uri,public_uri,thumbnails_uri
0,"[-0.0109932, 0.0219409987, 0.00889575668, 0.00...",0,10,gs://vtxdemos-videos-embeddings/input_videos/b...,https://storage.googleapis.com/vtxdemos-videos...,https://storage.googleapis.com/vtxdemos-videos...
1,"[-0.0160676222, 0.0372462161, 0.00278694532, -...",0,10,gs://vtxdemos-videos-embeddings/input_videos/u...,https://storage.googleapis.com/vtxdemos-videos...,https://storage.googleapis.com/vtxdemos-videos...
2,"[-0.0394800939, 0.0639086813, -0.0153921023, -...",0,10,gs://vtxdemos-videos-embeddings/input_videos/p...,https://storage.googleapis.com/vtxdemos-videos...,https://storage.googleapis.com/vtxdemos-videos...
3,"[-0.0215460453, 0.0362352319, -0.00118816691, ...",0,10,gs://vtxdemos-videos-embeddings/input_videos/a...,https://storage.googleapis.com/vtxdemos-videos...,https://storage.googleapis.com/vtxdemos-videos...
4,"[-0.0278681852, 0.0649537519, 0.0149312979, -0...",0,10,gs://vtxdemos-videos-embeddings/input_videos/c...,https://storage.googleapis.com/vtxdemos-videos...,https://storage.googleapis.com/vtxdemos-videos...


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 203 entries, 0 to 202
Data columns (total 6 columns):
 #   Column                           Non-Null Count  Dtype 
---  ------                           --------------  ----- 
 0   ml_generate_embedding_result     203 non-null    object
 1   ml_generate_embedding_start_sec  203 non-null    Int64 
 2   ml_generate_embedding_end_sec    203 non-null    Int64 
 3   uri                              203 non-null    object
 4   public_uri                       203 non-null    object
 5   thumbnails_uri                   203 non-null    object
dtypes: Int64(2), object(4)
memory usage: 10.0+ KB


In [None]:
from google.cloud import bigquery

# Sample DataFrame (replace with your actual data)
import pandas as pd
import numpy as np

# BigQuery client
client = bigquery.Client()

# **Important:** Define your BigQuery table schema
table_id = 'vtxdemos.demo_us_outputs.videos_emeddings_end_081624'
job_config = bigquery.LoadJobConfig(
    schema=[
        bigquery.SchemaField("ml_generate_embedding_result", "FLOAT", mode="REPEATED"),
        bigquery.SchemaField("ml_generate_embedding_start_sec", bigquery.enums.SqlTypeNames.INTEGER),
        bigquery.SchemaField("ml_generate_embedding_end_sec", bigquery.enums.SqlTypeNames.INTEGER),
        bigquery.SchemaField("uri", bigquery.enums.SqlTypeNames.STRING),
        bigquery.SchemaField("public_uri", bigquery.enums.SqlTypeNames.STRING),
        bigquery.SchemaField("thumbnails_uri", bigquery.enums.SqlTypeNames.STRING)
    ],
    write_disposition="WRITE_TRUNCATE",  # Change to WRITE_APPEND if needed
)

# Load the DataFrame into BigQuery
job = client.load_table_from_dataframe(df, table_id, job_config=job_config)
job.result()  # Wait for the job to complete

print(f"Loaded {job.output_rows} rows into {table_id}")

Loaded 203 rows into vtxdemos.demo_us_outputs.videos_emeddings_end_081624


In [None]:
%%bigquery df

SELECT * FROM `vtxdemos.demo_us_outputs.videos_emeddings_end_081624`

Query is running:   0%|          |

Downloading:   0%|          |

In [None]:
df["uri"].unique()

array(['gs://vtxdemos-videos-embeddings/input_videos/babe_ruth.mp4',
       'gs://vtxdemos-videos-embeddings/input_videos/us_chains.mp4',
       'gs://vtxdemos-videos-embeddings/input_videos/pumas_unam.mp4',
       'gs://vtxdemos-videos-embeddings/input_videos/aaron_judge.mp4',
       'gs://vtxdemos-videos-embeddings/input_videos/chat_gpt_4o.mp4',
       'gs://vtxdemos-videos-embeddings/input_videos/will_ferrel.mp4',
       'gs://vtxdemos-videos-embeddings/input_videos/celeb_audience.mp4',
       'gs://vtxdemos-videos-embeddings/input_videos/google_pixel_9.mp4',
       'gs://vtxdemos-videos-embeddings/input_videos/vimeo_workflow.mp4',
       'gs://vtxdemos-videos-embeddings/input_videos/african_animals.mp4',
       'gs://vtxdemos-videos-embeddings/input_videos/celebrities_nba.mp4',
       'gs://vtxdemos-videos-embeddings/input_videos/chaplin_the_kid.mp4',
       'gs://vtxdemos-videos-embeddings/input_videos/clothing_brands.mp4',
       'gs://vtxdemos-videos-embeddings/input_videos/chap

## Combining Tables

In [None]:
emb_df = df[["ml_generate_embedding_result", "uri", "public_uri", "ml_generate_embedding_start_sec", "ml_generate_embedding_end_sec", "thumbnails_uri"]].copy()

In [None]:
final_df = emb_df.merge(summary_df, on="uri").merge(analytics_df, on="uri").merge(title_df, on="uri")
final_df.head(10)

Unnamed: 0,ml_generate_embedding_result,uri,public_uri,ml_generate_embedding_start_sec,ml_generate_embedding_end_sec,thumbnails_uri,ml_generate_text_llm_result_x,ml_generate_text_llm_result_y,title
0,"[-0.0109932, 0.0219409987, 0.00889575668, 0.00...",gs://vtxdemos-videos-embeddings/input_videos/b...,https://storage.googleapis.com/vtxdemos-videos...,0,10,https://storage.googleapis.com/vtxdemos-videos...,00:00<start_of_image> BEDROOM_ALARM_CLOCK:00:0...,"{""objects"": {""baseball bat"": 12, ""baseball"": 3...",The Legend of Babe Ruth \n
1,"[-0.00243466347, 0.0761750937, -0.00546080945,...",gs://vtxdemos-videos-embeddings/input_videos/b...,https://storage.googleapis.com/vtxdemos-videos...,10,20,https://storage.googleapis.com/vtxdemos-videos...,00:00<start_of_image> BEDROOM_ALARM_CLOCK:00:0...,"{""objects"": {""baseball bat"": 12, ""baseball"": 3...",The Legend of Babe Ruth \n
2,"[-0.00163601618, 0.0783343539, -0.00419028383,...",gs://vtxdemos-videos-embeddings/input_videos/b...,https://storage.googleapis.com/vtxdemos-videos...,20,30,https://storage.googleapis.com/vtxdemos-videos...,00:00<start_of_image> BEDROOM_ALARM_CLOCK:00:0...,"{""objects"": {""baseball bat"": 12, ""baseball"": 3...",The Legend of Babe Ruth \n
3,"[-0.0142161334, 0.0444211289, 0.00266227685, -...",gs://vtxdemos-videos-embeddings/input_videos/b...,https://storage.googleapis.com/vtxdemos-videos...,30,40,https://storage.googleapis.com/vtxdemos-videos...,00:00<start_of_image> BEDROOM_ALARM_CLOCK:00:0...,"{""objects"": {""baseball bat"": 12, ""baseball"": 3...",The Legend of Babe Ruth \n
4,"[-0.0148623306, 0.0503682978, -0.0188063551, 0...",gs://vtxdemos-videos-embeddings/input_videos/b...,https://storage.googleapis.com/vtxdemos-videos...,40,50,https://storage.googleapis.com/vtxdemos-videos...,00:00<start_of_image> BEDROOM_ALARM_CLOCK:00:0...,"{""objects"": {""baseball bat"": 12, ""baseball"": 3...",The Legend of Babe Ruth \n
5,"[-0.0171047803, 0.0583979562, -0.0172300171, 0...",gs://vtxdemos-videos-embeddings/input_videos/b...,https://storage.googleapis.com/vtxdemos-videos...,50,60,https://storage.googleapis.com/vtxdemos-videos...,00:00<start_of_image> BEDROOM_ALARM_CLOCK:00:0...,"{""objects"": {""baseball bat"": 12, ""baseball"": 3...",The Legend of Babe Ruth \n
6,"[-0.0189423, 0.0573085621, -0.00376170571, 0.0...",gs://vtxdemos-videos-embeddings/input_videos/b...,https://storage.googleapis.com/vtxdemos-videos...,60,70,https://storage.googleapis.com/vtxdemos-videos...,00:00<start_of_image> BEDROOM_ALARM_CLOCK:00:0...,"{""objects"": {""baseball bat"": 12, ""baseball"": 3...",The Legend of Babe Ruth \n
7,"[0.00626083231, 0.0537993, 0.0029025, 0.027797...",gs://vtxdemos-videos-embeddings/input_videos/b...,https://storage.googleapis.com/vtxdemos-videos...,70,80,https://storage.googleapis.com/vtxdemos-videos...,00:00<start_of_image> BEDROOM_ALARM_CLOCK:00:0...,"{""objects"": {""baseball bat"": 12, ""baseball"": 3...",The Legend of Babe Ruth \n
8,"[-0.0108456435, 0.0541867055, 0.00726296473, 0...",gs://vtxdemos-videos-embeddings/input_videos/b...,https://storage.googleapis.com/vtxdemos-videos...,80,90,https://storage.googleapis.com/vtxdemos-videos...,00:00<start_of_image> BEDROOM_ALARM_CLOCK:00:0...,"{""objects"": {""baseball bat"": 12, ""baseball"": 3...",The Legend of Babe Ruth \n
9,"[-0.00777684, 0.0664713457, 0.00474223215, 0.0...",gs://vtxdemos-videos-embeddings/input_videos/b...,https://storage.googleapis.com/vtxdemos-videos...,90,100,https://storage.googleapis.com/vtxdemos-videos...,00:00<start_of_image> BEDROOM_ALARM_CLOCK:00:0...,"{""objects"": {""baseball bat"": 12, ""baseball"": 3...",The Legend of Babe Ruth \n


In [None]:
# prompt: drop rows with nulls from final_df

final_df = final_df.dropna()


In [None]:
# Final Table on BigQuery
from google.cloud import bigquery

# Sample DataFrame (replace with your actual data)
import pandas as pd
import numpy as np

# BigQuery client
client = bigquery.Client()

# **Important:** Define your BigQuery table schema
table_id = 'vtxdemos.demo_us_outputs.videos_emeddings_end_081624'
job_config = bigquery.LoadJobConfig(
    schema=[
        bigquery.SchemaField("ml_generate_embedding_result", "FLOAT", mode="REPEATED"),
        bigquery.SchemaField("ml_generate_embedding_start_sec", bigquery.enums.SqlTypeNames.INTEGER),
        bigquery.SchemaField("ml_generate_embedding_end_sec", bigquery.enums.SqlTypeNames.INTEGER),
        bigquery.SchemaField("uri", bigquery.enums.SqlTypeNames.STRING),
        bigquery.SchemaField("public_uri", bigquery.enums.SqlTypeNames.STRING),
        bigquery.SchemaField("thumbnails_uri", bigquery.enums.SqlTypeNames.STRING),
        bigquery.SchemaField("ml_generate_text_llm_result_x", bigquery.enums.SqlTypeNames.STRING),
        bigquery.SchemaField("ml_generate_text_llm_result_y", bigquery.enums.SqlTypeNames.STRING)
    ],
    write_disposition="WRITE_TRUNCATE",  # Change to WRITE_APPEND if needed
)

# Load the DataFrame into BigQuery
job = client.load_table_from_dataframe(final_df, table_id, job_config=job_config)
job.result()  # Wait for the job to complete

print(f"Loaded {job.output_rows} rows into {table_id}")

Loaded 120 rows into vtxdemos.demo_us_outputs.videos_emeddings_end_081624
