Python Notebook for Gemini Video Inference

In [None]:
!pip install -q -U google-generativeai gsheet-keyring

In [2]:
import google.generativeai as genai
from IPython.display import Markdown
from google.colab import userdata

In [3]:
genai.configure(api_key=userdata.get('api_key'))

In [4]:
video_file_path = "shopshort_enc.mp4"
video_file = genai.upload_file(path=video_file_path)

# Check to see if video is ready

In [None]:
import time

while video_file.state.name == "PROCESSING":
    print('Waiting for video to be processed.')
    time.sleep(10)
    video_file = genai.get_file(video_file.name)

if video_file.state.name == "FAILED":
  raise ValueError(video_file.state.name)
print(f'Video processing complete: ' + video_file.uri)

In [6]:
# Create the prompt.
website_prompt = '''
  Use this video to generate a detailed website for this business.
  Use any context including services, products, etc. Make sure you have a "Contact us" page, and "About" page, etc.
  Be as detailed as possible and include specifics for the type of business, i.e. hotels have details about amenities, stores have hours and products.
  Do not include links to logos or anything not available in the video.
  Ensure your output is in HTML with clean CSS so we can display to the user and they can use it for a possible website.
  '''
facebook_prompt = '''
  Use this video to generate a detailed Facebook business profile for this business.
  Use any context including services, products, etc.
  Be as detailed as possible and include specifics for the type of business, i.e. hotels have details about amenities, stores have hours and products.
  Do not include links to logos or anything not available in the video.
  Ensure your output is in HTML with clean CSS so we can display to the user. Ideally add a button with JS next to each individual assets so the user can copy them to their clipboard.
  '''

tiktok_prompt = '''
  Use this video to generate a TikTok profile.
  Use any context including services, products, etc.
  Be as detailed as possible and include specifics for the type of business, i.e. hotels have details about amenities, stores have hours and products.
  Do not include links to logos or anything not available in the video.
  Make sure you use standard TikTok user attributes: open_id,
display_name,
bio_description,
username
  Ensure your output is in HTML with clean CSS so we can display to the user. Ideally add a button with JS next to each individual assets so the user can copy them to their clipboard.
  '''
# Choose a Gemini model.

gbp_prompt = '''
  Use this video to generate a Google Business profile.
  Use any context including services, products, etc.
  Be as detailed as possible and include specifics for the type of business, i.e. hotels have details about amenities, stores have hours and products.
  Do not include links to logos or anything not available in the video.
  Make sure you use standard Google profile objects.
  Ensure your output is in HTML with clean CSS so we can display to the user. Ideally add a button with JS next to each individual assets so the user can copy them to their clipboard.
  '''
# # Choose a Gemini model.
model = genai.GenerativeModel(model_name="gemini-1.5-pro-latest")


In [21]:
response_website = model.generate_content([video_file, website_prompt],
                                  request_options={"timeout": 600})

In [None]:
import IPython
IPython.display.HTML(response_website.text)

In [23]:
response_facebook = model.generate_content([video_file, facebook_prompt],
                                  request_options={"timeout": 600})

In [None]:
IPython.display.HTML(response_facebook.text)

In [None]:
response_tiktok = model.generate_content([video_file, tiktok_prompt],
                                  request_options={"timeout": 600})

In [None]:
IPython.display.HTML(response_tiktok.text)

In [38]:
response_gbp = model.generate_content([video_file, gbp_prompt],
                                  request_options={"timeout": 600})

In [None]:
IPython.display.HTML(response_gbp.text)

### Transcribe video and provide visual descriptions

If the video is not fast-paced (given that frames are sampled at 1 per second), it's possible to transcribe the video with visual descriptions for each shot.

In [None]:
# Create the prompt.
prompt = "Transcribe the audio, giving timestamps. Also provide visual descriptions."

# Choose a Gemini model.
model = genai.GenerativeModel(model_name="gemini-1.5-pro-latest")

# Make the LLM request.
print("Making LLM inference request...")
response = model.generate_content([prompt, video_file],
                                  request_options={"timeout": 600})
print(response.text)

# APPENDIX - OLD PHOTO PROMPTS

In [None]:
import PIL.Image

sample_file_1 = PIL.Image.open('tran1.png')
sample_file_2 = PIL.Image.open('tran2.png')
sample_file_3 = PIL.Image.open('tran3.png')
sample_file_4 = PIL.Image.open('tran4.png')
sample_file_5 = PIL.Image.open('tran5.png')
sample_file_6 = PIL.Image.open('tran6.png')
sample_file_7 = PIL.Image.open('tran7.png')


In [None]:
# Choose a Gemini model.
model = genai.GenerativeModel(model_name="gemini-1.5-pro-latest")

prompt = "Use these images to detect information for the user to create a Google Business profile. See if you can summarize the business hours. Ensure your output is machine parseable JSON, i.e. {'item1': 'details', 'item2': 'details', 'item3', 'details'}"

response = model.generate_content([prompt, sample_file_1, sample_file_2, sample_file_3, sample_file_4, sample_file_5, sample_file_6, sample_file_7])

Markdown(">" + response.text)

In [None]:
# Choose a Gemini model.
model = genai.GenerativeModel(model_name="gemini-1.5-pro-latest")

prompt = "Use these images to detect information for the user to create a Google Business profile. Try to guess the street address as the best you can. Ensure your output is machine parseable JSON, i.e. {'item1': 'details', 'item2': 'details', 'item3', 'details'}"

response = model.generate_content([prompt, sample_file_2, sample_file_5, sample_file_6])

Markdown(">" + response.text)

In [None]:
# Choose a Gemini model.
model = genai.GenerativeModel(model_name="gemini-1.5-pro-latest")

prompt = "Use these images to help a user create a Google Business profile. Your first job is to identify a detailed description of this business (120 words or less). Ensure your output is machine parseable JSON, i.e. {'description': 'this is a business'}"

response = model.generate_content([prompt, sample_file_1, sample_file_2, sample_file_3, sample_file_4])

Markdown(">" + response.text)