In [1]:
import base64
import time
import typing
import math
import numpy as np

from google.cloud import aiplatform
from google.protobuf import struct_pb2

#libraries to generate image summaries
from vertexai.vision_models import Video
from vertexai.vision_models import VideoSegmentConfig
from vertexai.vision_models import MultiModalEmbeddingModel
from vertexai.language_models import TextEmbeddingModel
from vertexai.vision_models import Image as vision_model_Image
from vertexai.preview.generative_models import (
    Content,
    GenerationConfig,
    GenerationResponse,
    GenerativeModel,
    Image,
    Part as GenerativeModelPart,
    HarmBlockThreshold,
    HarmCategory,
)
from typing import Any, Dict, List, Literal, Optional, Union

from moviepy.editor import VideoFileClip

contents= [{
                                                                    "role": "user",
                                                                    "parts": [
                                                                        {
                                                                    
                                                                        
                                                                        "file_data": {
                                                                            "mime_type":  "video/mp4",
                                                                            "file_uri": "gs://raw_nine_files/vlt_video_extract/MAAT/MAAT2024_1_A_HBB.mp4"
                                                                        } 
                                                                         ,
                                                                        "video_metadata": {
                                                                                        "start_offset": {
                                                                                        "seconds": 600,
                                                                                        "nanos": 0
                                                                                        },
                                                                                        "end_offset": {
                                                                                        "seconds": 900,
                                                                                        "nanos": 0
                                                                                        }
                                                                                }

                                                                        },
                                                                        { "text": "describe what is happening in this video within the given time period" }
                                                                    ]
                                                                    }
                                                                ]

generative_multimodal_model= GenerativeModel("gemini-1.5-pro-002")

safety_settings=  {
                    HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
                    HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
                    HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
                    HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
                    }

model_response = generative_multimodal_model.generate_content(
                                    contents ,safety_settings=safety_settings
                                   )   

In [10]:
print(model_response.candidates[0].content.parts[0].text)

The video shows contestants on Married at First Sight Australia getting to know each other and then sharing their hopes and dreams for their time on the show.

At [00:00:03–00:00:10] one of the contestants is introduced to three other contestants. Then at [00:00:11–00:00:19], a contestant in a blue jacket and green scarf introduces another contestant by name and describes his appearance and personality. The two then sit together with two others, where one asks about another’s occupation and discusses it at length. One of the contestants then introduces himself and his career as a kickboxer and reveals his softer side. He describes his hopes for the show, saying that he will approach it with an open mind. 

A woman in a red sequined jacket is shown in a car ride at [00:00:27]. She exits the vehicle at [00:11:52] to meet other contestants. The same woman also sits with another contestant in a black suit and shares her hopes and qualities for the man she is about to be married to. At [00:

In [2]:
import json

start=600
end=900
schema="""{
    "description": "A structured schema to represent detailed information from a video or text analysis",
    "type": "object",
    "properties": {
        "Category": {
            "type": "string",
            "description": "The category or general type of the content"
        },
        "DetailedDescriptionOfEventsAndConversations": {
            "type": "string",
            "description": "A detailed textual description of the events and conversations in the content"
        },
        "BrandsCompanyNamesLogos": {
            "type": "array",
            "items": {
                "type": "string"
            },
            "description": "A list of brands, company names, or logos appearing or mentioned in the content"
        },
        "KeyLocationsAndScenes": {
            "type": "array",
            "items": {
                "type": "string"
            },
            "description": "A list of key locations and scenes appearing or mentioned in the content"
        },
        "KeyThemes": {
            "type": "array",
            "items": {
                "type": "string"
            },
            "description": "A list of key themes discussed or portrayed in the content"
        },
        "PeopleAppearingAndMentioned": {
            "type": "array",
            "items": {
                "type": "string"
            },
            "description": "A list of people who appear or are mentioned in the content"
        }
    },
    "required": [
        "Category",
        "DetailedDescriptionOfEventsAndConversations",
        "BrandsCompanyNamesLogos",
        "KeyLocationsAndScenes",
        "KeyThemes",
        "PeopleAppearingAndMentioned"
    ]
}"""


VAR_VIDEO_SEGMENT=f"Your task is to provide a comprehensive description of this video from segment {start} seconds to {end} seconds.\n"
VAR_INSTRUCTIONS= """To complete the task you need to follow these steps:\n
                           No greetings, closing remarks, or additional comments. Begin immediately with the video analysis and provide only the requested information in the specified format.\n
                           Idenify all instances of visual product placement. Pay close attention to background details and items held by the characters. List each product placement with the following
                            information: Brand name, product name (if applicable), and a brief description. Include information about product placement into the description generated for the video\n
                           Create a transcript of all the speeches, dialogs, narration.\n
                           Scrupulously examine each scene for any and all visible brand names, logos, and products. Even if a product appears briefly or in the background, it should be included.\n"""

VAR_CONSTRAINTS= """Describe the video content objectively, avoiding any subjective opinions or assumptions.\n
                           Specify who is saying what. If a person talking can be seen, specify their name and/or occupation. If it is voice behind the scenes, then describe it as a narrator.\n
                           Be specific when describing. Include all the information that is shown or given.\n
                           Do not show timestamps.\n
                           If an unidentified person is shown in the video first, but then their name is mentioned later in the video, make sure to mention their name in the description from the start.\n
                           """

VAR_STRUCTURE= f"""Organize the description with the following properties, and give a valid json file with JSON schema.<JSONSchema>{json.dumps(schema)}</JSONSchema>:
                       \n**Category**\n
                       \n**DetailedDescriptionOfEventsAndConversations**\n
                       \n**BrandsCompanyNamesLogos**\n
                       \n**KeyLocationsAndScenes**\n
                       \n**KeyThemes**\n
                       \n**PeopleAppearingAndMentioned**\n 
                 """ 

VAR_CONDITIONS = """Identify a video as one of these categories: News, TV Shows, Live Sport Events, News Analyses. \n
                       When describing the DetailedDescriptionOfEventsAndConversations, consider the following instructions for specific video types:\n
                       * **News:** Pay close attention to transitions, graphics, and on-screen text.\n
                       * **TV Shows:** Describe facial expressions, body language, appearances, and overall mood.\n
                       * **Live Sports Events:** Focus on key moments, like goals or fouls, and describe the overall flow and momentum of the game.\n
                       * **News Analyses:** Identify different perspectives, arguments, and supporting evidence.\n
                       Make sure to mention people's names in the DetailedDescriptionOfEventsAndConversations and in PeopleAppearingAndMentioned as well as any other information about them like their age, occupation, location, etc. \n"""

VAR_EXAMPLE = """Follow this example for the format of the output:\n
              {
                "Category": "TV Show",
                "DetailedDescriptionOfEventsAndConversations": "The video starts with a man sitting at a dining table, reading a letter. Two Fiji bottles are visible on the benchtop. He has short, light brown hair and a beard. His name is Harrison. The scene changes to Melissa. Melissa says: \"I'm Melissa, and I'm a hairdresser. I'm 41 years old, and I'm from Sydney.\"",
                "BrandsCompanyNamesLogos": ["Lacoste", "Fiji"],
                "KeyLocationsAndScenes": ["Apartment"],
                "KeyThemes": ["Marriage"],
                "PeopleAppearingAndMentioned": [
                "Harrison, 32, Builder, NSW",
                "Melissa, 41, Hairdresser, NSW"
                ]
            }
               """
  
    
video_description_prompt=VAR_VIDEO_SEGMENT+VAR_INSTRUCTIONS+VAR_CONSTRAINTS+VAR_STRUCTURE+VAR_CONDITIONS+VAR_EXAMPLE
 

In [3]:
import base64
import time
import typing
import math
import numpy as np

from google.cloud import aiplatform
from google.protobuf import struct_pb2

#libraries to generate image summaries
from vertexai.vision_models import Video
from vertexai.vision_models import VideoSegmentConfig
from vertexai.vision_models import MultiModalEmbeddingModel
from vertexai.language_models import TextEmbeddingModel
from vertexai.vision_models import Image as vision_model_Image
from vertexai.preview.generative_models import (
    Content,
    GenerationConfig,
    GenerationResponse,
    GenerativeModel,
    Image,
    Part as GenerativeModelPart,
    HarmBlockThreshold,
    HarmCategory,
)
from typing import Any, Dict, List, Literal, Optional, Union

from moviepy.editor import VideoFileClip

contents= [{
                                                                    "role": "user",
                                                                    "parts": [
                                                                        {
                                                                    
                                                                        
                                                                        "file_data": {
                                                                            "mime_type":  "video/mp4",
                                                                            "file_uri": "gs://raw_nine_files/vlt_video_extract/MAAT/MAAT2024_1_A_HBB.mp4"
                                                                        } 
                                                                         ,
                                                                        "video_metadata": {
                                                                                        "start_offset": {
                                                                                        "seconds": start,
                                                                                        "nanos": 0
                                                                                        },
                                                                                        "end_offset": {
                                                                                        "seconds": end,
                                                                                        "nanos": 0
                                                                                        }
                                                                                }

                                                                        },
                                                                        { "text": video_description_prompt }
                                                                    ]
                                                                    }
                                                                ]

generative_multimodal_model= GenerativeModel("gemini-1.5-pro-002")
generation_config=GenerationConfig(temperature=1, top_k=40,top_p=0.95,max_output_tokens=8192)#, response_mime_type='application/json',
 
safety_settings=  {
                    HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
                    HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
                    HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
                    HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
                    }

model_response = generative_multimodal_model.generate_content(
                                    contents ,safety_settings=safety_settings,generation_config=generation_config,
                                   )   

In [4]:
model_response.candidates[0].content.parts[0].text

'```json\n{\n  "Category": "TV Show",\n  "DetailedDescriptionOfEventsAndConversations": "The video begins with Jayden, a 26-year-old professional kickboxer from Queensland, arriving at a gathering. He greets Tim, Richard, and John. Richard introduces himself to Jayden. Richard asks Jayden what he does for a living. Jayden says that kickboxing is his full-time job, training twice a day, six days a week. Jayden explains that he\'s not what he appears to be, enjoying snuggles and back tickles when he\'s not fighting. He adds that he\'s a romantic and doesn\'t have a specific type, looking for someone with a good heart. Lucinda, a 43-year-old MC and wedding celebrant from New South Wales, arrives at the same gathering, greeting the other women: Tori and Natasha. Lucinda says that she\'s looking for a confident man, a great cook who is funny and generous of spirit. She unfolds a scroll, revealing her detailed list of desired qualities in a partner, including “handsome” and “spunky,” prompti

In [5]:
print(model_response.candidates[0].content.parts[0].text)

```json
{
  "Category": "TV Show",
  "DetailedDescriptionOfEventsAndConversations": "The video begins with Jayden, a 26-year-old professional kickboxer from Queensland, arriving at a gathering. He greets Tim, Richard, and John. Richard introduces himself to Jayden. Richard asks Jayden what he does for a living. Jayden says that kickboxing is his full-time job, training twice a day, six days a week. Jayden explains that he's not what he appears to be, enjoying snuggles and back tickles when he's not fighting. He adds that he's a romantic and doesn't have a specific type, looking for someone with a good heart. Lucinda, a 43-year-old MC and wedding celebrant from New South Wales, arrives at the same gathering, greeting the other women: Tori and Natasha. Lucinda says that she's looking for a confident man, a great cook who is funny and generous of spirit. She unfolds a scroll, revealing her detailed list of desired qualities in a partner, including “handsome” and “spunky,” prompting laught

In [6]:
import json

# Provided JSON string
json_string =  model_response.candidates[0].content.parts[0].text.replace('```json','').replace('```','')
# Convert JSON string to a Python dictionary
parsed_json = json.loads(json_string, strict=False)

parsed_json
 

{'Category': 'TV Show',
 'DetailedDescriptionOfEventsAndConversations': "The video begins with Jayden, a 26-year-old professional kickboxer from Queensland, arriving at a gathering. He greets Tim, Richard, and John. Richard introduces himself to Jayden. Richard asks Jayden what he does for a living. Jayden says that kickboxing is his full-time job, training twice a day, six days a week. Jayden explains that he's not what he appears to be, enjoying snuggles and back tickles when he's not fighting. He adds that he's a romantic and doesn't have a specific type, looking for someone with a good heart. Lucinda, a 43-year-old MC and wedding celebrant from New South Wales, arrives at the same gathering, greeting the other women: Tori and Natasha. Lucinda says that she's looking for a confident man, a great cook who is funny and generous of spirit. She unfolds a scroll, revealing her detailed list of desired qualities in a partner, including “handsome” and “spunky,” prompting laughter from her 

In [7]:
# Define the text you want to write to the file
text_to_export =json_string

# Specify the file name
file_name = "output_model1.txt"

# Write the text to the file
with open(file_name, "w") as file:
    file.write(text_to_export)

print(f"The text has been successfully exported to {file_name}")

The text has been successfully exported to output_model1.txt


In [8]:
# Define the text you want to write to the file
text_to_export =model_response.candidates[0].content.parts[0].text

# Specify the file name
file_name = "output_model_original_text1.txt"

# Write the text to the file
with open(file_name, "w") as file:
    file.write(text_to_export)

print(f"The text has been successfully exported to {file_name}")

The text has been successfully exported to output_model_original_text1.txt


### Lets test with another model


In [9]:
#lets change the model

generative_multimodal_model= GenerativeModel("gemini-1.5-flash")
generation_config=GenerationConfig(temperature=1, top_k=40,top_p=0.95,max_output_tokens=8192)#, response_mime_type='application/json',
 
safety_settings=  {
                    HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
                    HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
                    HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
                    HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
                    }

model_response = generative_multimodal_model.generate_content(
                                    contents ,safety_settings=safety_settings,generation_config=generation_config,
                                   )   

In [10]:
# Provided JSON string
json_string =  model_response.candidates[0].content.parts[0].text.replace('```json','').replace('```','')
# Convert JSON string to a Python dictionary
parsed_json = json.loads(json_string, strict=False)

parsed_json

{'Category': 'TV Show',
 'DetailedDescriptionOfEventsAndConversations': "The video starts with two women seated at a table. Lucinda is wearing a pink sparkly jacket and a matching pink skirt. She says: 'So, Tori and Nat, what's on the manifesto?' Nat is wearing a red floral dress. Tori is wearing a floral patterned long-sleeved dress.  Lucinda continues:  'The man, of first down.' Nat replies: 'I mean, yeah, yeah. I don't want him to have a big ego.' Lucinda says: 'Yeah.  And it doesn't have a big ego.  Confident but with nothing to prove.' Tori continues: 'Correct.' Lucinda says: 'I want a nerdy guy.'  Nat replies: 'You're a bit of an intellect, and you need somebody to challenge you on that level.'  Lucinda says: 'Yeah, I love nerds.' She smiles.  The scene changes to a view of a city with buildings lit up at night.  A voice over says:  'Love Lucinda. Here's another one.'  There's a scene change to the three women seated at a table. Lucinda says: 'Yay.' The scene changes to a view of

In [11]:
# Define the text you want to write to the file
text_to_export =json_string

# Specify the file name
file_name = "output_model2.txt"

# Write the text to the file
with open(file_name, "w") as file:
    file.write(text_to_export)

print(f"The text has been successfully exported to {file_name}")

The text has been successfully exported to output_model2.txt


In [12]:
# Define the text you want to write to the file
text_to_export =model_response.candidates[0].content.parts[0].text

# Specify the file name
file_name = "output_model_original_text2.txt"

# Write the text to the file
with open(file_name, "w") as file:
    file.write(text_to_export)

print(f"The text has been successfully exported to {file_name}")

The text has been successfully exported to output_model_original_text2.txt


### compare the result of the two model