In [1]:
from dotenv import load_dotenv
import openai
import os
from openai import AzureOpenAI
from IPython.display import display, HTML, JSON, Markdown, Image
import base64 

load_dotenv()
AZURE_OPENAI_ENDPOINT=os.getenv("AZURE_OPENAI_ENDPOINT")
AZURE_OPENAI_KEY=os.getenv("AZURE_OPENAI_KEY")
AZURE_OPENAI_API_VERSION=os.getenv("AZURE_OPENAI_API_VERSION")
AZURE_OPENAI_GPT4o_DEPLOYMENT=os.getenv("AZURE_OPENAI_GPT4o_DEPLOYMENT")

client = openai.AzureOpenAI(
        azure_endpoint=AZURE_OPENAI_ENDPOINT,
        api_key=AZURE_OPENAI_KEY,
        api_version=AZURE_OPENAI_API_VERSION
)

print(f"Model: {AZURE_OPENAI_GPT4o_DEPLOYMENT}; API Version:{AZURE_OPENAI_API_VERSION}")
print("Azure OpenAI model is ready to use!")

Model: gpt-4o; API Version:2024-10-21
Azure OpenAI model is ready to use!


In [2]:
def call_gpt4o_with_imgs(image_list):
    text=[
    {"role": "system", "content": "You are generating a video summary. Please provide a summary of the video using the frames from the video provided."},
    {"role": "user", "content": [
        *map(lambda x: {"type": "image_url", 
                        "image_url": {
                            "url": f'data:image/jpg;base64,{x}', 
                            "detail": "low"}}, image_list)
        ],
    }
    ]
    response = client.chat.completions.create(
        model=AZURE_OPENAI_GPT4o_DEPLOYMENT,
        messages = text,
        temperature=0.0
    )
    return response.choices[0].message.content

def call_gpt4o(transcript):
    text=[
        {"role": "system", "content": "You are generating a video summary. Please provide a summary of the video transcript."},
        {"role": "user", "content": transcript}
    ]
    response = client.chat.completions.create(
        model=AZURE_OPENAI_GPT4o_DEPLOYMENT,
        messages = text,
        temperature=0.0
    )
    return response.choices[0].message.content

In [3]:
import cv2
from moviepy import VideoFileClip
# moviepy doc: https://zulko.github.io/moviepy/getting_started/updating_to_v2.html
import time
import base64

# We'll be using the OpenAI DevDay Keynote Recap video. You can review the video here: https://www.youtube.com/watch?v=h02ti0Bl6zk
VIDEO_PATH = "data/keynote_recap.mp4"

In [4]:
import os
def process_video(video_path, seconds_per_frame=2):
    base64Frames = []
    base_video_path, _ = os.path.splitext(video_path)

    video = cv2.VideoCapture(video_path)
    total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
    fps = video.get(cv2.CAP_PROP_FPS)
    frames_to_skip = int(fps * seconds_per_frame)
    curr_frame=0

    # Loop through the video and extract frames at specified sampling rate
    while curr_frame < total_frames - 1:
        video.set(cv2.CAP_PROP_POS_FRAMES, curr_frame)
        success, frame = video.read()
        if not success:
            break
        _, buffer = cv2.imencode(".jpg", frame)
        base64Frames.append(base64.b64encode(buffer).decode("utf-8"))
        curr_frame += frames_to_skip
    video.release()

    # Extract audio from video
    audio_path = f"{base_video_path}.mp3"
    clip = VideoFileClip(video_path)
    clip.audio.write_audiofile(audio_path, bitrate="32k")

    print(f"Extracted {len(base64Frames)} frames")
    print(f"Extracted audio to {audio_path}")
    return base64Frames, audio_path

# Extract 1 frame per second. You can adjust the `seconds_per_frame` parameter to change the sampling rate
base64Frames, audio_path = process_video(VIDEO_PATH, seconds_per_frame=1)

{'video_found': True, 'audio_found': True, 'metadata': {'major_brand': 'mp42', 'minor_version': '0', 'compatible_brands': 'isommp42', 'creation_time': '2023-12-05T19:15:46.000000Z'}, 'inputs': [{'streams': [{'input_number': 0, 'stream_number': 0, 'stream_type': 'video', 'language': None, 'default': True, 'size': [1280, 720], 'bitrate': 191, 'fps': 29.97002997002997, 'codec_name': 'h264', 'profile': '(Main)', 'metadata': {'Metadata': '', 'creation_time': '2023-12-05T19:15:46.000000Z', 'handler_name': 'ISO Media file produced by Google Inc. Created on: 12/05/2023.', 'vendor_id': '[0][0][0][0]'}}, {'input_number': 0, 'stream_number': 1, 'stream_type': 'audio', 'language': 'eng', 'default': True, 'fps': 44100, 'bitrate': 127, 'metadata': {'Metadata': '', 'creation_time': '2023-12-05T19:15:46.000000Z', 'handler_name': 'ISO Media file produced by Google Inc. Created on: 12/05/2023.', 'vendor_id': '[0][0][0][0]'}}], 'input_number': 0}], 'duration': 211.0, 'bitrate': 323, 'start': 0.0, 'defaul

                                                                      

MoviePy - Done.
Extracted 218 frames
Extracted audio to data/keynote_recap.mp3




In [5]:

## Display the frames and audio for context
display_handle = display(None, display_id=True)
for img in base64Frames:
    display_handle.update(Image(data=base64.b64decode(img.encode("utf-8")), width=600))
    time.sleep(0.025)

In [6]:
# print the number of frames
print(f"Total frames: {len(base64Frames)}")

Total frames: 218


### Example 1: Summarization
The visual summary is generated by sending the model only the frames from the video. With just the frames, the model is likely to capture the visual aspects, but will miss any details discussed by the speaker.

In [7]:
# grab 20 frames and send them to GPT-4o for summarization
# create a loop to grab every 20 frames and send them to OpenAI for summarization from base64Frames
import copy 

nr_of_pics = len(base64Frames)
nr_of_pics_per_round = 20
start = 0
end = nr_of_pics_per_round-1
messages = []
runs = 1
accumulated_result = ""
while end < nr_of_pics:
    partial_frames = copy.deepcopy(base64Frames[start:end])
    result = call_gpt4o_with_imgs(partial_frames)
    print(f"Run {runs} - {start} to {end}")
    display(result)
    accumulated_result += result
    if nr_of_pics - end < nr_of_pics_per_round:
        break
    start = end+1
    end = start + nr_of_pics_per_round-1
    runs += 1
partial_frames = copy.deepcopy(base64Frames[end+1:nr_of_pics-1])
result = call_gpt4o_with_imgs(partial_frames)
accumulated_result += result

Run 1 - 0 to 19


'The video is a recap of the OpenAI DevDay event, showcasing the venue and the keynote presentation. It begins with the title "OpenAI DevDay" and "Keynote Recap," followed by scenes of the event location, which is adorned with OpenAI branding. The video captures the bustling atmosphere as attendees gather in a large conference hall. The keynote presentation is highlighted, with a speaker on stage addressing the audience. The video concludes with the OpenAI DevDay logo.'

Run 2 - 20 to 39


'The video appears to be a presentation or keynote event where a speaker is introducing and discussing advancements in AI technology. The main focus is on "GPT-4 Turbo," an enhanced version of the GPT-4 model. The speaker is on stage, likely explaining the features and improvements of this new model. Additionally, there is a demonstration of a "JSON Mode: ON" feature, suggesting a technical aspect or functionality related to data handling or output format. The audience is visible, indicating a live event setting.'

Run 3 - 40 to 59


'The video appears to be a presentation discussing advancements in technology, specifically focusing on JSON mode, function calling, and improvements in AI capabilities. The speaker highlights the transition from simple commands to more complex function calls, demonstrating how AI can now handle multiple tasks simultaneously. The presentation also covers six key areas of improvement, including context length, control, and better knowledge. Visuals include a JSON example, a comparison of function calling before and after improvements, and a timeline reference to September 2021, indicating a significant update or release during that time.'

Run 4 - 60 to 79


'The video features a presentation that took place in April 2023. The presenter, whose face is blurred, is discussing advancements in AI technology. Key topics include the introduction of DALL-E 3, GPT-4 with Turbo and Vision capabilities, and TTS (Text-to-Speech) technology. The presentation also covers the concept of Custom Models, indicating a focus on personalized or adaptable AI solutions. The setting appears to be a formal event with an audience, as indicated by the stage setup and the presence of a large screen displaying the topics being discussed.'

Run 5 - 80 to 99


'The video features a presentation where a speaker, whose face is blurred, discusses advancements in technology. The speaker is seen on stage, delivering a talk about increasing efficiency, specifically mentioning "2x tokens per minute," which suggests a focus on improving processing speed or capacity. Additionally, there is a segment showing a user interface for requesting a limit increase, specifically for the "gpt-3.5-turbo" model, indicating a discussion about enhancing or scaling up usage limits for a particular AI model. The presentation appears to be aimed at an audience interested in technological improvements and AI capabilities.'

Run 6 - 100 to 119


'The video appears to be a presentation about GPT-4 Turbo, focusing on its pricing and efficiency. The presenter discusses the cost benefits of GPT-4 Turbo, highlighting that it uses 3 times less input tokens and 2 times less output tokens compared to previous models. The presentation is part of an OpenAI event, as indicated by the "OPENAI DEV DAY" branding. The video also showcases a variety of applications or tools, possibly powered by GPT technology, displayed in a grid format. The term "GPTs" is emphasized, suggesting a focus on the capabilities or applications of GPT models.'

Run 7 - 120 to 139


'The video features a presentation where the speaker discusses a framework involving "Instructions," "Expanded knowledge," and "Actions." The speaker emphasizes the concept of building with natural language, suggesting a focus on leveraging natural language processing or understanding in technology or software development. The presentation appears to be aimed at an audience interested in advancements in technology, possibly in the context of AI or machine learning.'

Run 8 - 140 to 159


'The video appears to be a presentation or keynote event, likely related to technology or software. It begins with a display of three icons on a screen, possibly representing different aspects or features of a product or service. A speaker, whose face is blurred, is seen on stage addressing an audience. The presentation includes a demonstration of a user interface, showcasing categories such as Programming, Data Analysis, Education, Lifestyle, and Just for Fun. Featured GPTs (Generative Pre-trained Transformers) are highlighted, including Reactify, Cumulus, Mid-century Future, and Autodeck, with a mention of DALL-E, an AI model for generating images. The video concludes with an animation of a rotating cube with the text "OPENAI" on its sides, suggesting the involvement of OpenAI in the presentation.'

Run 9 - 160 to 179


'The video appears to be a presentation from an OpenAI event, likely a developer day, as indicated by the "OPENAI DEVDAY" text. The speaker, whose face is blurred, is discussing various features related to APIs. The presentation highlights key concepts such as "Threading," "Retrieval," "Code Interpreter," and "Function Calling," which are displayed on a large screen behind the speaker. The event seems to be focused on introducing or explaining these technical features to an audience, likely developers or tech enthusiasts.'

Run 10 - 180 to 199


'The video appears to be a presentation at an event called "OpenAI Dev Day." The speaker, whose face is blurred, is standing on a stage with a backdrop that repeatedly displays "OPENAI DEV DAY." The speaker is dressed casually in a green sweater and dark pants, and is holding a clicker, suggesting they are giving a talk or presentation, possibly about developments or updates related to OpenAI. The audience is visible in silhouette, indicating a live event setting.'

In [8]:
final = call_gpt4o(accumulated_result)
display(final)

"The video is a recap of the OpenAI DevDay event, highlighting the venue, keynote presentation, and various advancements in AI technology. The event features a bustling atmosphere with attendees gathered in a large conference hall. Key topics discussed include the introduction of GPT-4 Turbo, DALL-E 3, and advancements in Text-to-Speech technology. The presentation emphasizes improvements in AI capabilities, such as increased efficiency, enhanced processing speed, and the ability to handle complex tasks. The speaker, whose face is blurred, discusses features like JSON mode, function calling, and custom models, aimed at developers and tech enthusiasts. Visual elements include a demonstration of a user interface with categories like Programming and Data Analysis, and a focus on the cost benefits and efficiency of GPT-4 Turbo. The event concludes with the OpenAI logo, underscoring the company's role in these technological advancements."