In [1]:
import os
from pathlib import Path
import shutil

import dotenv
from openai import OpenAI

from media_extractor import decode_input
import datauri

# Load OpenAI API key from .env file
dotenv.load_dotenv()

# Check prerequisites
if os.environ.get("OPENAI_API_KEY") is None:
    raise ValueError("OPENAI_API_KEY not found in .env file")

client = OpenAI()

In [2]:
with open("system_prompt.txt", "r") as f:
    SYSTEM_PROMPT = f.read().strip()

In [3]:
video_file = "input.mov"

# == Uncomment to preview ==
from IPython.display import Video
Video(video_file, width=320)

In [4]:
audio_uri, image_uris = decode_input(video_file)
audio_uri[:50]

'data:audio/mpeg;base64,SUQzBAAAAAACXVRYWFgAAAASAAA'

In [5]:
with datauri.as_tempfile(audio_uri) as audio_file:
    transcription = client.audio.transcriptions.create(
        model="whisper-1", file=Path(audio_file)
    )
user_prompt = transcription.text

user_prompt

"Can you describe what I'm wearing right now?"

In [6]:
response = client.chat.completions.create(
    model="gpt-4o",
    messages=[
        {
            "role": "user",
            "content": [
                {"type": "text", "text": user_prompt},
                *[
                    {
                        "type": "image_url",
                        "image_url": {"url": image_uri, "detail": "auto"},
                    }
                    for image_uri in image_uris
                ],
            ],
        },
        {
            "role": "system",
            "content": SYSTEM_PROMPT,
        },
    ],
)
response_text = response.choices[0].message.content
response_text

"Hey! From what I can see in the video, you're wearing a gray hoodie. Looks comfy!"

In [7]:
audio = client.audio.speech.create(
    model="tts-1",
    voice="nova",
    input=response_text,
    response_format="mp3",
)
response_audio_uri = datauri.from_bytes(audio.read(), "audio/mpeg")

In [11]:
with datauri.as_tempfile(response_audio_uri) as response_audio_file:
    from IPython.display import Audio
    display(Audio(response_audio_file))