In [8]:
import os
import base64
import asyncio
import aiohttp
import nest_asyncio
import cv2
import moviepy.editor as mp
import whisper
from pytubefix import YouTube
from pytubefix.cli import on_progress

# Apply nest_asyncio to allow nested event loops in Jupyter Notebooks
nest_asyncio.apply()

# Your OpenAI API Key
api_key = "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"

# Function to download video from YouTube
def download_video(url, filename="video.mp4"):
    yt = YouTube(url, on_progress_callback=on_progress)
    print(f"Downloading: {yt.title}")
    ys = yt.streams.get_highest_resolution()
    ys.download(filename=filename)

# Function to extract audio from video and transcribe it with timestamps
def transcribe_audio(video_path):
    video = mp.VideoFileClip(video_path)
    audio = video.audio
    audio.write_audiofile("audio.wav")

    model = whisper.load_model("base.en")
    result = model.transcribe("audio.wav")

    # Collect transcription with timestamps
    transcript_with_timestamps = []
    for segment in result['segments']:
        start_time = segment['start']
        end_time = segment['end']
        text = segment['text']
        transcript_with_timestamps.append(f"[{start_time:.2f}s - {end_time:.2f}s]: {text}")
    
    return "\n".join(transcript_with_timestamps)

# Function to extract frames from video at given intervals
def extract_frames(video_path, output_folder, interval=10):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    video_capture = cv2.VideoCapture(video_path)
    fps = int(video_capture.get(cv2.CAP_PROP_FPS))
    total_frames = int(video_capture.get(cv2.CAP_PROP_FRAME_COUNT))
    duration = total_frames // fps

    for sec in range(0, duration, interval):
        video_capture.set(cv2.CAP_PROP_POS_FRAMES, sec * fps)
        success, frame = video_capture.read()
        if success:
            output_filename = os.path.join(output_folder, f"frame_{sec:04d}.jpg")
            cv2.imwrite(output_filename, frame)
        else:
            break

    video_capture.release()

# Function to encode image in base64
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

# Asynchronous function to analyze image using Vision API
async def analyze_image(base64_image, session, time_range):
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}"
    }
    
    payload = {
        "model": "gpt-4o",
        "messages": [
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": "Describe in detail what you see on the screen. Don't list things out, just write one paragraph describing what you see. Make it detailed, I want the small and the big things. Don't use any markdown formatting."
                    },
                    {
                        "type": "text",
                        "text": f"Frame from {time_range}"
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{base64_image}",
                            "detail": "low"
                        }
                    }
                ]
            }
        ],
        "max_tokens": 150
    }

    async with session.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload) as response:
        try:
            return await response.json()
        except Exception as e:
            print(f"Error analyzing image: {e}")
            return {"choices": [{"message": {"content": "Error in processing the image"}}]}

# Function to aggregate the frame descriptions
async def aggregate_descriptions(descriptions, transcript):
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}"
    }

    payload = {
        "model": "gpt-4o",
        "messages": [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": """What is this video about based on the information you have? Make it long and detailed, but just in paragraph 
                     form. Maximum 3 paragraphs. Don't list things out, and don't use any markdown formatting. I want you to primarily use the transcript of the video to figure out
                     what it is about and how best to summarize it. However, I do want you to use some of the descriptions from the frames within the video as well.
                     When you do, I want you to use language such as "at 2 minutes 10 seconds, an illustration of XYZ appears, which highlights ABC" as an example.
                     You are also provided some of the temporal information with the images, so I want you to use that in your summary if that makes sense."""},
                    {"type": "text", "text": "\n".join(descriptions)},
                    {"type": "text", "text": f"Transcript: {transcript}"}
                ],
            }
        ],
        "max_tokens": 500
    }

    # Write the payload to a text file
    with open("prompt.txt", "w") as prompt_file:
        prompt_file.write(str(payload))

    async with aiohttp.ClientSession() as session:
        async with session.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload) as response:
            result = await response.json()
            return result['choices'][0]['message']['content']

# Main logic with async processing
async def process_video(video_url, video_path="video.mp4", output_folder="images", interval=10):
    # Download video
    download_video(video_url, video_path)

    # Extract frames from the video
    extract_frames(video_path, output_folder, interval)

    # Analyze each frame asynchronously
    descriptions = []
    async with aiohttp.ClientSession() as session:
        tasks = []
        for i, image_file in enumerate(sorted(os.listdir(output_folder))):
            image_path = os.path.join(output_folder, image_file)
            base64_image = encode_image(image_path)
            
            # Calculate time range for the current frame
            start_time = i * interval
            end_time = (i + 1) * interval
            time_range = f"{start_time // 60:02d}:{start_time % 60:02d}-{end_time // 60:02d}:{end_time % 60:02d}"

            tasks.append(analyze_image(base64_image, session, time_range))
        
        results = await asyncio.gather(*tasks)

        for i, result in enumerate(results):
            description = result['choices'][0]['message']['content']
            # Append time range to the description with frame number
            start_time = i * interval
            end_time = (i + 1) * interval
            time_range = f"{start_time // 60:02d}:{start_time % 60:02d}-{end_time // 60:02d}:{end_time % 60:02d}"
            descriptions.append(f"Frame {i} ({time_range}): {description}")
            print(f"Description for frame_{i * interval:04d}.jpg ({time_range}): {description}")

    # Extract and transcribe audio with timestamps
    transcription = transcribe_audio(video_path)
    print("Transcription with Timestamps:")
    print(transcription)

    summary = await aggregate_descriptions(descriptions, transcription)
    print("Video Summary:")
    print(summary)

video_url = "https://www.youtube.com/watch?v=GL2uFYi86kk"

await process_video(video_url)

Description for frame_0000.jpg: In the frame from the video, we can observe a traffic situation likely caused by poor weather conditions, possibly heavy rain:

1. **Weather Conditions**: The scene is marked by significant rainfall, as evidenced by water droplets on the windshield, which likely contributes to reduced visibility and potentially slippery road conditions.

2. **Road Layout**: The road features a slight curve, as indicated by the yellow curve sign to the right. This suggests that drivers need to navigate carefully due to the road's direction and the poor weather.

3. **Traffic Congestion**: There is a line of vehicles halted or moving very slowly. The presence of diverse vehicles, including a large truck (possibly a Tata, based on the visible branding) and smaller cars, indicates
Description for frame_0003.jpg: In the frame from the video titled "Giant Rock Hits Vehicles on Road in India," several significant observations can be made:

1. **Setting**: The scene is set on a 