In [1]:
import os
import base64
import asyncio
import aiohttp
import nest_asyncio
import cv2
import moviepy.editor as mp
import whisper
from pytubefix import YouTube
from pytubefix.cli import on_progress

# Apply nest_asyncio to allow nested event loops in Jupyter Notebooks
nest_asyncio.apply()

# Set your OpenAI API Key
API_KEY = "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"

# General utility functions
def format_timestamp(seconds):
    """Converts time in seconds to MM:SS format."""
    minutes = int(seconds // 60)
    seconds = int(seconds % 60)
    return f"{minutes:02}:{seconds:02}"

def encode_image(image_path):
    """Encodes an image file to a base64 string."""
    try:
        with open(image_path, "rb") as image_file:
            return base64.b64encode(image_file.read()).decode('utf-8')
    except Exception as e:
        print(f"Error encoding image: {e}")
        return None

# Video processing functions
def download_video(url, filename="video.mp4"):
    """Downloads a YouTube video and returns the video title."""
    try:
        yt = YouTube(url, on_progress_callback=on_progress)
        print(f"Downloading: {yt.title}")
        ys = yt.streams.get_highest_resolution()
        ys.download(filename=filename)
        return yt.title
    except Exception as e:
        print(f"Error downloading video: {e}")
        return "Unknown Title"

def extract_frames(video_path, output_folder, interval=10):
    """Extracts frames from a video at specified intervals."""
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    video_capture = cv2.VideoCapture(video_path)
    fps = int(video_capture.get(cv2.CAP_PROP_FPS))
    total_frames = int(video_capture.get(cv2.CAP_PROP_FRAME_COUNT))
    
    current_frame = 0
    frame_count = 1  # Initialize frame counter
    while current_frame < total_frames:
        video_capture.set(cv2.CAP_PROP_POS_FRAMES, current_frame)
        success, frame = video_capture.read()
        if success:
            output_filename = os.path.join(output_folder, f"image_{frame_count}.jpg")
            cv2.imwrite(output_filename, frame)
            frame_count += 1
        else:
            break
        current_frame += interval * fps

    video_capture.release()

def transcribe_audio(video_path):
    """Extracts audio from video and transcribes it with timestamps."""
    try:
        video = mp.VideoFileClip(video_path)
        audio_path = "audio.wav"
        video.audio.write_audiofile(audio_path)

        model = whisper.load_model("base.en")
        result = model.transcribe(audio_path)

        transcript_with_timestamps = [
            f"[{format_timestamp(segment['start'])} - {format_timestamp(segment['end'])}]: {segment['text']}"
            for segment in result['segments']
        ]
        return "\n".join(transcript_with_timestamps)
    except Exception as e:
        print(f"Error transcribing audio: {e}")
        return ""

# Asynchronous API request functions
async def analyze_image(base64_image, session, start_time, end_time):
    """Sends an image to the OpenAI Vision API and retrieves a description."""
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {API_KEY}"
    }

    payload = {
        "model": "gpt-4o",
        "messages": [
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": "Describe in detail the visual content of the image provided. Focus on capturing both small details and the overall context. Avoid listing objects; instead, compose a coherent paragraph that conveys what is happening in the frame."
                    },
                    {
                        "type": "text",
                        "text": f"This frame is from [{format_timestamp(start_time)} - {format_timestamp(end_time)}]."
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{base64_image}",
                            "detail": "low"
                        }
                    }
                ]
            }
        ],
        "max_tokens": 150
    }

    try:
        async with session.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload) as response:
            return await response.json()
    except Exception as e:
        print(f"Error analyzing image: {e}")
        return {"choices": [{"message": {"content": "Error in processing the image"}}]}

async def aggregate_descriptions(descriptions, transcript, title):
    """Aggregates frame descriptions and transcript into a concise video summary, including the video title."""
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {API_KEY}"
    }

    content_to_send = [
        {
            "type": "text",
            "text": f"""Create a relatively concise summary of the video titled '{title}'. The summary should be no longer than three paragraphs and should seamlessly integrate both the visual elements and the spoken content from the transcript. Reference visual and audio elements naturally, using temporal cues to connect these domains into an immersive and accurate summary of the video content. Don't cite the timestamp as if it were an in-text citation, instead use a more linguistic approach, segueing between the audio components (from the transcript), visual components (from the descriptions), and the temporal data you have access to which is associated with the audio and visual components. Make it sound a bit more natural and human, and less like an AI. Less descriptive imagery, we aren't in highschool English class so we don't need to evaluate everything. Just use what you have and do a damn good summary."""
        },
        {"type": "text", "text": "\n".join(descriptions)},
        {"type": "text", "text": f"Transcript: {transcript}"}
    ]

    payload = {
        "model": "gpt-4o",
        "messages": [
            {
                "role": "user",
                "content": content_to_send,
            }
        ]
    }

    try:
        async with aiohttp.ClientSession() as session:
            async with session.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload) as response:
                result = await response.json()
                # Save the prompt content to a file
                with open("prompt.txt", "w") as prompt_file:
                    prompt_file.write(str(payload))
                return result['choices'][0]['message']['content']
    except Exception as e:
        print(f"Error aggregating descriptions: {e}")
        return "Error generating summary."

# Main processing function
async def process_video(video_url, video_path="video.mp4", output_folder="images", interval=10):
    """Main function to process video, extract frames, analyze images, transcribe audio, and generate a video summary."""
    try:
        # Download the video and get its title
        video_title = download_video(video_url, video_path)

        # Extract frames from the video
        extract_frames(video_path, output_folder, interval)

        # Analyze each frame asynchronously
        descriptions = []
        async with aiohttp.ClientSession() as session:
            tasks = []
            for i, image_file in enumerate(sorted(os.listdir(output_folder))):
                image_path = os.path.join(output_folder, image_file)
                base64_image = encode_image(image_path)
                if not base64_image:
                    continue

                start_time = i * interval
                end_time = start_time + interval
                tasks.append(analyze_image(base64_image, session, start_time, end_time))

            results = await asyncio.gather(*tasks)

            for i, result in enumerate(results):
                description = result['choices'][0]['message']['content']
                start_time = i * interval
                end_time = start_time + interval
                descriptions.append(f"[{format_timestamp(start_time)} - {format_timestamp(end_time)}]: {description}")

        # Extract and transcribe audio with timestamps
        transcription = transcribe_audio(video_path)

        # Generate the final summary
        summary = await aggregate_descriptions(descriptions, transcription, video_title)
        return summary

    except Exception as e:
        print(f"Error processing video: {e}")
    finally:
        # Cleanup temporary files and folders (except video)
        try:
            if os.path.exists("audio.wav"):
                os.remove("audio.wav")
            if os.path.exists(output_folder):
                for image_file in os.listdir(output_folder):
                    os.remove(os.path.join(output_folder, image_file))
                os.rmdir(output_folder)
        except Exception as cleanup_error:
            print(f"Error during cleanup: {cleanup_error}")

# Example usage
video_url = "https://www.youtube.com/watch?v=EkAQAi3a4js"
await process_video(video_url)

Downloading: Multiple Regression, Clearly Explained!!!
 ↳ |████████████████████████████████████████████| 100.0%

CancelledError: 