In [65]:
import sys
print(sys.executable)
!{sys.executable} -m pip install --upgrade pip
!{sys.executable} -m pip install faster-whisper openai requests tqdm transformers pillow opencv-python torch torchvision torchaudio
!{sys.executable} -m pip install ipywidgets

/Users/estellekim/miniconda3/envs/CIS5810_hw2/bin/python
[0mINFO: pip is looking at multiple versions of opencv-python to determine which version is compatible with other requirements. This could take a while.
[0mCollecting pydantic-core==2.41.4 (from pydantic<3,>=1.9.0->openai)
  Using cached pydantic_core-2.41.4-cp310-cp310-macosx_11_0_arm64.whl.metadata (7.3 kB)
Collecting pydantic<3,>=1.9.0 (from openai)
  Using cached pydantic-2.12.3-py3-none-any.whl.metadata (87 kB)
Collecting jiter<1,>=0.10.0 (from openai)
  Using cached jiter-0.11.1-cp310-cp310-macosx_11_0_arm64.whl.metadata (5.2 kB)
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Using cached httpcore-1.0.9-py3-none-any.whl.metadata (21 kB)
Collecting httpx<1,>=0.23.0 (from openai)
  Using cached httpx-0.28.1-py3-none-any.whl.metadata (7.1 kB)
Collecting distro<2,>=1.7.0 (from openai)
  Using cached distro-1.9.0-py3-none-any.whl.metadata (6.8 kB)
INFO: pip is still looking at multiple versions of opencv-python to 

In [66]:
import os
from pathlib import Path
from difflib import SequenceMatcher
import json
import time
import requests

#openai for LLM functionality
OPENAI_API_KEY = ""
#assembly for video to text script
ASSEMBLYAI_API_KEY = ""
#google cloud for video analysis (also try Amazon Rekognition, Microsoft Azure Video Indexer, Clarifai)
GCP_API_KEY = ""

VIDEO_PATH = "good_place_clip.mp4"


In [67]:
def transcribe_to_text(video_path: str) -> dict:
    headers = {"authorization": ASSEMBLYAI_API_KEY}
    # uplkoad video
    upload_url = "https://api.assemblyai.com/v2/upload"
    def read_file(filename, chunk_size=5242880):
        with open(filename, 'rb') as f:
            while True:
                data = f.read(chunk_size)
                if not data:
                    break
                yield data
    up_resp = requests.post(upload_url, headers=headers, data=read_file(video_path))
    audio_url = up_resp.json()['upload_url']

    # transcribe
    transcribe_url = "https://api.assemblyai.com/v2/transcript"
    transcribe_req = {"audio_url": audio_url, 
                      "speaker_labels": True,
                      "iab_categories": True, # topics
                      "entity_detection": True,
                      "sentiment_analysis": True,
                      "auto_chapters": True}
    t_resp = requests.post(transcribe_url, headers=headers, json=transcribe_req)
    transcript_id = t_resp.json()['id']

    # keep polling until complete
    poll_url = f"{transcribe_url}/{transcript_id}"
    while True:
        poll_resp = requests.get(poll_url, headers=headers)
        status = poll_resp.json()['status']
        if status == 'completed':
            return poll_resp.json()
        elif status == 'error':
            raise RuntimeError(poll_resp.json()['error'])
        time.sleep(5)


In [68]:
# print(os.getcwd())
# transcript_result = transcribe_to_text(VIDEO_PATH)
# transcript_text = transcript_result.get("text", "")

In [69]:
# print("Text:")
# print(transcript_result.get("text", "")) 

# print("Entities:")
# for e in transcript_result.get("entities", []):
#     print(f"{e['entity_type']}: {e['text']}")

# print("Categories:")
# if "iab_categories_result" in transcript_result:
#     summary = transcript_result["iab_categories_result"].get("summary", {})
#     for name in summary.items():
#         print(name)

In [70]:
from faster_whisper import WhisperModel

model = WhisperModel("small")
segments, info = model.transcribe(VIDEO_PATH)
transcript = [{"start": s.start, "end": s.end, "text": s.text} for s in segments]



In [71]:
transcript_text_only = ''.join([s["text"] for s in transcript])
print(transcript_text_only)
print(transcript)

 What was ever good at being sad? Probably because my mom straight up told me not to be. But this is sad, man. You got a John Locke quote, or a piece of county and wisdom you can throw at me? Those guys were more focused on rules and regulations. For spiritual stuff, you got to turn to the east. I'll take anything you got. Hit me. Picture a wave in the ocean. You can see it, measure it, its height, the way the sunlight refracts when it passes through, and it's there, and you can see it. You know what it is. It's a wave. And then it crashes on the shore, and it's gone, but the water is still there. The wave was just a different wave for the water to be for a little while. That's one conception of death for a Buddhist. The wave returns to the ocean. Where it came from. And where it's supposed to be. Not bad, Buddhists. Not bad. None of this is bad. I need you to do me one last favor. Say goodbye to me now, and leave before I wake up. You can sit on that bench as long as you'd like, and w

In [72]:
# take frames every 1 second of the video to put into image analysis

import cv2

cap = cv2.VideoCapture(VIDEO_PATH)
fps = cap.get(cv2.CAP_PROP_FPS)
frames = []
frame_id = 0
while True:
    ret, frame = cap.read()
    if not ret:
        break
    if frame_id % int(fps) == 0:  # 1 fps
        frames.append(frame)
    frame_id += 1
cap.release()

In [73]:
# print(frames)

In [74]:
# analyze and capture each frame

from transformers import BlipProcessor, BlipForConditionalGeneration
import torch
from PIL import Image

# https://huggingface.co/Salesforce/blip-image-captioning-base
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

captions = []
for f in frames:
    image = Image.fromarray(cv2.cvtColor(f, cv2.COLOR_BGR2RGB))
    inputs = processor(images=image, return_tensors="pt")
    out = model.generate(**inputs)
    captions.append(processor.decode(out[0], skip_special_tokens=True))

In [75]:
print(captions)

['a black background with a white and red flower', 'a person sitting on a balcony watching the sunset', 'a person sitting on a couch watching a sunset', 'a couple sitting on a balcony watching the sunset', 'a couple sitting on a balcony watching the sunset', 'a couple sitting on a couch watching the sunset', 'a woman sitting on a couch', 'a woman sitting on a couch with a man', 'a woman sitting on a couch next to a man', 'a woman sitting on a couch next to a man', 'a woman sitting on a couch', 'a woman sitting on a couch', 'a woman sitting on a couch next to a man', 'a woman sitting on a couch', 'watch this gi gi gi gi gi gi gi gi gi gi gi gi gi gi gi gi gi gi', 'a woman sitting on a couch next to a man', 'watch this video of a $ $ $ from the movie', 'watch this gi gi gi gi gi gi gi gi gi gi gi gi gi gi gi gi gi gi', 'watch this gi gi gi gi gi gi gi gi gi gi gi gi gi gi gi gi gi gi', 'a woman sitting on a couch next to a man', 'a man and woman sitting on a couch', 'a man and woman sitt

In [None]:
from difflib import SequenceMatcher

def similar(a, b): 
    return SequenceMatcher(None, a, b).ratio()

def categorize_scenes(captions, threshold=0.6, max_gap=1, fps=1):
    if not captions: 
        return []

    scenes = []

    # track indices (also corresponds to seconds because fps of snapshots is 1)
    current = {
        "captions": [captions[0]],
        "start_idx": 0, #inclusive
        "end_idx": 1, # exclusive
        "anchor": captions[0], #first caption of each category
        "gap": 0
    }

    for i in range(1, len(captions)):
        curr = captions[i]
        prev = captions[i - 1]

        if similar(curr, prev) >= threshold or similar(curr, current["anchor"]) >= threshold:
            current["captions"].append(curr)
            current["end_idx"] = i + 1
            current["gap"] = 0
        elif current["gap"] < max_gap:
            # tolerate an outlier without splitting
            current["captions"].append(curr)
            current["end_idx"] = i + 1
            current["gap"] += 1
        else:
            # split
            scenes.append({
                "captions": current["captions"],
                "start_time": current["start_idx"] / fps,
                "end_time": current["end_idx"] / fps
            })
            current = {
                "captions": [curr],
                "start_idx": i,
                "end_idx": i + 1,
                "anchor": curr,
                "gap": 0
            }

    # final scene
    scenes.append({
        "captions": current["captions"],
        "start_time": current["start_idx"] / fps,
        "end_time": current["end_idx"] / fps
    })
    return scenes

#adjust threshold and max_gap as needed - higher threshold > more strict grouping
scenes = categorize_scenes(captions, threshold=0.6, max_gap=1, fps=1)

In [77]:
for scene in scenes:
    print(scene)

{'captions': ['a black background with a white and red flower', 'a person sitting on a balcony watching the sunset', 'a person sitting on a couch watching a sunset', 'a couple sitting on a balcony watching the sunset', 'a couple sitting on a balcony watching the sunset', 'a couple sitting on a couch watching the sunset', 'a woman sitting on a couch', 'a woman sitting on a couch with a man', 'a woman sitting on a couch next to a man', 'a woman sitting on a couch next to a man', 'a woman sitting on a couch', 'a woman sitting on a couch', 'a woman sitting on a couch next to a man', 'a woman sitting on a couch', 'watch this gi gi gi gi gi gi gi gi gi gi gi gi gi gi gi gi gi gi'], 'start_time': 0.0, 'end_time': 15.0}
{'captions': ['a woman sitting on a couch next to a man', 'watch this video of a $ $ $ from the movie'], 'start_time': 15.0, 'end_time': 17.0}
{'captions': ['watch this gi gi gi gi gi gi gi gi gi gi gi gi gi gi gi gi gi gi', 'watch this gi gi gi gi gi gi gi gi gi gi gi gi gi gi

In [78]:
from difflib import SequenceMatcher

def similar(a: str, b: str) -> float:
    return SequenceMatcher(None, a, b).ratio()

def representative_caption(captions, sim_fn=similar):

    n = len(captions)
    if n == 0: return "", -1, []
    if n == 1: return captions[0], 0, [1.0]

    scores = [0.0] * n
    for i in range(n):
        ai = captions[i]
        s = 0.0
        for j in range(n):
            if i == j: continue
            s += sim_fn(ai, captions[j])
        scores[i] = s

    best_i = max(range(n), key=lambda i: scores[i])
    return captions[best_i], best_i, scores


combined_scenes = []
for scene in scenes:
    rep, _, _ = representative_caption(scene["captions"])
    scene_dialogue = [
        e["text"] for e in transcript
        if scene["start_time"] <= e["start"] and e["end"] <= scene["end_time"]
    ]
    combined_scenes.append({
        "start_time": scene["start_time"],
        "end_time": scene["end_time"],
        "description": rep, # most representative caption
        "dialogue": scene_dialogue
    })



In [79]:
for scene in combined_scenes:
    print(scene)

{'start_time': 0.0, 'end_time': 15.0, 'description': 'a woman sitting on a couch with a man', 'dialogue': [' What was ever good at being sad?', ' Probably because my mom straight up told me not to be.', ' But this is sad, man.', ' You got a John Locke quote,']}
{'start_time': 15.0, 'end_time': 17.0, 'description': 'a woman sitting on a couch next to a man', 'dialogue': []}
{'start_time': 17.0, 'end_time': 89.0, 'description': 'a man and woman sitting on a couch', 'dialogue': [' Those guys were more focused on rules and regulations.', ' For spiritual stuff, you got to turn to the east.', " I'll take anything you got. Hit me.", ' Picture a wave in the ocean.', ' You can see it, measure it, its height,', ' the way the sunlight refracts when it passes through,', " and it's there, and you can see it.", " You know what it is. It's a wave.", ' And then it crashes on the shore,', " and it's gone,", ' but the water is still there.', ' The wave was just a different wave', ' for the water to be f

In [80]:
#iteration: scikit-learn feature_extraction.text:
#{'start_time': 0.0, 'end_time': 15.0, 'description': 'a black background with a white and red flower a person sitting on a balcony watching the sunset a person', 'dialogue': [' What was ever good at being sad?', ' Probably because my mom straight up told me not to be.', ' But this is sad, man.', ' You got a John Locke quote,']}
# {'start_time': 15.0, 'end_time': 17.0, 'description': 'a woman sitting on a couch next to a man watch this video of a $ $ $ from the', 'dialogue': []}
# {'start_time': 17.0, 'end_time': 89.0, 'description': 'watch this gi gi gi gi gi gi gi gi gi gi gi gi gi gi gi gi gi gi', 'dialogue': [' Those guys were more focused on rules and regulations.', ' For spiritual stuff, you got to turn to the east.', " I'll take anything you got. Hit me.", ' Picture a wave in the ocean.', ' You can see it, measure it, its height,', ' the way the sunlight refracts when it passes through,', " and it's there, and you can see it.", " You know what it is. It's a wave.", ' And then it crashes on the shore,', " and it's gone,", ' but the water is still there.', ' The wave was just a different wave', ' for the water to be for a little while.', " That's one conception of death for a Buddhist.", ' The wave returns to the ocean.', ' Where it came from.']}
# {'start_time': 89.0, 'end_time': 144.0, 'description': 'a man and woman are looking at each other people a man and woman sitting next to each other people', 'dialogue': [" And where it's supposed to be.", ' Not bad, Buddhists.', ' Not bad.', ' None of this is bad.', ' I need you to do me one last favor.', ' Say goodbye to me now, and leave before I wake up.']}
# {'start_time': 144.0, 'end_time': 167.0, 'description': 'a woman laying in bed a woman laying in bed a woman laying in bed with a book a woman', 'dialogue': []}
# {'start_time': 167.0, 'end_time': 192.0, 'description': 'a woman standing in the middle of a forest a woman standing in the middle of a forest a woman', 'dialogue': [" and whenever you're ready,", ' you just walk through.', " I'm ready."]}
# {'start_time': 192.0, 'end_time': 203.0, 'description': 'a man and woman kissing in the woods a man and woman standing in front of a tree a man', 'dialogue': []}