In [1]:
import os
import cv2
import pdb
import json
import base64
import traceback
from typing import Any, Dict, List, Optional

import pandas as pd
from rich import print as rich_print
from dotenv import load_dotenv, find_dotenv

from openai import OpenAI, BadRequestError, APIStatusError
from prompts import SYSTEM_PROMPT_FILM_CLASSIFICATION

In [None]:
from IPython.display import display, HTML

In [16]:
pd.set_option('display.max_colwidth', None)  # Show full content in each column
pd.set_option('display.max_columns', None)   # Display all columns
pd.set_option('display.max_rows', None)      # Display all rows
pd.set_option('display.width', None)         # Adjust display width to fit content

In [2]:
def extract_frames_as_data_urls(
    video_path: str,
    interval_sec: float = 1.0,
    limit: Optional[int] = None,
    jpeg_quality: int = 92,
    target_width: int = 480,
) -> List[Dict[str, str]]:
    """Return data URLs for frames sampled every `interval_sec` seconds.

    Output element example:
    {"type": "input_image", "image_url": "data:image/jpeg;base64,<...>"}
    """
    cap = cv2.VideoCapture(str(video_path))
    if not cap.isOpened():
        raise ValueError(f"Could not open video: {video_path}")

    fps = cap.get(cv2.CAP_PROP_FPS)
    if not fps or fps <= 0:
        fps = 24.0
    frame_interval = max(1, int(round(fps * interval_sec)))

    out: List[Dict[str, str]] = []
    frame_no = 0

    while True:
        ok, frame = cap.read()
        if not ok:
            break

        if frame_no % frame_interval == 0:
            if limit is not None and len(out) >= limit:
                print(f'ignoring frame_no={frame_no}')
                frame_no += 1
                continue
                # break
            # Resize the frame before encoding (e.g., to width=640, keep aspect ratio)
            h, w = frame.shape[:2]
            if w > target_width:
                aspect_ratio = h / w
                new_height = int(target_width * aspect_ratio)
                frame_resized = cv2.resize(frame, (target_width, new_height))
            else:
                frame_resized = frame

            ok_jpg, buf = cv2.imencode(
                ".jpg", frame_resized, [int(cv2.IMWRITE_JPEG_QUALITY), int(jpeg_quality)]
            )
            print(f'attaching frame_no={frame_no}, original_shape={frame.shape}, resized_shape={frame_resized.shape}')
            if ok_jpg:
                b64 = base64.b64encode(buf.tobytes()).decode("utf-8")
                out.append({
                    "type": "input_image",
                    "image_url": f"data:image/jpeg;base64,{b64}",
                })

        frame_no += 1

    cap.release()
    return out

In [3]:
def make_message(video_content_lst):
    input_messages = [
        {
            "role": "system",
            "content": [
                {"type": "input_text", "text": SYSTEM_PROMPT_FILM_CLASSIFICATION},
            ],
        },
        {
            "role": "user",
            "content": [
                {
                    "type": "input_text",
                    "text": (
                        "You are given a sample of frames from the film. "
                        "Classify strictly per the schema and return only valid JSON."
                    ),
                },
            ] + video_content_lst,
        }
    ]
    return input_messages

def chunk_list(items: List[Any], chunk_size: int) -> List[List[Any]]:
    """Split a list into consecutive chunks of size `chunk_size`.

    Returns a list of lists; the last chunk may be smaller if the
    total number of items is not divisible by `chunk_size`.
    """
    if chunk_size <= 0:
        raise ValueError("chunk_size must be positive")
    return [items[i : i + chunk_size] for i in range(0, len(items), chunk_size)]

def inference_text(client, input_messages):
    response = client.responses.create(
        model="gpt-5",
        input = input_messages,
    )
    # usage=ResponseUsage(input_tokens=22807, input_tokens_details=InputTokensDetails(cached_tokens=0), output_tokens=1896, output_tokens_details=OutputTokensDetails(reasoning_tokens=1280), total_tokens=24703),
    return response

# initialise client

In [4]:
load_dotenv(find_dotenv())
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

client = OpenAI()

# Final_Destination

In [5]:
# Extract frames as data URLs
video_path = "data/YT_download/Final_Destination.mp4"
video_content_lst = extract_frames_as_data_urls(
    video_path, 
    interval_sec=1,
    limit=600
)
print(f"len(video_content_lst) = {len(video_content_lst)}")

attaching frame_no=0, original_shape=(360, 640, 3), resized_shape=(270, 480, 3)
attaching frame_no=24, original_shape=(360, 640, 3), resized_shape=(270, 480, 3)
attaching frame_no=48, original_shape=(360, 640, 3), resized_shape=(270, 480, 3)
attaching frame_no=72, original_shape=(360, 640, 3), resized_shape=(270, 480, 3)
attaching frame_no=96, original_shape=(360, 640, 3), resized_shape=(270, 480, 3)
attaching frame_no=120, original_shape=(360, 640, 3), resized_shape=(270, 480, 3)
attaching frame_no=144, original_shape=(360, 640, 3), resized_shape=(270, 480, 3)
attaching frame_no=168, original_shape=(360, 640, 3), resized_shape=(270, 480, 3)
attaching frame_no=192, original_shape=(360, 640, 3), resized_shape=(270, 480, 3)
attaching frame_no=216, original_shape=(360, 640, 3), resized_shape=(270, 480, 3)
attaching frame_no=240, original_shape=(360, 640, 3), resized_shape=(270, 480, 3)
attaching frame_no=264, original_shape=(360, 640, 3), resized_shape=(270, 480, 3)
attaching frame_no=288

## Break video into sub videos

In [7]:
chunk_size = 32
content_chunks = chunk_list(video_content_lst, chunk_size)
print(f"num_chunks = {len(content_chunks)}, chunk_size = {chunk_size}")

num_chunks = 5, chunk_size = 32


## inference

In [8]:
output_json_lst = []
for idx, content_chunk in enumerate(content_chunks):
    print(f"processing chunk {idx + 1}/{len(content_chunks)} with {len(content_chunk)} frames")
    input_messages = make_message(content_chunk)
    response = inference_text(client, input_messages)

    # parse each response as JSON
    parsed = json.loads(response.output_text)

    # verbose current output
    rating = parsed['rating']
    rich_print(f"\trating: {rating}")
    overall_rationale = parsed['overall_rationale']
    rich_print(f"\toverall_rationale: {overall_rationale}")
    
    output_json_lst.append(parsed)

print(f"done")

processing chunk 1/5 with 32 frames


processing chunk 2/5 with 32 frames


processing chunk 3/5 with 32 frames


processing chunk 4/5 with 32 frames


processing chunk 5/5 with 17 frames


done


## verbose

In [22]:
df = pd.DataFrame({
    'rating': [i['rating'] for i in output_json_lst],
    'overall_rationale': [i['overall_rationale'] for i in output_json_lst],
})

df['overall_rationale'] = df['overall_rationale'].str.wrap(80)

In [23]:
display(HTML(df.to_html().replace("\\n", "<br>")))

Unnamed: 0,rating,overall_rationale
0,PG13,"Scenes focus on social drinking at a backyard gathering with numerous bottles shown, mixing of drinks, and group toasts. No violence, sex, nudity, coarse language, or horror evident. Alcohol depictions warrant a PG13 floor under drugs/psychoactive substances."
1,PG,"Backyard gathering scenes feature clear shots of a beer bottle and mild moments of suspense (dark room with an unsettling older woman). No violence, sex, nudity, or strong language evident. Alcohol presence and brief mild threat place it at PG."
2,PG13,"Sample shows disaster/peril scenes with fire and structural collapse, a funeral sequence, and a few intense screams. No gore, sex, nudity, drug use, or strong language evident. Moderate threat and distress place it at PG13."
3,PG13,"Sample frames suggest a thriller with perilous set‑pieces (explosions, collapses, glass breaking, electrical hazards) and a dark, ominous tone (“birth of death”). Violence appears non-gory and non-prolonged, with moderate threat and no sex, nudity, drugs, or coarse language observed."
4,PG13,"Sample frames suggest a horror film with ominous tone and blood‑themed visuals (bloody handprints, red title cards) and brief peril, but no explicit gore, sex, nudity, coarse language or drug use shown. The strongest element is moderate horror imagery."


# Final_Destination_All_Deaths

In [9]:
# Extract frames as data URLs
video_path = "data/YT_download/Final_Destination_All_Deaths.mp4"
video_content_lst = extract_frames_as_data_urls(
    video_path, 
    interval_sec=1, 
    limit=32*5
)
print(f"len(video_content_lst) = {len(video_content_lst)}")

attaching frame_no=0, original_shape=(360, 640, 3), resized_shape=(270, 480, 3)
attaching frame_no=30, original_shape=(360, 640, 3), resized_shape=(270, 480, 3)
attaching frame_no=60, original_shape=(360, 640, 3), resized_shape=(270, 480, 3)
attaching frame_no=90, original_shape=(360, 640, 3), resized_shape=(270, 480, 3)
attaching frame_no=120, original_shape=(360, 640, 3), resized_shape=(270, 480, 3)
attaching frame_no=150, original_shape=(360, 640, 3), resized_shape=(270, 480, 3)
attaching frame_no=180, original_shape=(360, 640, 3), resized_shape=(270, 480, 3)
attaching frame_no=210, original_shape=(360, 640, 3), resized_shape=(270, 480, 3)
attaching frame_no=240, original_shape=(360, 640, 3), resized_shape=(270, 480, 3)
attaching frame_no=270, original_shape=(360, 640, 3), resized_shape=(270, 480, 3)
attaching frame_no=300, original_shape=(360, 640, 3), resized_shape=(270, 480, 3)
attaching frame_no=330, original_shape=(360, 640, 3), resized_shape=(270, 480, 3)
attaching frame_no=36

## Break video into sub videos

In [24]:
chunk_size = 32
content_chunks = chunk_list(video_content_lst, chunk_size)
print(f"num_chunks = {len(content_chunks)}, chunk_size = {chunk_size}")

num_chunks = 5, chunk_size = 32


## inference

In [25]:
output_json_lst = []
for idx, content_chunk in enumerate(content_chunks):
    print(f"processing chunk {idx + 1}/{len(content_chunks)} with {len(content_chunk)} frames")
    input_messages = make_message(content_chunk)
    response = inference_text(client, input_messages)

    # parse each response as JSON
    parsed = json.loads(response.output_text)

    # verbose current output
    rating = parsed['rating']
    rich_print(f"\trating: {rating}")
    overall_rationale = parsed['overall_rationale']
    rich_print(f"\toverall_rationale: {overall_rationale}")
    
    output_json_lst.append(parsed)

print(f"done")

processing chunk 1/5 with 32 frames


processing chunk 2/5 with 32 frames


processing chunk 3/5 with 32 frames


processing chunk 4/5 with 32 frames


processing chunk 5/5 with 32 frames


done


### example of response

In [29]:
rich_print(parsed)

## verbose

In [26]:
df = pd.DataFrame({
    'rating': [i['rating'] for i in output_json_lst],
    'overall_rationale': [i['overall_rationale'] for i in output_json_lst],
})

df['overall_rationale'] = df['overall_rationale'].str.wrap(80)

In [27]:
display(HTML(df.to_html().replace("\\n", "<br>")))

Unnamed: 0,rating,overall_rationale
0,NC16,"Scenes depict intense accidental deaths, including a prolonged strangulation in a bathtub with panic and close-ups, and a sudden vehicle impact resulting in blood splatter. Threat and distress are sustained but gore is limited. Other elements are minimal."
1,M18,"Frames show graphic, close-up depictions of injury and death, including impalement, extensive bleeding, and a likely decapitation, within a sustained horror context. The strength of the gore and intensity of peril place the film at M18 under IMDA guidelines."
2,PG13,"Sample frames show intense peril and accident imagery: car explosions and a truck crash with burning wreckage, and a near‑impalement from a falling ladder. Violence is impactful but lacks gore or explicit injuries. Horror elements are suspenseful but non-detailed. No sex, nudity (beyond non-sexual male torso), drugs, or coarse language observed."
3,M18,"Graphic accidental deaths with blood and gore (impalement through head, crushing by falling object, elevator death, explosion and severed limb) shown with close- ups and recurring peril. Horror tone and mature themes of death/fate. Little to no sex/nudity; brief smoking."
4,M18,"Graphic violence and gore in multiple death sequences (e.g., bisection, burned/charred body on a table, explosive hospital corridor incident) with sustained peril. Other elements (sex, nudity, drugs, language) are not evident from the sample. The cumulative impact of strong-detail injuries places the film at M18."


# Ensemble

In [28]:
# to do

# Generate Report

In [28]:
# to do