In [1]:
import math
import cv2
import numpy as np
import matplotlib.pyplot as plt
import os

def save_each_frame_with_timestamp(
    video_id,
    frames,
    timestamps,
    output_dir="saved_frames",
    figsize=(6, 4),
    font_scale=1.0,
    color=(255, 255, 255),
    thickness=2,
    dpi=100
):
    """
    Saves each frame as a separate image file, overlaying its timestamp and the video ID on the frame.

    Parameters:
    -----------
    video_id : str
        The identifier of the video, which will be prepended to each filename.
    frames : list of numpy.ndarray
        A list of image frames in BGR (as returned by OpenCV) or RGB format.
    timestamps : list of float or datetime-like
        A list of timestamps (in seconds or any printable format) for each frame.
    output_dir : str, optional
        Directory where the output images will be saved. Defaults to "saved_frames".
    figsize : tuple (width, height), optional
        Size (in inches) of the matplotlib figure for each frame.
    font_scale : float, optional
        Scale factor for the timestamp text. Defaults to 1.0.
    color : tuple of three ints, optional
        Text color in BGR (if using cv2.putText) or RGB (if using matplotlib). Defaults to white.
    thickness : int, optional
        Thickness of the text stroke. Defaults to 2.
    dpi : int, optional
        DPI for saving the figure. Defaults to 100.
    """
    if len(frames) != len(timestamps):
        raise ValueError("The number of frames and timestamps must be the same.")

    # Create the output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    for idx, (frame, ts) in enumerate(zip(frames, timestamps), start=1):
        # Convert BGR → RGB for matplotlib if needed
        if frame.ndim == 3 and frame.shape[2] == 3:
            img_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        else:
            img_rgb = frame.copy()  # assume already RGB or single-channel

        # Prepare timestamp text
        ts_text = f"{ts:.2f}s" if isinstance(ts, (int, float)) else str(ts)

        # Create a new figure for this frame
        plt.figure(figsize=figsize, dpi=dpi)
        plt.imshow(img_rgb)
        plt.axis("off")

        # Overlay timestamp in the top-left corner
        plt.text(
            x=5,
            y=15,
            s=ts_text,
            color=np.array(color) / 255.0,  # normalize to [0,1] for matplotlib
            fontsize=14 * font_scale,
            fontweight="bold",
            backgroundcolor="black",
            alpha=0.6,
        )

        # Build a filename that includes video_id: e.g., "Sv9fcuRfk2o_frame_01_12_34s.png"
        safe_ts = ts_text.replace(":", "-").replace(".", "_")
        filename = f"{video_id}_frame_{idx:02d}_{safe_ts}.png"
        filepath = os.path.join(output_dir, filename)

        # Save and close the figure
        plt.tight_layout(pad=0)
        plt.savefig(filepath, bbox_inches="tight", pad_inches=0)
        plt.close()

        print(f"Saved: {filepath}")




In [11]:
import selecting_frames

video_directory = "../video_samples"
video_id = "Sv9fcuRfk2o"  # Example video ID
frames, timesteps = selecting_frames.get_filtered_frames(video_id, directory=video_directory)

save_each_frame_with_timestamp(
    video_id,
    frames,
    timesteps,
    output_dir="saved_frames",
)

Saved: saved_frames\Sv9fcuRfk2o_frame_01_0_00s.png
Saved: saved_frames\Sv9fcuRfk2o_frame_02_1_00s.png
Saved: saved_frames\Sv9fcuRfk2o_frame_03_2_00s.png
Saved: saved_frames\Sv9fcuRfk2o_frame_04_2_90s.png
Saved: saved_frames\Sv9fcuRfk2o_frame_05_3_70s.png
Saved: saved_frames\Sv9fcuRfk2o_frame_06_5_00s.png
Saved: saved_frames\Sv9fcuRfk2o_frame_07_6_00s.png
Saved: saved_frames\Sv9fcuRfk2o_frame_08_6_40s.png
Saved: saved_frames\Sv9fcuRfk2o_frame_09_8_00s.png
Saved: saved_frames\Sv9fcuRfk2o_frame_10_9_20s.png


In [None]:
import prompt_builder

# Generate the prompt and base64‐encoded frames
prompt, frames_base_64 = prompt_builder.generate_prompt_and_frames(video_id, directory=video_directory)

print("\n=== Generated Prompt ===\n")
print(prompt)
print("\n" + "="*30 + "\n")

# 2) Print each frame’s Base64 string on its own, with wrapping every 80 characters
wrap_width = 80

for idx, b64 in enumerate(frames_base_64, start=1):
    print(f"--- Frame {idx} (Base64, length={len(b64)}) ---")

    for i in range(0, len(b64), wrap_width):
        print(b64[i : i + wrap_width])
    print()  # blank line between frames



=== Generated Prompt ===

You are provided with 10 sequential video frames,The video includes audio, which I can describe for you as tags
Additionally, a brief caption describes the video content and audio details. Use this information to create questions that require watching the video to be answerable, avoiding general knowledge questions.
Modality Definitions:
Visual: Answer is fully derived from the video frames alone. Audio: Answer relies only on audio information. Audio-Visual: Both audio and visual information are essential for a 100% accurate answer.
Question Categories:
Relative Position (Visual): Ask about the position of one object relative to another.
Description: Ask 2 questions. Visual: about visual details, excluding movement. Audio: about audio-only details (example background sounds)
Action: Ask 2 questions. Visual: about movements in the video. Audio-Visual: Link sounds to movements (example "What sound accompanies a movement?")
Temporal: Ask 1 question which event h