In [None]:
#set up to insert the collected data to the backend

from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.future import select
from sqlalchemy import text
from fastapi.responses import JSONResponse
from fastapi import status
from typing import List
from pydantic import BaseModel
import os

from sqlalchemy.exc import SQLAlchemyError




class SubtitleData(BaseModel):
    sub: str
    start: float
    end: float

class ImageData(BaseModel):
    start: float
    end: float
    images: List[bytes]  # base64-encoded or binary content

async def insert_data_async(subtitles: List[SubtitleData], images_data: List[ImageData], session: AsyncSession):
    try:
        # Insert Subtitles
        for subtitle in subtitles:
            stmt = text(
                "INSERT INTO subtitles (sub, start, end) VALUES (:sub, :start, :end)"
            )
            await session.execute(stmt, {'sub': subtitle.sub, 'start': subtitle.start, 'end': subtitle.end})
        
        # Insert Images with binary data
        for image_data in images_data:
            for image in image_data.images:
                stmt = text(
                    "INSERT INTO images (start, end, image) VALUES (:start, :end, :image)"
                )
                await session.execute(stmt, {
                    'start': image_data.start,
                    'end': image_data.end,
                    'image': image  # Insert binary data
                })

        # Commit all changes
        await session.commit()
        
        return JSONResponse(status_code=status.HTTP_200_OK, content={"detail": "Data inserted successfully"})
    except SQLAlchemyError as e:
        print(f"Database error: {e}")
        await session.rollback()
        return JSONResponse(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, content={"detail": "Une erreur est survenue lors de l'insertion."})
    except Exception as e:
        print(f"Unexpected error: {e}")
        return JSONResponse(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, content={"detail": "Une erreur inattendue est survenue."})


In [None]:
#Data Collection Pipeline (ETL) Milestone
"""The subtitles for the videos where repeating themselves as you can see in this example
    {
      "start": "00:00:02.990",
      "end": "00:00:03.000",
      "text": "in this video I would like to start the"
    },
    {
      "start": "00:00:03.000",
      "end": "00:00:04.870",
      "text": "in this video I would like to start the discussion about convolutional new"
    },
    so i implemented a logc to remove the duplication 
"""
from datasets import Dataset, Video
from pathlib import Path
from piq import ssim
from torchvision.transforms.functional import rgb_to_grayscale
import json
from datetime import datetime
from torchvision.transforms.functional import to_pil_image
from PIL import ImageDraw, ImageFont
import textwrap
from PIL import Image
from bson import Binary
from io import BytesIO

#build the dataset
folder_path = Path("./videos/")
all_file_paths = [str(p) for p in folder_path.rglob("*.mp4")]
# videos_object = Dataset.from_dict({"video": all_file_paths}).cast_column("video", Video())
data = []
#images_data = []
#time to add so the time can be continuous across all videos
adjust_time = 0
current_sub_text=""
#
frame_sub_text = ""
#for sutitle drawing
# pick whatever point-size you like:
#pt_size = 25

# Pillow will resolve this name from its own bundled fonts:
# font = ImageFont.truetype("DejaVuSans.ttf", pt_size)

# def is_unique(prev_tensor, curr_tensor, treshold=0.99):
#     # Ensure both are in [0, 1] and float32fro
#     prev = prev_tensor.unsqueeze(0).float() /255.0
#     curr = curr_tensor.unsqueeze(0).float() / 255.0

#     #convert to grayscale for faster comparison
#     prev = rgb_to_grayscale(prev)
#     curr = rgb_to_grayscale(curr)

#     score = ssim(prev, curr, data_range=1.0)

#     return score.item() < treshold

#convert time to sec
def to_seconds(t):
    if isinstance(t, str):
        t = datetime.strptime(t, "%H:%M:%S.%f").time()
        return t.hour * 3600 + t.minute * 60 + t.second + t.microsecond / 1e6
    return float(t)


# def save_binary_image(binary_data: Binary, filename: str):
#     image = Image.open(BytesIO(binary_data))
#     image.save(f"{filename}.png")  # Add the appropriate extension (e.g., .png)


# def encode_image_binary(pil_img: Image.Image) -> Binary:
#     buffer = BytesIO()
#     pil_img.save(buffer, format="PNG")  # or "JPEG"
#     return Binary(buffer.getvalue())


#get the subtitles of each videos, clean the repeating segment
#for index, video_object in enumerate(videos_object):
for video_path in all_file_paths:
    #video = video_object["video"]
    #i = 0
    with open (f"{video_path.replace('mp4','json')}", "r") as metadata:
        subs =json.load(metadata)["captions"]
        #print(subs)
    #extract first frame
   # prev  = next(video)["data"]

    prev_sub   = subs[0]
    #print(video_path)
    #base text to remove from next sub if present
    base_sub_text = ""
    images_per_sub = []
    #for frame in video:
    i = 1
    while i <  len(subs):

            #tensor = frame["data"]

        #if is_unique(prev, tensor):
            start = to_seconds(subs[i]["start"]) 
            #frame_time = to_seconds(frame["pts"])
            #merge subtitles in  3 seconds segments
            while i < len(subs)  and (to_seconds(subs[i]["start"]) - to_seconds(prev_sub["start"])  < 3):

                current_sub_text = subs[i]["text"]
                #clean the subtitles
                if base_sub_text in current_sub_text:
                    current_sub_text = current_sub_text.replace(base_sub_text, "").strip(" ")

                    if current_sub_text !=  "" :
                        base_sub_text = current_sub_text
                frame_sub_text = frame_sub_text + " "+current_sub_text
                i = i + 1
                # if i < len(subs):
                #    current_sub =  subs[i]
            #convert tensor into image
            #image = to_pil_image(tensor)
            if frame_sub_text != "" and i < len(subs) :
                end = to_seconds(subs[i]["end"]) + adjust_time
                #save the subtitle
                data.append({
                    "video" : video_path,
                    "sub": frame_sub_text,
                    "start" :start ,
                    "end" : end,
                })
                frame_sub_text = ""
                prev_sub = subs[i]
            i = i+ 1
                #draw subtitles into image
                # draw = ImageDraw.Draw(image)
                # # wrap into ~40-char lines
                # lines = textwrap.wrap(frame_sub_text, width=40)
                # # compute line height (font height + spacing)
                # bbox = font.getbbox("Ay")
                # line_height = (bbox[3] - bbox[1]) + 4
                # bottom_padding = 60
                # left_padding   = 60

                # block_height = len(lines) * line_height
                # y            = image.height - bottom_padding - block_height

                # for line in lines:
                #     draw = ImageDraw.Draw(image)
                #     lines = textwrap.wrap(frame_sub_text, width=160)
                #     bbox = font.getbbox("Ay")
                #     line_h = (bbox[3]-bbox[1]) + 4
                #     y = image.height - 30 - line_h*len(lines)
                #     for line in lines:
                #         draw.text((30, y), line, fill="white", font=font)
                #         y += line_h
                #end draw
                #reset it so the it can be reused to store the next set of subtitle

            # images_per_sub.append(encode_image_binary(image))
            # if frame_sub_text != "" and i <= len(subs):
            #     images_data.append({"start" : start, "end" : end,"images" :images_per_sub})
            #     #save_binary_image(images_per_sub[0], "./images")
            #     images_per_sub = []
            #     frame_sub_text = ""
                
            #image.save(f"unique_frames/{(frame_time + adjust_time):.3f}.png")
                #print(next_sub)
            #print(data)
            # prev = tensor
#print(data)
    #update it with the time on the video last frame
    #adjust_time = adjust_time + to_seconds(current_sub["end"])
#print(data)


In [None]:
#Featurization Pipeline Milestone
"""compute the embedding of every chunks, then store it into Qdrant vector database
  """
#SEMANTIC CHUNKING
"""uses sentence transformer to group the subtitles that are semantically close. we compute the embedding of two subtitles at a time 
and merge them if they are semantically close """
from sentence_transformers import SentenceTransformer, util
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct, Distance, VectorParams
import uuid

# Connect to Qdrant
client = QdrantClient(host="localhost", port=6333)

# Define collection
collection_name = "subtitle_chunks"
model = SentenceTransformer('all-MiniLM-L6-v2')
vector_size = model.get_sentence_embedding_dimension()

client.recreate_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(
        size=vector_size,
        distance=Distance.COSINE
    )
)

# Semantic merging

subtitles = data  
merged_chunks = []

if not subtitles:
    raise ValueError("No subtitle data found!")

current = subtitles[0].copy()
current_embedding = model.encode(current["sub"], convert_to_tensor=True)

for i in range(1, len(subtitles)):
    next_sub = subtitles[i]
    next_embedding = model.encode(next_sub["sub"], convert_to_tensor=True)

    similarity = util.cos_sim(current_embedding, next_embedding).item()

    if similarity > 0.75:
        # Merge with previous
        current["end"] = next_sub["end"]
        current["sub"] += " " + next_sub["sub"]
        current_embedding = model.encode(current["sub"], convert_to_tensor=True)
    else:
        merged_chunks.append(current)
        current = next_sub.copy()
        current_embedding = next_embedding

merged_chunks.append(current)  # Add final chunk

# Insert into Qdrant

points = []
for chunk in merged_chunks:
    embedding = model.encode(chunk["sub"]).tolist()
    point_id = str(uuid.uuid4())
    payload = {
        "video": chunk["video"],
        "start": chunk["start"],
        "end": chunk["end"],
        "text": chunk["sub"]
    }
    points.append(PointStruct(id=point_id, vector=embedding, payload=payload))

client.upsert(collection_name=collection_name, points=points)


In [None]:

#Retrieval Milestone 1
""" This is the first part of this milestone, it retrieves the subtitltes  that are closes to the questions from qdrant along 
with their time frames and the  video that they belongs to"""

from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from collections import defaultdict

# Load model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Connect to Qdrant
client = QdrantClient(host="localhost", port=6333)
collection_name = "subtitle_chunks"

# queries
queries = [
    "Using only the videos, explain how ResNets work.",
    "Using only the videos, explain the advantages of CNNs over fully connected networks.",
    "Using only the videos, explain the binary cross entropy loss function."
]

# Function to search Qdrant for each query
def search_query(query, top_k=3):
    embedding = model.encode(query).tolist()
    results = client.search(
        collection_name=collection_name,
        query_vector=embedding,
        limit=top_k
    )
    return results
# loop to group results by video and merge subtitles
query_results = []

for query in queries:
    results = search_query(query)

    # Temporary dict to group items by video
    video_groups = defaultdict(lambda: {"start": float("inf"), "end": float("-inf"), "text": []})

    for result in results:
        payload = result.payload
        video = payload["video"]
        start = payload["start"]
        end = payload["end"]
        text = payload["text"]

        # Update min start and max end
        video_groups[video]["start"] = min(video_groups[video]["start"], start)
        video_groups[video]["end"] = max(video_groups[video]["end"], end)
        video_groups[video]["text"].append(text)

    # grouped data for the current query
    query_items = []
    for video, data in video_groups.items():
        query_items.append({
            "video": video,
            "start": data["start"],
            "end": data["end"],
            "text": " ".join(data["text"])
        })

    query_results.append({
        "query": query,
        "results": query_items
    })

# Pretty print results
#print.pprint(query_results)


In [None]:
#Retrieval Milestone 2
"""This part retrieve the frames from the relevant videos that matches the time stamp of the 
subtitles, write the subtitles on those frames. Finally output a video that answers the question asked """
import cv2
from pathlib import Path
import gc

folder_path = Path("./videos/")

#function that extract the relevant frames by timestamp range, write the subtitles on them
#given also the video where the times stamp originated, the subttilte to write and the range of time
def extract_frames_opencv(video_path, start_time, end_time, subtitle_text):
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)

    start_frame = int(start_time * fps)
    end_frame = int(end_time * fps)

    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    end_frame = min(end_frame, total_frames)

    frames = []

    cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame)

    font = cv2.FONT_HERSHEY_SIMPLEX
    font_scale = 1.0
    thickness = 2
    y_position = 1000
    text_color = (255, 255, 255)  # White

    # Read the first frame to get dimensions and text width
    ret, frame = cap.read()
    if not ret:
        print(f"Failed to read from {video_path}")
        cap.release()
        return []

    text_size = cv2.getTextSize(subtitle_text, font, font_scale, thickness)[0]
    text_width = text_size[0]
    frame_width = frame.shape[1]
    max_x = frame_width - text_width

    cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame)  # Reset position again
    frame_index = start_frame
    i = 0
    total_frame_count = end_frame - start_frame

    while frame_index < end_frame:
        ret, frame = cap.read()
        if not ret:
            break

        # Calculate dynamic x position
        x = int(max_x * (i / (total_frame_count - 1))) if total_frame_count > 1 else 0

        # Draw text
        cv2.putText(frame, subtitle_text, (x, y_position), font, font_scale, text_color, thickness, cv2.LINE_AA)

        # Convert BGR to RGB
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frames.append(frame_rgb)

        frame_index += 1
        i += 1

    cap.release()
    return frames

#to produce the video given the relevant frames
def write_video(frames, output_path, fps=30):
    if not frames:
        print(f"No frames to write for {output_path}")
        return

    height, width = frames[0].shape[:2]
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    for frame in frames:
        frame_bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
        out.write(frame_bgr)

    out.release()
    print(f"Video saved to {output_path}")

# processing loop
for items in query_results:
    print(items)
    video_title = items["query"]
    frames = []

    for result in items["results"]:
        video_path = f"./{result['video']}"
        segment_frames = extract_frames_opencv(video_path, result["start"], result["end"], result["text"])
        frames.extend(segment_frames)

        del segment_frames
        gc.collect()

    output_path = f"./output_videos/{video_title.replace(' ', '_')}.mp4"
    write_video(frames, output_path)

    del frames
    gc.collect()
