## Qwen 0.6B Model

In [None]:
FOLDER_NAME = "Data/Machine-Learning/"

VIDEO_DIR = FOLDER_NAME + "Videos"             # Folder containing input video files
AUDIO_DIR = FOLDER_NAME + "Audios"             # Folder to store extracted audio files
CHUNK_DIR = FOLDER_NAME + "Audio-Chunks"       # Folder to save audio chunks after VAD# Directory containing SRT files
srt_directory = FOLDER_NAME + "SRT-Files"
sentences_file = FOLDER_NAME + 'sentences.txt'
metadata_file = FOLDER_NAME + 'srt-embedding-metadata.tsv'
grouped_sentences_file = FOLDER_NAME + "grouped_sentences.pkl"
grouped_sent_to_metadata_file = FOLDER_NAME + "grouped_sent_to_metadata.pkl"
grouped_sentences_embeddings_file = FOLDER_NAME + "grouped-sentences-embeddings.idx"
final_video_stitched_output_srt_file = FOLDER_NAME + "stitched_output.srt"
final_video_file = FOLDER_NAME + "answer.mp4"

### Install SUMMARISER MODEL

In [None]:
!pip3 install bert-extractive-summarizer

Necessary Libraries

In [None]:
import ffmpeg
import os
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
import re
import os
from tqdm import tqdm
import csv
import pickle

# Paths


# Create output directories if they don't exist
os.makedirs(AUDIO_DIR, exist_ok=True)
os.makedirs(CHUNK_DIR, exist_ok=True)

# Audio processing
TARGET_SAMPLE_RATE = 16000  # or 32000 Hz depending on your use case

# VAD settings
MIN_CHUNK_DURATION_SEC = 30  # Minimum duration for an audio chunk
USE_ONNX_MODEL = False      # Set True to use ONNX version of Silero VAD

from silero_vad import (
    load_silero_vad, read_audio, get_speech_timestamps, 
    save_audio, VADIterator
)

# faiss_index = faiss.read_index("Data/sentence_embeddings.index")

### Model for Semantic Search

In [None]:
model = SentenceTransformer("Qwen/Qwen3-Embedding-0.6B")

## Video to Audio Conversion

In [None]:
# Loop through all .mp4 files in the input folder
for filename in os.listdir(VIDEO_DIR):
    if filename.endswith(".mp4"):
        input_path = os.path.join(VIDEO_DIR, filename)
        output_path = os.path.join(AUDIO_DIR, filename.replace(".mp4", ".wav"))

        print(f"Processing: {input_path} -> {output_path}")
        
        # Extract audio
        input_video = ffmpeg.input(input_path)
        output_audio = ffmpeg.output(input_video.audio, output_path, ac=1, ar=TARGET_SAMPLE_RATE)
        ffmpeg.run(output_audio, overwrite_output=True)
        
        # Probe the generated audio file for details
        audio_info = ffmpeg.probe(output_path, v="error", select_streams="a", show_entries="stream=codec_name,codec_type,sample_rate,channels,bit_rate,duration")
        
        codec_name = audio_info['streams'][0]['codec_name']
        sample_rate = int(audio_info['streams'][0]['sample_rate'])
        channels = int(audio_info['streams'][0]['channels'])
        bit_rate = audio_info['streams'][0].get('bit_rate', 'N/A')
        duration_sec = float(audio_info['streams'][0]['duration'])
        duration_ms = duration_sec * 1000
        
        print(f"Audio extracted: {output_path}")
        # print(f"Codec: {codec_name}, Sample Rate: {sample_rate} Hz, Channels: {channels}, Bit Rate: {bit_rate}, Duration: {duration_ms} ms\n")

print("Video to Audio converted successfully!.")

Voice Activity Detection Algorithm on Audio Files - Converting into Smaller Chunks

In [None]:
# Load Silero VAD model
model = load_silero_vad(onnx=USE_ONNX_MODEL)

def process_audio_file(audio_path, output_chunk_dir):
    os.makedirs(output_chunk_dir, exist_ok=True)
    """Process an audio file, split it into chunks, and save them."""
    wav = read_audio(audio_path, sampling_rate=TARGET_SAMPLE_RATE)
    speech_timestamps = get_speech_timestamps(
        wav, model, sampling_rate=TARGET_SAMPLE_RATE, return_seconds=True
    )
    
    # Format timestamps to 4 decimal places
    for segment in speech_timestamps:
        segment['start'] = float(f"{segment['start']:.4f}")
        segment['end'] = float(f"{segment['end']:.4f}")
    
    vad_iterator = VADIterator(model, sampling_rate=TARGET_SAMPLE_RATE)
    chunks = []
    current_chunk_start = 0
    
    for segment in speech_timestamps:
        start, end = segment['start'], segment['end']
        if (end - current_chunk_start) >= MIN_CHUNK_DURATION_SEC:
            chunk_wav = wav[int(current_chunk_start * TARGET_SAMPLE_RATE):int(end * TARGET_SAMPLE_RATE)]
            chunk_path = os.path.join(output_chunk_dir, f"{len(chunks) + 1}.wav")
            save_audio(chunk_path, chunk_wav, sampling_rate=TARGET_SAMPLE_RATE)
            chunks.append((current_chunk_start, end, chunk_wav))
            current_chunk_start = end
    
    # Save the last chunk if necessary
    if current_chunk_start < speech_timestamps[-1]['end']:
        chunk_wav = wav[int(current_chunk_start * TARGET_SAMPLE_RATE):]
        chunk_path = os.path.join(output_chunk_dir, f"{len(chunks) + 1}.wav")
        save_audio(chunk_path, chunk_wav, sampling_rate=TARGET_SAMPLE_RATE)
        chunks.append((current_chunk_start, speech_timestamps[-1]['end'], chunk_wav))
    
    vad_iterator.reset_states()
    print(f"Processed {audio_path}, saved chunks in {output_chunk_dir}")

def process_all_audio_files():
    """Process all .wav files in the main audio folder and save their chunks."""
    if not os.path.exists(AUDIO_DIR):
        print(f"Audio folder '{AUDIO_DIR}' does not exist.")
        return
    
    for file_name in sorted(os.listdir(AUDIO_DIR)):
        if file_name.endswith(".wav"):
            audio_path = os.path.join(AUDIO_DIR, file_name)
            audio_id = os.path.splitext(file_name)[0]  # Extract the number without extension
            output_chunk_dir = os.path.join(CHUNK_DIR, audio_id)
            process_audio_file(audio_path, output_chunk_dir)

# Process VAD on Audio Files.
process_all_audio_files()


### Transcribing Audio Files

In [None]:
import whisper
import os
import nltk
from nltk.tokenize import sent_tokenize

# Download necessary NLTK data
nltk.download('punkt')

# Load Whisper model once
model = whisper.load_model("large", device="cuda")

# Function to transcribe and save sentence-wise output
def transcribe_audio(audio_path, output_dir):
    # Extract base name (without extension)
    base_name = os.path.splitext(os.path.basename(audio_path))[0]

    # Transcribe the audio
    result = model.transcribe(audio_path, language="en")

    # Combine all text segments
    full_text = " ".join(segment["text"].strip() for segment in result["segments"])

    # Split into proper sentences
    sentences = sent_tokenize(full_text)

    # Ensure output directory exists
    os.makedirs(output_dir, exist_ok=True)

    # Save transcript (sentence-wise) to a text file
    txt_file = os.path.join(output_dir, f"{base_name}.txt")
    with open(txt_file, "w", encoding="utf-8") as f:
        for sentence in sentences:
            f.write(sentence.strip() + "\n")  # Sentence per line

    print(f"✅ Transcription saved: {txt_file}")

# Iterate through numbered folders (1 to 26)
for subfolder in sorted(os.listdir(root_audio_dir)):
    subfolder_path = os.path.join(root_audio_dir, subfolder)

    # Ensure it's a directory
    if os.path.isdir(subfolder_path):
        # Define corresponding output directory
        output_subfolder = os.path.join(root_output_dir, subfolder)

        # Process each audio file in the subfolder
        for file in os.listdir(subfolder_path):
            if file.endswith(".wav"):
                audio_file_path = os.path.join(subfolder_path, file)
                transcribe_audio(audio_file_path, output_subfolder)

In [None]:
import os
import shutil

# Root directory containing numbered subfolders
root_audio_dir = "Data/Speech-Processing/extracted_text_speech_processing"
root_output_dir = "Data/Speech-Processing/Transcribe-Text"

# Define folder paths
f1 = "Data/Speech-Processing/Audio-Chunks-Speech-Processing"
f2 = "Data/Speech-Processing/Transcribe-Text"

# Iterate over directories
for subfolder in sorted(os.listdir(f2), key=lambda x: int(x)):  
  
    source_folder = os.path.join(f2, subfolder)
    destination_folder = os.path.join(f1, subfolder)

    # Ensure the destination folder exists
    if not os.path.exists(destination_folder):
        print(f"Warning: Destination folder {destination_folder} does not exist, skipping...")
        continue

    # Copy all .txt files from source to destination
    for file in os.listdir(source_folder):
        if file.endswith(".txt"):
            src_file = os.path.join(source_folder, file)
            dest_file = os.path.join(destination_folder, file)
            shutil.copy2(src_file, dest_file)  # copy2 preserves metadata
            print(f"Copied {src_file} -> {dest_file}")

print("All .txt files copied successfully!")


In [None]:
import subprocess
import os

# Define paths
root_corpus_dir = "Data/Speech-Processing/Audio-Chunks-Speech-Processing"
dict_path = "english_us_arpa"
model_path = "english_us_arpa"
root_output_dir = "Data/Speech-Processing/complete_timestamp"

# Iterate through numbered folders (1 to 26)
for subfolder in sorted(os.listdir(root_corpus_dir), key=lambda x: int(x)):

    # if subfolder not in ["22","23","24","25","26"]:
    #     continue
    subfolder_corpus_path = os.path.join(root_corpus_dir, subfolder)
    subfolder_output_path = os.path.join(root_output_dir, subfolder)

    # Ensure the output directory exists
    os.makedirs(subfolder_output_path, exist_ok=True)

    # Run the alignment command
    cmd = [
        "mfa", "align","--clean", "--output_format", "csv",
        subfolder_corpus_path, dict_path, model_path, subfolder_output_path
    ]
    subprocess.run(cmd, check=True)
    print(f"Alignment completed for {subfolder}!")

In [None]:
import os
import pandas as pd
import librosa
from natsort import natsorted  # To ensure files are processed in correct order

def get_audio_length(audio_file):
    """Get the exact length of an audio file using librosa."""
    y, sr = librosa.load(audio_file, sr=None)
    return librosa.get_duration(y=y, sr=sr)

def convert_to_srt_time(seconds):
    """Convert seconds to SRT time format (hh:mm:ss,ms)."""
    millisec = int((seconds - int(seconds)) * 1000)
    hours, remainder = divmod(int(seconds), 3600)
    minutes, sec = divmod(remainder, 60)
    return f"{hours:02}:{minutes:02}:{sec:02},{millisec:03}"

def merge_chunks_to_srt(csv_folder,txt_folder,audio_folder, srt_file):
    """Process all chunk CSVs and TXT files to generate a sentence-level SRT file."""
    chunk_files = natsorted([f for f in os.listdir(csv_folder) if f.endswith(".csv")])  # Sort correctly
    # chunk_files = natsorted([f for f in os.listdir(audio_folder) if f.endswith(".wav")])  
    # print(chunk_files)
    total_offset = 0  # Offset to adjust timestamps

    with open(srt_file, "w", encoding="utf-8") as f:
        subtitle_index = 1  # SRT subtitle counter
        for chunk in chunk_files:
            csv_path = os.path.join(csv_folder, chunk)
            txt_path = os.path.join(txt_folder, chunk.replace(".csv", ".txt"))
            audio_path = os.path.join(audio_folder, chunk.replace(".csv", ".wav"))  # Assuming audio has the same name

            # Get audio length of this chunk
            chunk_length = get_audio_length(audio_path)

            # Read CSV
            df = pd.read_csv(csv_path)

            # Filter for words only
            df = df[df["Type"] == "words"].reset_index(drop=True)

            # Read sentences from the corresponding TXT file
            with open(txt_path, "r", encoding="utf-8") as txt_file:
                sentences = txt_file.readlines()

            word_index = 0  # Track position in the word list

            for sentence in sentences:
                words = sentence.strip().split()

                if word_index >= len(df):
                    break  # Avoid index error

                # Get the start and end timestamps for the sentence
                start_time = df.loc[word_index, 'Begin'] + total_offset
                end_time = df.loc[min(word_index + len(words) - 1, len(df) - 1), 'End'] + total_offset

                # Write to SRT
                f.write(f"{subtitle_index}\n")
                f.write(f"{convert_to_srt_time(start_time)} --> {convert_to_srt_time(end_time)}\n")
                f.write(f"{sentence.strip()}\n\n")

                subtitle_index += 1  # Increment subtitle number
                word_index += len(words)  # Move to the next sentence

            # Update offset for next chunk
            total_offset += chunk_length

    print(f"✅ Sentence-level SRT file saved as: {srt_file}")

# Example usage
csv_base_folder = "Data/Speech-Processing/complete_timestamp"  # Change to your folder path
txt_base_folder="Data/Speech-Processing/audio_chunks_text"
audio_base_folder="Data/Speech-Processing/Audio-Chunks-Speech-Processing"
srt_base_folder = "Data/Speech-Processing/complete_srt"

for subfolder in sorted(os.listdir(audio_base_folder), key=lambda x: int(x)):
    subfolder_path = os.path.join(audio_base_folder, subfolder)

    # Ensure it's a directory
    if os.path.isdir(subfolder_path):
        # Define corresponding output directory
        # output_subfolder = os.path.join(root_output_dir, subfolder)
        csv_subfolder = os.path.join(csv_base_folder, subfolder)
        txt_subfolder=os.path.join(txt_base_folder, subfolder)
        audio_subfolder = os.path.join(audio_base_folder, subfolder)
        srt_file_path = os.path.join(srt_base_folder, subfolder+".srt")

        merge_chunks_to_srt(csv_subfolder, txt_subfolder,audio_subfolder,srt_file_path)


## SRT to Sentences generation

In [None]:
# Function to extract combined sentences with timestamps from .srt file
def extract_sentences_from_srt(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    sentences = []
    timestamps = []
    current_sentence = ""
    current_timestamp = ""

    for line in lines:
        line = line.strip()

        # Check for timestamp lines
        timestamp_match = re.match(r'(\d{2}:\d{2}:\d{2}[.,]\d{3}) --> (\d{2}:\d{2}:\d{2}[.,]\d{3})', line)
        if timestamp_match:
            current_timestamp = timestamp_match.group(1).replace(',', '.') + ' --> ' + timestamp_match.group(2).replace(',', '.')
            continue

        # Skip empty lines and cue identifiers
        if not line or line.isdigit():
            continue

        # Add line to current sentence
        current_sentence += " " + line if current_sentence else line

        # If sentence ends, save it
        if re.search(r'[.!?]$', line):
            sentences.append(current_sentence.strip())
            timestamps.append(current_timestamp)
            current_sentence = ""
            current_timestamp = ""

    return sentences, timestamps



# Initialize lists for all sentences, timestamps, and filenames
all_sentences = []
all_timestamps = []
all_filenames = []

# Process all SRT files with tqdm
srt_files = [f for f in os.listdir(srt_directory) if f.endswith('.srt')]
for file_name in tqdm(srt_files, desc="Processing SRT files"):
    file_path = os.path.join(srt_directory, file_name)
    sentences, timestamps = extract_sentences_from_srt(file_path)
    all_sentences.extend(sentences)
    all_timestamps.extend(timestamps)
    all_filenames.extend([file_name] * len(sentences))  # associate each sentence with its file

# print("Encoding sentences into embeddings...")
# Uncomment for all miniLM and all mpnet.
# sentence_embeddings = np.array(
#     model.encode(all_sentences) 
# ).astype('float32')

# sentence_embeddings = model.encode(all_sentences)

# Create FAISS index (use Inner Product for cosine similarity)
# embedding_dimension = sentence_embeddings.shape[1]
# faiss_index = faiss.IndexFlatIP(embedding_dimension)
# faiss_index.add(sentence_embeddings)

# Save FAISS index
# faiss.write_index(faiss_index, "Data/sentence_embeddings.index")

# Save metadata to file

with open(metadata_file, 'w', encoding='utf-8') as file:
    file.write("filename\ttimestamp\tsentence\n")
    for fname, timestamp, sentence in zip(all_filenames, all_timestamps, all_sentences):
        clean_sentence = sentence.replace('\t', ' ').replace('\n', ' ')
        file.write(f"{fname}\t{timestamp}\t{clean_sentence}\n")

# Save sentences to a text file

with open(sentences_file, 'w', encoding='utf-8') as file:
    for sentence in all_sentences:
        file.write(sentence.strip().replace('\n', ' ') + '\n')

# Summary
# print(f"\nEmbeddings created for {len(all_sentences)} sentences from {len(srt_files)} SRT files.")
# print("FAISS index saved as 'Data/sentence_embeddings.index'")
print(f"Metadata saved as '{metadata_file}'")
print(f"Sentences saved as '{sentences_file}'")

## Finding Related Sentences to a Question

### Grouping Sentences N-gram technique

In [None]:
metadata_list = []
with open(metadata_file, "r", encoding="utf-8") as f:
    reader = csv.reader(f, delimiter='\t')
    next(reader)
    for row in reader:
        filename, timestamp, sentence = row
        metadata_list.append((filename.strip(), timestamp.strip(), sentence.strip()))

group_size = 3
grouped_sentences = []
grouped_sent_to_metadata = {}

def extract_start_end(ts):
    start, end = ts.split("-->")
    return start.strip(), end.strip()

# Group by filename first
from itertools import groupby

for filename, file_group in groupby(metadata_list, key=lambda x: x[0]):
    file_group = list(file_group)
    for i in range(len(file_group) - group_size + 1):
        group = file_group[i:i+group_size]
        grouped_text = " ".join(sent for _, _, sent in group)
        first_start, _ = extract_start_end(group[0][1])
        _, last_end = extract_start_end(group[-1][1])
        timestamp_range = f"{first_start} --> {last_end}"
        individual_timestamps = [ts for _, ts, _ in group]
        grouped_sentences.append(grouped_text)
        grouped_sent_to_metadata[grouped_text] = {
            "filename": filename,
            "timestamp_range": timestamp_range,
            "individual_timestamps": individual_timestamps
        }

# Save pickles
with open(grouped_sentences_file, "wb") as f:
    pickle.dump(grouped_sentences, f)

with open(grouped_sent_to_metadata_file, "wb") as f:
    pickle.dump(grouped_sent_to_metadata, f)

first_key = next(iter(grouped_sent_to_metadata))
print("Grouped Sentence to Metadata First element:", first_key, "->", grouped_sent_to_metadata[first_key])


In [None]:
# Load grouped sentences
with open(grouped_sentences_file, "rb") as f:
    grouped_sentences = pickle.load(f)

# Load precomputed metadata mapping
with open(grouped_sent_to_metadata_file, "rb") as f:
    grouped_sent_to_metadata = pickle.load(f)
print("Encoding grouped sentences into embeddings...")

batch_size = 16  # you can adjust this based on memory
all_embeddings = []

for i in tqdm(range(0, len(grouped_sentences), batch_size), desc="Encoding batches"):
    batch = grouped_sentences[i:i+batch_size]
    batch_emb = model.encode(batch)  # encode the batch
    all_embeddings.append(batch_emb)

# Combine all batches into one array and convert to float32
grouped_embeddings = np.vstack(all_embeddings).astype('float32')

# Create FAISS index (Inner Product for cosine similarity)
embedding_dim = grouped_embeddings.shape[1]
faiss_index = faiss.IndexFlatIP(embedding_dim)
faiss_index.add(grouped_embeddings)

# Save FAISS index
faiss.write_index(faiss_index, grouped_sentences_embeddings_file)

print("✅ FAISS index created and saved!")

### QA

In [None]:
from datetime import datetime
import torch
import pickle
import faiss
import numpy as np
import re

faiss_index = faiss.read_index(grouped_sentences_embeddings_file)

with open(grouped_sentences_file, "rb") as f:
    grouped_sentences = pickle.load(f)

with open(grouped_sent_to_metadata_file, "rb") as f:
    grouped_sent_to_metadata = pickle.load(f)

student_question = input("Enter your question: ")
question_embedding = model.encode(student_question, prompt_name="query")
if question_embedding.ndim == 1:
    question_embedding = np.expand_dims(question_embedding, axis=0)

distances, indices = faiss_index.search(question_embedding, 10)

related_results = []
for idx in indices[0]:
    grouped_sent = grouped_sentences[idx]
    meta = grouped_sent_to_metadata.get(grouped_sent, None)

    if meta:
        filename = meta["filename"]
        timestamp_range = meta["timestamp_range"]
        individual_timestamps = meta.get("individual_timestamps", [])
    else:
        filename = "Unknown"
        timestamp_range = "Unknown"
        individual_timestamps = []

    related_results.append(
        (filename, timestamp_range, grouped_sent, individual_timestamps)
    )

print("Question:", student_question)
print("\nTop Related Sentences with Metadata:")
for filename, timestamp, sent, indiv_ts in related_results:
    print(f"- [{filename} | {timestamp}] {sent}")
    if indiv_ts:
        print("  ↳ Individual timestamps:", indiv_ts)
    print()

# Group results by file
grouped_by_file = {}
for filename, timestamp, sent, _ in related_results:
    if filename not in grouped_by_file:
        grouped_by_file[filename] = []
    grouped_by_file[filename].append((timestamp, sent))

# Helper to parse timestamps
def parse_ts(ts):
    start, end = ts.split("->")
    fmt = "%H:%M:%S.%f"
    return datetime.strptime(start.strip(), fmt), datetime.strptime(end.strip(), fmt)

# Helper to clean and normalize sentences
def clean_sentences(sentences):
    cleaned = []
    seen = set()
    for s in sentences:
        s = s.strip()
        while s and s[-1] in ".!?":
            s = s[:-1].strip()
        if s and s not in seen:
            cleaned.append(s)
            seen.add(s)
    return cleaned

def extract_file_number(fname):
    match = re.search(r"(\d+)", fname)
    return int(match.group(1)) if match else float("inf")

# Sort by file and start timestamp
related_results_sorted = sorted(
    related_results,
    key=lambda x: (extract_file_number(x[0]), parse_ts(x[1])[0])
)

# Merge overlapping timestamps within same file
merged_results = []
for filename, ts_range, text, indiv_ts in related_results_sorted:
    sentences = clean_sentences(text.split('. '))

    if not merged_results:
        merged_results.append([
            filename, ts_range, '. '.join(sentences) + '.', indiv_ts
        ])
        continue

    prev = merged_results[-1]
    prev_start, prev_end = parse_ts(prev[1])
    curr_start, curr_end = parse_ts(ts_range)

    if filename == prev[0] and curr_start <= prev_end:
        new_start = prev_start.strftime("%H:%M:%S.%f")[:-3]
        new_end = max(prev_end, curr_end).strftime("%H:%M:%S.%f")[:-3]
        prev[1] = f"{new_start} -> {new_end}"

        prev_sentences = clean_sentences(prev[2].split('. '))
        combined_sentences = prev_sentences + sentences
        prev[2] = '. '.join(clean_sentences(combined_sentences)) + '.'

        # Combine timestamps
        prev[3].extend(indiv_ts)
    else:
        merged_results.append([filename, ts_range, '. '.join(sentences) + '.', indiv_ts])

# Print merged results
print("\nOrdered & Merged Top Related Sentences (duplicates removed):")
for i, (fname, ts, sent, indiv_ts) in enumerate(merged_results, 1):
    print(f"\nGroup {i}")
    print("Text:", sent)
    print("Metadata:", (fname, ts))
    if indiv_ts:
        print("Individual timestamps:", indiv_ts)

In [None]:
segment_list = []

for filename, ts_range, text, individual_timestamps in merged_results:
    distance = 0.0  # placeholder for similarity or ranking score
    segment_list.append({
        "filename": filename,
        "timestamp_range": ts_range,
        "text": text,
        "individual_timestamps": individual_timestamps,
        "distance": distance
    })

# Now segment_list is ready to use
for seg in segment_list:
    print(f"[{seg['filename']} | {seg['timestamp_range']}]")
    print(f"Text: {seg['text']}")
    print(f"Individual timestamps: {seg['individual_timestamps']}")
    print()


In [None]:
from moviepy.editor import VideoFileClip, concatenate_videoclips, ColorClip
import os
import datetime

def format_srt_timestamp(seconds):
    td = datetime.timedelta(seconds=seconds)
    total_seconds = int(td.total_seconds())
    milliseconds = int((td.total_seconds() - total_seconds) * 1000)
    hours, remainder = divmod(total_seconds, 3600)
    minutes, seconds = divmod(remainder, 60)
    return f"{hours:02}:{minutes:02}:{int(seconds):02},{milliseconds:03}"

def create_continuous_srt(clips_info, output_filename=final_video_stitched_output_srt_file, transition_sec=0.01):
    srt_lines = []
    current_time = 0.0

    for idx, (duration, sentence, start, video_file) in enumerate(clips_info, start=1):
        start_time = format_srt_timestamp(current_time)
        end_time = format_srt_timestamp(current_time + duration)
        srt_lines.append(f"{idx}\n{start_time} --> {end_time}\n{sentence}\n")
        current_time += duration + transition_sec  # Account for pause
    with open(output_filename, "w") as f:
        f.write("\n".join(srt_lines))

def parse_timestamp(timestamp_str):
    start, end = timestamp_str.split(" -> ")
    return start.strip(), end.strip()

def stitch_video_from_segments(segment_list, srt_filename=final_video_stitched_output_srt_file, pause_duration=0.01):
    clips = []
    clips_info = []
    sources = set()

    for idx, (filename, timestamp, sentence, distance) in enumerate(segment_list):
        lecture_no = os.path.splitext(filename)[0]
        video_file = VIDEO_DIR + "/" + lecture_no + ".mp4"
        start, end = parse_timestamp(timestamp)
        sources.add("Lecture - " + lecture_no)

        try:
            clip = VideoFileClip(video_file).subclip(start, end)

            # Resize black screen to match clip size
            if idx > 0:
                black_clip = ColorClip(size=clip.size, color=(0, 0, 0), duration=pause_duration)
                black_clip = black_clip.set_fps(clip.fps)
                clips.append(black_clip)

            clips.append(clip)
            clips_info.append((clip.duration, sentence, start, video_file))
        except Exception as e:
            print(f"Error processing segment ({filename}, {timestamp}): {e}")

    if not clips:
        print("No valid clips found.")
        return

    final_clip = concatenate_videoclips(clips, method="chain")

    final_clip.write_videofile(
        final_video_file,
        codec="libx264",
        preset="ultrafast",
        threads=4,
        audio_codec="aac"
    )

    create_continuous_srt(clips_info, output_filename=srt_filename, transition_sec=pause_duration)

stitch_video_from_segments(segment_list, srt_filename=final_video_stitched_output_srt_file, pause_duration=0.01)


### Summarisation

In [None]:
from summarizer import Summarizer

# Initialize summarizer
summarizer = Summarizer()

### Summarisation

In [None]:
# Extract texts and timestamps
texts = [sent for _, _, sent, _ in merged_results]
timestamps = [ts for _, ts, _, _ in merged_results]

# Combine all texts
full_text = " ".join(texts)

# Run summarization
summary_text = summarizer(full_text, ratio=0.6)

if isinstance(summary_text, list):
    summary_text = " ".join(summary_text)

# Split summary into sentences
summary_sentences = [s.strip() for s in summary_text.split('. ') if s.strip()]

def find_timestamp(sentence):
    # Try to find which merged text contains this sentence or a portion of it
    for text, ts in zip(texts, timestamps):
        if sentence[:20].lower() in text.lower() or sentence.lower() in text.lower():
            return ts
    return "Unknown"

# Print summary with timestamps
print("\nSummary with timestamps:\n")
for s in summary_sentences:
    ts = find_timestamp(s)
    print(f"[{ts}] {s}.")
