## Qwen 0.6B Model

### Install SUMMARISER MODEL

In [1]:
!pip3 install bert-extractive-summarizer



Necessary Libraries

In [2]:
import ffmpeg
import os
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
import re
import os
from tqdm import tqdm
import csv
import pickle

# Paths
VIDEO_DIR = "Data/Videos"             # Folder containing input video files
AUDIO_DIR = "Data/Audios"             # Folder to store extracted audio files
CHUNK_DIR = "Data/Audio-Chunks"       # Folder to save audio chunks after VAD

# Create output directories if they don't exist
os.makedirs(AUDIO_DIR, exist_ok=True)
os.makedirs(CHUNK_DIR, exist_ok=True)

# Audio processing
TARGET_SAMPLE_RATE = 16000  # or 32000 Hz depending on your use case

# VAD settings
MIN_CHUNK_DURATION_SEC = 30  # Minimum duration for an audio chunk
USE_ONNX_MODEL = False      # Set True to use ONNX version of Silero VAD

from silero_vad import (
    load_silero_vad, read_audio, get_speech_timestamps, 
    save_audio, VADIterator
)

# faiss_index = faiss.read_index("Data/sentence_embeddings.index")

  from .autonotebook import tqdm as notebook_tqdm


### Model for Semantic Search

In [3]:
model = SentenceTransformer("Qwen/Qwen3-Embedding-0.6B")

### Summarisation

In [2]:
from summarizer import Summarizer

# Initialize summarizer
summarizer = Summarizer()

  from .autonotebook import tqdm as notebook_tqdm


## Video to Audio Conversion

In [None]:
# Loop through all .mp4 files in the input folder
for filename in os.listdir(VIDEO_DIR):
    if filename.endswith(".mp4"):
        input_path = os.path.join(VIDEO_DIR, filename)
        output_path = os.path.join(AUDIO_DIR, filename.replace(".mp4", ".wav"))

        print(f"Processing: {input_path} -> {output_path}")
        
        # Extract audio
        input_video = ffmpeg.input(input_path)
        output_audio = ffmpeg.output(input_video.audio, output_path, ac=1, ar=TARGET_SAMPLE_RATE)
        ffmpeg.run(output_audio, overwrite_output=True)
        
        # Probe the generated audio file for details
        audio_info = ffmpeg.probe(output_path, v="error", select_streams="a", show_entries="stream=codec_name,codec_type,sample_rate,channels,bit_rate,duration")
        
        codec_name = audio_info['streams'][0]['codec_name']
        sample_rate = int(audio_info['streams'][0]['sample_rate'])
        channels = int(audio_info['streams'][0]['channels'])
        bit_rate = audio_info['streams'][0].get('bit_rate', 'N/A')
        duration_sec = float(audio_info['streams'][0]['duration'])
        duration_ms = duration_sec * 1000
        
        print(f"Audio extracted: {output_path}")
        # print(f"Codec: {codec_name}, Sample Rate: {sample_rate} Hz, Channels: {channels}, Bit Rate: {bit_rate}, Duration: {duration_ms} ms\n")

print("Video to Audio converted successfully!.")

Voice Activity Detection Algorithm on Audio Files - Converting into Smaller Chunks

In [None]:
# Load Silero VAD model
model = load_silero_vad(onnx=USE_ONNX_MODEL)

def process_audio_file(audio_path, output_chunk_dir):
    """Process an audio file, split it into chunks, and save them."""
    wav = read_audio(audio_path, sampling_rate=TARGET_SAMPLE_RATE)
    speech_timestamps = get_speech_timestamps(
        wav, model, sampling_rate=TARGET_SAMPLE_RATE, return_seconds=True
    )
    
    # Format timestamps to 4 decimal places
    for segment in speech_timestamps:
        segment['start'] = float(f"{segment['start']:.4f}")
        segment['end'] = float(f"{segment['end']:.4f}")
    
    vad_iterator = VADIterator(model, sampling_rate=TARGET_SAMPLE_RATE)
    chunks = []
    current_chunk_start = 0
    
    for segment in speech_timestamps:
        start, end = segment['start'], segment['end']
        if (end - current_chunk_start) >= MIN_CHUNK_DURATION_SEC:
            chunk_wav = wav[int(current_chunk_start * TARGET_SAMPLE_RATE):int(end * TARGET_SAMPLE_RATE)]
            chunk_path = os.path.join(output_chunk_dir, f"{len(chunks) + 1}.wav")
            save_audio(chunk_path, chunk_wav, sampling_rate=TARGET_SAMPLE_RATE)
            chunks.append((current_chunk_start, end, chunk_wav))
            current_chunk_start = end
    
    # Save the last chunk if necessary
    if current_chunk_start < speech_timestamps[-1]['end']:
        chunk_wav = wav[int(current_chunk_start * TARGET_SAMPLE_RATE):]
        chunk_path = os.path.join(output_chunk_dir, f"{len(chunks) + 1}.wav")
        save_audio(chunk_path, chunk_wav, sampling_rate=TARGET_SAMPLE_RATE)
        chunks.append((current_chunk_start, speech_timestamps[-1]['end'], chunk_wav))
    
    vad_iterator.reset_states()
    print(f"Processed {audio_path}, saved chunks in {output_chunk_dir}")

def process_all_audio_files():
    """Process all .wav files in the main audio folder and save their chunks."""
    if not os.path.exists(AUDIO_DIR):
        print(f"Audio folder '{AUDIO_DIR}' does not exist.")
        return
    
    for file_name in sorted(os.listdir(AUDIO_DIR)):
        if file_name.endswith(".wav"):
            audio_path = os.path.join(AUDIO_DIR, file_name)
            audio_id = os.path.splitext(file_name)[0]  # Extract the number without extension
            output_chunk_dir = os.path.join(CHUNK_DIR, audio_id)
            process_audio_file(audio_path, output_chunk_dir)

# Process VAD on Audio Files.
process_all_audio_files()


## SRT to Sentences generation

In [6]:
# Function to extract combined sentences with timestamps from .srt file
def extract_sentences_from_srt(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    sentences = []
    timestamps = []
    current_sentence = ""
    current_timestamp = ""

    for line in lines:
        line = line.strip()

        # Check for timestamp lines
        timestamp_match = re.match(r'(\d{2}:\d{2}:\d{2}[.,]\d{3}) --> (\d{2}:\d{2}:\d{2}[.,]\d{3})', line)
        if timestamp_match:
            current_timestamp = timestamp_match.group(1).replace(',', '.') + ' --> ' + timestamp_match.group(2).replace(',', '.')
            continue

        # Skip empty lines and cue identifiers
        if not line or line.isdigit():
            continue

        # Add line to current sentence
        current_sentence += " " + line if current_sentence else line

        # If sentence ends, save it
        if re.search(r'[.!?]$', line):
            sentences.append(current_sentence.strip())
            timestamps.append(current_timestamp)
            current_sentence = ""
            current_timestamp = ""

    return sentences, timestamps

# Directory containing SRT files
srt_directory = 'Data/SRT-Files'

# Initialize lists for all sentences, timestamps, and filenames
all_sentences = []
all_timestamps = []
all_filenames = []

# Process all SRT files with tqdm
srt_files = [f for f in os.listdir(srt_directory) if f.endswith('.srt')]
for file_name in tqdm(srt_files, desc="Processing SRT files"):
    file_path = os.path.join(srt_directory, file_name)
    sentences, timestamps = extract_sentences_from_srt(file_path)
    all_sentences.extend(sentences)
    all_timestamps.extend(timestamps)
    all_filenames.extend([file_name] * len(sentences))  # associate each sentence with its file

# print("Encoding sentences into embeddings...")
# Uncomment for all miniLM and all mpnet.
# sentence_embeddings = np.array(
#     model.encode(all_sentences) 
# ).astype('float32')

# sentence_embeddings = model.encode(all_sentences)

# Create FAISS index (use Inner Product for cosine similarity)
# embedding_dimension = sentence_embeddings.shape[1]
# faiss_index = faiss.IndexFlatIP(embedding_dimension)
# faiss_index.add(sentence_embeddings)

# Save FAISS index
# faiss.write_index(faiss_index, "Data/sentence_embeddings.index")

# Save metadata to file
metadata_file = 'Data/srt-embedding-metadata.tsv'
with open(metadata_file, 'w', encoding='utf-8') as file:
    file.write("filename\ttimestamp\tsentence\n")
    for fname, timestamp, sentence in zip(all_filenames, all_timestamps, all_sentences):
        clean_sentence = sentence.replace('\t', ' ').replace('\n', ' ')
        file.write(f"{fname}\t{timestamp}\t{clean_sentence}\n")

# Save sentences to a text file
sentences_file = 'Data/sentences.txt'
with open(sentences_file, 'w', encoding='utf-8') as file:
    for sentence in all_sentences:
        file.write(sentence.strip().replace('\n', ' ') + '\n')

# Summary
# print(f"\nEmbeddings created for {len(all_sentences)} sentences from {len(srt_files)} SRT files.")
# print("FAISS index saved as 'Data/sentence_embeddings.index'")
print(f"Metadata saved as '{metadata_file}'")
print(f"Sentences saved as '{sentences_file}'")

Processing SRT files: 100%|██████████| 31/31 [00:00<00:00, 715.81it/s]

Metadata saved as 'Data/srt-embedding-metadata.tsv'
Sentences saved as 'Data/sentences.txt'





## Finding Related Sentences to a Question

### Grouping Sentences N-gram technique

In [7]:
metadata_list = []
with open("Data/srt-embedding-metadata.tsv", "r", encoding="utf-8") as f:
    reader = csv.reader(f, delimiter='\t')
    next(reader)
    for row in reader:
        filename, timestamp, sentence = row
        metadata_list.append((filename, timestamp, sentence))

grouped_sentences = []
grouped_sent_to_metadata = {}
group_size = 3

def extract_start_end(ts):
    start, end = ts.split("-->")
    return start.strip(), end.strip()

for i in range(len(metadata_list) - group_size + 1):
    group = metadata_list[i:i+group_size]
    grouped_text = " ".join(sent for _, _, sent in group)
    filename = group[0][0]
    first_start, _ = extract_start_end(group[0][1])
    _, last_end = extract_start_end(group[-1][1])
    timestamp_range = f"{first_start} -> {last_end}"
    grouped_sentences.append(grouped_text)
    grouped_sent_to_metadata[grouped_text] = (filename, timestamp_range)

with open("Data/grouped_sentences.pkl", "wb") as f:
    pickle.dump(grouped_sentences, f)

with open("Data/grouped_sent_to_metadata.pkl", "wb") as f:
    pickle.dump(grouped_sent_to_metadata, f)

first_key = next(iter(grouped_sent_to_metadata))
print("Grouped Sentence to Metadata First element:", first_key, "->", grouped_sent_to_metadata[first_key])


Grouped Sentence to Metadata First element: Hello, welcome to the NPTEL online certification course on deep learning. Now, we are discussing about the discriminant function and the decision boundary among different classes. So, in the previous class we have considered two simple cases where the covariance matrices of the different classes they are same and in one of the case we have assumed that the covariance matrix is of the form sigma square i where sigma is the variance of all the components of the vectors. -> ('8.srt', '00:00:00.160 -> 00:01:04.390')


In [8]:
# Load grouped sentences
with open("Data/grouped_sentences.pkl", "rb") as f:
    grouped_sentences = pickle.load(f)

# Load precomputed metadata mapping
with open("Data/grouped_sent_to_metadata.pkl", "rb") as f:
    grouped_sent_to_metadata = pickle.load(f)
print("Encoding grouped sentences into embeddings...")

batch_size = 16  # you can adjust this based on memory
all_embeddings = []

for i in tqdm(range(0, len(grouped_sentences), batch_size), desc="Encoding batches"):
    batch = grouped_sentences[i:i+batch_size]
    batch_emb = model.encode(batch)  # encode the batch
    all_embeddings.append(batch_emb)

# Combine all batches into one array and convert to float32
grouped_embeddings = np.vstack(all_embeddings).astype('float32')

# Create FAISS index (Inner Product for cosine similarity)
embedding_dim = grouped_embeddings.shape[1]
faiss_index = faiss.IndexFlatIP(embedding_dim)
faiss_index.add(grouped_embeddings)

# Save FAISS index
faiss.write_index(faiss_index, "Data/grouped-sentences-embeddings.idx")

print("✅ FAISS index created and saved!")

Encoding grouped sentences into embeddings...


Encoding batches: 100%|██████████| 319/319 [23:39<00:00,  4.45s/it]


✅ FAISS index created and saved!


In [5]:
import torch
print("MPS available:", torch.backends.mps.is_available())
print("MPS enabled:", torch.backends.mps.is_built())

MPS available: True
MPS enabled: True


### QA

In [7]:
from datetime import datetime
import torch
import pickle
import faiss
import numpy as np
import re

device = torch.device("mps")
model.to(device)

faiss_index = faiss.read_index("Data/grouped-sentences-embeddings.idx")

with open("Data/grouped_sentences.pkl", "rb") as f:
    grouped_sentences = pickle.load(f)

with open("Data/grouped_sent_to_metadata.pkl", "rb") as f:
    grouped_sent_to_metadata = pickle.load(f)

student_question = input("Enter your question: ")
question_embedding = model.encode(student_question, prompt_name="query")
if question_embedding.ndim == 1:
    question_embedding = np.expand_dims(question_embedding, axis=0)

distances, indices = faiss_index.search(question_embedding, 10)

related_results = []
for idx in indices[0]:
    grouped_sent = grouped_sentences[idx]
    filename, timestamp_range = grouped_sent_to_metadata.get(
        grouped_sent, ("Unknown", "Unknown")
    )
    related_results.append((filename, timestamp_range, grouped_sent))

print("Question:", student_question)
print("\nTop Related Sentences with Metadata:")
for filename, timestamp, sent in related_results:
    print(f"- [{filename} | {timestamp}] {sent}\n")

# Group results by file
grouped_by_file = {}
for filename, timestamp, sent in related_results:
    if filename not in grouped_by_file:
        grouped_by_file[filename] = []
    grouped_by_file[filename].append((timestamp, sent))

# Helper to parse timestamps
def parse_ts(ts):
    start, end = ts.split("->")
    fmt = "%H:%M:%S.%f"
    return datetime.strptime(start.strip(), fmt), datetime.strptime(end.strip(), fmt)

# Helper to clean and normalize sentences
def clean_sentences(sentences):
    cleaned = []
    seen = set()
    for s in sentences:
        s = s.strip()
        while s and s[-1] in ".!?":
            s = s[:-1].strip()
        if s and s not in seen:
            cleaned.append(s)
            seen.add(s)
    return cleaned

def extract_file_number(fname):
    # Extract numeric part from "1.srt", "2.srt" etc.
    match = re.search(r"(\d+)", fname)
    return int(match.group(1)) if match else float('inf')

# Order by start timestamp
related_results_sorted = sorted(
    related_results,
    key=lambda x: (extract_file_number(x[0]), parse_ts(x[1])[0])
)

# Merge sentences and remove duplicates
merged_results = []
for filename, ts_range, text in related_results_sorted:
    sentences = clean_sentences(text.split('. '))

    if not merged_results:
        merged_results.append([filename, ts_range, '. '.join(sentences) + '.'])
        continue

    prev = merged_results[-1]
    prev_start, prev_end = parse_ts(prev[1])
    curr_start, curr_end = parse_ts(ts_range)

    if filename == prev[0] and curr_start <= prev_end:
        new_start = prev_start.strftime("%H:%M:%S.%f")[:-3]
        new_end = max(prev_end, curr_end).strftime("%H:%M:%S.%f")[:-3]
        prev[1] = f"{new_start} -> {new_end}"

        prev_sentences = clean_sentences(prev[2].split('. '))
        combined_sentences = prev_sentences + sentences
        prev[2] = '. '.join(clean_sentences(combined_sentences)) + '.'
    else:
        merged_results.append([filename, ts_range, '. '.join(sentences) + '.'])

print("\nOrdered & Merged Top Related Sentences (duplicates removed):")
for i, (fname, ts, sent) in enumerate(merged_results, 1):
    print(f"\nGroup {i}")
    print("Text:", sent)
    print("Metadata:", (fname, ts))

Question: Explain Linear Regression?

Top Related Sentences with Metadata:
- [2.srt | 00:01:19.590 -> 00:01:30.569] So what is a linear regression? It is one of the simplest kind of supervised learning model. And it predicts a real value number which is regression.

- [2.srt | 00:00:06.419 -> 00:00:26.550] In this video, we're going to talk about linear regression. So we'll begin by the definition of linear regression, and we'll talk about how this model can optimize to get the best estimate value. And then we're going to talk about important quantities for linear regression, such as a fitness performance metric, things like that.

- [2.srt | 00:00:05.429 -> 00:00:17.170] Hi everyone. In this video, we're going to talk about linear regression. So we'll begin by the definition of linear regression, and we'll talk about how this model can optimize to get the best estimate value.

- [2.srt | 00:04:20.089 -> 00:04:40.349] And this is called linear combination. So this type of model, whethe

In [10]:
segment_list = []

for filename, ts_range, text in merged_results:
    # distance placeholder
    distance = 0.0
    segment_list.append((filename, ts_range, text, distance))

# Now segment_list is ready to use
print(segment_list)

[('2.srt', '00:00:05.429 -> 00:00:30.570', "Hi everyone. In this video, we're going to talk about linear regression. So we'll begin by the definition of linear regression, and we'll talk about how this model can optimize to get the best estimate value. And then we're going to talk about important quantities for linear regression, such as a fitness performance metric, things like that. And we'll talk about how statistically significant these estimate values are.", 0.0), ('2.srt', '00:01:07.469 -> 00:01:30.569', "And our goal is to tweak this parameter by optimization so that the model makes a prediction that's close to the target as much as possible. So what is a linear regression? It is one of the simplest kind of supervised learning model. And it predicts a real value number which is regression.", 0.0), ('2.srt', '00:01:40.769 -> 00:02:06.859', "That means the user doesn't need to figure out some design parameters in advance or during the training. And importantly, linear regression m

In [11]:
from moviepy.editor import VideoFileClip, concatenate_videoclips, ColorClip
import os
import datetime

def format_srt_timestamp(seconds):
    td = datetime.timedelta(seconds=seconds)
    total_seconds = int(td.total_seconds())
    milliseconds = int((td.total_seconds() - total_seconds) * 1000)
    hours, remainder = divmod(total_seconds, 3600)
    minutes, seconds = divmod(remainder, 60)
    return f"{hours:02}:{minutes:02}:{int(seconds):02},{milliseconds:03}"

def create_continuous_srt(clips_info, output_filename="Data/stitched_output.srt", transition_sec=0.01):
    srt_lines = []
    current_time = 0.0

    for idx, (duration, sentence, start, video_file) in enumerate(clips_info, start=1):
        start_time = format_srt_timestamp(current_time)
        end_time = format_srt_timestamp(current_time + duration)
        srt_lines.append(f"{idx}\n{start_time} --> {end_time}\n{sentence}\n")
        current_time += duration + transition_sec  # Account for pause
    with open(output_filename, "w") as f:
        f.write("\n".join(srt_lines))

def parse_timestamp(timestamp_str):
    start, end = timestamp_str.split(" -> ")
    return start.strip(), end.strip()

def stitch_video_from_segments(segment_list, srt_filename="Data/stitched_output.srt", pause_duration=0.01):
    clips = []
    clips_info = []
    sources = set()

    for idx, (filename, timestamp, sentence, distance) in enumerate(segment_list):
        lecture_no = os.path.splitext(filename)[0]
        video_file = "Data/Videos/" + lecture_no + ".mp4"
        start, end = parse_timestamp(timestamp)
        sources.add("Lecture - " + lecture_no)

        try:
            clip = VideoFileClip(video_file).subclip(start, end)

            # Resize black screen to match clip size
            if idx > 0:
                black_clip = ColorClip(size=clip.size, color=(0, 0, 0), duration=pause_duration)
                black_clip = black_clip.set_fps(clip.fps)
                clips.append(black_clip)

            clips.append(clip)
            clips_info.append((clip.duration, sentence, start, video_file))
        except Exception as e:
            print(f"Error processing segment ({filename}, {timestamp}): {e}")

    if not clips:
        print("No valid clips found.")
        return

    final_clip = concatenate_videoclips(clips, method="chain")

    final_clip.write_videofile(
        "Data/answer.mp4",
        codec="libx264",
        preset="ultrafast",
        threads=4,
        audio_codec="aac"
    )

    create_continuous_srt(clips_info, output_filename=srt_filename, transition_sec=pause_duration)

stitch_video_from_segments(segment_list, srt_filename="Data/stitched_output.srt", pause_duration=0.01)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Moviepy - Building video Data/answer.mp4.
MoviePy - Writing audio in answerTEMP_MPY_wvf_snd.mp4


chunk:  21%|██        | 485/2293 [00:00<00:02, 866.40it/s, now=None]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
chunk:  43%|████▎     | 995/2293 [00:01<00:01, 726.15it/s, now=None]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
chunk:  71%|███████   | 1626/2293 [00:02<00:00, 710.59it/s, now=None]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the envir

MoviePy - Done.
Moviepy - Writing video Data/answer.mp4



                                                                 

Moviepy - Done !
Moviepy - video ready Data/answer.mp4


In [6]:
key = list(grouped_sent_to_metadata.keys())[0]
print(key)
print(grouped_sent_to_metadata[key])

Hi everyone. In this video, we're going to talk about multilinear regression. So last time we talked about multilinear regression with the higher order terms of a single variable, and this time we're going to talk about multilinear regression model when there are multiple variables.
('2.srt', '00:00:06.169 --> 00:00:06.910', '00:00:05.429 --> 00:00:06.209')


### Summarisation

In [None]:
# Combine all top-K grouped sentences into one text
all_text = " ".join([sent for _, _, sent in related_results])
print(all_text)

print("SUJAL IS NOT HERE")
# Summarize across groups (num_sentences controls how many groups to pick)
summary_text = summarizer(all_text, num_sentences=4)  # pick top 4 important groups
print("SUJAL CODE IS WORKING")
# Map summary back to original grouped sentences to get metadata
important_groups = []
for filename, timestamp, grouped_sent in related_results:
    # Check if this grouped sentence appears in the summary
    if grouped_sent in summary_text:
        important_groups.append((filename, timestamp, grouped_sent))

# Print important groups with metadata
print("\nMost Important Groups with Metadata:")
for filename, timestamp, grouped_sent in important_groups:
    print(f"- [{filename} | {timestamp}] {grouped_sent}\n")


SUJAL IS NOT HERE


: 