## Generate Questions from Sentences

In [None]:
from transformers import pipeline
from tqdm import tqdm

qg_pipeline = pipeline("text2text-generation", model="valhalla/t5-base-qg-hl")

input_file = "Data/sentences.txt"
output_file = "Data/generated_questions.txt"

with open(input_file, "r", encoding="utf-8") as file:
    sentences = [line.strip() for line in file if line.strip()]

with open(output_file, "w", encoding="utf-8") as out_file:
    for sentence in tqdm(sentences, desc="Generating Questions", unit="sentence"):
        questions = qg_pipeline(sentence, max_length=128, num_return_sequences=1)
        for q in questions:
            out_file.write(q["generated_text"] + "\n")  # Write each question on a new line

print("Question generation complete! Questions saved in 'Data/generated_questions.txt'.")


## Create Embeddings for Questions

In [3]:
import numpy as np
from sentence_transformers import SentenceTransformer

# Load the model
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# Load the dataset
questions_file = "Data/generated_questions.txt"
questions = [line.strip() for line in open(questions_file, "r") if line.strip()]

# Encode all questions
question_embeddings = np.array(model.encode(questions)).astype("float32")

# Save embeddings and questions
np.save("Data/questions_embeddings.npy", question_embeddings)
with open("questions_list.txt", "w") as f:
    f.write("\n".join(questions))

print("Embeddings saved successfully to `questions_embeddings.npy` file!")

Embeddings saved successfully to `questions_embeddings.npy` file!


## Classify Question whether clear or not

In [1]:
import os
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# 1. 📂 Define local paths
data_folder = "Data"
embeddings_path = os.path.join(data_folder, "questions_embeddings.npy")
questions_path = os.path.join(data_folder, "questions_list.txt")
sentences_path = os.path.join(data_folder, "sentences.txt")
faiss_index_path = os.path.join(data_folder, "sentence_embeddings.index")

# 2. 🤖 Load Sentence-BERT model
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# 3. 📥 Load question embeddings and questions
question_embeddings = np.load(embeddings_path)

# Load lecture sentences
with open(sentences_path, "r", encoding="utf-8") as f:
    lecture_sentences = [line.strip() for line in f if line.strip()]

# Load FAISS index
faiss_index = faiss.read_index(faiss_index_path)

# 4. 🔍 Similarity classification function
def classify_question(query, threshold=0.60):
    query_embedding = model.encode([query]).astype("float32")
    similarities = cosine_similarity(query_embedding, question_embeddings)[0]
    max_similarity = np.max(similarities)
    is_clear = max_similarity >= threshold
    return ("Clear" if is_clear else "Vague"), max_similarity

## Ask user Query

In [3]:
import pandas as pd
import faiss
from sentence_transformers import SentenceTransformer

# Load model and metadata
model = SentenceTransformer('all-MiniLM-L6-v2')  # or your preferred model
metadata_df = pd.read_csv('Data/srt-embedding-metadata.tsv', sep='\t')
lecture_sentences = metadata_df['sentence'].tolist()

# Build FAISS index (only once at startup)
sentence_embeddings = model.encode(lecture_sentences).astype('float32')
faiss_index = faiss.IndexFlatL2(sentence_embeddings.shape[1])
faiss_index.add(sentence_embeddings)


# Interactive loop
while True:
    student_question = input("\n📩 Enter your question (or type 'exit' to quit): ").strip()
    if student_question.lower() == 'exit':
        print("👋 Exiting...")
        break

    status, score = classify_question(student_question)

    if status == "Vague":
        print(f"⚠️ Your question seems unclear (Similarity: {score:.2f}). Please provide more details.")
    else:
        print(f"✅ Your question is clear (Similarity: {score:.2f}). Proceeding with the answer...")

        # Encode student question and search FAISS index
        question_embedding = model.encode([student_question]).astype('float32')
        distances, indices = faiss_index.search(question_embedding, len(lecture_sentences))

        distance_threshold = 0.7
        related_sentences = [
            (
                metadata_df.iloc[i]["sentence"],
                metadata_df.iloc[i]["filename"],
                metadata_df.iloc[i]["timestamp"],
                distances[0][j]
            )
            for j, i in enumerate(indices[0])
            if 0 < distances[0][j] <= distance_threshold and not metadata_df.iloc[i]["sentence"].endswith('?')
        ]

        print("Question:", student_question)
        print("\n🔎 Related Sentences:")
        if not related_sentences:
            print("❌ No related sentences found...")
        else:
            for sentence, filename, timestamp, distance in related_sentences:
                print(f"- 📁 {filename} 🕒 {timestamp}\n  💬 {sentence} (Distance: {distance:.4f})\n")


✅ Your question is clear (Similarity: 1.00). Proceeding with the answer...
Question: What is Machine Learning?

🔎 Related Sentences:
- 📁 1.srt 🕒 00:06:07.960 --> 00:06:12.370
  💬 As you can see, machine learning is a top skill in the jobs that involves AI skills. (Distance: 0.5102)

- 📁 1.srt 🕒 00:02:55.280 --> 00:03:02.489
  💬 Machine learning consists of different types of learning, such as supervised learning, unsupervised learning, or reinforcement learning. (Distance: 0.5319)

- 📁 1.srt 🕒 00:03:03.539 --> 00:03:06.799
  💬 Many machine learning models, they are coming from statistical learning. (Distance: 0.5381)

- 📁 1.srt 🕒 00:00:07.940 --> 00:00:11.130
  💬 This video will talk about introduction to machine learning. (Distance: 0.5936)

- 📁 1.srt 🕒 00:02:42.550 --> 00:02:48.879
  💬 So machine learning is part of data science and it is also a subfield of artificial intelligence. (Distance: 0.6000)

- 📁 1.srt 🕒 00:05:36.290 --> 00:05:41.000
  💬 And here is the Google trend on the t