## Generate Questions from Sentences

In [None]:
from transformers import pipeline
from tqdm import tqdm

qg_pipeline = pipeline("text2text-generation", model="valhalla/t5-base-qg-hl")

input_file = "Data/sentences.txt"
output_file = "Data/generated_questions.txt"

with open(input_file, "r", encoding="utf-8") as file:
    sentences = [line.strip() for line in file if line.strip()]

with open(output_file, "w", encoding="utf-8") as out_file:
    for sentence in tqdm(sentences, desc="Generating Questions", unit="sentence"):
        questions = qg_pipeline(sentence, max_length=128, num_return_sequences=1)
        for q in questions:
            out_file.write(q["generated_text"] + "\n")  # Write each question on a new line

print("Question generation complete! Questions saved in 'Data/generated_questions.txt'.")


## Create Embeddings for Questions

In [None]:
import numpy as np
from sentence_transformers import SentenceTransformer

# Load the model
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# Load the dataset
questions_file = "Data/generated_questions.txt"
questions = [line.strip() for line in open(questions_file, "r") if line.strip()]

# Encode all questions
question_embeddings = np.array(model.encode(questions)).astype("float32")

# Save embeddings and questions
np.save("Data/questions_embeddings.npy", question_embeddings)
with open("questions_list.txt", "w") as f:
    f.write("\n".join(questions))

print("Embeddings saved successfully to `questions_embeddings.npy` file!")

## Classify Question whether clear or not

In [1]:
import os
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# 1. 📂 Define local paths
data_folder = "Data"
embeddings_path = os.path.join(data_folder, "questions_embeddings.npy")
questions_path = os.path.join(data_folder, "questions_list.txt")
sentences_path = os.path.join(data_folder, "sentences.txt")
faiss_index_path = os.path.join(data_folder, "sentence_embeddings.index")

# 2. 🤖 Load Sentence-BERT model
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# 3. 📥 Load question embeddings and questions
question_embeddings = np.load(embeddings_path)

# Load lecture sentences
with open(sentences_path, "r", encoding="utf-8") as f:
    lecture_sentences = [line.strip() for line in f if line.strip()]

# Load FAISS index
faiss_index = faiss.read_index(faiss_index_path)

# 4. 🔍 Similarity classification function
def classify_question(query, threshold=0.60):
    query_embedding = model.encode([query]).astype("float32")
    similarities = cosine_similarity(query_embedding, question_embeddings)[0]
    max_similarity = np.max(similarities)
    is_clear = max_similarity >= threshold
    return ("Clear" if is_clear else "Vague"), max_similarity

## Ask user Query

In [2]:
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
import csv

# Load the model and FAISS index
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
faiss_index = faiss.read_index("Data/sentence_embeddings.index")

# Load lecture sentences
with open('Data/sentences.txt', 'r') as file:
    lecture_sentences = file.readlines()
lecture_sentences = [line.strip() for line in lecture_sentences if line.strip()]

lecture_data = []
with open('Data/srt-embedding-metadata.tsv', 'r', encoding='utf-8') as file:
    tsv_reader = csv.reader(file, delimiter='\t')
    for row in tsv_reader:
        if len(row) == 3:
            filename, timestamp, sentence = row
            lecture_data.append((filename.strip(), timestamp.strip(), sentence.strip()))

# Get student's question
student_question = input("Enter your question: ")
question_embedding = np.array(model.encode([student_question])).astype('float32')

# Search all sentences (max number can be total sentences in the index)
distances, indices = faiss_index.search(question_embedding, len(lecture_sentences))

# Define a distance threshold (lower means more similar)
distance_threshold = 0.7

related_sentences = []
related_results = []
for j in range(len(indices[0])):
    i = indices[0][j]
    distance = distances[0][j]
    sentence = lecture_sentences[i]
    
    # Check if the sentence is below the distance threshold and is not a question
    if distance > 0 and distance <= distance_threshold and not sentence.strip().endswith('?'):
        related_sentences.append((sentence, distance))
        filename, timestamp, _ = lecture_data[i]
        related_results.append((filename, timestamp, sentence, distance))


# Display related sentences with distances
print("\n Related Sentences:")
for sentence, distance in related_sentences:
    print(f"- {sentence} - {distance:.4f}")

print("\nRelated Sentences with Metadata:")
for filename, timestamp, sentence, distance in related_results:
    print(f"- [{filename}] [{timestamp}] {sentence} - Distance: {distance:.4f}")


 Related Sentences:
- As you can see, machine learning is a top skill in the jobs that involves AI skills. - 0.5102
- Machine learning consists of different types of learning, such as supervised learning, unsupervised learning, or reinforcement learning. - 0.5319
- Many machine learning models, they are coming from statistical learning. - 0.5381
- This video will talk about introduction to machine learning. - 0.5936
- So machine learning is part of data science and it is also a subfield of artificial intelligence. - 0.6000
- And here is the Google trend on the term on machine learning and software engineering. - 0.6220
- So machine learning extends the statistical learning by including more complex algorithms, which deal with more complex data and bigger data, and more efficient algorithms. - 0.6400
- Machine learning, we mentioned that machine learning several times during the talk about data science. - 0.6611
- Here are some few examples of machine learning tasks. - 0.6639
- It is o