In [None]:
import os
import torch
from sentence_transformers import SentenceTransformer, util
import os
import pickle
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
from transformers import pipeline

In [None]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
print("\n Loading Sentence-BERT model...")
model = SentenceTransformer("all-MiniLM-L6-v2")
print("Loaded Model...")

print("Loading Generated Questions...")
generated_qs_path = "data/intermediate/generated_questions.txt"
if os.path.exists(generated_qs_path):
    with open(generated_qs_path, "r", encoding="utf-8") as f:
        generated_questions = [line.strip() for line in f if line.strip()]
    generated_embeddings = model.encode(generated_questions, convert_to_tensor=True)
    print("Loaded Generated Questions.")
else:
    print("'generated_questions.txt' not found!")
    exit()


print("Loading Courses and Institutions Data")
with open("data/processed/data.pkl", "rb") as f:
    df = pickle.load(f)


df_unique = df.drop_duplicates(subset=["reviews"]).reset_index(drop=True)
print("Loaded courses and institutions data.")

print("Loading Intent Classifier Model..")
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
print("Loaded Intent Classifier Model")

## Ask User to select a institution and Course

In [None]:
# --- Ask user to select an institution ---
institutions = sorted(df_unique["institution"].dropna().unique())
print("\n🏫 Available Institutions:")
for idx, inst in enumerate(institutions):
    print(f"{idx + 1}. {inst}")

inst_idx = int(input("\n👉 Select an institution by number: ")) - 1
selected_institution = institutions[inst_idx]
print(f"\n✅ You selected: {selected_institution}")

# --- Filter courses ---
inst_courses = sorted(df_unique[df_unique["institution"] == selected_institution]["name"].dropna().unique())
print("\n📚 Courses in this institution:")
for idx, course in enumerate(inst_courses):
    print(f"{idx + 1}. {course}")

course_idx = int(input("\n👉 Select a course by number: ")) - 1
selected_course = inst_courses[course_idx]
print(f"\n✅ You selected course: {selected_course}")

## Evaluationg Models

In [None]:
course_review_questions = [
    "How was the instructor’s teaching?",
    "Was the course content clear?",
    "How effective were the assignments and projects?",
    "Would you recommend it?",
     "Were videos easy to follow?",
    "How well did the instructor explain complex topics?",
    "Were the assignments helpful for practice?",
    "Was the course organized and easy to navigate?",
    "Did you find the learning platform user-friendly?",
    "What improvements would you suggest for this course?"
]

non_course_review_questions = [
    "What’s your hobby?",
    "Do you like music?",
    "How are you?",
    "Beach or mountains?",
    "What do you enjoy doing in your free time?",
    "Have you traveled anywhere interesting recently?",
    "What’s your favorite way to relax after studying or working?",
    "Is there a skill you'd love to master one day?",
    "Do you prefer reading books or watching shows?",
    "Favorite movie?"
]

# Combine questions and assign labels
all_questions = course_review_questions + non_course_review_questions
true_labels = [1]*len(course_review_questions) + [0]*len(non_course_review_questions)

# Generate embeddings for reference course-related questions
reference_embeddings = generated_embeddings.cpu()

# Store predictions and scores
predicted_labels = []
cosine_scores_list = []

print("\n--- Evaluation Results ---\n")

# Calculate cosine similarity for each question
for i, question in enumerate(all_questions):
    query_embedding = model.encode(question, convert_to_tensor=True).cpu()
    cosine_scores = util.pytorch_cos_sim(query_embedding, reference_embeddings)
    max_score = torch.max(cosine_scores).item()
    cosine_scores_list.append(max_score)

# Evaluate different thresholds and calculate accuracy
thresholds = np.arange(0.5, 1.1, 0.1)
accuracy_results = []

for threshold in thresholds:
    # Predict labels based on cosine similarity threshold
    predictions = (np.array(cosine_scores_list) >= threshold).astype(int)
    
    # Calculate accuracy
    accuracy = (predictions == true_labels).mean()
    accuracy_results.append((threshold, accuracy))

# Create a DataFrame for displaying results
accuracy_df = pd.DataFrame(accuracy_results, columns=["Threshold", "Accuracy"])

# Find the best threshold and its accuracy
best_threshold = accuracy_df.loc[accuracy_df["Accuracy"].idxmax()]

# Display results in a table format
print("\nThresholds and their corresponding accuracies:")
print(accuracy_df)

print(f"\nBest Threshold: {best_threshold['Threshold']:.2f} with Accuracy: {best_threshold['Accuracy']*100:.2f}%")

print("Course Related Questions:")
for question in course_review_questions[-5:]:
    print(f" - {question}")
print()
print(" - ...")

print("Non Course Related Questions:")
for question in non_course_review_questions[-5:]:
    print(f" - {question}")
print(" - ...")


# Display the last 3 questions from each category and cosine similarity
print("Output of model from Course Related questions:")
for question in course_review_questions[:3]:
    query_embedding = model.encode(question, convert_to_tensor=True).cpu()
    cosine_scores = util.pytorch_cos_sim(query_embedding, reference_embeddings)
    max_score = torch.max(cosine_scores).item()
    predicted_label = 1 if max_score >= best_threshold['Threshold'] else 0
    related = "Related" if predicted_label == 1 else "Not Related"
    print(f"Question: {question}")
    print(f"Cosine Similarity: {max_score:.4f}")
    print(f"Prediction: {related}")
    print()

print("Output of model from Non-Course Related questions:")
for question in non_course_review_questions[:3]:
    query_embedding = model.encode(question, convert_to_tensor=True).cpu()
    cosine_scores = util.pytorch_cos_sim(query_embedding, reference_embeddings)
    max_score = torch.max(cosine_scores).item()
    predicted_label = 1 if max_score >= best_threshold['Threshold'] else 0
    related = "Related" if predicted_label == 1 else "Not Related"
    print(f"Question: {question}")
    print(f"Cosine Similarity: {max_score:.4f}")
    print(f"Prediction: {related}")
    print()

In [None]:
## Intent Classification

In [None]:
intent_actions = {
    "yes_no": {
        "sentiment": True,
        "nlg": True,
        "summarization": False
    },
    "instructor": {
        "sentiment": True,
        "nlg": True,
        "summarization": False
    },
    "content": {
        "sentiment": True,
        "nlg": True,
        "summarization": False
    },
    "difficulty": {
        "sentiment": True,
        "nlg": True,
        "summarization": False
    },
    "career": {
        "sentiment": True,
        "nlg": True,
        "summarization": False
    },
    "general_opinion": {
        "sentiment": True,
        "nlg": True,
        "summarization": False
    },
    "course_overview": {
        "sentiment": False,
        "nlg": False,
        "summarization": True
    },
    "prerequisites": {
        "sentiment": False,
        "nlg": False,
        "summarization": True
    },
    "schedule": {
        "sentiment": False,
        "nlg": False,
        "summarization": True
    },
    "fees": {
        "sentiment": True,
        "nlg": True,
        "summarization": False
    },
    "certification": {
        "sentiment": True,
        "nlg": True,
        "summarization": False
    }
}

print(f"{'Intent Type':<20} {'Sentiment':<10} {'NLG':<10} {'Summarization':<15}")
print("-" * 55)
for intent, actions in intent_actions.items():
    print(f"{intent:<20} {str(actions['sentiment']):<10} {str(actions['nlg']):<10} {str(actions['summarization']):<15}")


intent_labels_readable = [
    "Is the user asking a yes or no question?",
    "Is the user asking about the instructor?",
    "Is the user asking about the course content or topics?",
    "Is the user asking about how difficult the course is?",
    "Is the user asking about career outcomes or job relevance?",
    "Is the user asking for general opinions from students?",
    "Is the user asking for a summary or overview of the course?",
    "Is the user asking about course prerequisites?",
    "Is the user asking about the course schedule or duration?",
    "Is the user asking about course fees or costs?",
    "Is the user asking about course certification or accreditation?"
]

label_map = {
    "Is the user asking a yes or no question?": "yes_no",
    "Is the user asking about the instructor?": "instructor",
    "Is the user asking about the course content or topics?": "content",
    "Is the user asking about how difficult the course is?": "difficulty",
    "Is the user asking about career outcomes or job relevance?": "career",
    "Is the user asking for general opinions from students?": "general_opinion",
    "Is the user asking for a summary or overview of the course?": "course_overview",
    "Is the user asking about course prerequisites?": "prerequisites",
    "Is the user asking about the course schedule or duration?": "schedule",
    "Is the user asking about course fees or costs?": "fees",
    "Is the user asking about course certification or accreditation?": "certification"
}

In [None]:
query = input("Enter your question related to the course: ")

# Encode the query
query_embedding = model.encode(query, convert_to_tensor=True).cpu()

# Compute cosine similarity with reference (course-related) embeddings
reference_embeddings = generated_embeddings.cpu()
cosine_scores = util.pytorch_cos_sim(query_embedding, reference_embeddings)
max_score = torch.max(cosine_scores).item()

# Set threshold
threshold = 0.7
print("Cosine distance of the question: ", max_score)
is_related = max_score >= threshold

if is_related:
    print("Related to the course reviews. Proceeding...")
    print("Question:", query)
    
    result = classifier(query, candidate_labels=intent_labels_readable, multi_label=False)
    # Print scores for each label
    print("Intent Scores:")
    for label, score in zip(result["labels"], result["scores"]):
        print(f"{label:<60} {score:.4f}")

    # Get top intent and its score
    predicted_label = result["labels"][0]
    predicted_score = result["scores"][0]
    predicted_intent = label_map[predicted_label]
    actions = intent_actions[predicted_intent]

    print(f"\nBest Intent: {predicted_intent} (Score: {predicted_score:.4f})")

    if actions["sentiment"]:
        print("Perform sentiment analysis")

    if actions["nlg"]:
        print("Generate answer using NLG")
    else:
        print("Summarize relevant content instead of generating an answer")

else:
    print(f"Question is not related to the course (Similarity Score: {max_score:.2f}). Please enter a course-related question.")


In [None]:
questions = [
    "Is this course suitable for beginners?",
    "How experienced is the instructor?",
    "What topics are covered in this course?",
    "Is this course difficult?",
    "Will this course help me in my career?",
    "What do students think about this course?",
    "Can you give me an overview of the course?",
    "What are the prerequisites for this course?",
    "What is the course schedule?",
    "How much does the course cost?",
    "Will I receive a certificate after completing the course?"
]

true_intents = [
    "yes_no",
    "instructor",
    "content",
    "difficulty",
    "career",
    "general_opinion",
    "course_overview",
    "prerequisites",
    "schedule",
    "fees",
    "certification"
]

correct_predictions = 0
total_questions = len(questions)
mispredicted_questions = []

for i, question in enumerate(questions):
    print(f"\nQuestion: {question}")
    result = classifier(question, candidate_labels=intent_labels_readable, multi_label=False)
    predicted_label = result["labels"][0]
    predicted_intent = label_map[predicted_label]
    print(f"Predicted Intent: {predicted_intent}")
    if predicted_intent == true_intents[i]:
        correct_predictions += 1
    else:
        mispredicted_questions.append((question, predicted_intent, true_intents[i]))

accuracy = (correct_predictions / total_questions) * 100
print(f"\nAccuracy: {accuracy:.2f}%")

if mispredicted_questions:
    print("\nMispredicted Questions:")
    for question, predicted, true in mispredicted_questions:
        print(f"Question: {question}\nPredicted: {predicted}, Correct: {true}\n")


In [None]:
print("\n📄 Fetching reviews for the selected course...")
# Filter reviews for the selected course and institution
filtered_reviews = df_unique[ (df_unique["institution"] == selected_institution) & (df_unique["name"] == selected_course) ]["reviews"].dropna().tolist()

if not filtered_reviews: 
    print("No reviews found for the selected course.") 
    exit()
else:
    print("Total Reviews for the course:", len(filtered_reviews))

# Encode reviews using Sentence-BERT
print("Encoding reviews using Sentence-BERT...") 
review_embeddings = model.encode(filtered_reviews, convert_to_tensor=True).cpu()
print("Encoded all reviews.")

# Compute cosine similarity between query and each review
cosine_scores = util.pytorch_cos_sim(query_embedding, review_embeddings)[0]

# Filter reviews with similarity ≥ 0.7
threshold = 0.5
related_reviews = [
    (filtered_reviews[i], score.item())
    for i, score in enumerate(cosine_scores) if score >= threshold
]

print("Question: ", query)
print("Related Sentences Count: ", len(related_reviews))
if related_reviews:
    print(f"\nReviews related to the query (threshold ≥ {threshold}): \n")
    for review, score in sorted(related_reviews, key=lambda x: x[1], reverse=True):
        print(f"- Score: {score:.4f} | Review: {review}")
else:
    print(f"\nNo reviews found with similarity ≥ {threshold}.")