# EduAI Connect — RAG Pipeline Evaluation
Tests 15 questions against the RAG pipeline to verify answer quality.
Run this from the project root: `cd ~/Documents/eduai-connect`

In [None]:
# Setup — same as rag_chain.py
import boto3
from langchain_aws import ChatBedrock, BedrockEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

region = "us-east-1"
bedrock_runtime = boto3.client(service_name="bedrock-runtime", region_name=region)

llm = ChatBedrock(
    client=bedrock_runtime,
    model_id="anthropic.claude-3-sonnet-20240229-v1:0",
    model_kwargs={"temperature": 0, "max_tokens": 1000}
)

embeddings = BedrockEmbeddings(client=bedrock_runtime, model_id="amazon.titan-embed-text-v1")
vectorstore = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)
retriever = vectorstore.as_retriever(search_kwargs={"k": 4})

prompt = ChatPromptTemplate.from_template("""
You are an education analytics assistant that helps teachers understand student performance data.
Use the context below to answer the question. If the answer is not in the context, say you don't know.

Context:
{context}

Question:
{question}

Answer:
""")

chain = prompt | llm | StrOutputParser()

def ask(question):
    docs = retriever.invoke(question)
    context = "\n\n".join([doc.page_content for doc in docs])
    return chain.invoke({"context": context, "question": question, "chat_history": ""})

print("Setup complete.")

## Test Questions
Each question tests a different RAG capability. Check if the answer is correct and relevant.

In [None]:
# Q1: Specific student lookup
# EXPECTED: Should return Jessica Wallace's full profile with grades and attendance
print(ask("Tell me about Jessica Wallace"))

In [None]:
# Q2: Find struggling students
# EXPECTED: Should return students with D grades or scores below 70
print(ask("Which students are failing?"))

In [None]:
# Q3: Course-specific query
# EXPECTED: Should return students enrolled in Physics with their scores
print(ask("Who has the lowest grade in Physics?"))

In [None]:
# Q4: Attendance query
# EXPECTED: Should return students with attendance below 0.80
print(ask("Which students have poor attendance?"))

In [None]:
# Q5: Engagement query
# EXPECTED: Should return students with low engagement scores (below 60)
print(ask("Who has the lowest engagement?"))

In [None]:
# Q6: Top performer query
# EXPECTED: Should return students with mostly A grades
print(ask("Who are the top performing students?"))

In [None]:
# Q7: Vague query — tests query transformation effectiveness
# EXPECTED: Should still find at-risk students despite vague phrasing
print(ask("Who needs help?"))

In [None]:
# Q8: Multi-course comparison
# EXPECTED: Should compare Math scores across students
print(ask("How are students doing in Math?"))

In [None]:
# Q9: Grade level filter
# EXPECTED: Should return 9th graders and their performance
print(ask("Show me all 9th grade students and their grades"))

In [None]:
# Q10: At-risk identification
# EXPECTED: Should identify students with combination of low grades + low attendance + low engagement
print(ask("Which students are at risk of failing based on their grades and attendance?"))

In [None]:
# Q11: Course with most struggling students
# EXPECTED: Should identify which course has the most D/F grades
print(ask("Which course has the most students struggling?"))

In [None]:
# Q12: Specific metric query
# EXPECTED: Should list students sorted by engagement score
print(ask("Rank the students by engagement score from lowest to highest"))

In [None]:
# Q13: Actionable recommendation query
# EXPECTED: Should provide specific intervention suggestions
print(ask("What actions should I take to help students who are struggling in Biology?"))

In [None]:
# Q14: Hallucination test — asking about data that doesn't exist
# EXPECTED: Should say it doesn't know — NOT make up an answer
print(ask("What is the average SAT score for our students?"))

In [None]:
# Q15: Hallucination test — asking about a student that doesn't exist
# EXPECTED: Should say it doesn't have info on this student
print(ask("Tell me about John Smith"))

## Scoring
After running all 15 questions, score each answer:
- **Correct**: Answer uses real data from the context and is accurate
- **Partially Correct**: Answer is on topic but missing details or slightly wrong
- **Incorrect**: Answer is wrong or hallucinated
- **Good Refusal**: Correctly said "I don't know" when data wasn't available (Q14, Q15)

| Question | Score | Notes |
|----------|-------|-------|
| Q1 | | |
| Q2 | | |
| Q3 | | |
| Q4 | | |
| Q5 | | |
| Q6 | | |
| Q7 | | |
| Q8 | | |
| Q9 | | |
| Q10 | | |
| Q11 | | |
| Q12 | | |
| Q13 | | |
| Q14 | | |
| Q15 | | |