# Module 7 Assessment — RAG Pipelines (GRADING TEMPLATE)

**This notebook contains hidden tests for automated grading.**

DO NOT distribute to students.

---
## Setup

In [None]:
!pip -q install sentence-transformers scikit-learn faiss-cpu

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import faiss
from typing import List, Tuple
from dataclasses import dataclass
import json
import traceback

# Load the embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")
print("Setup complete!")

---
## Knowledge Base

In [None]:
# Knowledge base (do not modify)
knowledge_base = [
    {
        "id": "doc_001",
        "text": "The central bank raised interest rates by 25 basis points to combat inflation.",
        "source": "monetary_policy.pdf"
    },
    {
        "id": "doc_002",
        "text": "Higher borrowing costs are expected to slow consumer spending and reduce inflationary pressure.",
        "source": "monetary_policy.pdf"
    },
    {
        "id": "doc_003",
        "text": "Mortgage rates have risen to their highest level in two decades.",
        "source": "housing_report.pdf"
    },
    {
        "id": "doc_004",
        "text": "The Federal Reserve's dual mandate requires balancing employment with price stability.",
        "source": "fed_overview.pdf"
    },
    {
        "id": "doc_005",
        "text": "Bank earnings improved as net interest margins widened due to higher rates.",
        "source": "earnings_summary.pdf"
    },
    {
        "id": "doc_006",
        "text": "The championship football match ended in a dramatic penalty shootout.",
        "source": "sports_news.pdf"
    }
]

# Pre-computed embeddings
texts = [doc["text"] for doc in knowledge_base]
doc_embeddings = model.encode(texts, normalize_embeddings=True)

print(f"Knowledge base loaded: {len(knowledge_base)} documents")

---
## Data Class

In [None]:
@dataclass
class RetrievedChunk:
    """A chunk retrieved from the knowledge base."""
    text: str
    score: float
    source: str
    doc_id: str

print("RetrievedChunk class defined")

---
## Task 1 — Implement Retriever Function (20 points)

In [None]:
# Task 1: Implement the retriever function

def retrieve_top_k(query: str, k: int = 3) -> List[RetrievedChunk]:
    """Retrieve top-k most similar documents for the query."""
    # STUDENT CODE GOES HERE
    pass

---
## Task 2 — Build RAG Prompt (15 points)

In [None]:
# Task 2: Build RAG prompt

def build_rag_prompt(chunks: List[RetrievedChunk], question: str) -> str:
    """Build an evidence-first RAG prompt."""
    # STUDENT CODE GOES HERE
    pass

---
## Task 3 — Implement Guardrails (20 points)

In [None]:
# Task 3: Implement guardrails

def retrieve_with_guardrails(
    query: str, 
    k: int = 5, 
    min_score: float = 0.3, 
    min_chunks: int = 2
) -> dict:
    """Retrieve with score threshold and minimum chunk guardrails."""
    # STUDENT CODE GOES HERE
    pass

---
## Task 4 — Complete RAG Pipeline (15 points)

In [None]:
# Task 4: Complete RAG pipeline

def rag_pipeline(question: str, min_score: float = 0.3) -> dict:
    """Complete RAG pipeline with guardrails."""
    # STUDENT CODE GOES HERE
    pass

---
## Task 5 — Evaluate Retrieval Quality (10 points)

In [None]:
# Task 5: Evaluate retrieval quality

def precision_at_k(query: str, expected_keywords: List[str], k: int = 3) -> float:
    """Calculate Precision@k for retrieval evaluation."""
    # STUDENT CODE GOES HERE
    pass

---
## Task 6 — Written Explanation (20 points)

In [None]:
# Task 6: Written explanation

rag_failure_explanation = """

"""

---
# HIDDEN GRADING TESTS

**Everything below this line is for automated grading only.**

In [None]:
# ============================================================
# GRADING INFRASTRUCTURE
# ============================================================

results = {
    "task1_retriever": {"points": 0, "max": 20, "feedback": ""},
    "task2_prompt": {"points": 0, "max": 15, "feedback": ""},
    "task3_guardrails": {"points": 0, "max": 20, "feedback": ""},
    "task4_pipeline": {"points": 0, "max": 15, "feedback": ""},
    "task5_precision": {"points": 0, "max": 10, "feedback": ""},
    "task6_written": {"points": 0, "max": 20, "feedback": ""}
}

# AI detection patterns
AI_PHRASES = [
    "as an ai", "i'm an ai", "language model", "i cannot", "i can't provide",
    "certainly!", "absolutely!", "great question", "happy to help",
    "here's a comprehensive", "let me explain", "in summary",
    "firstly", "secondly", "thirdly", "furthermore", "moreover",
    "it's important to note", "it is worth noting",
    "delve into", "dive into", "explore this",
    "robust", "leverage", "utilize", "facilitate",
    "comprehensive overview", "key takeaways"
]

FORBIDDEN_CHARS = ['—', '–', '“', '”', '‘', '’', '…', '•']

def check_ai_generated(text):
    """Check for AI-generated content markers."""
    text_lower = text.lower()
    issues = []
    
    for phrase in AI_PHRASES:
        if phrase in text_lower:
            issues.append(f"AI phrase: '{phrase}'")
    
    for char in FORBIDDEN_CHARS:
        if char in text:
            issues.append(f"AI typography: '{char}'")
    
    return issues

print("Grading infrastructure loaded")

In [None]:
# ============================================================
# TASK 1 GRADING: Retriever Function (20 points)
# ============================================================

def grade_task1():
    task = results["task1_retriever"]
    feedback = []
    
    try:
        # Test 1: Function exists and is callable (4 pts)
        if retrieve_top_k is None or not callable(retrieve_top_k):
            feedback.append("Function not implemented")
            task["feedback"] = "; ".join(feedback)
            return
        task["points"] += 4
        feedback.append("Function exists (+4)")
        
        # Test 2: Returns list of RetrievedChunk (4 pts)
        chunks = retrieve_top_k("Why did rates increase?", k=3)
        if not isinstance(chunks, list):
            feedback.append("Does not return list")
            task["feedback"] = "; ".join(feedback)
            return
        if len(chunks) == 0:
            feedback.append("Returns empty list")
            task["feedback"] = "; ".join(feedback)
            return
        if not isinstance(chunks[0], RetrievedChunk):
            feedback.append("Does not return RetrievedChunk objects")
            task["feedback"] = "; ".join(feedback)
            return
        task["points"] += 4
        feedback.append("Returns RetrievedChunk list (+4)")
        
        # Test 3: Returns correct number of chunks (4 pts)
        if len(chunks) == 3:
            task["points"] += 4
            feedback.append("Correct k=3 count (+4)")
        else:
            feedback.append(f"Wrong count: expected 3, got {len(chunks)}")
        
        # Test 4: Sorted by score descending (4 pts)
        scores = [c.score for c in chunks]
        if scores == sorted(scores, reverse=True):
            task["points"] += 4
            feedback.append("Correctly sorted by score (+4)")
        else:
            feedback.append("Not sorted by score descending")
        
        # Test 5: Top result is relevant (4 pts)
        top_text = chunks[0].text.lower()
        if "rate" in top_text or "interest" in top_text or "central bank" in top_text:
            task["points"] += 4
            feedback.append("Top result is relevant (+4)")
        else:
            feedback.append("Top result not relevant to query")
            
    except Exception as e:
        feedback.append(f"Error: {str(e)}")
    
    task["feedback"] = "; ".join(feedback)

grade_task1()
print(f"Task 1: {results['task1_retriever']['points']}/{results['task1_retriever']['max']}")
print(f"Feedback: {results['task1_retriever']['feedback']}")

In [None]:
# ============================================================
# TASK 2 GRADING: RAG Prompt Builder (15 points)
# ============================================================

def grade_task2():
    task = results["task2_prompt"]
    feedback = []
    
    try:
        # Need Task 1 to work
        if results["task1_retriever"]["points"] < 8:
            feedback.append("Skipped: Task 1 not working")
            task["feedback"] = "; ".join(feedback)
            return
        
        # Test 1: Function exists and is callable (3 pts)
        if build_rag_prompt is None or not callable(build_rag_prompt):
            feedback.append("Function not implemented")
            task["feedback"] = "; ".join(feedback)
            return
        task["points"] += 3
        feedback.append("Function exists (+3)")
        
        # Get test chunks
        chunks = retrieve_top_k("Why did rates increase?", k=3)
        prompt = build_rag_prompt(chunks, "Why did rates increase?")
        
        # Test 2: Returns string (3 pts)
        if not isinstance(prompt, str) or len(prompt) < 50:
            feedback.append("Does not return valid string")
            task["feedback"] = "; ".join(feedback)
            return
        task["points"] += 3
        feedback.append("Returns valid string (+3)")
        
        prompt_upper = prompt.upper()
        
        # Test 3: Contains CONTEXT section (3 pts)
        if "CONTEXT" in prompt_upper:
            task["points"] += 3
            feedback.append("Has CONTEXT section (+3)")
        else:
            feedback.append("Missing CONTEXT section")
        
        # Test 4: Contains QUESTION section (3 pts)
        if "QUESTION" in prompt_upper:
            task["points"] += 3
            feedback.append("Has QUESTION section (+3)")
        else:
            feedback.append("Missing QUESTION section")
        
        # Test 5: Contains grounding instruction (3 pts)
        prompt_lower = prompt.lower()
        if "only" in prompt_lower and ("context" in prompt_lower or "provided" in prompt_lower):
            task["points"] += 3
            feedback.append("Has grounding instruction (+3)")
        else:
            feedback.append("Missing grounding instruction")
            
    except Exception as e:
        feedback.append(f"Error: {str(e)}")
    
    task["feedback"] = "; ".join(feedback)

grade_task2()
print(f"Task 2: {results['task2_prompt']['points']}/{results['task2_prompt']['max']}")
print(f"Feedback: {results['task2_prompt']['feedback']}")

In [None]:
# ============================================================
# TASK 3 GRADING: Guardrails (20 points)
# ============================================================

def grade_task3():
    task = results["task3_guardrails"]
    feedback = []
    
    try:
        # Need Task 1 to work
        if results["task1_retriever"]["points"] < 8:
            feedback.append("Skipped: Task 1 not working")
            task["feedback"] = "; ".join(feedback)
            return
        
        # Test 1: Function exists and is callable (4 pts)
        if retrieve_with_guardrails is None or not callable(retrieve_with_guardrails):
            feedback.append("Function not implemented")
            task["feedback"] = "; ".join(feedback)
            return
        task["points"] += 4
        feedback.append("Function exists (+4)")
        
        # Test 2: Returns dict with correct keys (4 pts)
        result = retrieve_with_guardrails("Why did rates increase?", min_score=0.3)
        if not isinstance(result, dict):
            feedback.append("Does not return dict")
            task["feedback"] = "; ".join(feedback)
            return
        if "refused" not in result:
            feedback.append("Missing 'refused' key")
            task["feedback"] = "; ".join(feedback)
            return
        task["points"] += 4
        feedback.append("Returns dict with refused key (+4)")
        
        # Test 3: Does NOT refuse for relevant query with low threshold (4 pts)
        result_relevant = retrieve_with_guardrails("Why did rates increase?", min_score=0.2)
        if result_relevant.get("refused") == False:
            task["points"] += 4
            feedback.append("Accepts relevant query (+4)")
        else:
            feedback.append("Incorrectly refused relevant query")
        
        # Test 4: DOES refuse for irrelevant query with high threshold (4 pts)
        result_irrelevant = retrieve_with_guardrails("What is the best pizza topping?", min_score=0.5, min_chunks=3)
        if result_irrelevant.get("refused") == True:
            task["points"] += 4
            feedback.append("Refuses irrelevant query (+4)")
        else:
            feedback.append("Should refuse irrelevant query with high threshold")
        
        # Test 5: Has 'reason' key when refused (4 pts)
        if result_irrelevant.get("refused") == True and "reason" in result_irrelevant:
            task["points"] += 4
            feedback.append("Provides reason on refusal (+4)")
        elif result_irrelevant.get("refused") == True:
            feedback.append("Missing 'reason' key on refusal")
            
    except Exception as e:
        feedback.append(f"Error: {str(e)}")
    
    task["feedback"] = "; ".join(feedback)

grade_task3()
print(f"Task 3: {results['task3_guardrails']['points']}/{results['task3_guardrails']['max']}")
print(f"Feedback: {results['task3_guardrails']['feedback']}")

In [None]:
# ============================================================
# TASK 4 GRADING: Complete Pipeline (15 points)
# ============================================================

def grade_task4():
    task = results["task4_pipeline"]
    feedback = []
    
    try:
        # Need Tasks 2 and 3 to work
        if results["task2_prompt"]["points"] < 6 or results["task3_guardrails"]["points"] < 8:
            feedback.append("Skipped: Tasks 2 and 3 required")
            task["feedback"] = "; ".join(feedback)
            return
        
        # Test 1: Function exists and is callable (3 pts)
        if rag_pipeline is None or not callable(rag_pipeline):
            feedback.append("Function not implemented")
            task["feedback"] = "; ".join(feedback)
            return
        task["points"] += 3
        feedback.append("Function exists (+3)")
        
        # Test 2: Returns dict with correct structure (3 pts)
        result = rag_pipeline("Why did rates increase?")
        if not isinstance(result, dict) or "refused" not in result:
            feedback.append("Does not return dict with 'refused' key")
            task["feedback"] = "; ".join(feedback)
            return
        task["points"] += 3
        feedback.append("Returns dict with refused key (+3)")
        
        # Test 3: Successful case has prompt and num_chunks (3 pts)
        if result.get("refused") == False:
            if "prompt" in result and "num_chunks" in result:
                task["points"] += 3
                feedback.append("Success case has prompt and num_chunks (+3)")
            else:
                feedback.append("Missing prompt or num_chunks on success")
        else:
            feedback.append("Unexpectedly refused relevant query")
        
        # Test 4: Refusal case has message (3 pts)
        result_refused = rag_pipeline("What is quantum computing?", min_score=0.6)
        if result_refused.get("refused") == True:
            if "message" in result_refused:
                task["points"] += 3
                feedback.append("Refusal has message (+3)")
            else:
                feedback.append("Missing message on refusal")
        else:
            feedback.append("Should refuse irrelevant query")
        
        # Test 5: Prompt contains context (3 pts)
        if result.get("refused") == False and result.get("prompt"):
            if "CONTEXT" in result["prompt"].upper() or "context" in result["prompt"].lower():
                task["points"] += 3
                feedback.append("Prompt includes context (+3)")
            else:
                feedback.append("Prompt missing context section")
            
    except Exception as e:
        feedback.append(f"Error: {str(e)}")
    
    task["feedback"] = "; ".join(feedback)

grade_task4()
print(f"Task 4: {results['task4_pipeline']['points']}/{results['task4_pipeline']['max']}")
print(f"Feedback: {results['task4_pipeline']['feedback']}")

In [None]:
# ============================================================
# TASK 5 GRADING: Precision@k (10 points)
# ============================================================

def grade_task5():
    task = results["task5_precision"]
    feedback = []
    
    try:
        # Need Task 1 to work
        if results["task1_retriever"]["points"] < 8:
            feedback.append("Skipped: Task 1 not working")
            task["feedback"] = "; ".join(feedback)
            return
        
        # Test 1: Function exists and is callable (2 pts)
        if precision_at_k is None or not callable(precision_at_k):
            feedback.append("Function not implemented")
            task["feedback"] = "; ".join(feedback)
            return
        task["points"] += 2
        feedback.append("Function exists (+2)")
        
        # Test 2: Returns float (2 pts)
        p = precision_at_k("Why did rates increase?", ["rate", "interest"], k=3)
        if not isinstance(p, (int, float)):
            feedback.append("Does not return numeric value")
            task["feedback"] = "; ".join(feedback)
            return
        task["points"] += 2
        feedback.append("Returns numeric value (+2)")
        
        # Test 3: Returns value in [0, 1] range (2 pts)
        if 0.0 <= p <= 1.0:
            task["points"] += 2
            feedback.append("Value in valid range (+2)")
        else:
            feedback.append(f"Value {p} not in [0, 1]")
        
        # Test 4: Reasonable value for relevant query (2 pts)
        # With k=3 and keywords about rates, should find at least 2/3
        if p >= 0.5:
            task["points"] += 2
            feedback.append(f"Reasonable precision {p:.2f} (+2)")
        else:
            feedback.append(f"Precision {p:.2f} seems too low")
        
        # Test 5: Different value for irrelevant keywords (2 pts)
        p_irrelevant = precision_at_k("Why did rates increase?", ["pizza", "sports"], k=3)
        if p_irrelevant < p:
            task["points"] += 2
            feedback.append("Lower precision for irrelevant keywords (+2)")
        else:
            feedback.append("Should have lower precision for irrelevant keywords")
            
    except Exception as e:
        feedback.append(f"Error: {str(e)}")
    
    task["feedback"] = "; ".join(feedback)

grade_task5()
print(f"Task 5: {results['task5_precision']['points']}/{results['task5_precision']['max']}")
print(f"Feedback: {results['task5_precision']['feedback']}")

In [None]:
# ============================================================
# TASK 6 GRADING: Written Explanation (20 points)
# ============================================================

def grade_task6():
    task = results["task6_written"]
    feedback = []
    
    try:
        # Test 1: Variable exists and has content (4 pts)
        if not rag_failure_explanation or len(rag_failure_explanation.strip()) < 50:
            feedback.append("Explanation missing or too short")
            task["feedback"] = "; ".join(feedback)
            return
        task["points"] += 4
        feedback.append("Has content (+4)")
        
        text = rag_failure_explanation.strip()
        text_lower = text.lower()
        
        # Test 2: Adequate length (4 pts)
        word_count = len(text.split())
        if word_count >= 80:
            task["points"] += 4
            feedback.append(f"Good length: {word_count} words (+4)")
        elif word_count >= 50:
            task["points"] += 2
            feedback.append(f"Adequate length: {word_count} words (+2)")
        else:
            feedback.append(f"Too short: {word_count} words")
        
        # Test 3: Mentions near-miss concept (3 pts)
        near_miss_terms = ["near-miss", "near miss", "semantically similar", "similar but wrong", "plausible but incorrect"]
        if any(term in text_lower for term in near_miss_terms):
            task["points"] += 3
            feedback.append("Discusses near-miss (+3)")
        else:
            feedback.append("Missing near-miss discussion")
        
        # Test 4: Mentions refusal/guardrails (3 pts)
        guardrail_terms = ["refus", "guardrail", "threshold", "reject", "filter"]
        if any(term in text_lower for term in guardrail_terms):
            task["points"] += 3
            feedback.append("Discusses guardrails/refusal (+3)")
        else:
            feedback.append("Missing guardrails discussion")
        
        # Test 5: Mentions risk shift (3 pts)
        shift_terms = ["shift", "not eliminate", "doesn't eliminate", "still", "reduce", "risk"]
        if any(term in text_lower for term in shift_terms):
            task["points"] += 3
            feedback.append("Discusses risk shift (+3)")
        else:
            feedback.append("Missing risk shift discussion")
        
        # Test 6: Check for AI-generated content (-5 pts penalty)
        ai_issues = check_ai_generated(text)
        if ai_issues:
            penalty = min(5, len(ai_issues) * 2)
            task["points"] = max(0, task["points"] - penalty)
            feedback.append(f"AI markers detected (-{penalty}): {ai_issues[:3]}")
        else:
            task["points"] += 3
            feedback.append("No AI markers (+3)")
            
    except Exception as e:
        feedback.append(f"Error: {str(e)}")
    
    task["feedback"] = "; ".join(feedback)

grade_task6()
print(f"Task 6: {results['task6_written']['points']}/{results['task6_written']['max']}")
print(f"Feedback: {results['task6_written']['feedback']}")

In [None]:
# ============================================================
# FINAL RESULTS
# ============================================================

total_points = sum(task["points"] for task in results.values())
max_points = sum(task["max"] for task in results.values())

print("="*60)
print("MODULE 7 ASSESSMENT RESULTS")
print("="*60)

for task_name, task_data in results.items():
    print(f"\n{task_name}:")
    print(f"  Score: {task_data['points']}/{task_data['max']}")
    print(f"  Feedback: {task_data['feedback']}")

print("\n" + "="*60)
print(f"TOTAL: {total_points}/{max_points} ({100*total_points/max_points:.1f}%)")
print("="*60)

# Save results to JSON
final_results = {
    "module": "Module 7 - RAG Pipelines",
    "total_points": total_points,
    "max_points": max_points,
    "percentage": round(100 * total_points / max_points, 1),
    "tasks": results
}

with open("assessment_result.json", "w") as f:
    json.dump(final_results, f, indent=2)

print("\nResults saved to assessment_result.json")