# Week 8: Retrieval-Augmented Generation (Solution)

Complete solutions for Week 8 RAG exercises.

In [None]:
import numpy as np
from typing import Dict, List, Tuple, Optional
import chromadb
from sentence_transformers import SentenceTransformer
from openai import OpenAI
import os
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

## Part 1: Document Chunking - SOLUTION

In [None]:
class DocumentChunker:
    def __init__(self, chunk_size: int = 500, overlap: int = 50):
        self.chunk_size = chunk_size
        self.overlap = overlap
    
    def chunk_text(self, text: str) -> List[str]:
        words = text.split()
        chunks = []
        for i in range(0, len(words), self.chunk_size - self.overlap):
            chunk = ' '.join(words[i:i + self.chunk_size])
            if chunk:
                chunks.append(chunk)
        return chunks
    
    def chunk_by_sentence(self, text: str) -> List[str]:
        import nltk
        nltk.download('punkt', quiet=True)
        sentences = nltk.sent_tokenize(text)
        chunks = []
        current_chunk = []
        current_length = 0
        for sent in sentences:
            sent_len = len(sent.split())
            if current_length + sent_len > self.chunk_size and current_chunk:
                chunks.append(' '.join(current_chunk))
                current_chunk = [sent]
                current_length = sent_len
            else:
                current_chunk.append(sent)
                current_length += sent_len
        if current_chunk:
            chunks.append(' '.join(current_chunk))
        return chunks

## Part 2: Vector Store - SOLUTION

In [None]:
class VectorStore:
    def __init__(self, collection_name: str = 'documents'):
        self.client = chromadb.Client()
        self.collection = self.client.get_or_create_collection(collection_name)
        self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
        self.doc_id = 0
    
    def add_documents(self, documents: List[str], metadata: Optional[List[Dict]] = None):
        embeddings = self.embedding_model.encode(documents).tolist()
        ids = [f'doc_{self.doc_id + i}' for i in range(len(documents))]
        self.collection.add(
            embeddings=embeddings,
            documents=documents,
            metadatas=metadata or [{} for _ in documents],
            ids=ids
        )
        self.doc_id += len(documents)
    
    def search(self, query: str, top_k: int = 5) -> List[Tuple[str, float]]:
        query_embedding = self.embedding_model.encode([query]).tolist()
        results = self.collection.query(query_embeddings=query_embedding, n_results=top_k)
        docs = results['documents'][0]
        distances = results['distances'][0]
        return list(zip(docs, distances))

## Part 3: RAG System - SOLUTION

In [None]:
class RAGSystem:
    def __init__(self, vector_store: VectorStore, llm_model: str = 'gpt-3.5-turbo'):
        self.vector_store = vector_store
        self.llm_client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
        self.model = llm_model
    
    def retrieve(self, query: str, top_k: int = 3) -> List[str]:
        results = self.vector_store.search(query, top_k)
        return [doc for doc, _ in results]
    
    def generate(self, query: str, context: List[str]) -> str:
        context_text = '\n\n'.join(context)
        prompt = f'Context:\n{context_text}\n\nQuestion: {query}\n\nAnswer based on the context:'
        response = self.llm_client.chat.completions.create(
            model=self.model,
            messages=[{'role': 'user', 'content': prompt}],
            temperature=0.3
        )
        return response.choices[0].message.content
    
    def query(self, question: str) -> Dict:
        context = self.retrieve(question)
        answer = self.generate(question, context)
        return {
            'question': question,
            'answer': answer,
            'context': context
        }

## Part 4: Enterprise Chatbot - SOLUTION

In [None]:
class EnterpriseKnowledgeChatbot:
    def __init__(self):
        self.chunker = DocumentChunker(chunk_size=300, overlap=50)
        self.vector_store = VectorStore('enterprise_kb')
        self.rag_system = RAGSystem(self.vector_store)
    
    def ingest_documents(self, documents: List[str]):
        all_chunks = []
        for doc in documents:
            chunks = self.chunker.chunk_by_sentence(doc)
            all_chunks.extend(chunks)
        self.vector_store.add_documents(all_chunks)
        logger.info(f'Ingested {len(all_chunks)} chunks from {len(documents)} documents')
    
    def chat(self, question: str) -> str:
        result = self.rag_system.query(question)
        return result['answer']
    
    def evaluate_answer(self, answer: str, context: List[str]) -> Dict:
        has_citation = any(ctx[:50] in answer for ctx in context)
        return {
            'answer_length': len(answer.split()),
            'uses_context': has_citation,
            'context_count': len(context)
        }

## Example Usage

In [None]:
# Initialize chatbot
chatbot = EnterpriseKnowledgeChatbot()

# Sample documents
docs = [
    'Our company was founded in 2020. We specialize in AI solutions for healthcare.',
    'Our vacation policy allows 20 days per year. Employees can roll over up to 5 unused days.',
    'The office is located at 123 Main St. Working hours are 9 AM to 5 PM.'
]

# Ingest and query
chatbot.ingest_documents(docs)
answer = chatbot.chat('What is the vacation policy?')
print(f'Answer: {answer}')