[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/kgweber-cwru/coding-with-ai-wn26/blob/main/series-2-coding-llms/week-4-embeddings-and-rag-concepts/assignment.ipynb)

# Week 4 Assignment: Build Your Own Document Search System

## Objective
Create a semantic search system for a collection of documents from your domain. Demonstrate retrieval quality and use it to answer questions.

## Requirements
1. Collect or create 10+ documents relevant to your work
2. Build a document store with embeddings
3. Implement semantic search
4. Demonstrate search with multiple queries
5. Build a simple RAG system that answers questions using your documents

In [None]:
import os
import sys
from pathlib import Path

IN_COLAB = "google.colab" in sys.modules

if IN_COLAB:
    !pip install -q google-genai google-auth python-dotenv numpy
    from google.colab import auth
    auth.authenticate_user()
    try:
        PROJECT_ID = input("Enter your Google Cloud Project ID (press Enter to use default ADC): ").strip()
    except Exception:
        PROJECT_ID = ""
    if PROJECT_ID:
        os.environ["GOOGLE_CLOUD_PROJECT"] = PROJECT_ID
else:
    def find_service_account_json(max_up=6):
        p = Path.cwd()
        for _ in range(max_up):
            candidate = p / "series-2-coding-llms" / "creds"
            if candidate.exists():
                for f in candidate.glob("*.json"):
                    return str(f.resolve())
            candidate2 = p / "creds"
            if candidate2.exists():
                for f in candidate2.glob("*.json"):
                    return str(f.resolve())
            p = p.parent
        return None

    sa_path = find_service_account_json()
    if sa_path:
        os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = sa_path
    else:
        try:
            from dotenv import load_dotenv
            load_dotenv()
        except Exception:
            pass


In [None]:
import numpy as np
from google import genai
from google.genai import types
import google.auth

creds, project = google.auth.default()
project = os.environ.get("GOOGLE_CLOUD_PROJECT", project)
client = genai.Client(vertexai=True, project=project, location="us-central1")
print(f"Using project: {project}")

print("✅ Environment loaded successfully!")

## Helper Functions

In [None]:
def get_embedding(text, model="text-embedding-004"):
    text = text.replace("\n", " ")
    response = client.models.embed_content(
        model=model,
        contents=text
    )
    return response.embeddings[0].values

def cosine_similarity(vec1, vec2):
    vec1, vec2 = np.array(vec1), np.array(vec2)
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

class SimpleDocumentStore:
    def __init__(self):
        self.documents = []
        self.embeddings = []
    
    def add_document(self, text):
        embedding = get_embedding(text)
        self.documents.append(text)
        self.embeddings.append(embedding)
    
    def add_documents(self, texts):
        for text in texts:
            self.add_document(text)
    
    def search(self, query, top_k=3):
        query_embedding = get_embedding(query)
        similarities = [cosine_similarity(query_embedding, emb) for emb in self.embeddings]
        top_indices = np.argsort(similarities)[::-1][:top_k]
        return [{"document": self.documents[i], "similarity": similarities[i], "index": i} for i in top_indices]
    
    def __len__(self):
        return len(self.documents)

## Step 1: Describe Your Document Collection

**YOUR DESCRIPTION**

Domain: [your field]

Document type: [e.g., research summaries, protocols, FAQs, case studies]

Why this collection: [explain relevance]

Number of documents: [X]

## Step 2: Create Your Document Collection

In [None]:
# YOUR DOCUMENTS HERE
my_documents = [
    "Document 1 text...",
    "Document 2 text...",
    # Add at least 10 documents
]

print(f"Created collection with {len(my_documents)} documents")

## Step 3: Build Your Document Store

In [None]:
# Create and populate store
store = SimpleDocumentStore()
print("Adding documents...")
store.add_documents(my_documents)
print(f"✅ Indexed {len(store)} documents")

## Step 4: Test Semantic Search

In [None]:
# Test with multiple queries
test_queries = [
    "YOUR QUERY 1",
    "YOUR QUERY 2",
    "YOUR QUERY 3",
]

for query in test_queries:
    print(f"\nQuery: {query}")
    print("="*70)
    results = store.search(query, top_k=3)
    for i, result in enumerate(results, 1):
        print(f"{i}. [Score: {result['similarity']:.3f}] {result['document'][:100]}...")

## Step 5: Build RAG System

In [None]:
def rag_query(question, doc_store, top_k=2):
    """Answer question using RAG"""
    # Retrieve
    results = doc_store.search(question, top_k=top_k)
    context = "\n\n".join([f"Source {i+1}: {r['document']}" for i, r in enumerate(results)])
    
    # Generate
    prompt = f"""Answer based on the context below.
    
Context:
{context}

Question: {question}

Answer:"""
    
    response = client.models.generate_content(
        model="gemini-2.5-flash-lite",
        contents=prompt,
        config=types.GenerateContentConfig(temperature=0.3)
    )
    
    return {"answer": response.text, "sources": results}

# Test RAG
questions = [
    "YOUR QUESTION 1",
    "YOUR QUESTION 2",
]

for question in questions:
    print(f"\nQ: {question}")
    print("="*70)
    result = rag_query(question, store)
    print(f"A: {result['answer']}")
    print("\nSources:")
    for i, src in enumerate(result['sources'], 1):
        print(f"  {i}. {src['document'][:80]}...")
    print()

## Reflection

### 1. How well did semantic search perform?

**YOUR ANSWER**

### 2. Did RAG answers improve with retrieved context?

**YOUR ANSWER**

### 3. What would make this system more useful?

**YOUR ANSWER**

## Bonus: If documents are long, implement chunking

Try the chunking function from concepts and rebuild your store with chunks instead of full documents.

In [None]:
# BONUS CODE HERE