# Práctica 9: Sistemas RAG con LangChain y Chroma

**Fuente Original:** https://www.pinecone.io/learn/openai-gen-qa/


Este notebook demuestra cómo implementar un sistema RAG (Retrieval-Augmented Generation) usando **LangChain** y **Chroma** como base de datos vectorial local. 

## Conceptos Clave a Aprender

1. **Embeddings vectoriales** para representación semántica
2. **Bases de datos vectoriales** para búsqueda eficiente
3. **RAG (Retrieval-Augmented Generation)** para mejorar respuestas de LLM
4. **LangChain** como framework unificado para aplicaciones de IA

In [None]:
%pip install -qU openai langchain langchain-openai langchain-chroma datasets tiktoken chromadb python-dotenv --upgrade

In [None]:
from dotenv import load_dotenv

load_dotenv() 

In [None]:
import os
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain.schema import Document
from langchain.chains import RetrievalQA

# Initialize LangChain OpenAI client and embeddings
llm = ChatOpenAI(
    model="gpt-4o-mini",
    api_key=os.getenv("OPENAI_API_KEY"),
    temperature=0
)

embeddings = OpenAIEmbeddings(
    model="text-embedding-3-small",
    api_key=os.getenv("OPENAI_API_KEY")
)

In [None]:
query = "who was the 12th person on the moon and when did they land?"

# Now query WITHOUT context using LangChain
response = llm.invoke(query)
print(response.content)

In [None]:
# First let's make it simpler to get answers using LangChain
def complete(prompt):
    response = llm.invoke(prompt)
    return response.content

query = (
    "Which training method should I use for sentence transformers when " +
    "I only have pairs of related sentences?"
)

complete(query)

In [None]:
# Use LangChain OpenAI embeddings
sample_texts = [
    "Sample document text goes here",
    "there will be several phrases in each batch"
]

# Generate embeddings using LangChain
vectors = embeddings.embed_documents(sample_texts)

print(f"Generated {len(vectors)} vectors")
print(f"Each vector has {len(vectors[0])} dimensions")
print(f"First few values of first vector: {vectors[0][:5]}")

In [None]:
# We have created two vectors (one for each sentence input)
len(vectors)

In [None]:
# We have created two 1536-dimensional vectors
len(vectors[0]), len(vectors[1])

In [None]:
# We can also get the vector for a single sentence
vectors[0]

In [None]:
from datasets import load_dataset

data = load_dataset('jamescalam/youtube-transcriptions', split='train')
data

In [None]:
data[0]

In [None]:
from tqdm.auto import tqdm

new_data = []

window = 20  # number of sentences to combine
stride = 4  # number of sentences to 'stride' over, used to create overlap

for i in tqdm(range(0, len(data), stride)):
    i_end = min(len(data)-1, i+window)
    if data[i]['title'] != data[i_end]['title']:
        # in this case we skip this entry as we have start/end of two videos
        continue
    text = ' '.join(data[i:i_end]['text'])
    # create the new merged dataset
    new_data.append({
        'start': data[i]['start'],
        'end': data[i_end]['end'],
        'title': data[i]['title'],
        'text': text,
        'id': data[i]['id'],
        'url': data[i]['url'],
        'published': data[i]['published'],
        'channel_id': data[i]['channel_id']
    })

In [None]:
new_data[0]

In [None]:
# Initialize Chroma vectorstore (local database)
# This creates a local database that doesn't require external services

# Create a persistent Chroma vectorstore
persist_directory = "./chroma_db"  # Local directory for the database

print("Initializing Chroma vectorstore...")
print(f"Database will be stored in: {persist_directory}")
print("Chroma is ready to use!")

In [None]:
# Create the Chroma vectorstore
vectorstore = Chroma(
    collection_name="youtube_transcriptions",
    embedding_function=embeddings,
    persist_directory=persist_directory
)

print("Chroma vectorstore created successfully!")
print(f"Collection name: youtube_transcriptions")
print(f"Embedding model: text-embedding-3-small")

In [None]:
# Get information about the vectorstore
collection_count = vectorstore._collection.count()
print(f"Number of documents in vectorstore: {collection_count}")
print("Vectorstore is ready for use!")

In [None]:
# Describe the Vectorstore:
collection_info = {
    "collection_name": vectorstore._collection.name,
    "total_documents": vectorstore._collection.count(),
    "embedding_function": "OpenAI text-embedding-3-small"
}

print("Chroma Vectorstore Information:")
print(f"Collection Name: {collection_info['collection_name']}")
print(f"Total Documents: {collection_info['total_documents']}")
print(f"Embedding Model: {collection_info['embedding_function']}")

In [None]:
from tqdm.auto import tqdm
import datetime
import time
from time import sleep

batch_size = 100  # how many documents we process at once

# Process data in batches and add to vectorstore
print("Starting to add documents to Chroma vectorstore...")

for i in tqdm(range(0, len(new_data), batch_size)):
    # find end of batch
    i_end = min(len(new_data), i+batch_size)
    meta_batch = new_data[i:i_end]
    
    # Create Document objects for LangChain
    documents = []
    for data in meta_batch:
        doc = Document(
            page_content=data['text'],
            metadata={
                'id': data['id'],
                'start': data['start'],
                'end': data['end'], 
                'title': data['title'],
                'url': data['url'],
                'published': data['published'],
                'channel_id': data['channel_id']
            }
        )
        documents.append(doc)
    
    # Add documents to Chroma vectorstore
    # LangChain handles the embedding generation automatically
    try:
        vectorstore.add_documents(documents)
        time.sleep(1)  # Small delay to avoid rate limits
    except Exception as e:
        print(f"Error processing batch {i//batch_size + 1}: {e}")
        time.sleep(5)  # Wait longer if there's an error
        # Retry the batch
        try:
            vectorstore.add_documents(documents)
        except Exception as e2:
            print(f"Failed to process batch {i//batch_size + 1} after retry: {e2}")

print(f"\nCompleted! Total documents in vectorstore: {vectorstore._collection.count()}")

In [None]:
# Perform similarity search using LangChain Chroma
query = "Which training method should I use for sentence transformers when I only have pairs of related sentences?"

# Search for similar documents
docs = vectorstore.similarity_search_with_score(query, k=2)

# Print search results in a readable format
print("Search results:")
print("-" * 80)
for i, (doc, score) in enumerate(docs, 1):
    print(f"\nMatch {i} (Score: {score:.3f})")
    print(f"ID: {doc.metadata['id']}")
    print("\nMetadata:")
    print(f"  Title: {doc.metadata['title']}")
    print(f"  Time: {doc.metadata['start']:.1f}s - {doc.metadata['end']:.1f}s") 
    print(f"  URL: {doc.metadata['url']}")
    print(f"  Published: {doc.metadata['published']}")
    print("\nText:")
    print("  " + doc.page_content[:300] + "..." if len(doc.page_content) > 300 else "  " + doc.page_content)

In [None]:
limit = 3750

def retrieve(query):
    # Use LangChain's similarity search
    docs = vectorstore.similarity_search(query, k=3)
    
    # Extract contexts from documents
    contexts = [doc.page_content for doc in docs]

    # build our prompt with the retrieved contexts included
    prompt_start = (
        "Answer the question based on the context below.\n\n"+
        "Context:\n"
    )
    prompt_end = (
        f"\n\nQuestion: {query}\nAnswer:"
    )
    
    # Initialize prompt with all contexts
    prompt = (
        prompt_start +
        "\n\n---\n\n".join(contexts) + 
        prompt_end
    )
    
    # If total length exceeds limit, reduce contexts one by one
    for i in range(len(contexts)-1, 0, -1):
        if len("\n\n---\n\n".join(contexts[:i])) < limit:
            prompt = (
                prompt_start +
                "\n\n---\n\n".join(contexts[:i]) +
                prompt_end
            )
            break
            
    return prompt

# First we retrieve relevant items from Chroma
query = "Which training method should I use for sentence transformers when I only have pairs of related sentences?"
query_with_contexts = retrieve(query)
print(query_with_contexts)

In [None]:
# Then we complete the context-infused query using our simple approach
print("=== Manual RAG Implementation ===")
print(complete(query_with_contexts))

In [None]:
# Alternative: Use LangChain's RetrievalQA for a complete RAG system
from langchain.chains import RetrievalQA

# Create a retriever from the vectorstore
retriever = vectorstore.as_retriever(
    search_type="similarity", 
    search_kwargs={"k": 3}
)

# Create the RetrievalQA chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",  # "stuff" means put all retrieved docs into the prompt
    retriever=retriever,
    return_source_documents=True,
    verbose=True
)

# Test the complete RAG system
query = "Which training method should I use for sentence transformers when I only have pairs of related sentences?"
result = qa_chain({"query": query})

print("=== RAG System Response ===")
print(result["result"])
print("\n=== Source Documents Used ===")
for i, doc in enumerate(result["source_documents"], 1):
    print(f"\nDocument {i}:")
    print(f"Title: {doc.metadata['title']}")
    print(f"Text: {doc.page_content[:200]}...")