In [14]:
# Data Ingestion

import os

from dotenv import load_dotenv

load_dotenv()

# Read your API key
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
    raise ValueError("OPENAI_API_KEY not found in environment variables!")


In [17]:
from sentence_transformers import SentenceTransformer

# Load HF embedding model
embedding_model = SentenceTransformer('BAAI/bge-large-en-v1.5')
print(f"✓ Loaded embedding model: BAAI/bge-large-en-v1.5")

def get_embedding(text, input_type="document"):
    """Generate embeddings using Hugging Face model"""
    embedding = embedding_model.encode(text, convert_to_tensor=False)
    return embedding.tolist()

✓ Loaded embedding model: BAAI/bge-large-en-v1.5


In [None]:
embeddings = get_embedding("AI TECHNOLOGY")
print(embeddings)

In [None]:

## PyMuPDFLoader --> points to html

# load the pdf, and split it
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

# load
loader = PyPDFLoader("https://www.fidelity.com/bin-public/060_www_fidelity_com/documents/about-fidelity/2024-Fidelity-Investments-Annual-Report.pdf")
data = loader.load()

# split
text_splitter = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=20)
documents = text_splitter.split_documents(data)

In [None]:
documents

In [None]:
## DOCS to prepare for insertions

docs_to_insert = [{
    "text"  : doc.page_content,
    "embedding" : get_embedding(doc.page_content)
} for doc in documents]

In [None]:
# Print the text of the first 5 documents in the final list
for i, doc in enumerate(docs_to_insert[:5]):
    print(f"--- Document {i} ---")
    print(doc['text'])

# Check the total count
print(f"\nTotal documents successfully inserted: {len(docs_to_insert)}")

In [None]:
import chromadb

# Define where to save the database
PERSIST_DIR = "./chroma_db_data"

# Initialize Persistent Client (this creates the database)
client = chromadb.PersistentClient(path=PERSIST_DIR)

print(f"✓ ChromaDB created at: {PERSIST_DIR}")


In [None]:
collection = client.get_or_create_collection(
    name="ragpdf")

print(f"✓ Created cluster: ragpdf")

In [None]:
import uuid

ids = [str(uuid.uuid4()) for _ in docs_to_insert]
documents = [doc["text"] for doc in docs_to_insert]
embeddings = [doc["embedding"] for doc in docs_to_insert]

# Insert
collection.add(
    ids=ids,
    documents=documents,
    embeddings=embeddings
    )

collection.count()
collection

In [None]:
#### PHASE 2


import chromadb
import uuid

# Connect to ChromaDB
client = chromadb.PersistentClient(path="./chroma_db_data")

# Create collection with vector index configuration
collection = client.get_or_create_collection(
    name="ragpdf",
    metadata={
        "hnsw:space": "cosine",
        "hnsw:construction_ef": 200,
        "hnsw:search_ef": 100,
        "hnsw:M": 16
    }
)

your_ids = [str(uuid.uuid4()) for _ in docs_to_insert]
your_texts = [doc["text"] for doc in docs_to_insert]
your_embeddings = [doc["embedding"] for doc in docs_to_insert]

collection.add(
    ids=your_ids,
    documents=your_texts,
    embeddings=your_embeddings
)

print(f"✓ Vector search index created with {collection.count()} documents")

In [None]:
results = collection.get(
    limit=5,
    include=['documents', 'embeddings'] # Explicitly ask for text and embeddings
)

In [None]:
results

In [None]:
def get_retrieved_context(query_text):
    # 1. Embed the input query text
    query_embedding = get_embedding(query_text)

    # 2. Vector Search ChromaDB
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=5,
        include=['documents']
    )

    retrieved_documents = results['documents'][0]
    context_string = "\n---\n".join(retrieved_documents)


    return context_string

In [None]:
#### Phase 3


from openai import OpenAI

# Define the question
query = "“According to the retrieved text, what does Fidelity emphasize about supporting customers?"

# 1. RETRIEVAL: Get the context string using the corrected function
context_string = get_retrieved_context(query)

# 2. GENERATION: Construct the RAG Prompt
rag_prompt = f"""
Use ONLY the provided context to answer the question.
If the answer is not in the context, state that explicitly.

QUESTION: {query}

CONTEXT:
{context_string}
"""

# 3. LLM API Call (Corrected message format)
openai_client = OpenAI()
model_name = "gpt-4o"

completion = openai_client.chat.completions.create(
    model=model_name,
    messages=[

        # Fix 2: User message contains the entire RAG prompt
        {"role": "user", "content": rag_prompt}
    ]
)

# Output the final answer
final_answer = completion.choices[0].message.content
print("\n--- LLM Final Answer ---")
print(final_answer)