In [1]:
# Data Ingestion

import os

from dotenv import load_dotenv

load_dotenv()

# Read your API key
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
    raise ValueError("OPENAI_API_KEY not found in environment variables!")


In [2]:
from sentence_transformers import SentenceTransformer

# Load HF embedding model
embedding_model = SentenceTransformer('BAAI/bge-large-en-v1.5')
print(f"✓ Loaded embedding model: BAAI/bge-large-en-v1.5")

def get_embedding(text, input_type="document"):
    """Generate embeddings using Hugging Face model"""
    embedding = embedding_model.encode(text, convert_to_tensor=False)
    return embedding.tolist()

✓ Loaded embedding model: BAAI/bge-large-en-v1.5


In [3]:
embeddings = get_embedding("AI TECHNOLOGY")
print(embeddings)

[0.01917918398976326, 0.034076254814863205, -0.014155448414385319, -0.011123940348625183, 0.008389163762331009, -0.0028036769945174456, -0.018184814602136612, -0.0040381792932748795, 0.003224150976166129, 0.033781394362449646, -0.017017804086208344, 0.023310476914048195, -0.013705801218748093, -0.025031737983226776, -0.007939236238598824, 0.0038879583589732647, 0.0019610035233199596, 0.0027043744921684265, 0.0016844150377437472, -0.0020721701439470053, 0.011545512825250626, 0.045777078717947006, -0.042421914637088776, -0.02855719067156315, -0.0338626392185688, 0.028979672119021416, 0.010323243215680122, 0.00826785247772932, 0.07991677522659302, 0.0356597863137722, -0.003495182376354933, -0.018108438700437546, 0.02561355195939541, -0.030184539034962654, -0.008680817671120167, -0.02041185274720192, -0.005613876506686211, -0.01719573512673378, -0.05497545003890991, -0.044544536620378494, -0.011060413904488087, -0.0038015968166291714, -0.02026393823325634, -0.1032286062836647, -0.092809744

In [4]:

## PyMuPDFLoader --> points to html

# load the pdf, and split it
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

# load
loader = PyPDFLoader("https://www.fidelity.com/bin-public/060_www_fidelity_com/documents/about-fidelity/2024-Fidelity-Investments-Annual-Report.pdf")
data = loader.load()

# split
text_splitter = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=20)
documents = text_splitter.split_documents(data)

  from pydantic.v1.fields import FieldInfo as FieldInfoV1


In [6]:
documents

[Document(metadata={'producer': 'Adobe PDF Library 17.0', 'creator': 'Adobe InDesign 20.0 (Windows)', 'creationdate': '2025-02-13T13:05:16-05:00', 'author': 'Fidelity Investments', 'keywords': '2024, Fidelity Investments, Fidelity, Annual Report', 'moddate': '2025-02-13T18:31:58-05:00', 'subject': "Learn how Fidelity's financial strength and operational stability allows us to deliver products and services that meet the needs of our customers and clients.", 'title': '2024 Fidelity Investments Annual Report', 'trapped': '/False', 'source': 'https://www.fidelity.com/bin-public/060_www_fidelity_com/documents/about-fidelity/2024-Fidelity-Investments-Annual-Report.pdf', 'total_pages': 15, 'page': 0, 'page_label': '1'}, page_content='Annual Report\n2024'),
 Document(metadata={'producer': 'Adobe PDF Library 17.0', 'creator': 'Adobe InDesign 20.0 (Windows)', 'creationdate': '2025-02-13T13:05:16-05:00', 'author': 'Fidelity Investments', 'keywords': '2024, Fidelity Investments, Fidelity, Annual R

In [7]:
## DOCS to prepare for insertions

docs_to_insert = [{
    "text"  : doc.page_content,
    "embedding" : get_embedding(doc.page_content)
} for doc in documents]

In [8]:
# Print the text of the first 5 documents in the final list
for i, doc in enumerate(docs_to_insert[:5]):
    print(f"--- Document {i} ---")
    print(doc['text'])

# Check the total count
print(f"\nTotal documents successfully inserted: {len(docs_to_insert)}")

--- Document 0 ---
Annual Report
2024
--- Document 1 ---
About Fidelity
Fidelity’s mission is to strengthen the 
financial well-being of our customers 
and deliver better outcomes for the 
clients and businesses we serve. 
Fidelity’s strength comes from the 
scale of our diversified, market-
leading financial services businesses 
that serve individuals, families, 
employers, wealth management 
firms, and institutions. With assets
--- Document 2 ---
under administration of $15.1 trillion, 
including discretionary assets of  
$5.9 trillion, we focus on meeting 
the unique needs of a broad and 
growing customer base. Privately 
held for 78 years, Fidelity employs 
more than 77,000 associates across 
North America, Europe, Asia, and 
Australia. For more information,  
visit Fidelity.com.
1
Customer Engagement
--- Document 3 ---
Customer Engagement
Our robust service offerings allow us to meet our customers where they are — whether that is in 
person, on social media, over the phone, or thr

In [None]:
from pydantic.v1 import BaseModel
import chromadb

# Define where to save the database
PERSIST_DIR = "./chroma_db_data"

# Initialize Persistent Client (this creates the database)
client = chromadb.PersistentClient(path=PERSIST_DIR)

print(f"✓ ChromaDB created at: {PERSIST_DIR}")


In [None]:
collection = client.get_or_create_collection(
    name="ragpdf")

print(f"✓ Created cluster: ragpdf")

In [None]:
import uuid

ids = [str(uuid.uuid4()) for _ in docs_to_insert]
documents = [doc["text"] for doc in docs_to_insert]
embeddings = [doc["embedding"] for doc in docs_to_insert]

# Insert
collection.add(
    ids=ids,
    documents=documents,
    embeddings=embeddings
    )

collection.count()
collection

In [None]:
#### PHASE 2


import chromadb
import uuid

# Connect to ChromaDB
client = chromadb.PersistentClient(path="./chroma_db_data")

# Create collection with vector index configuration
collection = client.get_or_create_collection(
    name="ragpdf",
    metadata={
        "hnsw:space": "cosine",
        "hnsw:construction_ef": 200,
        "hnsw:search_ef": 100,
        "hnsw:M": 16
    }
)

your_ids = [str(uuid.uuid4()) for _ in docs_to_insert]
your_texts = [doc["text"] for doc in docs_to_insert]
your_embeddings = [doc["embedding"] for doc in docs_to_insert]

collection.add(
    ids=your_ids,
    documents=your_texts,
    embeddings=your_embeddings
)

print(f"✓ Vector search index created with {collection.count()} documents")

In [None]:
results = collection.get(
    limit=5,
    include=['documents', 'embeddings'] # Explicitly ask for text and embeddings
)

In [None]:
results

In [None]:
def get_retrieved_context(query_text):
    # 1. Embed the input query text
    query_embedding = get_embedding(query_text)

    # 2. Vector Search ChromaDB
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=5,
        include=['documents']
    )

    retrieved_documents = results['documents'][0]
    context_string = "\n---\n".join(retrieved_documents)


    return context_string

In [None]:
#### Phase 3


from openai import OpenAI

# Define the question
query = "“According to the retrieved text, what does Fidelity emphasize about supporting customers?"

# 1. RETRIEVAL: Get the context string using the corrected function
context_string = get_retrieved_context(query)

# 2. GENERATION: Construct the RAG Prompt
rag_prompt = f"""
Use ONLY the provided context to answer the question.
If the answer is not in the context, state that explicitly.

QUESTION: {query}

CONTEXT:
{context_string}
"""

# 3. LLM API Call (Corrected message format)
openai_client = OpenAI()
model_name = "gpt-4o"

completion = openai_client.chat.completions.create(
    model=model_name,
    messages=[

        # Fix 2: User message contains the entire RAG prompt
        {"role": "user", "content": rag_prompt}
    ]
)


In [None]:
# Output the final answer
final_answer = completion.choices[0].message.content
print("\n--- LLM Final Answer ---")
print(final_answer)