In [None]:
from dotenv import load_dotenv

load_dotenv()

In [None]:
import os

os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

os.environ["LANGCHAIN_API_KEY"] = os.getenv("LANGCHAIN_API_KEY")
os.environ["LANGCHAIN_PROJECT"] = os.getenv("LANGCHAIN_PROJECT")
os.environ["LANGCHAIN_TRACING_V2"] = "true"

In [None]:
from unstructured.partition.pdf import partition_pdf

file_path = '/Users/kamal/Desktop/AgenticAI/Uploads/Attention Is All You Need.pdf'

chunks = partition_pdf(
    filename=file_path,
    infer_table_structure=True,           
    strategy="hi_res",                     

    extract_image_block_types=["Image", "Table"],   
    image_output_dir_path="/Users/kamal/Desktop/AgenticAI/Side Projects/RAG Assignment/Images",   

    extract_image_block_to_payload=True,   

    chunking_strategy="by_title",          
    max_characters=10000,                  
    combine_text_under_n_chars=2000,       
    new_after_n_chars=6000
)

In [None]:
chunks, len(chunks)

In [None]:
chunks[8]

In [None]:
import os
import uuid
import base64
from collections import Counter
import openai

openai.api_key = os.getenv("OPENAI_API_KEY")
image_output_dir = "/Users/kamal/Desktop/AgenticAI/Side Projects/RAG Assignment/Images"
os.makedirs(image_output_dir, exist_ok=True)

documents = []

for chunk in chunks:
    chunk_id = str(uuid.uuid4())
    content_pieces, page_numbers, image_paths = [], [], []

    for idx, elem in enumerate(getattr(chunk.metadata, 'orig_elements', [])):
        if (text := getattr(elem, 'text', None)):
            content_pieces.append(text)

        if (page := getattr(elem.metadata, 'page_number', None)) is not None:
            page_numbers.append(page)

        if (b64 := getattr(elem.metadata, 'image_base64', None)):
            try:
                img_bytes = base64.b64decode(b64)
                img_path = os.path.join(image_output_dir, f"{chunk_id}_{idx}.png")
                with open(img_path, "wb") as f:
                    f.write(img_bytes)
                image_paths.append(img_path)
            except Exception as e:
                print(f"Image decode error in chunk {chunk_id}: {e}")

    content = "\n".join(content_pieces)
    page_number = Counter(page_numbers).most_common(1)[0][0] if page_numbers else -1

    try:
        embedding_response = openai.embeddings.create(
            input=content,
            model="text-embedding-3-small"
        )
        embedding = embedding_response.data[0].embedding
    except Exception as e:
        print(f"Embedding error in chunk {chunk_id}: {e}")
        embedding = [0.0] * 1536

    doc = {
        "id": chunk_id,
        "embedding": embedding,
        "content": content,
        "type": getattr(chunk, 'type', "Composite"),
        "section_title": getattr(chunk, 'section_title', "") or "",
        "page_number": page_number,
        "image_path": image_paths,
    }

    documents.append(doc)


In [None]:
from pymilvus import connections, Collection, CollectionSchema, FieldSchema, DataType, list_collections

# 1. Connect to Milvus
connections.connect(alias="default", host="localhost", port="19530")

# 2. Define schema
fields = [
    FieldSchema(name="id", dtype=DataType.VARCHAR, is_primary=True, auto_id=False, max_length=36),
    FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=1536),
    FieldSchema(name="content", dtype=DataType.VARCHAR, max_length=65535),
    FieldSchema(name="type", dtype=DataType.VARCHAR, max_length=20),
    FieldSchema(name="section_title", dtype=DataType.VARCHAR, max_length=256),
    FieldSchema(name="page_number", dtype=DataType.INT64),
    FieldSchema(name="image_path", dtype=DataType.VARCHAR, max_length=1024),
]

schema = CollectionSchema(fields, description="RAG Assignment")
collection_name = "rag_assignment"

# 3. Create collection if not exists
if collection_name not in list_collections():
    collection = Collection(name=collection_name, schema=schema)
else:
    collection = Collection(name=collection_name)

# 4. Prepare and insert data
data_to_insert = [
    [doc["id"] for doc in documents],
    [doc["embedding"] for doc in documents],
    [doc["content"][:4096] for doc in documents],
    [doc["type"] for doc in documents],
    [doc["section_title"][:256] for doc in documents],
    [doc["page_number"] for doc in documents],
    [",".join(doc["image_path"])[:1024] for doc in documents],
]

collection.insert(data_to_insert)
collection.flush()

index_params = {
    "metric_type": "COSINE",
    "index_type": "IVF_FLAT",
    "params": {"nlist": 128}
}

collection.create_index(field_name="embedding", index_params=index_params)

collection.load()

print(f"Inserted {len(documents)} documents into Milvus collection '{collection_name}'.")

In [None]:
from langchain.vectorstores import Milvus
from langchain.embeddings import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

vector_store = Milvus(
    embedding_function=embeddings,
    collection_name="rag_assignment",
    connection_args={"host": "localhost", "port": "19530"},
    text_field="content",
    vector_field="embedding",     
    index_params={"index_type": "IVF_FLAT", "metric_type": "COSINE"},
)

In [None]:
retriever = vector_store.as_retriever(search_kwargs={"k": 3})

In [None]:
from sentence_transformers import CrossEncoder

query = "What is the multihead attention?"
top_k_docs = retriever.get_relevant_documents(query)
candidate_texts = [doc.page_content for doc in top_k_docs]

cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L6-v2')
pairs = [(query, text) for text in candidate_texts]
scores = cross_encoder.predict(pairs)

scored_docs = sorted(zip(top_k_docs, scores), key=lambda x: x[1], reverse=True)
ranked_docs = [doc for doc, score in scored_docs]

In [None]:
ranked_docs

In [None]:
from langchain import hub

prompt = hub.pull("rlm/rag-prompt")

from langchain.chains import LLMChain
from langchain_community.chat_models import ChatOpenAI

llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.1)

context = "\n\n".join([doc.page_content for doc in ranked_docs[:2]])

chain = LLMChain(llm=llm, prompt=prompt)
response = chain.run({"context": context, "question": query})
print(response)

In [None]:
from langchain import hub
from langchain.chains import LLMChain
from langchain_community.chat_models import ChatOpenAI
from IPython.display import Image, display
import os

# Load prompt and LLM
prompt = hub.pull("rlm/rag-prompt")
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.1)

# Prepare top 2 documents
top_docs = ranked_docs[:2]
context = "\n\n".join([doc.page_content for doc in top_docs])

# Run RAG synthesis
chain = LLMChain(llm=llm, prompt=prompt)
response = chain.run({"context": context, "question": query})

# Print response
print("🔍 Synthesized Answer:\n")
print(response)

# Display associated images (if any)
print("\n🖼️ Associated Images from Top Documents:\n")
for i, doc in enumerate(top_docs, start=1):
    image_paths = doc.metadata.get("image_path", "")
    if image_paths:
        paths = [p.strip() for p in image_paths.split(",") if p.strip()]
        for path in paths:
            if os.path.exists(path):
                print(f"Document {i} - Image: {os.path.basename(path)}")
                display(Image(filename=path))
            else:
                print("")
