In [1]:
import fitz  # PyMuPDF
from bs4 import BeautifulSoup
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline, AutoModelForCausalLM


def extract_text_layout(file_path):
    doc = fitz.open(file_path)
    full_text = []
    for page in doc:
        html = page.get_text("html")
        soup = BeautifulSoup(html, "html.parser")
        full_text.append(soup.get_text(separator=" "))
    return "\n".join(full_text)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
pdf_text = extract_text_layout("data/PC_US_Elections.pdf")

In [3]:
from transformers import AutoTokenizer

# Use LLaMA-2 tokenizer instead of FLAN-T5
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf", use_auth_token=True)

def smart_chunk_text(text, max_tokens=512, overlap_tokens=50):
    paragraphs = text.split("\n")
    chunks = []
    current_chunk = []
    current_token_count = 0

    for para in paragraphs:
        if not para.strip():
            continue

        para_tokens = tokenizer(para)["input_ids"]
        if current_token_count + len(para_tokens) <= max_tokens:
            current_chunk.append(para)
            current_token_count += len(para_tokens)
        else:
            combined = "\n".join(current_chunk)
            chunks.append(combined)

            # Handle overlap
            if overlap_tokens > 0 and chunks:
                last_chunk_tokens = tokenizer(combined)["input_ids"][-overlap_tokens:]
                overlap_text = tokenizer.decode(last_chunk_tokens)
                current_chunk = [overlap_text, para]
                current_token_count = len(tokenizer(" ".join(current_chunk))["input_ids"])
            else:
                current_chunk = [para]
                current_token_count = len(para_tokens)

    if current_chunk:
        chunks.append("\n".join(current_chunk))

    return chunks

# Run on your parsed PDF text
chunks = smart_chunk_text(pdf_text)
print(f"✅ Total chunks: {len(chunks)}")
print("🧾 Sample chunk:\n", chunks[0][:500])




✅ Total chunks: 45
🧾 Sample chunk:
  INSTITUTIONAL EQUITY RESEARCH  
 Page | 1 | PHILLIPCAPITAL INDIA RESEARCH  
 DISCLAIMER FOR U.S. BASED INVESTORS . The Agent of  PhillipCapital (India) Pvt. Ltd.  in the United States is Marco Polo Securities Inc, a non-affiliated broker-dealer registered with the US Securities and  
 Exchange Commission. The activities of  PhillipCapital (India) Pvt. Ltd.  in the United States will be affected only to the extent permitted by Rule 15a-6 under the US Securities Exchange Act of 1934 and in  
 acc


In [4]:
class Embedder:
    def __init__(self, model_name="all-MiniLM-L6-v2"):
        self.model = SentenceTransformer(model_name)

    def embed_chunks(self, chunks):
        return self.model.encode(
            chunks,
            convert_to_numpy=True,
            show_progress_bar=True
        ).astype("float32")

embedder = Embedder()
embeddings = embedder.embed_chunks(chunks)
print("✅ Embeddings shape:", embeddings.shape)



: 

: 

In [None]:
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)
print("✅ FAISS index created and populated.")

In [None]:
def retrieve_chunks_for_query(query, top_k=3):
    query_vec = embedder.model.encode([query], convert_to_numpy=True).astype("float32")
    D, I = index.search(query_vec, top_k)
    return [chunks[i] for i in I[0]]

query = "What are the Biden policies?"
retrieved_chunks = retrieve_chunks_for_query(query)
print("🔍 Retrieved Chunks:\n")
for i, chunk in enumerate(retrieved_chunks):
    print(f"Chunk {i+1}:", chunk[:300], "\n")


In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
# ✅ Cell 7: Use LLaMA-2 to Generate the Answer
class LLM:
    def __init__(self, model_id="meta-llama/Llama-2-7b-chat-hf"):
        self.tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=True)
        self.model = AutoModelForCausalLM.from_pretrained(
            model_id,
            device_map="auto",
            torch_dtype="auto",
            use_auth_token=True
        )
        self.generator = pipeline("text-generation", model=self.model, tokenizer=self.tokenizer)

    def answer_query(self, query, context_chunks):
        context = "\n".join(context_chunks)
        prompt = f"""<s>[INST] <<SYS>>Use the following context to answer the question as accurately as possible.<<SYS>>\n\nContext:\n{context}\n\nQuestion: {query}\nAnswer: [/INST]"""
        
        result = self.generator(prompt, max_new_tokens=256, max_length=1024, do_sample=False)[0]["generated_text"]
        return result

llm_model = LLM()
answer = llm_model.answer_query(query, retrieved_chunks)
print("🧠 LLaMA-2 Answer:\n", answer)
