In [None]:
from pypdf import PdfReader

def load_pdf(path):
    reader = PdfReader(path)
    text = ""
    for page in reader.pages:
        text += page.extract_text() + "\n"
    return text

text = load_pdf("sample.pdf")

print(text)  # preview

In [None]:
def chunk_text(text, chunk_size=500, overlap=50):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = " ".join(words[i:i+chunk_size])
        chunks.append(chunk)
    return chunks
chunks = chunk_text(text)   # <--- generate chunks here
print("Number of chunks:", len(chunks))

In [None]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(chunks)
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings))

In [None]:
def retrieve(query, top_k=3):
    query_vec = model.encode([query])
    distances, indices = index.search(np.array(query_vec), top_k)
    return [chunks[i] for i in indices[0]]


In [None]:
from transformers import pipeline

qa_model = pipeline("text2text-generation", model="google/flan-t5-base")

def answer_question(query):
    context = "\n".join(retrieve(query))
    prompt = f"Answer the question based on context:\nContext: {context}\n\nQuestion: {query}"
    return qa_model(prompt, max_length=200)[0]["generated_text"]


In [None]:
query = "New owner of drylab family?"
print("Q:", query)
print("A:", answer_question(query))


# Quality

In [None]:

from PyPDF2 import PdfReader

# ----------------------------
# 1. PDF LOADING + CHUNKING
# ----------------------------
def load_pdf(path):
    reader = PdfReader(path)
    text = ""
    for page in reader.pages:
        text += page.extract_text() + "\n"
    return text

def chunk_text(text, chunk_size=800, overlap=100):
    words = text.split()
    chunks = []
    start = 0
    while start < len(words):
        end = start + chunk_size
        chunk = " ".join(words[start:end])
        chunks.append(chunk)
        start += chunk_size - overlap
    return chunks

In [None]:
from transformers import pipeline, AutoTokenizer
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# ----------------------------
# 1. EMBEDDINGS + FAISS INDEX
# ----------------------------
embedder = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
chat_history = []

MODEL_MAX_TOKENS = 512       # Flan-T5 small/medium max input tokens
MAX_NEW_TOKENS = 128

def truncate_prompt_to_budget(prompt, tokenizer, max_input_tokens):
    """
    Ensure final prompt fits within token budget
    """
    ids = tokenizer.encode(prompt, add_special_tokens=False)
    if len(ids) <= max_input_tokens:
        return prompt
    truncated = tokenizer.decode(ids[:max_input_tokens], skip_special_tokens=True)
    return truncated
MAX_INPUT_TOKENS = 512 - MAX_NEW_TOKENS  # model limit minus expected output

def build_strict_context(retrieved_chunks, chat_history, tokenizer, history_limit=):
    context_tokens = 0
    context_text = ""

    # 1) Add chunks until we hit token limit
    for chunk in retrieved_chunks:
        chunk_ids = tokenizer.encode(chunk, add_special_tokens=False)
        if context_tokens + len(chunk_ids) > MAX_INPUT_TOKENS - 50:  # reserve 50 for instructions
            remaining = MAX_INPUT_TOKENS - 50 - context_tokens
            if remaining > 0:
                context_text += tokenizer.decode(chunk_ids[:remaining], skip_special_tokens=True)
            break
        context_text += chunk + "\n"
        context_tokens += len(chunk_ids)

    # 2) Add history if it fits
    history_text = ""
    for q, a in chat_history[-history_limit:]:
        qa_text = f"Q: {q}\nA: {a}\n"
        qa_ids = tokenizer.encode(qa_text, add_special_tokens=False)
        if context_tokens + len(qa_ids) > MAX_INPUT_TOKENS - 50:
            break
        history_text += qa_text
        context_tokens += len(qa_ids)

    return history_text, context_text


In [None]:

# ----------------------------
# 2. QA MODEL WITH MEMORY & CLEAN CONTEXT
# ----------------------------
qa_model = pipeline("text2text-generation", model="google/flan-t5-base", device=0)

def answer_question_token_safe(query, index, chunks, history_limit=2, top_k=3):
    # 1) retrieve top-k relevant chunks
    retrieved_chunks = retrieve(query, index, chunks, top_k=top_k)

    # 2) build token-safe context
    reserved_tokens_for_instructions = 50
    context_budget = MAX_INPUT_TOKENS - reserved_tokens_for_instructions
    context, used_tokens = build_context_token_safe(retrieved_chunks, tokenizer, context_budget)

    # 3) prepare chat history (multi-turn)
    history_text = ""
    history_token_count = 0
    for q, a in chat_history[-history_limit:]:
        pair_text = f"Q: {q}\nA: {a}\n"
        pair_tokens = len(tokenizer.encode(pair_text, add_special_tokens=False))
        if history_token_count + pair_tokens > context_budget - used_tokens:
            break
        history_text += pair_text
        history_token_count += pair_tokens

    # 4) assemble full prompt
    prompt = f"""
You are a helpful assistant.
Previous conversation:
{history_text}
Answer the question ONLY using the context below.
If the answer is not in the context, say "I don't know from the document."

Context:
{context}

Question: {query}
"""

    # 5) truncate prompt if still over budget
    prompt = truncate_prompt_to_budget(prompt, tokenizer, MAX_INPUT_TOKENS)

    # 6) generate answer
    response = qa_model(
        prompt,
        max_new_tokens=MAX_NEW_TOKENS,
        do_sample=False
    )[0]["generated_text"]

    # 7) save in history and return
    chat_history.append((query, response))
    return response

# ----------------------------
# Example usage
# ----------------------------
# chunks = ["Your PDF chunks here..."]
# index, _ = build_faiss_index(chunks)

print(answer_question_token_safe("What dataset was used?", index, chunks))
print(answer_question_token_safe("What problem do Sparse Feature Networks aim to solve?", index, chunks))
print(answer_question_token_safe("How does that relate to the dataset?", index, chunks))


In [None]:

# ----------------------------
# 4. MAIN EXECUTION
# ----------------------------
if __name__ == "__main__":
    # load and chunk PDF
    pdf_text = load_pdf("sample.pdf")   # 🔹 replace with your PDF
    chunks = chunk_text(pdf_text, chunk_size=800, overlap=100)

    # build FAISS
    index, vectors = build_faiss_index(chunks)

    # ask a question
    query = "What problem do Sparse Feature Networks aim to solve ?"
    print("Q:", query)
    print("A:", answer_question(query, index, chunks))
