In [None]:
!pip install pinecone sentence-transformers pypdf openai

In [None]:
from pypdf import PdfReader

print()

def extract_text(pdf_path):
    reader = PdfReader(pdf_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text() + "\n"
    return text


In [None]:
def chunk_text(text, chunk_size=500, overlap=50):
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunk = text[start:end]
        chunks.append(chunk)
        start += chunk_size - overlap
    return chunks


In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np

embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")


In [None]:
# !pip install pinecone

# pcsk_2w4gyj_7amhP7kP21465hLjuA4UZVxrSCB4KaaNwgWECE8yywZx8DhYNAreiZTsbv6hv1A
from pinecone import Pinecone,ServerlessSpec

pc = Pinecone(api_key="pcsk_2w4gyj_7amhP7kP21465hLjuA4UZVxrSCB4KaaNwgWECE8yywZx8DhYNAreiZTsbv6hv1A")


index_name = "pdf-rag-index"

# Create index only if not exists
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=384,  # because MiniLM has 384-dim embeddings
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )

index = pc.Index(index_name)



In [None]:
def retrieve_chunks(query, top_k=5):
    query_emb = embedder.encode(query).astype(np.float32).tolist()

    response = index.query(
        vector=query_emb,
        top_k=top_k,
        include_metadata=True
    )

    chunks = [m["metadata"]["text"] for m in response["matches"]]
    return chunks


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tok = AutoTokenizer.from_pretrained("google/flan-t5-base")
llm = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")



In [None]:
def build_safe_prompt(query, chunks, max_input_tokens=512):
    base = (
        "Use the context to answer the question.\n"
        "If the answer is not in the  context, say 'I donâ€™t know.'\n\n"
        "Context:\n"
    )

    used = []
    for c in chunks:
        temp_context = "\n\n".join(used + [c])
        temp_prompt = f"{base}{temp_context}\n\nQuestion: {query}\nAnswer:"

        tok_len = len(tok(temp_prompt)["input_ids"])

        if tok_len <= max_input_tokens - 50:
            used.append(c)
        else:
            break

    final_context = "\n\n".join(used)
    final_prompt = f"{base}{final_context}\n\nQuestion: {query}\nAnswer:"

    return final_prompt



def answer_question(query):
    chunks = retrieve_chunks(query, top_k=5)
    prompt = build_safe_prompt(query, chunks, max_input_tokens=512)

    tokens = tok(prompt, return_tensors="pt", truncation=True, max_length=512)

    output = llm.generate(
        **tokens,
        max_new_tokens=150,
        do_sample=False
    )

    return tok.decode(output[0], skip_special_tokens=True)


In [None]:
import gradio as gr
import numpy as np
import uuid

# Function for Gradio interface
def chatbot_interface(pdf_file, query):
    if pdf_file is None:
        return "Please upload a PDF file first."

    pdf_path = pdf_file.name

    # Re-process the PDF for each new upload
    # 1. Extract text
    document_text = extract_text(pdf_path)

    # 2. Chunk text
    chunks = chunk_text(document_text)
    print(f"Processed {len(chunks)} chunks from the uploaded PDF.")

    # 3. Embed chunks
    embeddings = embedder.encode(chunks).astype(np.float32)

    # 4. Prepare vectors for Pinecone
    vectors = []
    for i, emb in enumerate(embeddings):
        vectors.append({
            "id": str(uuid.uuid4()),
            "values": emb.tolist(),
            "metadata": {"text": chunks[i]}
        })

    # 5. Clear previous data in Pinecone and upload new vectors
    print("Clearing Pinecone index...")
    index.delete(delete_all=True, namespace="") # Clear the entire index
    print(f"Upserting {len(vectors)} new vectors to Pinecone...")
    index.upsert(vectors)
    print("PDF processed and indexed successfully!")

    # Now answer the question using the newly indexed document
    answer = answer_question(query)

    return answer

# Launch Gradio
iface = gr.Interface(
    fn=chatbot_interface,
    inputs=[gr.File(label="Upload PDF"), gr.Textbox(label="Your Query")],
    outputs="text",
    title="PDF RAG Chatbot"
)
iface.launch(debug=True)