In [6]:
!pip install pinecone sentence-transformers pypdf openai

Collecting pinecone
  Downloading pinecone-8.0.0-py3-none-any.whl.metadata (11 kB)
Collecting pinecone-plugin-assistant<4.0.0,>=3.0.1 (from pinecone)
  Downloading pinecone_plugin_assistant-3.0.1-py3-none-any.whl.metadata (30 kB)
Collecting packaging>=20.9 (from huggingface-hub>=0.20.0->sentence-transformers)
  Downloading packaging-24.2-py3-none-any.whl.metadata (3.2 kB)
Downloading pinecone-8.0.0-py3-none-any.whl (745 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m745.9/745.9 kB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pinecone_plugin_assistant-3.0.1-py3-none-any.whl (280 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.9/280.9 kB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading packaging-24.2-py3-none-any.whl (65 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.5/65.5 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: packaging, pinecone-plugin-assistant, pin

In [7]:
from pypdf import PdfReader

print()

def extract_text(pdf_path):
    reader = PdfReader(pdf_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text() + "\n"
    return text





In [8]:
def chunk_text(text, chunk_size=500, overlap=50):
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunk = text[start:end]
        chunks.append(chunk)
        start += chunk_size - overlap
    return chunks


In [9]:
from sentence_transformers import SentenceTransformer
import numpy as np

embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")


In [11]:
# !pip install pinecone

# pcsk_2w4gyj_7amhP7kP21465hLjuA4UZVxrSCB4KaaNwgWECE8yywZx8DhYNAreiZTsbv6hv1A
from pinecone import Pinecone,ServerlessSpec

pc = Pinecone(api_key="pcsk_2w4gyj_7amhP7kP21465hLjuA4UZVxrSCB4KaaNwgWECE8yywZx8DhYNAreiZTsbv6hv1A")


index_name = "pdf-rag-index"

# Create index only if not exists
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=384,  # because MiniLM has 384-dim embeddings
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )

index = pc.Index(index_name)



In [12]:
def retrieve_chunks(query, top_k=5):
    query_emb = embedder.encode(query).astype(np.float32).tolist()

    response = index.query(
        vector=query_emb,
        top_k=top_k,
        include_metadata=True
    )

    chunks = [m["metadata"]["text"] for m in response["matches"]]
    return chunks


In [13]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tok = AutoTokenizer.from_pretrained("google/flan-t5-base")
llm = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")



tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [14]:
def build_safe_prompt(query, chunks, max_input_tokens=512):
    base = (
        "Use the context to answer the question.\n"
        "If the answer is not in the  context, say 'I don’t know.'\n\n"
        "Context:\n"
    )

    used = []
    for c in chunks:
        temp_context = "\n\n".join(used + [c])
        temp_prompt = f"{base}{temp_context}\n\nQuestion: {query}\nAnswer:"

        tok_len = len(tok(temp_prompt)["input_ids"])

        if tok_len <= max_input_tokens - 50:
            used.append(c)
        else:
            break

    final_context = "\n\n".join(used)
    final_prompt = f"{base}{final_context}\n\nQuestion: {query}\nAnswer:"

    return final_prompt



def answer_question(query):
    chunks = retrieve_chunks(query, top_k=5)
    prompt = build_safe_prompt(query, chunks, max_input_tokens=512)

    tokens = tok(prompt, return_tensors="pt", truncation=True, max_length=512)

    output = llm.generate(
        **tokens,
        max_new_tokens=150,
        do_sample=False
    )

    return tok.decode(output[0], skip_special_tokens=True)


In [None]:
import gradio as gr
import numpy as np
import uuid

# Function for Gradio interface
def chatbot_interface(pdf_file, query):
    if pdf_file is None:
        return "Please upload a PDF file first."

    pdf_path = pdf_file.name

    # Re-process the PDF for each new upload
    # 1. Extract text
    document_text = extract_text(pdf_path)

    # 2. Chunk text
    chunks = chunk_text(document_text)
    print(f"Processed {len(chunks)} chunks from the uploaded PDF.")

    # 3. Embed chunks
    embeddings = embedder.encode(chunks).astype(np.float32)

    # 4. Prepare vectors for Pinecone
    vectors = []
    for i, emb in enumerate(embeddings):
        vectors.append({
            "id": str(uuid.uuid4()),
            "values": emb.tolist(),
            "metadata": {"text": chunks[i]}
        })

    # 5. Clear previous data in Pinecone and upload new vectors
    print("Clearing Pinecone index...")
    index.delete(delete_all=True, namespace="") # Clear the entire index
    print(f"Upserting {len(vectors)} new vectors to Pinecone...")
    index.upsert(vectors)
    print("PDF processed and indexed successfully!")

    # Now answer the question using the newly indexed document
    answer = answer_question(query)

    return answer

# Launch Gradio
iface = gr.Interface(
    fn=chatbot_interface,
    inputs=[gr.File(label="Upload PDF"), gr.Textbox(label="Your Query")],
    outputs="text",
    title="PDF RAG Chatbot"
)
iface.launch(debug=True)

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://9aaf098119af257457.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Processed 3 chunks from the uploaded PDF.
Clearing Pinecone index...
Upserting 3 new vectors to Pinecone...
PDF processed and indexed successfully!
