In [ ]:
!pip install langchain langchain-community langchain-ibm chromadb pypdf gradio

In [ ]:
import os
from langchain_community.document_loaders import PyPDFLoader, TextLoader, WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain_ibm import WatsonxLLM, WatsonxEmbeddings
import gradio as gr

WATSONX_APIKEY    = os.getenv("WATSONX_APIKEY",     "YOUR_API_KEY")
WATSONX_PROJECTID = os.getenv("WATSONX_PROJECT_ID", "YOUR_PROJECT_ID")
WATSONX_URL       = os.getenv("WATSONX_URL",        "https://us-south.ml.cloud.ibm.com")

llm = WatsonxLLM(
    model_id="mistralai/mixtral-8x7b-instruct-v01",
    url=WATSONX_URL,
    apikey=WATSONX_APIKEY,
    project_id=WATSONX_PROJECTID,
    params={"temperature": 0.1, "max_new_tokens": 512}
)

embeddings = WatsonxEmbeddings(
    model_id="ibm/slate-125m-english-rtrvr",
    url=WATSONX_URL,
    apikey=WATSONX_APIKEY,
    project_id=WATSONX_PROJECTID
)

In [ ]:
pdf_loader = PyPDFLoader("research_paper.pdf")
pdf_docs = pdf_loader.load()
text_docs = TextLoader("notes.txt", encoding="utf-8").load()
web_docs = WebBaseLoader("https://arxiv.org/").load()
print(len(pdf_docs), len(text_docs), len(web_docs))

In [ ]:
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
split_docs = splitter.split_documents(pdf_docs)
split_docs[0].page_content[:500]

In [ ]:
sample_embeddings = embeddings.embed_documents([d.page_content for d in split_docs[:3]])
len(sample_embeddings), len(sample_embeddings[0])

In [ ]:
vectordb = Chroma.from_documents(split_docs, embeddings, persist_directory="quest_chroma")
vectordb.persist()
vectordb._collection.count()

In [ ]:
retriever = vectordb.as_retriever(search_kwargs={"k":4})
docs = retriever.get_relevant_documents("What is the main idea?")
docs[0].page_content[:300]

In [ ]:
current_vectordb=None; current_retriever=None; current_qa=None

def load_pdf(pdf):
    global current_vectordb,current_retriever,current_qa
    loader=PyPDFLoader(pdf.name); docs=loader.load()
    sp=RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=200)
    sdocs=sp.split_documents(docs)
    current_vectordb=Chroma.from_documents(sdocs,embeddings)
    current_retriever=current_vectordb.as_retriever(search_kwargs={"k":4})
    current_qa=RetrievalQA.from_chain_type(llm=llm,retriever=current_retriever,chain_type="stuff")
    return "PDF loaded"

def ask(q): return current_qa({"query":q})["result"] if current_qa else "Upload first"

import gradio as gr
with gr.Blocks() as demo:
    pdf=gr.File()
    status=gr.Textbox()
    loadbtn=gr.Button("Load")
    loadbtn.click(load_pdf,pdf,status)
    q=gr.Textbox(value="What this paper is talking about?")
    a=gr.Textbox(lines=6)
    askbtn=gr.Button("Ask")
    askbtn.click(ask,q,a)
demo.launch()