<a href="https://colab.research.google.com/github/kartiktongaria/PDFChatKit/blob/main/chat_with_pdf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import CharacterTextSplitter, TokenTextSplitter
from langchain.document_loaders import PyPDFLoader
from langchain.vectorstores import Chroma
from langchain.llms.huggingface_pipeline import HuggingFacePipeline
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

In [None]:
# Model names
EMB_SBERT_MPNET_BASE = "sentence-transformers/all-mpnet-base-v2"
LLM_FLAN_T5_BASE = "google/flan-t5-base"
LLM_FALCON_SMALL = "tiiuae/falcon-7b-instruct"

In [None]:
# Configuration
config = {
    "persist_directory": None,
    "load_in_8bit": False,
    "embedding": EMB_SBERT_MPNET_BASE,
    "llm": LLM_FLAN_T5_BASE,
}


In [None]:
def create_sbert_mpnet():
    device = "cuda" if torch.cuda.is_available() else "cpu"
    return HuggingFaceEmbeddings(model_name=EMB_SBERT_MPNET_BASE, model_kwargs={"device": device})


In [None]:
def create_flan_t5_base(load_in_8bit=False):
    model_name = "google/flan-t5-base"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    return pipeline(
        task="text2text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=100,
        model_kwargs={"max_length": 512, "temperature": 0.},
    )

In [None]:
def create_falcon_instruct_small(load_in_8bit=False):
    model = "tiiuae/falcon-7b-instruct"
    tokenizer = AutoTokenizer.from_pretrained(model)
    hf_pipeline = pipeline(
        task="text-generation",
        model=model,
        tokenizer=tokenizer,
        trust_remote_code=True,
        max_new_tokens=100,
        model_kwargs={
            "device_map": "auto",
            "load_in_8bit": load_in_8bit,
            "max_length": 512,
            "temperature": 0.01,
            "torch_dtype": torch.bfloat16,
        }
    )
    return hf_pipeline

In [None]:
# Load the pdf using PyPDFLoader
pdf_path = "YOUR_PDF_FILE_PATH"
loader = PyPDFLoader(pdf_path)
documents = loader.load()

# Split documents and create text snippets
text_splitter = CharacterTextSplitter(chunk_size=100, chunk_overlap=0)
texts = text_splitter.split_documents(documents)
text_splitter = TokenTextSplitter(chunk_size=1000, chunk_overlap=10, encoding_name="cl100k_base")
texts = text_splitter.split_documents(texts)

# Embedding
persist_directory = config["persist_directory"]
embedding = create_sbert_mpnet()
vectordb = Chroma.from_documents(documents=texts, embedding=embedding, persist_directory=persist_directory)

# Language Model
load_in_8bit = config["load_in_8bit"]
llm = create_flan_t5_base(load_in_8bit=load_in_8bit)
hf_llm = HuggingFacePipeline(pipeline=llm)
retriever = vectordb.as_retriever(search_kwargs={"k": 4})
qa = RetrievalQA.from_chain_type(llm=hf_llm, chain_type="stuff", retriever=retriever)

# Default prompt for flan models
if config["llm"] in [LLM_FLAN_T5_BASE]:
    question_t5_template = """
    context: {context}
    question: {question}
    answer:
    """
    QUESTION_T5_PROMPT = PromptTemplate(
        template=question_t5_template, input_variables=["context", "question"]
    )
    qa.combine_documents_chain.llm_chain.prompt = QUESTION_T5_PROMPT

# Query
question = "YOUR_QUESTION"
qa.combine_documents_chain.verbose = True
qa.return_source_documents = True
qa({"query": question})

Token indices sequence length is longer than the specified maximum sequence length for this model (1926 > 512). Running this sequence through the model will result in indexing errors




[1m> Entering new StuffDocumentsChain chain...[0m

[1m> Finished chain.[0m


{'query': "what's is icici pru iprotect smart?",
 'result': 'a life insurance policy',
 'source_documents': [Document(page_content='Death Beneﬁt payout options \nICICI Pru iProtect Smart provides the ﬂexibility to take the Death Beneﬁt in a way that meets your ﬁnancial \nrequirement. The Death Beneﬁt payout option has to be selected by you at Policy inception. The Death Beneﬁt can \nbe paid to your beneﬁciary as:\n1. Lump sum: the entire beneﬁt amount is payable as a lump sum\n2. Income: 10% of the beneﬁt amount is payable every year for 10 years. This will be paid in equal monthly \ninstallments in advance at the rate of 0.83333% of Death Beneﬁt Amount. The beneﬁciary can also advance the \nﬁrst year’s income as lump sum. The monthly income will start from the subsequent month for 9 years at the rate \nof 0.80% of the Death Beneﬁt Amount.\n3. Lump sum and Income: The percentage of the Sum Assured to be paid out as lump sum is chosen at inception. \nThe balance Sum Assured will be paid