<a href="https://colab.research.google.com/github/kartiktongaria/PDFChatbot/blob/main/Chat_bot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
!pip install langchain
!pip install pypdf
!pip install tiktoken
!pip install sentence-transformers
!pip install chromadb

In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import CharacterTextSplitter, TokenTextSplitter
from langchain.document_loaders import PyPDFLoader
from langchain.vectorstores import Chroma
from langchain.llms.huggingface_pipeline import HuggingFacePipeline
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

In [3]:
# Model names
EMB_SBERT_MPNET_BASE = "sentence-transformers/all-mpnet-base-v2"
LLM_FLAN_T5_BASE = "google/flan-t5-base"
LLM_FALCON_SMALL = "tiiuae/falcon-7b-instruct"

In [4]:
# Configuration
config = {
    "persist_directory": None,
    "load_in_8bit": False,
    "embedding": EMB_SBERT_MPNET_BASE,
    "llm": LLM_FLAN_T5_BASE,
}


In [5]:
def create_sbert_mpnet():
    device = "cuda" if torch.cuda.is_available() else "cpu"
    return HuggingFaceEmbeddings(model_name=EMB_SBERT_MPNET_BASE, model_kwargs={"device": device})


In [6]:
def create_flan_t5_base(load_in_8bit=False):
    model_name = "google/flan-t5-base"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    return pipeline(
        task="text2text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=100,
        model_kwargs={"max_length": 512, "temperature": 0.},
    )

In [7]:
def create_falcon_instruct_small(load_in_8bit=False):
    model = "tiiuae/falcon-7b-instruct"
    tokenizer = AutoTokenizer.from_pretrained(model)
    hf_pipeline = pipeline(
        task="text-generation",
        model=model,
        tokenizer=tokenizer,
        trust_remote_code=True,
        max_new_tokens=100,
        model_kwargs={
            "device_map": "auto",
            "load_in_8bit": load_in_8bit,
            "max_length": 512,
            "temperature": 0.01,
            "torch_dtype": torch.bfloat16,
        }
    )
    return hf_pipeline

In [17]:
# Load the pdf using PyPDFLoader

pdf_path = "ENTER_YOU_PDF_FILE _PATH.pdf" #<<<<<<<<<<-----ENTER YOU PDF FILE PATH---------->>>>>>>>>>#

loader = PyPDFLoader(pdf_path)
documents = loader.load()

# Split documents and create text snippets
text_splitter = CharacterTextSplitter(chunk_size=100, chunk_overlap=0)
texts = text_splitter.split_documents(documents)
text_splitter = TokenTextSplitter(chunk_size=1000, chunk_overlap=10, encoding_name="cl100k_base")
texts = text_splitter.split_documents(texts)

# Embedding
persist_directory = config["persist_directory"]
embedding = create_sbert_mpnet()
vectordb = Chroma.from_documents(documents=texts, embedding=embedding, persist_directory=persist_directory)

# Language Model
load_in_8bit = config["load_in_8bit"]
llm = create_flan_t5_base(load_in_8bit=load_in_8bit)
hf_llm = HuggingFacePipeline(pipeline=llm)
retriever = vectordb.as_retriever(search_kwargs={"k": 4})
qa = RetrievalQA.from_chain_type(llm=hf_llm, chain_type="stuff", retriever=retriever)

# Default prompt for flan models
if config["llm"] in [LLM_FLAN_T5_BASE]:
    question_t5_template = """
    context: {context}
    question: {question}
    answer:
    """
    QUESTION_T5_PROMPT = PromptTemplate(
        template=question_t5_template, input_variables=["context", "question"]
    )
    qa.combine_documents_chain.llm_chain.prompt = QUESTION_T5_PROMPT

# Query

question = "ENTER_YOUR_QUESTION" #<<<<<<<<<<<-----ENTER YOUR QUESTION--------------->>>>>>>>>>>#

qa.combine_documents_chain.verbose = True
qa.return_source_documents = True
qa({"query": question})