In [None]:
!pip install -q -U langchain langchain-huggingface langchain_community chromadb faiss-cpu transformers accelerate bitsandbytes langchain_core bs4 pymupdf

In [None]:
import sys
import pypdf
from langchain_community.document_loaders import WebBaseLoader,PyMuPDFLoader # Data Ingestion
import bs4 # Beautiful Soup for webscraping
from langchain.text_splitter import RecursiveCharacterTextSplitter #Document split and create chunks
from langchain_huggingface import HuggingFaceEmbeddings # Convert Doc into Vectors
from langchain.vectorstores import Chroma # Vector Database to store vectors / docs
from transformers import AutoTokenizer, AutoModelForCausalLM,BitsAndBytesConfig,pipeline # To load model and tokenizer
import torch
from langchain_huggingface import HuggingFacePipeline # To Create Huggingface pipeline with langchain to create LLM Model
from langchain.chains import RetrievalQA # To make Vector DB as Retriever
from langchain_core.prompts import ChatPromptTemplate,PromptTemplate # To write prompt and template
from langchain.chains.combine_documents import create_stuff_documents_chain # To combine LLM and Prompt and create chain
from langchain.chains import create_retrieval_chain #To combine retriever and document chain for inferencing
import warnings
warnings.filterwarnings('ignore')

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
text_document= PyMuPDFLoader('/kaggle/input/attention-research-paper/NIPS-2017-attention-is-all-you-need-Paper.pdf').load()

In [None]:
# Split Documents in chunks
text_splitter= RecursiveCharacterTextSplitter(chunk_size=3000,chunk_overlap=100)
documents=text_splitter.split_documents(text_document)

In [None]:
# Create Embeddings
embedding_model_name= "sentence-transformers/all-mpnet-base-v2"
embeddings= HuggingFaceEmbeddings(model_name=embedding_model_name)

In [None]:
# create Vector DB, Store document and Embeddings in DB
db= Chroma.from_documents(documents=documents,embedding=embeddings,persist_directory='chroma_db')

In [None]:
# Quantization and Load Model & Tokenizer
bnb_config =BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=torch.float16
)
tokenizer = AutoTokenizer.from_pretrained("Shorya22/LLaMA-2-7B")
model = AutoModelForCausalLM.from_pretrained("Shorya22/LLaMA-2-7B",quantization_config=bnb_config,device_map='auto')

In [None]:
# Create pipeline using transformers
pipe= pipeline(task='text-generation',model=model,tokenizer=tokenizer,max_new_tokens=512,temperature=0.3,do_sample=True)

In [None]:
# craete llm using Huggingface Pipeline
llm= HuggingFacePipeline(pipeline=pipe)

# RAG Without Prompt Template and Chain:

In [None]:
# Create retriever for query to model/llm
retriever = db.as_retriever()
qa= RetrievalQA.from_chain_type(llm=llm,retriever=retriever,verbose=True)

In [None]:
# Inferencing
result=qa.run('What is attention?')

In [None]:
print('Answer:',result.split('Helpful Answer:')[-1])

# RAG Pipeline + Prompt Template + LLM Chain:

In [None]:
template = """
Provide answer in bullet Points.
Always end the answer with "Thanks for asking!".

Context: {context}\n\n\n

Question: {input}

Response:
"""
prompt = PromptTemplate(template=template, input_variables=['context', 'input'])

In [None]:
# Create LLM Document Chain and merge llm and prompt
document_chain= create_stuff_documents_chain(llm=llm,prompt=prompt)

In [None]:
# Create retriver and retrival chain and merger retriever and llm document chain
retriever = db.as_retriever()
retrieval_chain= create_retrieval_chain(retriever=retriever,combine_docs_chain=document_chain)

In [None]:
# Inferencing
input_question= "What is attention?"
result=retrieval_chain.invoke({'input':input_question})

In [None]:
print(result['answer'].split('\n\n\n')[-1])