
# Implementation of RAG, using:
* Langchain
* FAISS
* Sentence-transformers
* an LLM: we will provide examples with Flan-alpaca-large, Zephyr-7b-beta and DeciLM-7b



## Install required dependencies
Langchain library is evolving very fast. If you use another version, some pieces of code may need to be changed.

In [None]:
#Install all the dependencies
!pip install  langchain~=0.0.352
!pip install  pypdf
!pip install  sentence-transformers==2.2.2
!pip install  huggingface_hub
!pip install  accelerate
!pip install  torch~=2.1.2
!pip install  transformers~=4.36.2

## Install FAISS
With GPU support, install faiss-gpu\
With CPU only, install faiss-cpu\
Uncomment the line below that fits your hardware and run

In [None]:
!pip install faiss-gpu
#!pip install faiss-cpu

## Import modules

In [None]:
#Langchain modules
from langchain import document_loaders as dl
from langchain import embeddings
from langchain import text_splitter as ts
from langchain import vectorstores as vs
from langchain.llms import HuggingFacePipeline
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.schema import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.runnable import RunnableParallel
from langchain.prompts import PromptTemplate
from operator import itemgetter
#Torch + transformers
import torch
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForCausalLM
#Other useful modules
import re
import time


## Load the document and chunk it
Upload your document beforehand.\
Use langchain text_splitter to make chunks. Each chunk as a page-content attribute and a metadata attribute which includes the document name and the page.

In [None]:
document_path ="quantum-mckinsey.pdf"


def split_doc(document_path, chunk_size=500, chunk_overlap=20):
    loader = dl.PyPDFLoader(document_path)
    document = loader.load()
    text_splitter = ts.RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    document_splitted = text_splitter.split_documents(documents=document)
    return document_splitted

#Split the document and print the different chunks
document_splitted = split_doc(document_path)
for doc in document_splitted:
  print(doc)

## Load the embedding model
Firstly, we will store the sentence-transformers model locally and then load it.\
If you are not running on GPU, change the device to cpu.

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
#Save the model locally
model.save('sentence-transformers')
del model
torch.cuda.empty_cache()

In [None]:
def load_embedding_model():
    model_kwargs = {'device': 'cuda:0'}
    encode_kwargs = {'normalize_embeddings': False}
    embedding_model_instance = embeddings.HuggingFaceEmbeddings(
        model_name="sentence-transformers",
        model_kwargs=model_kwargs,
        encode_kwargs=encode_kwargs
    )
    return embedding_model_instance

#Instantiate the embedding model
embedding_model_instance = load_embedding_model()

## Create a vector database to store embeddings
We will use FAISS to store the chunks and their embeddings. We also store the metadata.



In [None]:
def create_db(document_splitted, embedding_model_instance):

    model_vectorstore = vs.FAISS
    db=None
    try:
        content = []
        metadata = []
        for d in document_splitted:
            content.append(d.page_content)
            metadata.append({'source': d.metadata})
        db=model_vectorstore.from_texts(content, embedding_model_instance, metadata)
    except Exception as error:
        print(error)
    return db

db = create_db(document_splitted, embedding_model_instance)
#store the db locally for future use
db.save_local('db.index')

# Load the large language model
We will provide here different options of LLMs. If you are running this notebook on Google Colab and do not have enough RAM or GPU memory, use the flan-alpaca-large model. Otherwise, choose between Zephyr-7b-beta and DeciLM-7b.
You can of course use any other LLM available on Huggingface.\
**In any case, only use one of the 3 following models**

## Flan-alpaca-large
https://huggingface.co/declare-lab/flan-alpaca-large

In [None]:
#Save the model locally.
from transformers import AutoModelForSeq2SeqLM
tokenizer = AutoTokenizer.from_pretrained("declare-lab/flan-alpaca-large")
model = AutoModelForSeq2SeqLM.from_pretrained("declare-lab/flan-alpaca-large", low_cpu_mem_usage=True, torch_dtype=torch.float16)
model.save_pretrained('flan-alpaca-large-model', max_shard_size="1000MB")
tokenizer.save_pretrained('flan-alpaca-large-tokenizer')
del model
del tokenizer
torch.cuda.empty_cache()

In [None]:
#Create a pipeline with the local version of the model
tokenizer = AutoTokenizer.from_pretrained("flan-alpaca-large-tokenizer")
model = AutoModelForSeq2SeqLM.from_pretrained("flan-alpaca-large-model", low_cpu_mem_usage=True, torch_dtype=torch.float16)
pipe = pipeline(task="text2text-generation", model=model,tokenizer=tokenizer, device="cuda:0", max_new_tokens=1000)

## Zephyr-7b-beta
https://huggingface.co/HuggingFaceH4/zephyr-7b-beta

In [None]:
#Save the model locally.
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta")
model = AutoModelForCausalLM.from_pretrained("HuggingFaceH4/zephyr-7b-beta", low_cpu_mem_usage=True, torch_dtype=torch.float16)
model.save_pretrained('zephyr-7b-beta-model', max_shard_size="1000MB")
tokenizer.save_pretrained('zephyr-7b-beta-tokenizer')
del model
del tokenizer
torch.cuda.empty_cache()

In [None]:
#Create a pipeline with the local version of the model
tokenizer = AutoTokenizer.from_pretrained("zephyr-7b-beta-tokenizer")
model = AutoModelForCausalLM.from_pretrained("zephyr-7b-beta-model", low_cpu_mem_usage=True, torch_dtype=torch.float16)
pipe = pipeline(task="text-generation", model=model,tokenizer=tokenizer, device="cuda:0", max_new_tokens=1000)

## DeciLM-7b
https://huggingface.co/Deci/DeciLM-7B

In [None]:
#Save the model locally.
tokenizer = AutoTokenizer.from_pretrained("Deci/DeciLM-7B")
model = AutoModelForCausalLM.from_pretrained("Deci/DeciLM-7B", torch_dtype=torch.float16, trust_remote_code=True).to(device)
model.save_pretrained('DeciLM-7b-model', max_shard_size="1000MB")
tokenizer.save_pretrained('DeciLM-7b-tokenizer')
del model
del tokenizer
torch.cuda.empty_cache()

In [None]:
#Create a pipeline with the local version of the model
tokenizer = AutoTokenizer.from_pretrained("DeciLM-7b-tokenizer")
model = AutoModelForCausalLM.from_pretrained("DeciLM-7b-model", low_cpu_mem_usage=True, torch_dtype=torch.float16, trust_remote_code=True)
pipe = pipeline(task="text-generation", model=model,tokenizer=tokenizer, device="cuda:0", max_new_tokens=1000, trust_remote_code=True)

## Connect the pipeline with Langchain

In [None]:
#Use the pipeline in Langchain
llm=HuggingFacePipeline(pipeline=pipe, model_kwargs={'temperature':0})

# Load a retriever, define prompt template and chains

In [None]:
query = "What is quantum computing?"
retriever = db.as_retriever(search_type="similarity_score_threshold", search_kwargs={"k": 6, 'score_threshold': 0.01})
retrieved_docs = retriever.get_relevant_documents(query)

In [None]:
template = """Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
{context}
Question: {question}
Helpful Answer:"""
rag_prompt_custom = PromptTemplate.from_template(template)

In [None]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

#First chain to query the LLM
rag_chain_from_docs = (
    {
        "context": lambda input: format_docs(input["documents"]),
        "question": itemgetter("question"),
    }
    | rag_prompt_custom
    | llm
    | StrOutputParser()
)

#Second chain to postprocess the answer
rag_chain_with_source = RunnableParallel(
    {"documents": retriever, "question": RunnablePassthrough()}
) | {
    "documents": lambda input: [doc.metadata for doc in input["documents"]],
    "answer": rag_chain_from_docs,
}

# Query the LLM and postprocess the answer

In [None]:
t0=time.time()
resp = rag_chain_with_source.invoke(query)
if len(resp['documents'])==0:
  print('No documents found')
else:
  stripped_resp = re.sub(r"\n+$", " ", resp['answer'])
  print(stripped_resp)
  print('Sources',resp['documents'])
  print('Response time:', time.time()-t0)
