In [None]:
%pip install --upgrade vllm -q

In [None]:
%pip install langchain langchain_community pypdf sentence-transformers chromadb -q

# RAG: Retrieval Augmented Generation

<a target="_blank" href="https://colab.research.google.com/github/juanhuguet/intro_to_llms/blob/main/intro_to_llms/03_local_rag_qa.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

![](https://python.langchain.com/assets/images/rag_indexing-8160f90a90a33253d0154659cf7d453f.png)

![](https://python.langchain.com/assets/images/rag_retrieval_generation-1046a4668d6bb08786ef73c56d4f228a.png)

# Importamos las funcionalidades de langchain

In [None]:
from langchain_core.chains import RetrievalQA
from langchain.document_loaders import PyPDFLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.llms.vllm import VLLM
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.prompts import PromptTemplate

Seteamos las variables de entorno con la api key de openAI

## Preprocesamos el pdf

PyPDF loader se encarga automáticamente de leer y extraer el texto.

El splitter, crea `chunks` a partir del texto de aprox. 1000 caracteres y un overlap de 200 para no perder info.

In [None]:
loader = PyPDFLoader("data/insurance_policy_example.pdf")

text_splitter = RecursiveCharacterTextSplitter(chunk_size=..., chunk_overlap=...)

chunks = loader.load_and_split(text_splitter=text_splitter)

### Cargamos el documento en la base de datos vectorial.

Usamos chroma, que esta en memoría y no requiere setup, y los embeddings a partir de OpenAI.

In [None]:
embedder = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

In [None]:
# populamos la base de datos con los chunks, y los embedings
docsearch = Chroma.from_documents(chunks, embedder)

### Creamos el `retriever`.

Esta pieza permite dada una query, calcular su embedding y recuperar los  k chunks más relevantes para formar el contexto

In [None]:
retriever=docsearch.as_retriever(search_type="mmr", fetch_k=..., k=..., return_source_documents=True)

### Creamos el reader

Una vez recuperamos el contexto, lo pasaremos a un llm. Usaremos gpt-3.5-turbo, que tiene un buen ratio performance/coste

In [None]:
reader = VLLM(
    model="TheBloke/Mistral-7B-Instruct-v0.2-AWQ",
    trust_remote_code=True,  # mandatory for hf models
    max_new_tokens=1000,
    top_p=0.95,
    stop=["\n\n"],
    temperature=0.3,
    vllm_kwargs={"quantization": "awq",
                 "max_model_len": 10000},
)

## Creamos el prompt

Para poder controlar el comportamiento del modelo, creamos un prompt para dar instrucciones

In [None]:
prompt_template = \
"""Use the following pieces of context to answer the question at the end.
If you don't know the answer, don't try to make up an answer and answer `not-in-text`
Answer in english only.

Context:

{context}

Question:

{question}

Answer:"""

PROMPT = PromptTemplate(template=prompt_template, input_variables=["context", "question"])

## Creamos la cadena de QA

Usamos el wrapper de langchain que se ocupa de orquestar el flujo de dato:

`query` -> `embedding` -> `retrieve context` -> `prompt completion` -> answer

In [None]:
qa = RetrievalQA.from_chain_type(llm=reader,
                                 chain_type="stuff",
                                 retriever=retriever,
                                 chain_type_kwargs={"prompt": PROMPT},
                                 return_source_documents=True

                                )

## Prueba del sistema

Vamos a ver como responde a la siguiente pregunta:

What are the issues that are not covered by the insurance ?

In [None]:
results = qa(...)

In [None]:
print(results["result"])

In [None]:
for  in results["source_documents"]: