<a href="https://colab.research.google.com/github/joshuaalpuerto/ML-guide/blob/main/RAG_hybrid_Mistral_7B_Instruct.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Loading a GPTQ quantized model requires optimum (`pip install optimum`) and auto-gptq library (`pip install auto-gptq`)
!pip install -q optimum --progress-bar off
!pip install -q auto-gptq --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/  --progress-bar off # Use cu117 if on CUDA 11.7
# We need specific transformer to make mistral work
!pip install -q git+https://github.com/huggingface/transformers.git@72958fcd3c98a7afdc61f953aa58c544ebda2f79 --progress-bar off

!pip install -qU langchain Faiss-gpu sentence-transformers
!pip install -q jq # for json loader to work
!pip install ctransformers[gptq] #To use CTransformer from langchain and load gptq model

# !pip install -qU trl Py7zr
!pip install -q rank_bm25
!pip install -q PyPdf

In [2]:
#connect to google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import langchain
from langchain.storage import InMemoryStore
from langchain.cache import InMemoryCache
from langchain.llms import CTransformers

config = {'max_new_tokens': 1024, 'temperature': 0.1, 'repetition_penalty': 1.1}

model_name_or_path = "TheBloke/Mistral-7B-Instruct-v0.1-GPTQ"

llm = CTransformers(model=model_name_or_path,
                    config=config)

# Have cache
langchain.llm_cache = InMemoryCache()

Fetching 10 files:   0%|          | 0/10 [00:00<?, ?it/s]

Downloading model.safetensors:   0%|          | 0.00/4.16G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)8014d/tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

Downloading (…)quantize_config.json:   0%|          | 0.00/186 [00:00<?, ?B/s]

Downloading (…)3b4f88014d/README.md:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Downloading (…)4f88014d/config.json:   0%|          | 0.00/963 [00:00<?, ?B/s]

Downloading (…)8014d/.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

In [None]:
from langchain.storage import InMemoryStore
from langchain.embeddings import CacheBackedEmbeddings,HuggingFaceEmbeddings

# In our implementation we have used uses the local file system for storing embeddings and FAISS vector store for retrieval.
# store = LocalFileStore("./cache/")

# We can also set up inmemory cache
# NOTE: we used this as we are more familiar with it
store = InMemoryStore()

embed_model_id="thenlper/gte-large"

# Under the hood HuggingFaceEmbeddings is using sentence-transformer
core_embeddings_model = HuggingFaceEmbeddings(model_name=embed_model_id)

# Here we will leverage a CacheBackedEmbeddings to prevent us from re-embedding similar queries over and over again.
embedder = CacheBackedEmbeddings.from_bytes_store(core_embeddings_model,
                                                  store,
                                                  namespace=embed_model_id)

In [5]:
from langchain.document_loaders import JSONLoader

# Define the metadata extraction function.
def metadata_func(record: dict, metadata: dict) -> dict:

    metadata["country"] = record.get("country")
    metadata["answer"] = record.get("answer")

    return metadata


loader = JSONLoader(
    file_path='/content/drive/MyDrive/datasets/qna-clean.json',
    jq_schema='.[]',
    content_key="question",
    metadata_func=metadata_func
)

faq_docs = loader.load()

In [6]:
from langchain.vectorstores import FAISS

# Create VectorStore
vectorstore = FAISS.from_documents(faq_docs, embedder)

In [8]:
# NOT USED

from langchain.retrievers import BM25Retriever

# Sparse
bm25_retriever = BM25Retriever.from_documents(faq_docs)
bm25_retriever.k=5


In [9]:
# NOT USED

from langchain.retrievers import EnsembleRetriever

# Hybrid search
faiss_retriever = vectorstore.as_retriever(search_kwargs={"k":5})
ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever,faiss_retriever],
                                       weights=[0.5,0.5])

In [52]:
from langchain import PromptTemplate

PROMPT_TEMPLATE = """<s> [INST] You act as a helpful immigration and relocation agent, responding to questions based on provided context on behalf of the user's assigned agent.

Follow these guidelines for the answer:
- Only answer information related to immigration, relocation, settle-in, cultural integration or similar fields related queries.
- Summarize the answer and make it clear, concise, grammatically correct and direct like a human agent would
- Don't disclose yourself as AI, automated tool or bot, answer like the user's agent would
- Never suggest contacting outside Jobbatical platform. Rewrite the answer to contact the user's agent instead
- write the answer using english language

If you're unsure about the answer:
- For questions that are specific about their own or family member's immigration case progress, reply with "OUT_OF_CONTEXT" to avoid providing incorrect information
- For unrelated questions which isn't generic to answer, reply with "OUT_OF_CONTEXT" to avoid hallucinating the answer
- If context is not enough to answer the users question reply with "OUT_OF_CONTEXT" [/INST] </s>

[INST] Question: {question}

Context:
{context}

Answer: [/INST]"""

input_variables = ['context', 'question']
custom_prompt = PromptTemplate(template=PROMPT_TEMPLATE,
                            input_variables=input_variables)

In [41]:
import langchain
from langchain.chains import LLMChain
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import EmbeddingsFilter
from langchain.callbacks import StdOutCallbackHandler

# A bit slower but this improves retrieval by using bruteforce similarity(cos_sim) against user query.
def filter_similar_docs_from_query(query, embeddings, retriever, similarity_threshold=0.90):
  embeddings_filter = EmbeddingsFilter(embeddings=embeddings, similarity_threshold=similarity_threshold)
  compression_retriever = ContextualCompressionRetriever(base_compressor=embeddings_filter, base_retriever=retriever)

  docs = compression_retriever.get_relevant_documents(query)
  inputs = [f"- {doc.metadata['answer']}" for doc in docs]
  context = "\n".join(inputs)

  return context

# Naive top_k retriever (there a chance it gets incorrect context)
def get_documents_as_context(query, retriever):
    docs = retriever.get_relevant_documents(query)
    inputs = [f"- {doc.metadata['answer']}" for doc in docs]
    context = "\n".join(inputs)
    return context

def inference(question, context, prompt):
  # Log genereated info
  handler = StdOutCallbackHandler()
  # We use LLMChain + manual retrieval because the answers for our question is inside metadata.
  # There no easyway to do it in Langchain.
  llm_chain = LLMChain(
      llm=llm,
      prompt=prompt,
      verbose=True,
      callbacks=[handler]
  )

  return llm_chain.predict(context=context, question=query)

# To print the final prompt
  langchain.debug = True

# W/o hybrid search

In [64]:
%%time

# query = "do you have any update for my visa?"
query = "Me and my wife doesn't marriage certificate yet. Is that a problem?"
country = "Estonia"

retriever = vectorstore.as_retriever(search_kwargs={"k":5, 'filter': {'country': country} })
context = filter_similar_docs_from_query(query=query, embeddings=core_embeddings_model, retriever=retriever)

inference(question=query, context=context, prompt=custom_prompt)



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m<s> [INST] You act as a helpful immigration and relocation agent, responding to questions based on provided context on behalf of the user's assigned agent.

Follow these guidelines for the answer:
- Only answer information related to immigration, relocation, settle-in, cultural integration or similar fields related queries.
- Summarize the answer and make it clear, concise, grammatically correct and direct like a human agent would
- Don't disclose yourself as AI, automated tool or bot, answer like the user's agent would
- Never suggest contacting outside Jobbatical platform. Rewrite the answer to contact the user's agent instead
- write the answer using english language

If you're unsure about the answer:
- For questions that are specific about their own or family member's immigration case progress, reply with "OUT_OF_CONTEXT" to avoid providing incorrect information
- For unrelated questions which isn't g

' OUT_OF_CONTEXT'

## Debugging

> NOTE: You don't need to run this