# Build RAG pipeline using Open Source Large Languages

In the notebook we will build a Chat with Website use cases using Zephyr 7B model

## Installation

In [None]:
!pip install langchain faiss-cpu sentence-transformers chromadb
!pip install -q faiss-cpuw

Collecting langchain
  Downloading langchain-0.1.7-py3-none-any.whl (815 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m815.9/815.9 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting faiss-cpu
  Downloading faiss_cpu-1.7.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.6/17.6 MB[0m [31m59.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentence-transformers
  Downloading sentence_transformers-2.3.1-py3-none-any.whl (132 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.8/132.8 kB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting chromadb
  Downloading chromadb-0.4.22-py3-none-any.whl (509 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m509.0/509.0 kB[0m [31m39.9 MB/s[0m eta [36m0:00:00[0m
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain)
  Downloading dataclasses_json-0.6.4-py3-none-any.whl (28 kB)


## Import RAG components required to build pipeline

In [None]:
from langchain.llms import HuggingFaceHub
from langchain.document_loaders import WebBaseLoader
from langchain.text_splitter import CharacterTextSplitter,RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceInferenceAPIEmbeddings
from langchain.vectorstores import FAISS, Chroma
from langchain.chains import RetrievalQA, LLMChain

## Setup HuggingFace Access Token

- Log in to [HuggingFace.co](https://huggingface.co/)
- Click on your profile icon at the top-right corner, then choose [“Settings.”](https://huggingface.co/settings/)
- In the left sidebar, navigate to [“Access Token”](https://huggingface.co/settings/tokens)
- Generate a new access token, assigning it the “write” role.


In [None]:
import os
from getpass import getpass

HF_TOKEN = getpass("HF Token:")
os.environ["HUGGINGFACEHUB_API_TOKEN"] = HF_TOKEN

HF Token:··········


## External data/document - ETL

In [None]:
# import nest_asyncio

# nest_asyncio.apply()

In [None]:
# WEBSITE_URL = "https://tarunjain.netlify.app/"

In [None]:
# loader = WebBaseLoader(WEBSITE_URL)
# loader.requests_per_second = 1
# docs = loader.aload()

In [None]:
from langchain.document_loaders import HuggingFaceDatasetLoader
!pip install -q datasets

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m536.6/536.6 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.3/38.3 MB[0m [31m27.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
ibis-framework 7.1.0 requires pyarrow<15,>=2, but you have pyarrow 15.0.0 which is incompatible.[0m[31m
[0m

In [None]:
# Specify the dataset name and the column containing the content
dataset_name = "databricks/databricks-dolly-15k"
page_content_column = "context"  # or any other column you're interested in

# Create a loader instance
loader = HuggingFaceDatasetLoader(dataset_name, page_content_column)

# Load the data
data = loader.load()

# Display the first 15 entries
data[:2]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/8.20k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/13.1M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

[Document(page_content='"Virgin Australia, the trading name of Virgin Australia Airlines Pty Ltd, is an Australian-based airline. It is the largest airline by fleet size to use the Virgin brand. It commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route. It suddenly found itself as a major airline in Australia\'s domestic market after the collapse of Ansett Australia in September 2001. The airline has since grown to directly serve 32 cities in Australia, from hubs in Brisbane, Melbourne and Sydney."', metadata={'instruction': 'When did Virgin Australia start operating?', 'response': 'Virgin Australia commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route.', 'category': 'closed_qa'}),
 Document(page_content='""', metadata={'instruction': 'Which is a species of fish? Tope or Rope', 'response': 'Tope', 'category': 'classification'})]

## Text Splitting - Chunking

In [None]:
# Create an instance of the RecursiveCharacterTextSplitter class with specific parameters.
# It splits text into chunks of 1000 characters each with a 150-character overlap
#(used in case the document is long, it should splitted into different smaller chunk)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=15)

# 'data' holds the text you want to split, split the text into documents using the text splitter.
docs = text_splitter.split_documents(data)
docs = docs[0:100] #taking 100 docs from the dataset
docs[0]

Document(page_content='"Virgin Australia, the trading name of Virgin Australia Airlines Pty Ltd, is an Australian-based', metadata={'instruction': 'When did Virgin Australia start operating?', 'response': 'Virgin Australia commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route.', 'category': 'closed_qa'})

In [None]:
# chunks[1]

## Embeddings

In [None]:
# embeddings = HuggingFaceInferenceAPIEmbeddings(
#     api_key=HF_TOKEN, model_name="BAAI/bge-base-en-v1.5"
# )

from langchain.embeddings import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(
    # model_name=modelPath,     # Provide the pre-trained model's path
    # model_kwargs=model_kwargs, # Pass the model configuration options
    # encode_kwargs=encode_kwargs # Pass the encoding options
)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

## Vector Store - FAISS or ChromaDB

In [None]:
!pip install -q faiss-cpu
from langchain.vectorstores import FAISS

In [None]:
# vectorstore = Chroma.from_documents(chunks, embeddings)
vectorstore = FAISS.from_documents(docs, embeddings)

In [None]:
vectorstore

<langchain_community.vectorstores.faiss.FAISS at 0x7c3f05077ee0>

In [None]:
query = "Where does Tarun work?"
search = vectorstore.similarity_search(query)

In [None]:
search[0].page_content

"romanized: D\\u00e2r es-Sel\\u00e2m, lit.\\u2009'Abode of Peace') or commonly known as Dar, is the"

## Retriever

In [None]:
retriever = vectorstore.as_retriever(
    search_type="mmr", #similarity
    search_kwargs={'k': 4}
)

In [None]:
retriever.get_relevant_documents(query)

[Document(page_content="romanized: D\\u00e2r es-Sel\\u00e2m, lit.\\u2009'Abode of Peace') or commonly known as Dar, is the", metadata={'instruction': "From the passage list down the areas for which Dar es Salaam is Tanzania's most prominent city. List the results in comma separated format.", 'response': 'arts, fashion, media, film, television, finance', 'category': 'information_extraction'}),
 Document(page_content='In July 2010, he returned to Japan and joined the J2 club Giravanz Kitakyushu. He played often as a', metadata={'instruction': 'When was Tomoaki Komorida born?', 'response': 'Tomoaki Komorida was born on July 10,1981.', 'category': 'closed_qa'}),
 Document(page_content='usually taking the form of a swindle that succeeds only if the superior side is', metadata={'instruction': 'If I have more pieces at the time of stalemate, have I won?', 'response': "No. \nStalemate is a drawn position. It doesn't matter who has captured more pieces or is in a winning position", 'category': 

## Large Language Model - Open Source

In [None]:
llm = HuggingFaceHub(
    # repo_id="gpt2",
    repo_id="google-t5/t5-base",
    model_kwargs={"temperature": 0.5, "max_length": 64,"max_new_tokens":512}
)

## Prompt Template and User Input (Augment - Step 2)

In [None]:
query = "Name the projects Tarun has worked on?"

prompt = f"""
 <|system|>
You are an AI assistant that follows instruction extremely well.
Please be truthful and give direct answers
</s>
 <|user|>
 {query}
 </s>
 <|assistant|>
"""

## RAG RetrievalQA chain

In [None]:
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="refine", retriever=retriever)

In [None]:
response = qa.run(prompt)

In [None]:
question = "Who is Thomas Jefferson?"
result = qa.run({"query": question})

In [None]:
result

'Die ursprüngliche Frage ist wie folgt: Wer ist Thomas Jefferson? Die ursprüngliche Frage ist wie folgt: Wer ist Thomas Jefferson?'

## Chain

In [None]:
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser
from langchain.prompts import ChatPromptTemplate

In [None]:
template = """
 <|system|>
You are an AI assistant that follows instruction extremely well.
Please be truthful and give direct answers
</s>
 <|user|>
 {query}
 </s>
 <|assistant|>
"""

In [None]:
prompt = ChatPromptTemplate.from_template(template)

In [None]:
rag_chain = (
    {"context": retriever,  "query": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [None]:
response = rag_chain.invoke("Name the projects Tarun has worked on?")

In [None]:
print(response)

realworld [[Datei:Tarun_Assistent.jpg|thumb|Tarun als Assistent]] [[Tarun]] ist ein [[Assistent]] der [[Föderation]].
