# RAG

In [None]:
import os
import re
import time

from dotenv import load_dotenv
from typing import Dict, List
from jinja2 import Template

from langchain_community.document_loaders import WebBaseLoader
from langchain_core.documents.base import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.output_parsers import StrOutputParser
from langchain_openai.chat_models import ChatOpenAI 

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [2]:
load_dotenv(dotenv_path='../src/.env')

True

## Functions

In [None]:
def load_urls(urls_path: str) -> List[str]:
    with open(urls_path, "r", encoding="utf-8") as file:
        urls = [line.strip() for line in file if line.strip()]
    return urls


def load_documents(urls_list: List[str]) -> List[Document]:
    docs = [WebBaseLoader(url).load() for url in urls_list]
    docs_list = [item for sublist in docs for item in sublist]
    print(f"len of documents :{len(docs_list)}")
    return docs_list


def preprocess_documents(documents: List[Document]) -> List[Document]:
    for doc in documents:
        doc.page_content = re.sub(r'\n+', '\n', doc.page_content.strip())
    return documents


def split_documents(documents: List[Document], chunk_size: int, chunk_overlap: int) -> List[Document]:
    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=chunk_size, chunk_overlap=chunk_overlap
    )
    doc_splits = text_splitter.split_documents(documents)
    print(f"length of document chunks generated :{len(doc_splits)}")
    return doc_splits


def read_template_file(template_path: str):
    with open(template_path, "r") as file:
        template_content = file.read()

    template = Template(template_content).render()
    return template


def create_prompt_template(template, input_variables: List[str]) -> PromptTemplate:
    prompt = PromptTemplate(
        template=template,
        input_variables=input_variables,
    )
    return prompt


### Implement the Retriever

In [4]:
urls = load_urls('../data/urls.txt')

In [5]:
documents = load_documents(urls)

len of documents :8


In [6]:
preprocessed_documents = preprocess_documents(documents)

In [7]:
splitted_documents = split_documents(preprocessed_documents, chunk_size=1024, chunk_overlap=200)

length of document chunks generated :156


In [8]:
embedding_model = FastEmbedEmbeddings(model_name="BAAI/bge-base-en-v1.5")

  from .autonotebook import tqdm as notebook_tqdm
Fetching 5 files: 100%|██████████| 5/5 [00:05<00:00,  1.09s/it]


In [9]:
vectorstore = FAISS.from_documents(
    documents=splitted_documents,
    embedding=embedding_model
)

In [10]:
retriever = vectorstore.as_retriever(search_kwargs={"k":5})

In [12]:
retrieved_documents = retriever.invoke("que necesito para entrar en Tailandia?")

### Implement the Router

In [17]:
template = read_template_file('../prompts/router_prompt.jinja')

In [20]:
prompt = create_prompt_template(template, input_variables=['question'])

In [33]:
llm = ChatOpenAI(model='gpt-4o-mini')

In [39]:
question_router = prompt | llm | JsonOutputParser()

In [42]:
start = time.time()

question = "Que necesito para visitar Vietnam?"
print(question_router.invoke({"question": question}))

end = time.time()

print(f"The time required to generate response by Router Chain in seconds:{end - start}")


{'datasource': 'vectorstore'}
The time required to generate response by Router Chain in seconds:0.48729753494262695
