In [None]:
import bs4
from langchain import hub
from langchain_community.document_loaders import WebBaseLoader
from langchain_chroma import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import ChatOpenAI
from dotenv import load_dotenv

load_dotenv()

In [5]:
llm = ChatOpenAI(model="gpt-3.5-turbo")

In [7]:
# Load, chunk and index the contents of the blog.
loader = WebBaseLoader(
    web_paths=("https://medium.com/mercadolibre-tech/how-do-we-structure-a-data-team-here-at-mercado-libre-e7533f78cfb8",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("pw-post-body-paragraph no np gt nq b nr ns nt nu nv nw nx ny nz oa ob oc od oe of og oh oi oj ok ol gm bj", "pw-post-title gr gs gt be gu gv gw gx gy gz ha hb hc hd he hf hg hh hi hj hk hl hm hn ho hp hq hr hs ht bj", "pw-author-name be st su sv sw bj")
        )
    ),
)
docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)
vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())

# Retrieve and generate using the relevant snippets of the blog.
retriever = vectorstore.as_retriever()
prompt = hub.pull("rlm/rag-prompt")

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [8]:
### RAG bot

import openai
from langsmith import traceable
from langsmith.wrappers import wrap_openai

class RagBot:

    def __init__(self, retriever, model: str = "gpt-3.5-turbo"):
        self._retriever = retriever
        # Wrapping the client instruments the LLM
        self._client = wrap_openai(openai.Client())
        self._model = model

    @traceable()
    def retrieve_docs(self, question):
        return self._retriever.invoke(question)

    @traceable()
    def invoke_llm(self, question, docs):
        response = self._client.chat.completions.create(
            model=self._model,
            messages=[
                {
                    "role": "system",
                    "content": "You are a helpful AI code assistant with expertise in LCEL."
                    " Use the following docs to produce a concise code solution to the user question.\n\n"
                    f"## Docs\n\n{docs}",
                },
                {"role": "user", "content": question},
            ],
        )

        # Evaluators will expect "answer" and "contexts"
        return {
            "answer": response.choices[0].message.content,
            "contexts": [str(doc) for doc in docs],
        }

    @traceable()
    def get_answer(self, question: str):
        docs = self.retrieve_docs(question)
        return self.invoke_llm(question, docs)

rag_bot = RagBot(retriever)

In [9]:
response = rag_bot.get_answer("How many ways are there to structure a data team?")
response["answer"][:150]

'The documents provided offer insights into how Mercado Libre structures its data team but do not provide a definitive number of ways to structure a da'

In [11]:
response = rag_bot.get_answer("What is Data Mesh?")
print(response["answer"])

Data Mesh is a market concept that involves the decentralization of the production of official tables, and at Mercado Libre, it has been broadened to include the decentralization of dashboards and reports production as well. This approach means that if a team wants to make an analytical asset available to the entire company, they can do it by themselves without depending on a central team. This structure ensures better time-to-market and faster responses to business inquiries or queries by aligning technology with an organizational model that supports the autonomy of individual teams in creating and sharing analytical data across the company. Implemented since late 2022 at Mercado Libre, Data Mesh facilitates a more collaborative, effective, and efficient process in handling data and analytics within the organization.


In [12]:
response = rag_bot.get_answer("Who did write this article?")
print(response["answer"])

The article was written by Elissa Suzuki.
