# Objective
Build a system that allows any company employee to consult, in natural language, information from the internal knowledge base (e.g., how the company's main model works, details about motorcycle couriers and services).

# Importing Libraries

In [1]:
import os
import glob
from dotenv import load_dotenv
import gradio as gr
from IPython.display import Markdown, display
import shutil
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import numpy as np
import plotly.graph_objects as go
from langchain.memory import ConversationBufferMemory,ChatMessageHistory
from langchain.chains import ConversationalRetrievalChain
from langchain_huggingface import HuggingFaceEmbeddings

In [31]:
import logging

In [2]:
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.chains import RetrievalQA
from langchain.chains import LLMChain
from langchain_core.runnables import RunnableParallel
from operator import itemgetter

In [3]:
from pydantic import BaseModel, Field
from datetime import datetime
import json
from pathlib import Path
from threading import Thread
import json

# Loading Knowlege Base

In [4]:
# Function to add metadata
def add_metadata(doc, doc_type):
    doc.metadata["doc_type"] = doc_type
    return doc

# Adjusting utf-8 
text_loader_kwargs = {'encoding': 'utf-8'}

# Path to folder with the files
folders = glob.glob("knowledge-base/*")

# Reading files
documents = []
for folder in folders:
    doc_type = os.path.basename(folder)
    loader = DirectoryLoader(folder, glob="**/*.md", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)
    folder_docs = loader.load()
    documents.extend([add_metadata(doc, doc_type) for doc in folder_docs])

In [5]:
# Creating chunks
text_splitter = CharacterTextSplitter(chunk_size=800, chunk_overlap=160)
chunks = text_splitter.split_documents(documents)

print(f"Total number of chunks: {len(chunks)}")
print(f"Document types found: {set(doc.metadata['doc_type'] for doc in documents)}")

Created a chunk of size 1302, which is longer than the specified 800
Created a chunk of size 1318, which is longer than the specified 800
Created a chunk of size 1338, which is longer than the specified 800
Created a chunk of size 890, which is longer than the specified 800
Created a chunk of size 1009, which is longer than the specified 800
Created a chunk of size 1225, which is longer than the specified 800


Total number of chunks: 57
Document types found: {'motoboys', 'model', 'company', 'services'}


# Vector Storage


In [6]:
# Embeddings open source
embedding_model_name = "sentence-transformers/all-MiniLM-L6-v2"
embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name)

2025-07-20 20:38:00.190397: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-07-20 20:38:00.193415: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-07-20 20:38:00.201200: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753054680.217318   11396 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753054680.222104   11396 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1753054680.235061   11396 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linkin

In [7]:
db_name = "vector_db"

In [8]:
# Delete if already exists
if os.path.exists(db_name):
    shutil.rmtree(db_name)
    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()

In [9]:
# Create vectorstore
vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)
print(f"Vectorstore created with {vectorstore._collection.count()} documents")

Vectorstore created with 57 documents


In [10]:
# The vectors
collection = vectorstore._collection
count = collection.count()

sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
dimensions = len(sample_embedding)
print(f"There are {count:,} vectors with {dimensions:,} dimensions in the vector store")

There are 57 vectors with 384 dimensions in the vector store


# Visualizing the Vector Store

In [None]:
# Prework 
result = collection.get(include=['embeddings', 'documents', 'metadatas'])
vectors = np.array(result['embeddings'])
documents = result['documents']
metadatas = result['metadatas']
doc_types = [metadata['doc_type'] for metadata in metadatas]
colors = [['blue', 'green', 'red', 'orange'][['model', 'motoboys', 'services', 'company'].index(t)] for t in doc_types]

In [None]:
# Reduce the dimensionality of the vectors to 2D using t-SNE
# (t-distributed stochastic neighbor embedding)

tsne = TSNE(n_components=2, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Create the 2D scatter plot
fig = go.Figure(data=[go.Scatter(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='2D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x',yaxis_title='y'),
    width=800,
    height=600,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show(renderer="iframe_connected")

The figure visualizes the distribution of high-dimensional document embeddings in a 2D space using t-SNE. Each point represents a document, and its color indicates its category. Points that appear closer together are likely to be semantically similar, helping us identify clusters and relationships between categories.

# LangChain

In [22]:
# MODEL = "llama3.2"
MODEL = "phi3"

# create a new Chat with llama3.2
llm = ChatOpenAI(temperature=0.7, model_name=MODEL, base_url='http://localhost:11434/v1', api_key='ollama')

# set up the conversation memory for the chat, in case we want LLM to respond considering previous questions
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True, output_key="answer")

# the retriever is an abstraction over the VectorStore that will be used during RAG
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

# putting it together: set up the conversation chain with the MODEL, the vector store and memory
conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever,return_source_documents=True, output_key="answer")

In [12]:
%%time
# Let's try a simple question
query = "Please explain the System Flow for Demand Forecasting process in a couple of sentences"
result = conversation_chain.invoke({"question": query,"chat_history": []})
display(Markdown(result["answer"]))

The demand forecast system uses historical sales data from various cities to predict future service slot needs. It employs SARIMA models, considering factors like order volume and seasonality, then identifies top markets while providing scheduling insights for planning purposes.

CPU times: user 94.7 ms, sys: 564 µs, total: 95.2 ms
Wall time: 16.6 s


In [13]:
%%time
# Let's try a simple question
query = "Please tell me Diego Fernandes key responsabilities, in English"
result = conversation_chain.invoke({"question": query,"chat_history": []})
display(Markdown(result["answer"]))

Key Responsibilities of Diego Fernandes as a Delivery Driver at Correios:
1. Deliver mail and packages to intended recipients ensuring accurate delivery.
2. Complete necessary paperwork associated with each delivery promptly.
3. Maintain an organized system for handling parcels, potentially contributing positively in the eyes of management based on past feedback regarding his attention to package organization.

CPU times: user 78.5 ms, sys: 1.72 ms, total: 80.2 ms
Wall time: 29.8 s


# Feedback Saving

In [14]:
class FeedbackEntry(BaseModel):
    timestamp: datetime = Field(default_factory=datetime.utcnow)
    user_query: str
    model_response: str
    feedback: str
    source_documents: list[str]

In [39]:
FEEDBACK_LOG = Path("feedback_log.jsonl")

def save_feedback_async(entry: FeedbackEntry):
    logging.info("Feedback Saved")
    def _save():
        with FEEDBACK_LOG.open("a") as f:
            f.write(entry.model_dump_json() + "\n")
    Thread(target=_save).start()

In [52]:
last_interaction = {"query": "", "answer": "", "sources": []}

def generate_response(message, history):
    result = conversation_chain({"question": message,"chat_history":[]})
    answer = result["answer"]
    sources = [doc.page_content for doc in result["source_documents"]]
    
    documents = result.get("source_documents", [])
    top_docs = documents[:2]
    doc_names = []
    for i, doc in enumerate(top_docs, 1):
        source = doc.metadata.get("source", f"Documento {i}")
        filename = source.split("/")[-1]
        doc_names.append(f"{i}. {filename}")

    rag_list = "\n".join(doc_names)

    last_interaction["query"] = message
    last_interaction["answer"] = answer
    last_interaction["sources"] = sources

    history.append({"role": "user", "content": message})
    history.append({"role": "assistant", "content": answer})
    return "", history,rag_list

def register_feedback(feedback_type):
    entry = FeedbackEntry(
        user_query=last_interaction["query"],
        model_response=last_interaction["answer"],
        feedback=feedback_type,
        source_documents=last_interaction["sources"]
    )
    save_feedback_async(entry)
    return f"Feedback '{feedback_type}' registered sucessfully."

# Chat Interface - Gradio

In [60]:
# Soimplest interface
def chat(question, history):
    result = conversation_chain.invoke({"question": question,"chat_history": []})
    return result["answer"]
view = gr.ChatInterface(chat, type="messages").launch(inbrowser=False)

* Running on local URL:  http://127.0.0.1:7873
* To create a public link, set `share=True` in `launch()`.


In [58]:
# More fancy user interface
with gr.Blocks() as ui:
    gr.Markdown("## 💬 LLM Deliveries Internal Chat")

    with gr.Row():
        with gr.Column(scale=4):  # Chat 
            chatbot = gr.Chatbot(type="messages")

            with gr.Row():  # Input + buttons
                msg = gr.Textbox(placeholder="Type your question...",  show_label=False, scale=9)
                with gr.Column(scale=1, min_width=60):
                    send_btn = gr.Button("➤", size="sm")

        with gr.Column(scale=1): 
            gr.Markdown("### Feedback")
            thumbs_up = gr.Button("👍 (Good Answer)")
            thumbs_down = gr.Button("👎 (Bad Answer)")
            feedback_status = gr.Textbox(label="Feedback Status")

            gr.Markdown("### Relevant Documents")
            rag_docs = gr.Textbox(lines=6, interactive=False, label=None)

    send_btn.click(generate_response, inputs=[msg, chatbot], outputs=[msg, chatbot, rag_docs])
    msg.submit(generate_response, inputs=[msg, chatbot], outputs=[msg, chatbot, rag_docs])
    thumbs_up.click(fn=lambda: register_feedback("thumbs_up"), outputs=feedback_status)
    thumbs_down.click(fn=lambda: register_feedback("thumbs_down"), outputs=feedback_status)


In [59]:
ui.launch()

* Running on local URL:  http://127.0.0.1:7872
* To create a public link, set `share=True` in `launch()`.




# Metrics

In [63]:
import pandas as pd

In [67]:
# reading feedbacks
df = pd.read_json(Path("feedback_log.jsonl"), lines=True)

In [68]:
# Calculating percentage of correct answers
df["feedback_binario"] = df["feedback"].map({"thumbs_up": 1, "thumbs_down": 0})
correct_answer_rate = df["feedback_binario"].mean() * 100