# Importing Libraries

In [1]:
import os
import glob
from dotenv import load_dotenv
import gradio as gr
from IPython.display import Markdown, display

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import shutil

In [3]:
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import numpy as np
import plotly.graph_objects as go
from langchain.memory import ConversationBufferMemory,ChatMessageHistory
from langchain.chains import ConversationalRetrievalChain
from langchain.embeddings import HuggingFaceEmbeddings

# Loading Knowlege Base

In [4]:
# Function to add metadata
def add_metadata(doc, doc_type):
    doc.metadata["doc_type"] = doc_type
    return doc

# Adjusting utf-8 
text_loader_kwargs = {'encoding': 'utf-8'}

# Path to folder with the files
folders = glob.glob("knowledge-base/*")

# Reading files
documents = []
for folder in folders:
    doc_type = os.path.basename(folder)
    loader = DirectoryLoader(folder, glob="**/*.md", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)
    folder_docs = loader.load()
    documents.extend([add_metadata(doc, doc_type) for doc in folder_docs])

In [5]:
# Creating chunks
text_splitter = CharacterTextSplitter(chunk_size=800, chunk_overlap=160)
chunks = text_splitter.split_documents(documents)

print(f"Total number of chunks: {len(chunks)}")
print(f"Document types found: {set(doc.metadata['doc_type'] for doc in documents)}")

Created a chunk of size 1009, which is longer than the specified 800
Created a chunk of size 1225, which is longer than the specified 800
Created a chunk of size 890, which is longer than the specified 800
Created a chunk of size 1302, which is longer than the specified 800
Created a chunk of size 1318, which is longer than the specified 800
Created a chunk of size 1338, which is longer than the specified 800


Total number of chunks: 57
Document types found: {'model', 'motoboys', 'company', 'services'}


# Vector Storage

In [6]:
# Embeddings open source
embedding_model_name = "sentence-transformers/all-MiniLM-L6-v2"
embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name)

  embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name)


In [7]:
db_name = "vector_db"

In [8]:
# Delete if already exists
if os.path.exists(db_name):
    shutil.rmtree(db_name)
    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()

In [9]:
# Create vectorstore
vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)
print(f"Vectorstore created with {vectorstore._collection.count()} documents")

Vectorstore created with 57 documents


In [10]:
# The vectors
collection = vectorstore._collection
count = collection.count()

sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
dimensions = len(sample_embedding)
print(f"There are {count:,} vectors with {dimensions:,} dimensions in the vector store")

There are 57 vectors with 384 dimensions in the vector store


# Visualizing the Vector Store

In [14]:
# Prework 
result = collection.get(include=['embeddings', 'documents', 'metadatas'])
vectors = np.array(result['embeddings'])
documents = result['documents']
metadatas = result['metadatas']
doc_types = [metadata['doc_type'] for metadata in metadatas]
colors = [['blue', 'green', 'red', 'orange'][['model', 'motoboys', 'services', 'company'].index(t)] for t in doc_types]

In [18]:
# Reduce the dimensionality of the vectors to 2D using t-SNE
# (t-distributed stochastic neighbor embedding)

tsne = TSNE(n_components=2, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Create the 2D scatter plot
fig = go.Figure(data=[go.Scatter(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='2D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x',yaxis_title='y'),
    width=800,
    height=600,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show(renderer="iframe_connected")

The figure visualizes the distribution of high-dimensional document embeddings in a 2D space using t-SNE. Each point represents a document, and its color indicates its category. Points that appear closer together are likely to be semantically similar, helping us identify clusters and relationships between categories.

# LangChain

In [19]:
MODEL = "llama3.2"

# create a new Chat with llama3.2
llm = ChatOpenAI(temperature=0.7, model_name='llama3.2', base_url='http://localhost:11434/v1', api_key='ollama')

# set up the conversation memory for the chat
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

# the retriever is an abstraction over the VectorStore that will be used during RAG
retriever = vectorstore.as_retriever()

# putting it together: set up the conversation chain with the GPT 3.5 LLM, the vector store and memory
conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)


Please see the migration guide at: https://python.langchain.com/docs/versions/migrating_memory/



In [None]:
# Let's try a simple question

query = "Please explain the System Flow for Demand Forecasting process in a couple of sentences"
result = conversation_chain.invoke({"question": query})
print(result["answer"])

In [None]:
# Let's try a simple question
query = "Please tell me all the endpoints used in all process"
result = conversation_chain.invoke({"question": query})
display(Markdown(result["answer"]))

# Chat Interface - Gradio

In [20]:
def chat(question, history):
    result = conversation_chain.invoke({"question": question})
    return result["answer"]

In [21]:
view = gr.ChatInterface(chat, type="messages").launch(inbrowser=True)

* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.
