In [None]:
import os
import glob
import gradio as gr
import numpy as np
import plotly.graph_objects as go
from dotenv import load_dotenv
from sklearn.manifold import TSNE
from langchain_chroma import Chroma
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain.chains import create_history_aware_retriever, create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

In [2]:
os.environ["LANGSMITH_TRACING"] = "true"

load_dotenv("../.env")
# os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

True

In [3]:
MODEL = "gpt-4o-mini"
db_name = "vector_db"

In [4]:
folders = glob.glob("./knowledge-base/*")

def add_metadata(doc, doc_type):
    doc.metadata["doc_type"] = doc_type
    return doc

text_loader_kwargs = {"encoding": "utf-8"}

documents = []
for folder in folders:
    doc_type = os.path.basename(folder)
    loader = DirectoryLoader(folder, glob="**/*.md", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)
    folder_docs = loader.load()
    documents.extend([add_metadata(doc, doc_type) for doc in folder_docs])

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(documents)

print(f"Total number of chunks: {len(chunks)}")
print(f"Document types found: {set(doc.metadata["doc_type"] for doc in documents)}")

Created a chunk of size 1088, which is longer than the specified 1000


Total number of chunks: 123
Document types found: {'contracts', 'company', 'products', 'employees'}


In [5]:
embeddings = OpenAIEmbeddings()

# Delete if already exists
if os.path.exists(db_name):
    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()

vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)
print(f"Vectorstore created with {vectorstore._collection.count()} documents")

Vectorstore created with 123 documents


In [6]:
collection = vectorstore._collection
count = collection.count()

sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
dimensions = len(sample_embedding)
print(f"There are {count:,} vectors with {dimensions:,} dimensions in the vector store")

There are 123 vectors with 1,536 dimensions in the vector store


In [7]:
result = collection.get(include=["embeddings", "documents", "metadatas"])
vectors = np.array(result["embeddings"])
documents = result["documents"]
metadatas = result["metadatas"]
doc_types = [metadata["doc_type"] for metadata in metadatas]
colors = [["blue", "green", "red", "orange"][["products", "employees", "contracts", "company"].index(t)] for t in doc_types]

In [8]:
tsne = TSNE(n_components=2, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

fig = go.Figure(data=[go.Scatter(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='2D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x',yaxis_title='y'),
    width=800,
    height=600,
    margin=dict(r=20, b=10, l=10, t=40)
)

In [9]:
llm = ChatOpenAI(temperature=0.7, model_name=MODEL)

# the retriever is an abstraction over the VectorStore that will be used during RAG; k is how many chunks to use
retriever = vectorstore.as_retriever(search_kwargs={"k": 25})

In [10]:
# Contextualize question
contextualize_q_system_prompt = "Given a chat history and the latest user question \
which might reference context in the chat history, formulate a standalone question \
which can be understood without the chat history. Do NOT answer the question, \
just reformulate it if needed and otherwise return it as is."
contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
history_aware_retriever = create_history_aware_retriever(
    llm, retriever, contextualize_q_prompt
)

# Answer question
qa_system_prompt = "You are an assistant for question-answering tasks. \
Use the following pieces of retrieved context to answer the question. \
If you don't know the answer, just say that you don't know. \
\n\n{context}"
qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", qa_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)
rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)

conversational_rag_chain = RunnableWithMessageHistory(
    rag_chain,
    lambda: ChatMessageHistory(),
    input_messages_key="input",
    history_messages_key="chat_history",
    output_messages_key="answer",
)

In [11]:
first_question = "Who is Avery Lancaster?"
ai_msg_1 = conversational_rag_chain.invoke({"input": first_question})
print(ai_msg_1["answer"])
second_question = "What did Avery do before?"
ai_msg_2 = conversational_rag_chain.invoke({"input": second_question})
print(ai_msg_2["answer"])

Avery Lancaster is the Co-Founder and Chief Executive Officer (CEO) of Insurellm, an insurance tech company she co-founded in 2015. Born on March 15, 1985, she is based in San Francisco, California. Under her leadership, Insurellm has become a leading provider in the insurance technology sector. Avery has a background in product management and analytics, having previously worked as a Senior Product Manager at Innovate Insurance Solutions and as a Business Analyst at Edge Analytics. She is known for her innovative leadership strategies, commitment to diversity and inclusion, and community engagement efforts, particularly in financial literacy programs.
Before co-founding Insurellm in 2015, Avery Lancaster worked as a Senior Product Manager at Innovate Insurance Solutions from 2013 to 2015, where she developed groundbreaking insurance products aimed at the tech sector. Prior to that, she was a Business Analyst at Edge Analytics from 2010 to 2013, focusing on market trends and consumer pr

In [12]:
# Wrapping it all into a function
def chat(question, history):
    stream = conversational_rag_chain.stream({"input": question})
    response = ""
    for chunk in stream:
        response += chunk.get("answer") or ""
        yield response

gr.ChatInterface(chat, type="messages").launch()

* Running on local URL:  http://127.0.0.1:7861

To create a public link, set `share=True` in `launch()`.


