In [1]:
# imports

import os
import glob
from dotenv import load_dotenv
import gradio as gr
import pandas as pd

In [2]:
# imports for langchain
from langchain_community.document_loaders import DirectoryLoader, UnstructuredWordDocumentLoader,UnstructuredExcelLoader, TextLoader, PyPDFLoader
from langchain_community.chat_models import ChatOllama
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_chroma import Chroma
import numpy as np
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain

In [3]:
load_dotenv(override=True)
api_key = os.getenv("OPENROUTER_API_KEY")

In [4]:
# price is a factor for our company, so we're going to use a low cost model

MODEL = "mistralai/mistral-small-3.2-24b-instruct:free"
db_name = "ken_vector_db"

In [5]:

# --- Configuration ---
# Set the path to the folder containing your files
folder = "my_knowledge_base/"

# --- Install Required Libraries ---
# You'll need to install the loaders for each file type.
# For this example, you would run:
# pip install pypdf unstructured python-docx

# --- Create Loaders for Each File Type ---
# 1. Loader for Markdown (.md) and Text (.txt) files
print("Loading Markdown and text files...")


txt_loader = DirectoryLoader(
    folder,
    glob="**/*.txt",
    loader_cls=TextLoader
)

md_loader = DirectoryLoader(
    folder,
    glob="**/*.md",
    loader_cls=TextLoader,
    loader_kwargs={"encoding": "utf-8"}
)

# 2. Loader for PDF files
print("Loading PDF files...")
pdf_loader = DirectoryLoader(
    folder,
    glob="**/*.pdf",
    loader_cls=PyPDFLoader
)

# 3. Loader for Word documents (.docx)
print("Loading Word documents...")
docx_loader = DirectoryLoader(
    folder,
    glob="**/*.docx",
    loader_cls=UnstructuredWordDocumentLoader
)

#4. Loader for Word Excel documents (.xlsx)

# print("Loading Excel documents...")
# xlsx_loader = DirectoryLoader(
#     folder,
#     glob="**/*.xlsx",
#     loader_cls=UnstructuredExcelLoader
# )

print("Loading Excel documents...")
xlsx_files = glob.glob(os.path.join(folder, "**/*.xlsx"), recursive=True)
pandas_docs = []

for xlsx_file in xlsx_files:
    print(f"Processing: {os.path.basename(xlsx_file)}")
    df = pd.read_excel(xlsx_file)
    
    # Method 1: Create structured text for each row
    for index, row in df.iterrows():
        # Create a nicely formatted text representation of each row
        content = ""
        for col in df.columns:
            if pd.notna(row[col]):  # Only include non-null values
                content += f"{col}: {row[col]}\n"
        
        # Create a document for each row
        doc = Document(
            page_content=content.strip(),
            metadata={
                "source": xlsx_file,
                "row_index": index
            }
        )
        pandas_docs.append(doc)

# --- Load Documents from All Sources ---
# Initialize a list to hold all documents
all_documents = []

# Load documents from each loader and add them to the list
all_documents.extend(txt_loader.load())
all_documents.extend(md_loader.load())
all_documents.extend(pdf_loader.load())
all_documents.extend(docx_loader.load())
#all_documents.extend(xlsx_loader.load())
all_documents.extend(pandas_docs) 

print(f"Finished loading. Found a total of {len(all_documents)} documents.")

# Now, 'all_documents' contains all the content from your files,
# ready to be processed for your knowledge worker.

Loading Markdown and text files...
Loading PDF files...
Loading Word documents...
Loading Excel documents...
Processing: Hotel_Inventory_Tracking.xlsx
Finished loading. Found a total of 701 documents.


In [6]:
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(all_documents)

Created a chunk of size 2924, which is longer than the specified 1000


In [7]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Delete if already exists

if os.path.exists(db_name):
    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()

# Create vectorstore

vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)
print(f"Vectorstore created with {vectorstore._collection.count()} documents")

  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


Vectorstore created with 747 documents


In [8]:
# Get one vector and find how many dimensions it has

collection = vectorstore._collection
sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
dimensions = len(sample_embedding)
print(f"The vectors have {dimensions:,} dimensions")

The vectors have 384 dimensions


In [15]:
from langchain_core.callbacks import StdOutCallbackHandler # For debugging purposes
#For online inference
llm = ChatOpenAI(
    model=MODEL,    # or any model OpenRouter supports
    api_key=api_key,        # note: it's 'api_key' not 'openai_api_key'
    base_url="https://openrouter.ai/api/v1",   # note: it's 'base_url' not 'openai_api_base'
    max_tokens=1000   # instead of default 100000
)

# # For local inference (No need to spend money)
# llm = ChatOllama(model="llama3.2:1b")


# set up the conversation memory for the chat
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

# the retriever is an abstraction over the VectorStore that will be used during RAG
retriever = vectorstore.as_retriever()#search_kwargs={"k": 25}

# putting it together: set up the conversation chain with the GPT 4o-mini LLM, the vector store and memory
conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)#,
                                                          #callbacks=[StdOutCallbackHandler()])

query = "Tell me about yam portioning"
result = conversation_chain.invoke({"question": query})
answer = result["answer"]
#print("\nAnswer:", answer)

In [16]:
# Wrapping in a function - note that history isn't used, as the memory is in the conversation_chain

def chat(message, history):
    result = conversation_chain.invoke({"question": message})
    return result["answer"]

In [17]:
# And in Gradio:
view = gr.ChatInterface(chat, type="messages")
view.launch(share=True)

* Running on local URL:  http://127.0.0.1:7862

Could not create share link. Please check your internet connection or our status page: https://status.gradio.app.


