This is an example on how to process log files in a simple rag system

In [0]:
from langchain.vectorstores import Chroma
from langchain.docstore.document import Document
from langchain.embeddings import HuggingFaceEmbeddings
from pathlib import Path
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter

# Path to your logs directory
input_dir = Path("failures_ds")

# Step 1: Load all .log files recursively
documents = []
for log_path in input_dir.rglob("*.log"):
    with open(log_path, "r", encoding="utf-8") as f:
        content = f.read().strip()
        if content:
            documents.append(Document(
                page_content=content,
                metadata={"source": str(log_path.relative_to(input_dir))}  # optional: store relative path
            ))

print(f"Loaded {len(documents)} log documents.")

# Step 2: Load the embedding model
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Step 3: Create the Chroma vectorstore

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(documents)

db_path = "chroma_failures_ds"
vectorstore = Chroma.from_documents(documents=chunks, embedding=embedding_model, persist_directory=db_path)
vectorstore.persist()
print(f"✅ Vectorstore created with {vectorstore._collection.count()} documents at {db_path}")

print(f"✅ Vectorstore created with {vectorstore._collection.count()} documents at {db_path}")

Display in 2D in order to understand what happened in chroma

In [0]:
# Step 1: Load the Chroma DB
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from sklearn.manifold import TSNE
import plotly.express as px
import numpy as np

persist_path = "chroma_failures_ds"
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorstore = Chroma(persist_directory=persist_path, embedding_function=embedding_model)

# ✅ Get embeddings explicitly
result = vectorstore.get(include=['embeddings', 'metadatas', 'documents'])  # Include documents ✅
all_docs = result['documents']
all_metas = result['metadatas']
all_embeddings = result['embeddings']

# ✅ Convert to numpy array and verify shape
X = np.array(all_embeddings)
print("Shape of X:", X.shape)

# ✅ Adjust perplexity to be < number of samples
X_2d = TSNE(n_components=2, perplexity=min(30, X.shape[0] - 1), random_state=42).fit_transform(X)

# Prepare Plotly data
sources = [meta['source'] for meta in all_metas]
texts = [doc[:200] for doc in all_docs]
df_data = {
    "x": X_2d[:, 0],
    "y": X_2d[:, 1],
    "source": sources,
    "preview": texts,
}

# Plot
fig = px.scatter(df_data, x="x", y="y", color="source", hover_data=["preview"])
fig.update_layout(title="2D Visualization of Chroma Embeddings", width=1000, height=700)
fig.show()