In [1]:
import ollama
import faiss
import numpy as np
import json
import os
from bs4 import BeautifulSoup

In [2]:

INDEX_FILE = "vector_store.index"
META_FILE = "metadata.json"
EMBED_MODEL="nomic-embed-text:latest"


In [3]:

# Load HTML File
html_file_path = "./generated_kba/OPS-1024_Troubleshooting_Article.html"
article_name = "OPS-1024_Troubleshooting_Article"

with open(html_file_path, "r", encoding="utf-8") as f:
    html_content = f.read()

# Extract Clean Text from HTML
soup = BeautifulSoup(html_content, "html.parser")

# Remove scripts/styles
for tag in soup(["script", "style"]):
    tag.decompose()

clean_text = soup.get_text(separator=" ", strip=True)

print("Extracted text length:", len(clean_text))


Extracted text length: 4341


In [4]:
# Create Embedding

def chunk_text(text, chunk_size=500, overlap=50):
    chunks = []
    for i in range(0, len(text), chunk_size - overlap):
        chunks.append(text[i:i+chunk_size])
    return chunks

chunks = chunk_text(clean_text)

embedding_response = []

for chunk in chunks:
    response = ollama.embeddings(
        model=EMBED_MODEL,
        prompt=chunk
    )
    embedding_response.append(response["embedding"])


embedding = np.array(embedding_response).astype("float32")


In [5]:

# Validate embedding shape
if len(embedding.shape) == 1:
    embedding = np.expand_dims(embedding, axis=0)

# Normalize for cosine similarity

faiss.normalize_L2(embedding)

dimension = embedding.shape[1]

# Create / Load FAISS Index

if os.path.exists(INDEX_FILE):
    index = faiss.read_index(INDEX_FILE)

    # Safety check for dimension mismatch
    if index.d != dimension:
        raise ValueError(
            f"Embedding dimension mismatch! "
            f"Index dimension = {index.d}, "
            f"New embedding dimension = {dimension}"
        )
else:
    index = faiss.IndexFlatIP(dimension)

# Add embedding
index.add(embedding)

# Save index
faiss.write_index(index, INDEX_FILE)

# Store Metadata PER CHUNK

if os.path.exists(META_FILE):
    with open(META_FILE, "r") as f:
        metadata = json.load(f)
else:
    metadata = []

for i, chunk in enumerate(chunks):
    metadata.append({
        "article_name": article_name,
        "source_file": html_file_path,
        "chunk_id": i,
        "chunk_text": chunk
    })

with open(META_FILE, "w") as f:
    json.dump(metadata, f, indent=2)

print("HTML article embedded and stored successfully.")
print("Total vectors in index:", index.ntotal)
print("Total metadata records:", len(metadata))

# CRITICAL CHECK
assert index.ntotal == len(metadata), "Index and metadata count mismatch!"

HTML article embedded and stored successfully.
Total vectors in index: 170
Total metadata records: 170


In [9]:

# Load FAISS index FIRST
index = faiss.read_index(INDEX_FILE)

if index.ntotal == 0:
    raise ValueError("FAISS index is empty!")

dimension = index.d  # get dimension directly from index

# Create embedding
new_ticket = "Service disruption triggered by full root filesystem it went down entirely due to insufficient disk space on its root filesystem. This resulted in slow SSH connections"

embedding_response = ollama.embeddings(
    model=EMBED_MODEL,
    prompt=new_ticket
)

query_vector = np.array(
    [embedding_response["embedding"]],
    dtype="float32"
)
# print("Query embedding shape:", query_vector)

# Validate dimension
if query_vector.shape[1] != dimension:
    raise ValueError(
        f"Dimension mismatch! Query dim={query_vector.shape[1]}, Index dim={dimension}"
    )

# Normalize query (REQUIRED for cosine similarity)
faiss.normalize_L2(query_vector)

# Search
distances, indices = index.search(query_vector, k=1)

score = float(distances[0][0])
match_index = int(indices[0][0])

# Load metadata
with open(META_FILE, "r") as f:
    metadata = json.load(f)

if match_index >= len(metadata):
    raise ValueError("Metadata mismatch with FAISS index!")

matched_article = metadata[match_index]
print("\nMost Similar KBA Found:")
print("Article Name:", matched_article["article_name"])
print("Source File:", matched_article["source_file"])
print("Cosine Similarity Score:", round(score, 4)*100, "%")
print("Total vectors in index:", index.ntotal)


Most Similar KBA Found:
Article Name: OPS-1024_Troubleshooting_Article
Source File: generated_kba/OPS-1024_Troubleshooting_Article.html
Cosine Similarity Score: 82.39 %
Total vectors in index: 170
