In [156]:
import ollama
import faiss
import numpy as np
import json
import os
from bs4 import BeautifulSoup
from numpy.char import index

In [157]:
INDEX_FILE = "vector_store.index"
META_FILE = "metadata.json"
EMBED_MODEL="nomic-embed-text:latest"
KBA_FOLDER = "generated_kba"


In [158]:
def get_article_path(KBA_FOLDER):
    data = []
    for filename in os.listdir(KBA_FOLDER):

        if filename.endswith(".html"):
            file_path = os.path.join(KBA_FOLDER, filename)
            article_name = os.path.splitext(filename)[0]
            data.append([article_name, file_path])
    return data

In [159]:
def get_article_data(html_file_path):
    with open(html_file_path, "r", encoding="utf-8") as f:
        html_content = f.read()

    # Extract Clean Text from HTML
    soup = BeautifulSoup(html_content, "html.parser")

    # Remove scripts/styles
    for tag in soup(["script", "style"]):
        tag.decompose()

    clean_text = soup.get_text(separator=" ", strip=True)
    return clean_text

In [160]:
def chunk_text(text, chunk_size=500, overlap=50):
    chunks = []
    for i in range(0, len(text), chunk_size - overlap):
        chunks.append(text[i:i+chunk_size])
    return chunks

In [161]:
# Create Embedding
def generate_embedding_response(chunks):
    embedding_response = []

    for chunk in chunks:
        response = ollama.embeddings(
            model=EMBED_MODEL,
            prompt=chunk
        )
        embedding_response.append(response["embedding"])


    embedding = np.array(embedding_response).astype("float32")
    return embedding


In [162]:
# Store Metadata PER CHUNK
def store_metadata(article_name, html_file_path, chunks):
        
    if os.path.exists(META_FILE):
        with open(META_FILE, "r") as f:
            metadata = json.load(f)
    else:
        metadata = []

    for i, chunk in enumerate(chunks):
        metadata.append({
            "article_name": article_name,
            "source_file": html_file_path,
            "chunk_id": i,
            "chunk_text": chunk
        })

    with open(META_FILE, "w") as f:
        json.dump(metadata, f, indent=2)

    return metadata

In [163]:
def store_embeddings(embedding):
    # Validate embedding shape
    if len(embedding.shape) == 1:
        embedding = np.expand_dims(embedding, axis=0)

    # Normalize for cosine similarity
    faiss.normalize_L2(embedding)

    dimension = embedding.shape[1]

    # Create / Load FAISS Index

    if os.path.exists(INDEX_FILE):
        index = faiss.read_index(INDEX_FILE)

        # ðŸ”Ž Safety check for dimension mismatch
        if index.d != dimension:
            raise ValueError(
                f"Embedding dimension mismatch! "
                f"Index dimension = {index.d}, "
                f"New embedding dimension = {dimension}"
            )
    else:
        index = faiss.IndexFlatIP(dimension)

    # Add embedding
    index.add(embedding)

    # Save index
    faiss.write_index(index, INDEX_FILE)
    return index




In [164]:
def search_similar_kba(new_ticket):

    # Load FAISS index
    index = faiss.read_index(INDEX_FILE)
    if index.ntotal == 0:
        raise ValueError("FAISS index is empty!")
    
    dimension = index.d  # get dimension directly from index
    #print(f"FAISS index loaded with dimension: {dimension} and total vectors: {index.ntotal}")  # Debug: Check index details
    
    #Create embedding
    query_vector = generate_embedding_response([new_ticket])
    #print(f"Generated query embedding with shape: {query_vector.shape}") 
    
    # Validate dimension
    if query_vector.shape[1] != dimension:
        raise ValueError(
            f"Dimension mismatch! Query dim={query_vector.shape[1]}, Index dim={dimension}"
        )

    # Normalize query (REQUIRED for cosine similarity)
    faiss.normalize_L2(query_vector)

    # Search
    #distances, indices = index.search(query_vector, k=1)
    top_k = 1
    distances, indices = index.search(query_vector, k=top_k)  # k=1 for top match
   
    # score = float(distances[0][0])
    # match_index = int(indices[0][0])

    # Load metadata
    with open(META_FILE, "r") as f:
        metadata = json.load(f)

    if indices[0][0] >= len(metadata):
        raise ValueError("Metadata mismatch with FAISS index!")
    
    for i in range(top_k):
        matched_article = metadata[indices[0][i]]
        score = float(distances[0][i])
        # print(matched_article, score)
        print("\nMost Similar KBA Found:")
        print("Article Name:", matched_article["article_name"])
        print("Source File:", matched_article["source_file"])
        print("Cosine Similarity Score:", round(score, 4)*100, "%")
        print("Total vectors in index:", index.ntotal)

In [165]:
def main():
    article_paths = get_article_path(KBA_FOLDER)
    for article_path in article_paths:
        print(f"Processing article: {article_path[1]}")
        clean_data = get_article_data(article_path[1])
        #print("Extracted text length:", len(clean_data))
        chunks = chunk_text(clean_data)
        #print(f"Total chunks created: {len(chunks)}")
        embedding_response = generate_embedding_response(chunks)
        #print(f"Embedding shape: {embedding_response.shape}")  # Debug: Check embedding shape
        index = store_embeddings(embedding_response)
        #print(f"Index total vectors after adding: {index.ntotal}")  # Debug: Check index count
        metadata = store_metadata(article_path[0], article_path[1], chunks)
        print(" HTML article embedded and stored successfully.")
        #print("Total metadata records:", len(metadata))
    # CRITICAL CHECK
    assert index.ntotal == len(metadata), "Index and metadata count mismatch!"
    

if __name__ == "__main__":
    main()

Processing article: generated_kba/OPS-1052_Troubleshooting_Article.html
 HTML article embedded and stored successfully.
Processing article: generated_kba/OPS-1060_Troubleshooting_Article.html
 HTML article embedded and stored successfully.
Processing article: generated_kba/OPS-1031_Troubleshooting_Article.html
 HTML article embedded and stored successfully.
Processing article: generated_kba/OPS-1024_Troubleshooting_Article.html
 HTML article embedded and stored successfully.
Processing article: generated_kba/OPS-1045_Troubleshooting_Article.html
 HTML article embedded and stored successfully.


In [166]:
# Test Search with a new ticket
new_ticket = "Troubleshooting Article Troubleshooting Article \u2013 OPS-1031 OPS-1031 Unable to Access Staging Server Disk Full Error Issue Summary The staging environment was inaccessible due to a \"No space left on device\" error in the /var directory, resulting in failed CI deployments. Description A failure occurred in the CI/CD pipeline where attempts to deploy new versions of applications into the staging environment were unsuccessful. The issue stemmed from reaching"
search_similar_kba(new_ticket)
    


Most Similar KBA Found:
Article Name: OPS-1031_Troubleshooting_Article
Source File: generated_kba/OPS-1031_Troubleshooting_Article.html
Cosine Similarity Score: 97.75 %
Total vectors in index: 340


In [171]:
# ADVANCED SEARCH WITH ARTICLE AGGREGATION

from collections import defaultdict

def search_similar_kba_new(new_ticket):

    index = faiss.read_index(INDEX_FILE)

    if index.ntotal == 0:
        raise ValueError("FAISS index is empty!")

    dimension = index.d

    query_vector = generate_embedding_response([new_ticket])

    if query_vector.shape[1] != dimension:
        raise ValueError(
            f"Dimension mismatch! Query dim={query_vector.shape[1]}, Index dim={dimension}"
        )

    faiss.normalize_L2(query_vector)

    # Get top 5 chunk matches
    distances, indices = index.search(query_vector, k=5)

    with open(META_FILE, "r") as f:
        metadata = json.load(f)

    # GROUP BY ARTICLE
    article_scores = defaultdict(float)
    article_chunks = defaultdict(list)

    for i in range(len(indices[0])):

        chunk_index = int(indices[0][i])
        score = float(distances[0][i])

        if chunk_index >= len(metadata):
            continue

        article_name = metadata[chunk_index]["article_name"]

        # Add score to article group
        article_scores[article_name] += score

        # Track chunk details
        article_chunks[article_name].append({
            "chunk_index": chunk_index,
            "chunk_score": score
        })

    # Compare Article Scores
    sorted_articles = sorted(
        article_scores.items(),
        key=lambda x: x[1],
        reverse=True
    )

    print("\n Aggregated Article Ranking:\n")

    for article, total_score in sorted_articles:

        print(f"Article: {article}")
        print(f"Total Combined Score: {round(total_score, 4)}")
        print("Matching Chunks:")

        for chunk in article_chunks[article]:
            print(f"  - Chunk {chunk['chunk_index']} | Score: {round(chunk['chunk_score'], 4)}")


    # Best Article

    best_article = sorted_articles[0]

    print("\n FINAL BEST MATCH")
    print("Article:", best_article[0])
    print("Final Aggregated Score:", round(best_article[1], 4))

In [172]:
# Test Search with a new ticket
new_ticket = "Troubleshooting Article Troubleshooting Article \u2013 OPS-1031 OPS-1031 Unable to Access Staging Server Disk Full Error Issue Summary The staging environment was inaccessible due to a \"No space left on device\" error in the /var directory, resulting in failed CI deployments. Description A failure occurred in the CI/CD pipeline where attempts to deploy new versions of applications into the staging environment were unsuccessful. The issue stemmed from reaching"
search_similar_kba_new(new_ticket)


 Aggregated Article Ranking:

Article: OPS-1031_Troubleshooting_Article
Total Combined Score: 4.8875
Matching Chunks:
  - Chunk 195 | Score: 0.9775
  - Chunk 135 | Score: 0.9775
  - Chunk 95 | Score: 0.9775
  - Chunk 55 | Score: 0.9775
  - Chunk 15 | Score: 0.9775

 FINAL BEST MATCH
Article: OPS-1031_Troubleshooting_Article
Final Aggregated Score: 4.8875
