In [1]:
# LIBRARIES #
import os
import io
import requests
import pandas as pd
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from dotenv import load_dotenv
import fitz
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from PyPDF2 import PdfReader
from langchain_experimental.text_splitter import SemanticChunker
import numpy as np
import faiss

In [2]:
# CREATING THE MODEL #
load_dotenv()
llm = ChatOpenAI(
    model="o3-mini",
    api_key=os.getenv("OPENAI_API_KEY")
)

In [37]:
# CSV FILES #
onedrive_link1 = r"C:/Users/lgian/OneDrive - James Cook University/Data/Sample_Spare_Parts2.csv"
non_generic_parts = pd.read_csv(onedrive_link1)

onedrive_link2 = r"C:/Users/lgian/OneDrive - James Cook University/Data/Downtime_History_Data.csv"
downtime_history = pd.read_csv(onedrive_link2, encoding = "cp1252")

In [38]:
# RETRIEVAL #
# Convert each part to a string
parts_list = non_generic_parts.apply(
    lambda row: f"{row['CAT Part']}: {row['SAP Material Description']}", axis = 1).tolist()

failure_list = downtime_history.apply(
    lambda row: f"{row['Order']}: {row['Notification']}: {row['Order Type']}: {row['Order Long Text Description']}: {row['Notification Long Text Description']}: {row['Sort Field']}: {row['Total Costs']}: {row['Total Work Hours ']}", axis = 1).tolist()

# Split long descriptions into smaller chunks using key-value pairs
text_splitter = SemanticChunker(OpenAIEmbeddings())

# Split parts into chunks
parts_kv_store = {}
parts_chunks = []

for i, text in enumerate(non_generic_parts):
    file_chunks = text_splitter.split_text(text)
    for j, chunk in enumerate(file_chunks):
        key = f"part_{i}_chunk_{j}"
        parts_kv_store[key] = {
            "text": chunk,
            "source": f"part_{i}"
        }
        parts_chunks.append(chunk)

# Split failures into chunks
failure_kv_store = {}
failure_chunks = []

for i, text in enumerate(downtime_history):
    file_chunks = text_splitter.split_text(text)
    for j, chunk in enumerate(file_chunks):
        key = f"failure_{i}_chunk_{j}"
        failure_kv_store[key] = {
            "text": chunk,
            "source": f"failure_{i}"
        }
        failure_chunks.append(chunk)


In [39]:
# CLUSTERING #
embedding_model = OpenAIEmbeddings()
k = 5  # Number of clusters

embedding_parts_vectors = embedding_model.embed_documents(parts_chunks)
embedding_parts_vectors = np.array(embedding_parts_vectors).astype('float32')
parts_dimension = embedding_parts_vectors.shape[1] 

embedding_failure_vectors = embedding_model.embed_documents(failure_chunks)
embedding_failure_vectors = np.array(embedding_failure_vectors).astype('float32')
failure_dimension = embedding_failure_vectors.shape[1] 

# Create and train the KMeans model
kmeans_parts = faiss.Kmeans(d=parts_dimension, k=k, niter=20, verbose=True)
kmeans_parts.train(embedding_parts_vectors)

kmeans_failure = faiss.Kmeans(d=failure_dimension, k=k, niter=20, verbose=True)
kmeans_failure.train(embedding_failure_vectors)

# Assign each embedding to a cluster
parts_index = kmeans_parts.index
failure_index = kmeans_failure.index

# Assign each vector to the nearest centroid
parts_distances, parts_cluster_ids = parts_index.search(embedding_parts_vectors, 1) # cluster_ids is shape with the cluster label for eahc vector
failure_distances, failure_cluster_ids = failure_index.search(embedding_failure_vectors, 1) 

# Flatten if needed
parts_cluster_ids = parts_cluster_ids.flatten()
failure_cluster_ids = failure_cluster_ids.flatten()


In [40]:
## VECTORSTORES ##
# Metadata now includes both cluster ID and key for exact retrieval
parts_metadatas = [
    {"cluster": int(cid), "key": key, "source": parts_kv_store[key]["source"]}
    for cid, key in zip(parts_cluster_ids, parts_kv_store.keys())
]

failure_metadatas = [
    {"cluster": int(cid), "key": key, "source": failure_kv_store[key]["source"]}
    for cid, key in zip(failure_cluster_ids, failure_kv_store.keys())
]

# Add metadata with cluster IDs!
parts_vectorstore = FAISS.from_texts(
    texts=parts_chunks,
    embedding=embedding_model,
    metadatas=parts_metadatas
)

failure_vectorstore = FAISS.from_texts(
    texts=failure_chunks,
    embedding=embedding_model,
    metadatas=failure_metadatas
)

In [27]:
# READING PDFs #
def extract_text_from_pdf(pdf_path):
    text = ""
    reader = PdfReader(pdf_path)
    for page in reader.pages:
        text += page.extract_text() or ""  # Avoid None
    return text

# Dictionary of PDF names and paths
pdf_files = {
    "KXE_brochure": r"C:/Users/lgian/OneDrive - James Cook University/Data/998KXE_Brochure.pdf",
    "XE_brochure": r"C:/Users/lgian/OneDrive - James Cook University/Data/998XE_Brochure.pdf",
    "XE_specs": r"C:/Users/lgian/OneDrive - James Cook University/Data/998XE_Specifications.pdf",
}

# Extract text for each file
pdf_texts = {name: extract_text_from_pdf(path) for name, path in pdf_files.items()}

# Split into chunks
splitter = SemanticChunker(OpenAIEmbeddings())

# Retreive exact chunks later by key
pdf_kv_store = {} # key-value store: key = chunk ID, value = chunk text + metadata
chunks = []
for name, text in pdf_texts.items():
    file_chunks = splitter.split_text(text)
    for j, chunk in enumerate(file_chunks):
        key = f"{name}_chunk_{j}"
        pdf_kv_store[key] = {
            "text": chunk,
            "source": name
        }
        chunks.append(chunk)

# Create embeddings + cluster using FAISS (kmeans)
embedding_pdf_vectors = embedding_model.embed_documents(chunks)
embedding_pdf_vectors = np.array(embedding_pdf_vectors).astype('float32')
pdf_dimension = embedding_pdf_vectors.shape[1]  
kmeans_pdf = faiss.Kmeans(d=pdf_dimension, k=k, niter=20, verbose=True)
kmeans_pdf.train(embedding_pdf_vectors)
pdf_index = kmeans_pdf.index
pdf_distances, pdf_cluster_ids = pdf_index.search(embedding_pdf_vectors, 1)
pdf_cluster_ids = pdf_cluster_ids.flatten()

# Metadata includes both cluster ID and key for exact retrieval
metadatas = [
    {"cluster": int(cid), "key": key, "source": pdf_kv_store[key]["source"]}
    for cid, key in zip(pdf_cluster_ids, pdf_kv_store.keys())
]

pdf_vectorstore = FAISS.from_texts(
    texts=chunks,
    embedding=embedding_model,
    metadatas=metadatas
)

In [41]:
# PROMPTING #
prompt_template = ChatPromptTemplate.from_template(
"""
You are a reliability engineer. 
You have the following list of part numbers and descriptions:

{parts_context}

Task: For the given failure description, return the most relevant PartNumber.
If none of the part numbers seem to fit, return "Unknown".

Use this extra context to make informed decisions: 
{KXE_brochure_context}, {XE_brochure_context}, {XE_specs_context}

Failure: {failure_context}
Answer with ONLY the PartNumber or "Unknown".

Please output your thinking process as well as your final answer.
"""
)

In [42]:
# RETRIEVER #
# Each time the for loop below runs it will get the top 3 similair parts from the parts_vectorstore
retriever_pdf = pdf_vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 3})
relevant_pdf_chunks = retriever_pdf.get_relevant_documents(text)
pdf_context = "\n".join([f"{doc.metadata['key']}: {doc.page_content}" for doc in relevant_pdf_chunks])

parts_retriever = parts_vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 3})
relevant_parts_chunks = parts_retriever.get_relevant_documents(text)
parts_context = "\n".join([f"{doc.metadata['key']}: {doc.page_content}" for doc in relevant_parts_chunks])

In [43]:
# FAILURE MAPPING #
failure_to_part = []
spare_parts = []
pdf = []

for text in failure_list:   
    # Prepare prompt
    prompt = prompt_template.format(
        parts_context= parts_context, 
        failure_context=text, 
        KXE_brochure_context=pdf_context,
        XE_brochure_context=pdf_context,
        XE_specs_context=pdf_context,
    )
    
    # Get LLM response
    response = llm.predict(prompt)
    
    failure_to_part.append({
        "Failure": text,
        "Mapped Part": response
    })


In [44]:
similarity_search = pd.DataFrame(failure_to_part)
similarity_search.to_excel(r"C:/Users/lgian/OneDrive - James Cook University/Data/Key_Value_Results.xlsx", index=False)
