In [1]:
# LIBRARIES #
import os
import io
import requests
import pandas as pd
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from dotenv import load_dotenv
import fitz
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from PyPDF2 import PdfReader
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
import tqdm

In [2]:
# CREATING THE MODEL #
load_dotenv()
llm = ChatOpenAI(
    model="o3-mini",
    api_key=os.getenv("OPENAI_API_KEY")
)

In [3]:
# READING PDFs #
# Dictionary of PDF names and paths
pdf_paths = {
    "KXE_brochure": r"C:/Users/lgian/OneDrive - James Cook University/Data/998KXE_Brochure.pdf",
    "XE_brochure": r"C:/Users/lgian/OneDrive - James Cook University/Data/998XE_Brochure.pdf",
    "XE_specs": r"C:/Users/lgian/OneDrive - James Cook University/Data/998XE_Specifications.pdf",
}

# Extract text from PDFs
def extract_text(pdf_path):
    text = ""
    reader = PdfReader(pdf_path)
    for page in reader.pages:
        text += page.extract_text() or ""
    return text

pdf_texts = {label: extract_text(path) for label, path in pdf_paths.items()}

# Split texts into chunks with labels
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
texts, labels = [], []

for label, text in pdf_texts.items():
    chunks = splitter.split_text(text)
    texts.extend(chunks)
    labels.extend([label] * len(chunks))


# Create embeddings + store in FAISS
embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small")
X = [embeddings_model.embed_query(t) for t in tqdm.tqdm(texts, desc="Embedding PDF chunks")]
X = np.array(X)
y = np.array(labels)

clf = KNeighborsClassifier(n_neighbors=3)
clf.fit(X, y)

# Build FAISS vector stores for each PDF
vectorstores = {}
for label, text in pdf_texts.items():
    chunks = splitter.split_text(text)
    vectorstores[label] = FAISS.from_texts(chunks, embeddings_model)

Embedding PDF chunks: 100%|██████████| 135/135 [01:12<00:00,  1.87it/s]


In [4]:
# CSV FILES #
onedrive_link1 = r"C:/Users/lgian/OneDrive - James Cook University/Data/Cleaned_Non_Generic_998KXE_Machine_Parts.csv"
non_generic_parts = pd.read_csv(onedrive_link1)

onedrive_link2 = r"C:/Users/lgian/OneDrive - James Cook University/Data/Order_and_Notification_Data.csv"
downtime_history = pd.read_csv(onedrive_link2, encoding = "cp1252")

In [5]:
# RETRIEVAL #
# Convert each part to a string
parts_list = non_generic_parts.apply(
    lambda row: f"{row['CAT Part']}: {row['SAP Material Description']}", axis = 1).tolist()

failure_list = downtime_history.apply(
    lambda row: f"{row['Order']}: {row['Notification']}: {row['Order Type']}: {row['Order Long Text Description']}: {row['Notification Long Text Description']}: {row['Sort field']}: {row['Total Costs']}: {row['Total Work Hours ']}", axis = 1).tolist()


In [6]:
# PROMPTING #
prompt_template = ChatPromptTemplate.from_template(
"""
You are a reliability engineer. 
You have the following list of part numbers and descriptions:

{parts_context}

Task: For the given failure description, return the most relevant PartNumber.
If none of the part numbers seem to fit, return "Unknown".

Use this extra context to make informed decisions: 
{KXE_brochure_context}, {XE_brochure_context}, {XE_specs_context}

Failure: {failure_context}
Answer with ONLY the PartNumber or "Unknown".

Please output your thinking process as well as your final answer.
"""
)

In [None]:
# Process each failure
failure_to_part = []

for failure in failure_list:
    failure_emb = embeddings_model.embed_query(failure)
    predicted_label = clf.predict([failure_emb])[0]

    # Retrieve top 4 relevant chunks from predicted PDF
    retriever = vectorstores[predicted_label].as_retriever(search_type="similarity", search_kwargs={"k": 3})
    relevant_docs = retriever.get_relevant_documents(failure)
    context_snippets = "\n".join([doc.page_content for doc in relevant_docs])

    # Feed into LLM prompt
    long_response = prompt_template.format(
        parts_context=parts_list,
        failure_context=failure,
        KXE_brochure_context=context_snippets if "KXE_brochure" in predicted_label else "",
        XE_brochure_context=context_snippets if "XE_brochure" in predicted_label else "",
        XE_specs_context=context_snippets if "XE_specs" in predicted_label else "",
    )

    response = llm.predict(long_response)

    failure_to_part.append({
        "FailureDescription": failure,
        "Response": response
    })

# Save results
failure_to_part_df = pd.DataFrame(failure_to_part)
failure_to_part_df.to_csv("classified_failures_with_predictions.csv", index=False)
print("✅ Classification complete! Saved to 'classified_failures_with_predictions.csv'")

  relevant_docs = retriever.get_relevant_documents(failure)
  response = llm.predict(long_response)


Mapping saved!
