In [1]:
# LIBRARIES #
import os
import io
import requests
import pandas as pd
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from dotenv import load_dotenv
import fitz
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from PyPDF2 import PdfReader

In [4]:
# CREATING THE MODEL #
load_dotenv()
llm = ChatOpenAI(
    model="o3-mini",
    api_key=os.getenv("OPENAI_API_KEY")
)


In [5]:
# CSV FILES #
onedrive_link1 = r"C:/Users/lgian/OneDrive - James Cook University/Data/Sample_Spare_Parts.csv"
non_generic_parts = pd.read_csv(onedrive_link1)

onedrive_link2 = r"C:/Users/lgian/OneDrive - James Cook University/Data/Sample_Failures.csv"
downtime_history = pd.read_csv(onedrive_link2, encoding = "cp1252")

In [8]:
# RETRIEVAL #
# Convert each part to a string
parts_list = non_generic_parts.apply(
    lambda row: f"{row['CAT Part']}: {row['SAP Material Description']}", axis = 1).tolist()

failure_list = downtime_history.apply(
    lambda row: f"{row['Order ']}: {row['Notification']}: {row['Order Type']}: {row['Order Long Text Description']}: {row['Notification Long Text Description']}: {row['Sort Field']}: {row['Total Costs']}: {row['Total Work Hours ']}", axis = 1).tolist()

# Split long descriptions into smaller chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=0)
parts_chunks = []
for text in parts_list:
    parts_chunks.extend(text_splitter.split_text(text))

failure_chunks = []
for text in failure_list:
    failure_chunks.extend(text_splitter.split_text(text))

In [9]:
# EMBEDDINGS #
embeddings = OpenAIEmbeddings()
parts_vectorstore = FAISS.from_texts(parts_chunks, embeddings)
failure_vectorstore = FAISS.from_texts(failure_chunks, embeddings)

In [None]:
# READING PDFs #
def extract_text_from_pdf(pdf_path):
    text = ""
    reader = PdfReader(pdf_path)
    for page in reader.pages:
        text += page.extract_text() or ""  # Avoid None
    return text


# Dictionary of PDF names and paths
pdf_files = {
    "KXE_brochure": r"C:/Users/lgian/OneDrive - James Cook University/Data/998KXE_Brochure.pdf",
    "XE_brochure": r"C:/Users/lgian/OneDrive - James Cook University/Data/998XE_Brochure.pdf",
    "XE_specs": r"C:/Users/lgian/OneDrive - James Cook University/Data/998XE_Specifications.pdf",
}

# Extract text for each file
pdf_texts = {name: extract_text_from_pdf(path) for name, path in pdf_files.items()}

# Split into chunks
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)

chunks = []
for name, text in pdf_texts.items():
    file_chunks = splitter.split_text(text)
    # Tag chunks with filename if you like
    chunks.extend([f"{name}: {chunk}" for chunk in file_chunks])

# Create embeddings + store in FAISS
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")  # or large
vectorstore = FAISS.from_texts(chunks, embeddings)

# Make retriever
retriever_pdf = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 4})

In [13]:
# PROMPTING #
prompt_template = ChatPromptTemplate.from_template(
"""
You are a reliability engineer. 
You have the following list of part numbers and descriptions:

{parts_context}

Task: For the given failure description, return the most relevant PartNumber.
If none of the part numbers seem to fit, return "Unknown".

Use this extra context to make informed decisions: 
{KXE_brochure_context}, {XE_brochure_context}, {XE_specs_context}

Failure: {failure_context}
Answer with ONLY the PartNumber or "Unknown".

Please output your thinking process as well as your final answer.
"""
)

In [14]:
# RETRIEVER #
# Create a retriever from your vectorstore
parts_retriever = parts_vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 3})

In [15]:
# FAILURE MAPPING #
failure_to_part = []

for text in failure_list:
    # Get top matching parts
    top_parts = parts_retriever.get_relevant_documents(text)
    parts_context = "\n".join([doc.page_content for doc in top_parts])
    
    # Prepare prompt
    prompt = prompt_template.format(
        parts_context=parts_context, 
        failure_context=text, 
        KXE_brochure_context="\n".join([doc.page_content for doc in retriever_pdf.get_relevant_documents("998KXE brochure")]),
        XE_brochure_context="\n".join([doc.page_content for doc in retriever_pdf.get_relevant_documents("998XE brochure")]),
        XE_specs_context="\n".join([doc.page_content for doc in retriever_pdf.get_relevant_documents("998XE specs")])
    )
    
    # Get LLM response
    response = llm.predict(prompt)
    
    failure_to_part.append({
        "Failure": text,
        "Mapped Part": response
    })

# Convert to DataFrame
mapping_df = pd.DataFrame(failure_to_part)

  top_parts = parts_retriever.get_relevant_documents(text)
  response = llm.predict(prompt)


In [16]:
mapping_df.to_excel(r"C:/Users/lgian/OneDrive - James Cook University/Data/Sample_Failure_Mappingggg.xlsx", index=False)
print("Mapping saved!")

Mapping saved!
