In [2]:
import pandas as pd

# Specify the Excel file name (update this to your file name)
excel_file = 'dataset.xlsx'

# Read the Excel file into a DataFrame
df = pd.read_excel(excel_file)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23 entries, 0 to 22
Data columns (total 10 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Dataset_Number              23 non-null     int64  
 1   Responsible_For             23 non-null     object 
 2   Supporting                  21 non-null     object 
 3   Designing                   23 non-null     object 
 4   Gold_Responsible_For        23 non-null     object 
 5   Gold_Supporting             20 non-null     object 
 6   Gold_Designing              22 non-null     object 
 7   LLM_Output_Responsible_For  0 non-null      float64
 8   LLM_Output_Supporting       0 non-null      float64
 9   LLM_Output_Designing        0 non-null      float64
dtypes: float64(3), int64(1), object(6)
memory usage: 1.9+ KB


# Clean the data using LLM

In [None]:
import os
import time
import pandas as pd
from pydantic import BaseModel, Field
from langchain_openai import AzureChatOpenAI
from langchain.schema import HumanMessage

# Define a model for the cleaned ISO processes output.
class CleanedISOProcesses(BaseModel):
    cleaned_processes: str = Field(
        description="A comma separated list of cleaned ISO processes"
    )

# Initialize the LangChain LLM using Azure OpenAI parameters.

llm = AzureChatOpenAI(
    model="gpt-4o-mini",
    openai_api_key=os.environ.get("AZURE_API_KEY"),
    azure_endpoint=os.environ.get("AZURE_ENDPOINT"),
    api_version="2024-02-15-preview",
    temperature=0.5
)

# Create a version of the LLM that returns structured output using our CleanedISOProcesses model.
structured_llm = llm.with_structured_output(CleanedISOProcesses)

In [9]:
# Define the valid ISO processes in our system.
iso_processes_reference = """
acquisition process
supply process
Life cycle model management process
Infrastructure management process
Portfolio management process
Human resource management process
Quality management process
Knowledge management process
Project planning process
Project assessment and control process
Decision management process
Risk management process
Configuration management process
Information management process
Measurement process
Quality assurance process
Business or mission analysis process
Validation process
Stakeholder needs and requirements definition process
System requirements definition process
System architecture definition process
Design definition process
System analysis process
Implementation process
Integration process
Verification process
Transition process
Operation process
Maintenance process
Disposal process
"""

# Assume the DataFrame 'df' is already loaded (e.g., using pd.read_excel).
# Create new columns to store the cleaned outputs.
df["cleaned_Gold_Responsible_For"] = None
df["cleaned_Gold_Supporting"] = None
df["cleaned_Gold_Designing"] = None

# Define a helper function to clean a given text input.
def clean_iso_processes(original_text: str):
    # If the text is empty or only whitespace, return None.
    if not original_text or original_text.strip() == "":
        return None

    prompt = f"""
The following text is user input for an ISO process: "{original_text}".
The valid ISO processes in our system are:
{iso_processes_reference}

Note: Users may input abbreviations or misspellings. For example:
- "snrd" may be used to represent "Stakeholder needs and requirements definition process".
- "ver" may represent "Verification process".
- "imp" may represent "Implementation process".
- "srd" may represent "System requirements definition process".
Users may also use other unusual abbreviations.

Please extract and clean the input to output a comma separated list of ISO processes that match the valid ones. Correct any spelling mistakes, abbreviations, or inconsistencies.
"""
    # Invoke the LLM with the prompt and return the cleaned processes.
    result = structured_llm.invoke(prompt)
    return result.cleaned_processes

# Iterate through each row and clean the relevant columns.
for idx, row in df.iterrows():
    # Clean the Gold_Responsible_For column.
    original_text = row["Gold_Responsible_For"]
    if pd.notna(original_text) and original_text.strip() != "":
        try:
            cleaned = clean_iso_processes(original_text)
            df.at[idx, "cleaned_Gold_Responsible_For"] = cleaned
        except Exception as e:
            print(f"Error processing row {idx} for Gold_Responsible_For: {e}")
    else:
        df.at[idx, "cleaned_Gold_Responsible_For"] = None

    # Clean the Gold_Supporting column.
    original_text = row["Gold_Supporting"]
    if pd.notna(original_text) and original_text.strip() != "":
        try:
            cleaned = clean_iso_processes(original_text)
            df.at[idx, "cleaned_Gold_Supporting"] = cleaned
        except Exception as e:
            print(f"Error processing row {idx} for Gold_Supporting: {e}")
    else:
        df.at[idx, "cleaned_Gold_Supporting"] = None

    # Clean the Gold_Designing column.
    original_text = row["Gold_Designing"]
    if pd.notna(original_text) and original_text.strip() != "":
        try:
            cleaned = clean_iso_processes(original_text)
            df.at[idx, "cleaned_Gold_Designing"] = cleaned
        except Exception as e:
            print(f"Error processing row {idx} for Gold_Designing: {e}")
    else:
        df.at[idx, "cleaned_Gold_Designing"] = None

    # Optional: sleep briefly to avoid rate limits.
    time.sleep(2)

# Save the updated DataFrame to an Excel file.
output_excel_file = 'cleaned_output.xlsx'
df.to_excel(output_excel_file, index=False)
print(f"Results saved to {output_excel_file}")

Results saved to cleaned_output.xlsx


In [6]:
df

Unnamed: 0,Dataset_Number,Responsible_For,Supporting,Designing,Gold_Responsible_For,Gold_Supporting,Gold_Designing,LLM_Output_Responsible_For,LLM_Output_Supporting,LLM_Output_Designing,cleaned_Gold_Responsible_For,cleaned_Gold_Supporting,cleaned_Gold_Designing
0,1,1. Formal compliance verification for the Volo...,1. Assisting the Avionics Engineering Team in ...,1. Certification strategy design: Defining how...,"verification process, Project planning process...","risk management process, life cycle model mana...","System architecture definition process, Stakeh...",,,,"verification process, project planning process...","risk management process, life cycle model mana...","System architecture definition process, Stakeh..."
1,2,1. Führung und Weiterentwicklung des siebenköp...,1. Zusammenarbeit mit anderen Abteilungen (z. ...,1. Design von Testprozessen: Entwicklung von s...,"Human resource management process, Verificatio...","Life cycle model management process, Human res...",Verification process; Validation process; Qual...,,,,"Human resource management process, Verificatio...","Life cycle model management process, Human res...","Verification process, Validation process, Qual..."
2,3,1. Performing testing and validation of ADAS h...,1. Benchmarking activities: Assisting in compe...,1. Defining use cases and test scenarios in co...,"Validation process,\nQuality assurance process...","Knowledge management process, Information mana...",Verification process; Validation process Verif...,,,,"Validation process,Quality assurance process,I...","Knowledge management process, Information mana...","Verification process, Validation process, Veri..."
3,4,\n1. Erstellung von Testplänen und Testprozedu...,1. Zusammenarbeit mit internen Teams (z. B. En...,1. Design von Testprozessen: Entwicklung stand...,"1. Verification, Validation\n2 Verification, V...","1. no mapping\n2 Verification, Validation (may...","1 Verification, Validation (maybe Integration)...",,,,"Verification process, Validation process, Supp...","Verification process, Validation process, Inte...","Verification process, Validation process, Inte..."
4,5,1. Fachliche und disziplinarische Führung des ...,1. Zusammenarbeit mit Entwicklungsabteilungen ...,1. Entwicklung von Methoden zur Produktqualitä...,Information management process; Project planni...,nformation management process\nRisk management...,Quality assurance process\nVerification & Vali...,,,,"Information management process, Project planni...","Information management process,Risk management...","Quality assurance process, Information managem..."
5,6,1. Preparation of technical repercussion sheet...,1. Interface management with stakeholders (e.g...,1. Implementation of change management process...,Life cycle model management process; Configura...,Stakeholder needs and requirements definition ...,Implementation process Configuration managemen...,,,,"Life cycle model management process, Configura...",Stakeholder needs and requirements definition ...,"Implementation process, Configuration manageme..."
6,7,1. Technical and timely implementation of cont...,"1. Acting as the interface between clients, su...","1. Optimizing site management workflows (e.g.,...",. Integration process\n1. project planning pro...,. Traqnsition process\n1. Project assessment a...,1. Transition process\n1. operation process\n1...,,,,"Integration process, Project planning process,...","Project assessment and control process, Qualit...","Transition process, Operation process, Impleme..."
7,8,1. Lead programs to secure and optimize IoT-en...,1. Act as a bridge between technical teams (e....,1. Design resilience frameworks for renewable ...,1. design definition process\ndesgin definitio...,hart to map\n1. Risk management process 1. ha...,1. systems analysis process 1. systesm design ...,,,,"Design definition process, Operation process","Risk management process, Project planning process","system analysis process, system design process..."
8,9,"1. Requirement Gathering & Analysis: Elicit, d...",1. Stakeholder Coordination: Act as a liaison ...,1. Designing System Requirements: Develop stru...,1. system definition requirement process 1. 2....,. SNRD\n1. project planning process\n1. knowle...,1. system architecture definition process\n1. ...,,,,"system requirements definition process, knowle...","project planning process,knowledge management ...","System architecture definition process, Design..."
9,10,1. Develop the safety case for autonomous trai...,"1. Coordinate with external partners (e.g., th...",1. Innovate safety methodologies for obstacle ...,. system analysis process\n1. hard to map\n1. ...,1. SNRD\n1. validation process\n1. hard to map...,1. supply process integration process\n1. syst...,,,,"system analysis process, operation process, co...","Validation process,System analysis process","supply process, system architecture definition..."


In [7]:

# Save the updated DataFrame to an Excel file.
output_excel_file = 'cleaned_output.xlsx'
df.to_excel(output_excel_file, index=False)
print(f"Results saved to {output_excel_file}")

Results saved to cleaned_output.xlsx


In [10]:
import pandas as pd

# Load the Excel file
file_path = "cleaned_output.xlsx"  # Update with the correct file path
df = pd.read_excel(file_path)

# Create the new column by concatenating existing columns with formatting
df["Gold_Output"] = (
    "User is Responsible For ISO Processes: " + df["cleaned_Gold_Responsible_For"].astype(str) + "\n"
    "User is Supporting: " + df["cleaned_Gold_Supporting"].astype(str) + "\n"
    "User is Designing: " + df["cleaned_Gold_Designing"].astype(str)
)

# Save the updated DataFrame back to an Excel file
output_file_path = "updated_cleaned_output.xlsx"
df.to_excel(output_file_path, index=False)

print(f"Updated file saved as {output_file_path}")


Updated file saved as updated_cleaned_output.xlsx
