# Inclus AI – Demo Outlier Detection

In [1]:
import pandas as pd
import numpy as np
from openai import AzureOpenAI

In [2]:
client = AzureOpenAI(
    api_key="API KEY", 
    api_version="2024-02-15-preview",
    azure_endpoint="END POINT" 
)

In [5]:
df = pd.read_excel("inclusData.xlsx") 

# Assuming df is your DataFrame
# Convert 'Answer' column to numeric, coercing non-numeric values to NaN

df2=df[['Item','Comment','Dimension']]

df_subset = df2.dropna(subset=['Comment'])

In [6]:
formatted_strings = []

# Iterate over each row and format it as desired
for index, row in df_subset.iterrows():
    formatted_string = f"ITEM: {row['Item']}, Comment: {row['Comment']}, Dimension: {row['Dimension']}"
    formatted_strings.append(formatted_string)

# Concatenate all formatted strings into one long string
long_string = '\n'.join(formatted_strings)

In [3]:
benchmark_questions_strict = 'Give impact or likelyhood for each comment (based on the Dimension) in scale 0-5 (number can have decimals), print the comment and add the assesment after it, print the risk the comment is related'

print(benchmark_questions_strict)

Give impact or likelyhood for each comment (based on the Dimension) in scale 0-5 (number can have decimals), print the comment and add the assesment after it, print the risk the comment is related


In [8]:
# Here we define the 'instruction' for the 'system' part for the prompt. Here the data 'risk_data' is also injected into the prompt, but it could be presented elsewhere.
prompt_engineering_majick = 'The assistant is a risk analyst.'
company_context = 'Inclus is a Finnish scaleup company that provides a platform for doing collaborative risk analysis.'
data_context = 'The data that is provided below is gathered from multiple people within Inclus and concerns multiple risk events. Each risk event is described in a header above the individual assessments. Treat each comment equally.'
examples = 'HERES EXAMPLE OF COMMENTS AND VALUATIONS: Impact 0: "This feature has no significant impact on user experience." "The change will have negligible impact on system performance."Impact 1:"Minor improvements may have a slight impact on customer satisfaction.""The update is expected to have a minimal impact on overall sales."Impact 2:"This decision could have a moderate impact on project timelines.""Changes in marketing strategy may have a moderate impact on brand visibility."Impact 3:"The proposed changes will have a noticeable impact on production efficiency.""Increased competition may have a significant impact on market share."Impact 4:"The new feature is expected to have a substantial impact on user engagement.""Market trends suggest a high impact on pricing strategies."Impact 5:"The security breach had a severe impact on customer trust.""Natural disasters can have a catastrophic impact on supply chain operations."'
formatting = 'Give all the results in format: ***risk type***comment***valuation (only the number), example: ***Outsourcing risks***We havent always been very good at selecting our partners***3.0 '

context_for_GPT = [
prompt_engineering_majick,
company_context,
data_context,
examples,
formatting,
long_string
]

print('\n'.join(context_for_GPT[:-1]))
system_content = '\n'.join(context_for_GPT) # goes into the prompt in 'messages' part in line '{"role": "system", "content": system_content}' later

The assistant is a risk analyst.
Inclus is a Finnish scaleup company that provides a platform for doing collaborative risk analysis.
The data that is provided below is gathered from multiple people within Inclus and concerns multiple risk events. Each risk event is described in a header above the individual assessments. Treat each comment equally.
HERES EXAMPLE OF COMMENTS AND VALUATIONS: Impact 0: "This feature has no significant impact on user experience." "The change will have negligible impact on system performance."Impact 1:"Minor improvements may have a slight impact on customer satisfaction.""The update is expected to have a minimal impact on overall sales."Impact 2:"This decision could have a moderate impact on project timelines.""Changes in marketing strategy may have a moderate impact on brand visibility."Impact 3:"The proposed changes will have a noticeable impact on production efficiency.""Increased competition may have a significant impact on market share."Impact 4:"The ne

In [6]:
response = client.chat.completions.create(
    model="gpt-4-turbo",
    messages=[
        {"role": "system", "content": system_content},
        {"role": "user", "content": benchmark_questions_strict} # the first strict benchmark question is asked
    ],
)

In [None]:
# Text data containing risk entries
risk_data = response.choices[0].message.content

# Split the text data into individual risk entries
risk_entries = risk_data.strip().split("\n\n")

risk_entries=risk_entries[:-1]

# Initialize empty lists to store components of each risk entry
risks = []
descriptions = []
impacts = []

# Iterate through each risk entry and extract its components
for entry in risk_entries:
    parts = entry.split("***")
    risk = parts[1].strip()
    description = parts[2].strip()
    impact = float(parts[3].strip())  # Convert to float
    
    # Append components to respective lists
    risks.append(risk)
    descriptions.append(description)
    impacts.append(impact)

# Create a DataFrame from the extracted components
risk_df = pd.DataFrame({
    "Risk": risks,
    "Description": descriptions,
    "Impact": impacts
})

# Display the DataFrame
risk_df

In [None]:
risk_df = risk_df.rename(columns={"Description": "Comment"})

# Merge the DataFrames based on the 'Comments' column
merged_df = pd.merge(risk_df, df, on="Comment", how="outer")
merged_df = merged_df.dropna(subset=['Impact'])
# Display the merged DataFrame
merged_df=merged_df[['Item','Dimension','Answer', 'Comment', 'Impact']]
merged_df = merged_df.rename(columns={"Answer": "Actual Answer"})
merged_df = merged_df.rename(columns={"Impact": "GPT Answer"})
print(merged_df)


In [None]:
merged_df['GPT Answer'] = merged_df['GPT Answer'].astype(float)
merged_df['Actual Answer'] = merged_df['Actual Answer'].astype(float)
outliers = merged_df[abs(merged_df['GPT Answer'] - merged_df['Actual Answer']) >= 1]