In [None]:
import pandas as pd
from openai import AzureOpenAI

In [None]:
client = AzureOpenAI(
    api_key="b3f3d81d73ee4a979100ef674ac01dc4", # DO NOT SHARE THIS IN CODE!
    api_version="2024-02-15-preview",
    azure_endpoint="https://inclus-ai-rd.openai.azure.com/" # DO NOT SHARE THIS IN CODE! Something like "https://######.openai.azure.com/"
)

In [None]:
df = pd.read_excel("inclusData.xlsx") # have the xlsx shared by Juha in the same folder as this .ipynb file
df = df.sort_values(by=['Item', 'Dimension', 'Comment' ,'Answer'], na_position='last') # We can do some sorting beforehand, might be interesting to list the risk categories or comments with highest likelyhood first for example
df_wide = df.pivot(index=['Item', 'Participant ID'], columns='Dimension', values=['Answer', 'Comment']) # likelihood & impact on the same row now; easier to deal with when looping by risk

#%%
# In this block we're parsing the data into a string that is fed into the prompt. Remember '\t' for a tab and '\n' for a line break.

dimensions = df['Dimension'].unique() # likelihood & impact in this case

# combine quant answers & comments into natural language
for dimension in dimensions:
    answer = df_wide[('Answer', dimension)].astype(str)
    comment = df_wide[('Comment', dimension)] #.astype(str) # exclude .astype(str) if want to exclude 'nan' comments & their ratings from prompt
    dimension_answer = ' ' # ' of this risk is '
    answer_comment = '. Explanation: ' if dimension == 'Likelyhood' else '. Mitigation: ' #' because: ' if dimension == 'Likelyhood' else ' and it can be mitigated by: '
    df_wide[('Rating and comment', dimension)] = dimension + dimension_answer + answer + answer_comment + comment # (Likelyhood, 4.0, reasons XYZ) |--> 'Likelyhood of this risk is 4.0 because: reasons XYZ'


df = df.dropna()
risk_data = ''
for row in df.values:
    risk_data += f'Risk type:  {row[4]}, Dimension: {row[5]}, Answer: {row[6]}, Comment: {row[7]} \n' 

risk_aggregates = []
col_names = ['Item']

for dimension in dimensions:
    col_names += ['Avg ' + dimension + ', all', 'Avg ' + dimension + ', given not nan comments', 'Number of comments, ' + dimension]

col_names += ['likelihood x impact, all', 'likelihood x impact, given not nan comments']
for item, group in df_wide.groupby('Item'):
    means = [item]
    for dimension in dimensions:
        group_with_comment = group.dropna(subset=[('Comment', dimension)])
        n_comments = group_with_comment.shape[0]
        avg_given_comments = group_with_comment[('Answer', dimension)].mean()
        avg = group[('Answer', dimension)].mean()
        means += [avg, avg_given_comments, n_comments]
    likelihood_x_impact = group[('Answer', 'Likelyhood')] * group[('Answer', 'Impact')]
    gg = group.dropna(subset=[('Comment', 'Likelyhood'), ('Comment', 'Impact')])
    likelihood_x_impact_given_comments = gg[('Answer', 'Likelyhood')] * gg[('Answer', 'Impact')]
    means += [likelihood_x_impact.mean(), likelihood_x_impact_given_comments.mean()]
    risk_aggregates.append( means )

df_aggregate = pd.DataFrame(risk_aggregates, columns=col_names)
df_aggregate['avg likelihood x avg impact, all'] = df_aggregate['Avg Impact, all'] * df_aggregate['Avg Likelyhood, all']
df_aggregate['avg likelihood x avg impact, given not nan comments'] = df_aggregate['Avg Likelyhood, given not nan comments'] * df_aggregate['Avg Impact, given not nan comments']
df_aggregate['Number of comments, combined'] = df_aggregate['Number of comments, Likelyhood'] + df_aggregate['Number of comments, Impact']
df_aggregate

In [None]:
# Here we define the 'instruction' for the 'system' part for the prompt. Here the data 'risk_data' is also injected into the prompt, but it could be presented elsewhere.
prompt_engineering_majick = 'The assistant is an expert in risk analyst.'
task_context = 'The assistant will be asked questions about risk data (about the company Inclus) that is provided in this message and the assistant will try to answer using only the provided data. If assistant is not sure in the answer, it can return text "None". The assistant would be provided a list with indexes, risk type, dimension, answer for the dimension and comment. The number given in the answer tells about the impact of the dimension, 5 corresponds to the biggest impact and 1 corresponds to the smallest impact. It should answer on posed benchmark question. Do not add any additional text except for required output format. Assistant needs to take the impacts into account, when providing the analysis.'
company_context = 'Inclus is a Finnish scaleup company that provides a platform for doing collaborative risk analysis.'
data_context = 'The data that is provided below is gathered from multiple people within Inclus and concerns multiple risk events. Each risk event is described in a header above the individual assessments.'
expected_output = 'HERE IS AN INSTRUCTION ON THE EXPECTED OUTPUT: Provide only several paragraphs of high-quality text that correctly describe important features of the data, such as significant risks and high-quality comments.'

context_for_GPT = [prompt_engineering_majick,
task_context,
company_context,
data_context,
expected_output,
risk_data]
system_content = '\n'.join(context_for_GPT) # goes into the prompt in 'messages' part in line '{"role": "system", "content": system_content}' later


benchmark_question = 'Write an executive summary of the results of the risk analysis. Do not add any conclusion and comments or provide any additional data. The output should be solely based on the provided data. Use mainly the comments that are written by experts and that have a strong argument. For any conclusion made, provide all comments used in assesments as the source and write the sources underneath the summary. The amount of csources used should be at least 45.'

In [None]:
response = client.chat.completions.create(
    model="gpt-4-turbo",
    messages=[
        {"role": "system", "content": system_content},
        {"role": "user", "content": benchmark_question} # the first strict benchmark question is asked
    ]
)

print(response.model_dump_json(indent=2))
print(response.choices[0].message.content)