# Part 2: Choose your adventure
In this part you can explore differnt ways of improving the overall performance of the identification of decision reasons:

- a) better model --- in this section you can test models other than LLAMA and compare the performance between them
- b) improve prompt --- in this section you can edit our prompt, or even writ your own from scratch
- c) other reasons --- in this section you can test your own decision reasons



# Environment Setup

## Monut google drive

In [None]:
# mount googl drive
from google.colab import drive
drive.mount("/content/drive")

MessageError: Error: credential propagation was unsuccessful

## Import the packacges and set the working directory

In [None]:
import sys
import os
import re
import textwrap
from IPython.display import display, HTML
import pandas as pd
from huggingface_hub import InferenceClient

# the code below installs huggingface hub if it's missing
if 'google.colab' in sys.modules:  # If in Google Colab environment

    # Installing requisite packages
    !pip install huggingface_hub &> /dev/null

# this sets the working directory to the exercises folder
base_path = '/content/drive/My Drive/llms_egproc/exercises/'
os.chdir(base_path)

## Import the functions we used in excercise 1 from the llm_funs.py file

In [None]:
# Add the script directory to the system path
sys.path.append(base_path)

# custom funcions from ex1
from llm_funs import generate_prompt, extract_confidence, wrap_text, show_verbal_reports_in_range

## Set the access token

In [None]:
API_TOKEN = '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'

## Read in data

In [None]:
# read in decision problems, decision reasons, and verbal reports
decision_problems = pd.read_csv('data/decision_problems.csv', encoding = 'utf-8')
decision_reasons = pd.read_csv('data/decision_reasons.csv', encoding = 'utf-8')
verbal_reports = pd.read_csv('data/verbal_reports.csv', encoding = 'utf-8')

# merge verbal reports with decision problems
problems_reports = pd.merge(decision_problems, verbal_reports, on = 'problem_id')

# the prompt
prompt_path = 'prompts/prompt_v1.txt'

# Open the file and read its contents
with open(prompt_path, 'r') as file:
    prompt_base= file.read()

## Load the functions from ex1

In [None]:
# function for constructing the full prompt
def generate_prompt(prompt, decision_problem, decision_reason, verbal_report):
    """
    Replaces placeholders in the prompt with the given decision problem, decision reason, and verbal report.
    """
    # Replace placeholders with actual values
    filled_prompt = prompt.replace("DECISION_PROBLEM", decision_problem)
    filled_prompt = filled_prompt.replace("DECISION_REASON", decision_reason)
    filled_prompt = filled_prompt.replace("VERBAL_REPORT", verbal_report)

    return filled_prompt

# Function for extracting confidence assessments
def extract_confidence(s):
    """
    Extracts an integer value from a string enclosed between @ or @@ symbols.
    """
    # Regular expression to match patterns like @number@ or @@number@@
    pattern = r'@+(\s*\d+\s*)@+'

    # Search for the pattern in the string
    match = re.search(pattern, s)

    if match:
        # Extract the number and convert it to an integer
        number_str = match.group(1).strip()
        return int(number_str)

    return None

# Function to wrap text
def wrap_text(text, width=100):
    return "<br>".join(textwrap.wrap(text, width))

# display data frames in HTML
def disp_tab(dd):
    dd = dd.to_html(escape=False)
    return display(HTML(dd))

# Function to show verbal reports with assigned numbers in a specified range
def show_verbal_reports_in_range(data, reason, min_confidence, max_confidence):
    """
    Shows verbal reports for which the model assigned a confidence within the specified range.
    """
    filtered_data = data[(data[reason] >= min_confidence) & (data[reason] <= max_confidence)] # filter by the specified range

     # wrap the text for nicer display
    filtered_data.loc[:, 'verbal_report'] = filtered_data['verbal_report'].apply(wrap_text)
    filtered_data.loc[:, 'decision_problem'] = filtered_data['decision_problem'].apply(lambda x: wrap_text(x, width=40))

    # select only the columns with report and confidence assesment
    filtered_data = filtered_data[['decision_problem', 'verbal_report', 'choice', reason]]

    return disp_tab(filtered_data)
    # return filtered_data[['verbal_report', reason]]

# Adeventure A: Better model

## Select a decision reason for comparioson of models

In [None]:
# Set up the prompts for a decision reason of your choice
# here are the avilable reasons
disp_tab(decision_reasons)

### Generate prompts with the selected reason

In [None]:
selected_reason = 'maximum outcome' # change to your reason of choice
selected_description = decision_reasons.loc[decision_reasons['decision reason name'] == selected_reason, 'decision reason description'].values[0]

# Create a list for storing prompts for the expected value reason
filled_prompts = []

# Generate prompts for the specific decision reason
for _, row in problems_reports.iterrows():

    # here we are using the generate prompt function to create prompts for all verbal reports and the expected value reason
    prompt = generate_prompt(
        prompt_base,
        row['decision_problem'],
        selected_description,  # Use the selected description
        row['verbal_report']
    )
    filled_prompts.append(prompt)

In [None]:
print(filled_prompts[0])

## Run the analysis with model from excercise 1

In [None]:
LLM1_version = "meta-llama/Meta-Llama-3-70B-Instruct"
LLM1 = InferenceClient(model = LLM1_version, token = API_TOKEN)

# list for storing the output from the model 1
LLM1_results = []

# column name for storage of the confidence values
llm1_res_col = 'llm1_confidence_res'

# new column in the problems_reports data set for stroting the confidence assesments
problems_reports[llm1_res_col] = None

# run the analysis
# Iterate over the list of prompts, get responses, and extract numerical estimates and add them to the data set with problems and reports
for i, prompt in enumerate(filled_prompts):

    # response from LLAMA
    LLM1_response = LLM1.text_generation(prompt, max_new_tokens = 4000)
    LLM1_results.append(LLM1_response) # save the response to the sure_outcome_eval list

    # extract the confidence value from the response
    confidence_assesment = extract_confidence(LLM1_response)

    # confidence value into the data
    problems_reports.at[i, llm1_res_col] = confidence_assesment

    # monitor progress
    print(str(i) + '/' + str(problems_reports.shape[0]-1))

## Run the analysis with another model
We propose to use the recently upgraded version of the model used in ex1 and above

In [None]:
LLM2_version = "meta-llama/Meta-Llama-3.1-70B-Instruct"  # most recent updated LLAMA family
LLM2 = InferenceClient(model = LLM2_version, token = API_TOKEN)

# list for storing the output from the model 1
LLM2_results = []

# column name for storage of the confidence values
llm2_res_col = 'llm2_confidence_res'

# new column in the problems_reports data set for storing the confidence assesments
problems_reports[llm2_res_col] = None

# run the analysis
# Iterate over the list of prompts, get responses, and extract numerical estimates and add them to the data set with problems and reports
for i, prompt in enumerate(filled_prompts):

    # response from LLAMA
    LLM2_response = LLM2.text_generation(prompt, max_new_tokens = 4000)
    LLM2_results.append(LLM2_response) # save the response to the sure_outcome_eval list

    # extract the confidence value from the response
    confidence_assesment = extract_confidence(LLM2_response)

    # confidence value into the data
    problems_reports.at[i, llm2_res_col] = confidence_assesment

    # monitor progress
    print(str(i) + '/' + str(problems_reports.shape[0]-1))

## Compare the results from both models
First, high confidence reports from model 1

In [None]:
# Show verbal reports for which the reason was assessed to be used with high confidecne, i.e., between 80 to 100
show_verbal_reports_in_range(problems_reports, 'llm1_confidence_res', 80, 100) # or use 0, 100 to display the entier data frame

Second, high confidence reports from model 2

In [None]:
# Show verbal reports for which the reason was assessed to be used with high confidecne, i.e., between 80 to 100
show_verbal_reports_in_range(problems_reports, 'llm2_confidence_res', 50, 100) # or use 0, 100 to display the entier data frame

You can also compare the full text of the model 'thinking' directly, for corresponding verbal reports:

(change the index numer in both code snippets to compare resposned corresponding to the same verbal reports)

In [None]:
print(LLM1_results[1]) # response to verbal report 1 from model 1

In [None]:
print(LLM2_results[1]) # response to verbal report 1 from model 2

## Other ways of comparing the models
Think of other ways of comparing the model performance and explore them. Feel free to ask us questions on how to set up what you have in mind.

# Adeventure B: improve prompt

- or make it worse (e.g., remove chain of thoughts)
- see how the model reacts to changes in the prompts


In [None]:
# We should use the same model to compare the diiferences a prompt can make:
LLM_version = "meta-llama/Meta-Llama-3-70B-Instruct"
LLM = InferenceClient(model = LLM_version, token = API_TOKEN)

## Also let's fix the decision reason

In [None]:
# Set up the prompts for a decision reason of your choice
# here are the avilable reasons
disp_tab(decision_reasons)

Unnamed: 0,decision reason name,decision reason description
0,minimum outcome,The reason considers the minimum outcome of each lottery. The reason prefers the lottery with the more favorable minimum outcome.
1,maximum outcome,The reason considers the maximum outcome of each lottery. The reason prefers the lottery with the more favorable maximum outcome.
2,most likely outcome,"The reason considers the most probable outcomes, that is the outcomes with the highest probability of each lottery. If there is more than one outcome with the highest probability, the more favorable one is considered. The reason prefers the lottery with the more favorable most probable outcome."
3,least likely outcome,"The reason considers the least probable outcomes, that is the outcomes with the lowest probability of each lottery. If there is more than one outcome with the lowest probability, the more favorable one is considered. The reason prefers the lottery with the more favorable least probable outcome."
4,zero outcome probability,The reason considers the probability of a zero outcome of each lottery. The reason prefers the lottery with the probability of the zero outcome that is more favorable in the context of the possible outcomes.
5,sure outcome,"The reason considers the presence of a sure outcome, that is an outcome with 100% probability, of each lottery. The reason prefers the lottery with or without the sure outcome, depending on whether the sure outcome is a favorable outcome in the context of all possible outcomes."
6,expected value,The reason considers the expected value of each lottery. The reason prefers the lottery with the higher expected value.
7,loss aversion,"The reason considers the outcomes of each lottery. Losses are considered more important than gains. The reason prefers the lottery with the more favorable loss or, if losses are identical, the lower loss probability."
8,regret,The reason considers the outcomes of each lottery. The sum of all pairwise differences of outcomes between the lotteries is considered important. The reason prefers the lottery with the more favorable sum of outcome differences.


In [None]:
# set the decision reason
selected_reason = 'maximum outcome' # change to your reason of choice
selected_description = decision_reasons.loc[decision_reasons['decision reason name'] == selected_reason, 'decision reason description'].values[0]

## First get model responses for our prompt


In [None]:
# Create a list for storing prompts
filled_prompts_p1 = []

# Generate prompts for the specific decision reason
for _, row in problems_reports.iterrows():

    # here we are using the generate prompt function to create prompts for all verbal reports and the expected value reason
    prompt = generate_prompt(
        prompt_base,
        row['decision_problem'],
        selected_description,  # Use the selected description
        row['verbal_report']
    )
    filled_prompts_p1.append(prompt)

## Second, run the model with our prompt
While you wait for the results, you can star working on your own prompt in the next code snippet.

In [None]:
# list for storing the output for prompt 1
LLM_P1_results = []

# column name for storage of the confidence values from prompt 1
llm_P1_res_col = 'llm_P1_confidence_res'

# new column in the problems_reports data set for stroting the confidence assesments
problems_reports[llm_P1_res_col] = None

# run the analysis
# Iterate over the list of prompts, get responses, and extract numerical estimates and add them to the data set with problems and reports
for i, prompt in enumerate(filled_prompts_p1):

    # response from LLAMA
    LLM_P1_response = LLM1.text_generation(prompt, max_new_tokens = 4000)
    LLM_P1_results.append(LLM_P1_response) # save the response to the sure_outcome_eval list

    # extract the confidence value from the response
    confidence_assesment = extract_confidence(LLM1_response)

    # confidence value into the data
    problems_reports.at[i, llm_P1_res_col] = confidence_assesment

    # monitor progress
    print(str(i) + '/' + str(problems_reports.shape[0]-1))

0/15
1/15


## Prepare your own prompt
Keep in mind that the function `generate_prompt` requiers that the `base prompt` contains the place holders for the decision reason, decision problem and verbal report in the following forms: DECISION_PROBLEM, DECISION_REASON, VERBAL_REPORT.

The rest can be changed as you please.

In [None]:
prompt_base_p2 = """
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a decision analyst who accurately identifies whether decision reasons are present or absent in verbal reports of people written after they made a choice between two monetary lotteries of a decision problem.

Available information —
A decision problem poses a choice between two lotteries, A and B, offering different monetary outcomes with different probabilities.

A decision reason specifies a rule to decide which of the two lotteries is preferred by the reason. The decision reason prefers A or B or is indifferent between the lotteries.

A verbal report written by an individual describes, in retrospect, the individual’s deliberation process used to choose one of the lotteries of the decision problem.


Task description —
Your task is to assess, based on the verbal report, whether the individual used the reason to make the decision.
The wording in the verbal report does not need to match the decision reason verbatim;
consider other wordings but make sure that the essence of the reason is clearly reflected by the verbal report.
Perform your analysis in three steps.

Step 1: Asses if the decision reason can be applied to the decision problem.
Evaluate whether the information relevant to the decision reason can be derived from the lotteries' outcomes and probabilities and summarize this information.
Proceed to Step 2.

Step 2: Assess the verbal report.
First, evaluate and summarize the outcome and probability information considered by the individual.
Second, evaluate and summarize the individual’s justification for the choice.
Focus on the described deliberation process and ignore information about the individual’s final choice.
Proceed to Step 3.

Step 3: Assess confidence in the decision reason’s use.
First, compare the outcome and probability information relevant to the decision reason and those considered by the individual.
Second, compare the decision reason’s rule to the individual’s justification for the choice.
Based on these two comparisons, return a value between 0 (certainly not used) and 100 (certainly used), reflecting your confidence that the individual used the decision reason to make the decision.


Output format —
Return the results of your assessment in the following format.
Return the confidence value by inserting it between two @@ symbols.
Only insert numbers between 0 (certainly not used) and 100 (certainly used).

Here is a template for the output format:
Confidence: @ insert confidence value @

<|eot_id|><|start_header_id|>user<|end_header_id|>

Consider the following decision problem, decision reason, and verbal report:


Decision problem ---
DECISION_PROBLEM


Decision reason ---
DECISION_REASON


Verbal report ---
VERBAL_REPORT


Task ---
Perform the confidence assessment step-by-step. Closely follow the steps previously outlined. Describe your reasoning before you arrive at an answer.
In the end, provide your confidence assessment that the decision reason was used by the individual using the specified output format.
<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""

## Generate new prompts based on your prompt P2 for the previously set decision reason

In [None]:
# Create a list for storing prompts
filled_prompts_p2 = []

# Generate prompts for the specific decision reason
for _, row in problems_reports.iterrows():

    # here we are using the generate prompt function to create prompts for all verbal reports and the expected value reason
    prompt = generate_prompt(
        prompt_base_p2, # here we are now passing your prompt
        row['decision_problem'],
        selected_description,  # Use the selected description
        row['verbal_report']
    )
    filled_prompts_p2.append(prompt)

## Run the model

In [None]:
# list for storing the output for prompt 2
LLM_P2_results = []

# column name for storage of the confidence values from prompt 1
llm_P2_res_col = 'llm_P2_confidence_res'

# new column in the problems_reports data set for stroting the confidence assesments
problems_reports[llm_P2_res_col] = None

# run the analysis
# Iterate over the list of prompts, get responses, and extract numerical estimates and add them to the data set with problems and reports
for i, prompt in enumerate(filled_prompts_p2):

    # response from LLAMA
    LLM_P2_response = LLM1.text_generation(prompt, max_new_tokens = 4000)
    LLM_P2_results.append(LLM_P2_response) # save the response to the sure_outcome_eval list

    # extract the confidence value from the response
    confidence_assesment = extract_confidence(LLM_P2_response)

    # confidence value into the data
    problems_reports.at[i, llm_P2_res_col] = confidence_assesment

    # monitor progress
    print(str(i) + '/' + str(problems_reports.shape[0]-1))

0/15
1/15


## Compare results
First have a look at results generated with our prompt

In [None]:
# Show verbal reports for which the reason was assessed to be used with high confidecne, i.e., between 80 to 100
show_verbal_reports_in_range(problems_reports, 'llm_P1_confidence_res', 80, 100) # or use 0, 100 to display the entier data frame

Now have a look at the results generated with your prompt

In [None]:
# Show verbal reports for which the reason was assessed to be used with high confidecne, i.e., between 80 to 100
show_verbal_reports_in_range(problems_reports, 'llm_P2_confidence_res', 80, 100) # or use 0, 100 to display the entier data frame

You can also compare the full text of the model 'thinking' directly, for corresponding verbal reports:

(change the index numer in both code snippets to compare resposned corresponding to the same verbal reports)

In [None]:
print(LLM_P1_results[1]) # first LLM response based on prompt P1 (ours)

In [None]:
print(LLM_P2_results[1]) # first LLM response based on prompt P1 (ours)

# Adventure C: other reasons

In this adventure you can test your own reasons.

In [None]:
# Let's set the InferenceClient first.
LLM_version = "meta-llama/Meta-Llama-3-70B-Instruct"
LLM = InferenceClient(model = LLM_version, token = API_TOKEN)

Have a look at our reasons for inspiration

In [None]:
# Set up the prompts for a decision reason of your choice
# here are the avilable reasons
disp_tab(decision_reasons)

Now create your own reason by filling in the code in the next snippet

In [None]:
# set the decision reason
new_reason = 'regret' # change to the name of your reason --- this will be used as a column name in the problems_reasons data for storing the confidence assesments

# now add a description --- this description will be used in the prompt!
new_reason_description = 'The reason considers the outcomes of each lottery. The sum of all pairwise differences of outcomes between the lotteries is considered important. The reason prefers the lottery with the more favorable sum of outcome differences.'

Generate the prompts with your reason

In [None]:
# Create a list for storing prompts
filled_prompts = []

# Generate prompts for the specific decision reason
for _, row in problems_reports.iterrows():

    # here we are using the generate prompt function to create prompts for all verbal reports and the expected value reason
    prompt = generate_prompt(
        prompt_base, # here we are now passing your prompt
        row['decision_problem'],
        new_reason_description,  # Use the selected description that you provided above
        row['verbal_report']
    )
    filled_prompts.append(prompt)

In [None]:
print(filled_prompts[1])

## Run the model

In [None]:
# list for storing the output
LLM_results = []

# column name for storage of the confidence values will be the name under `selected_reason`
problems_reports[new_reason] = None

# run the analysis
# Iterate over the list of prompts, get responses, and extract numerical estimates and add them to the data set with problems and reports
for i, prompt in enumerate(filled_prompts):

    # response from LLAMA
    LLM_response = LLM.text_generation(prompt, max_new_tokens = 4000)
    LLM_results.append(LLM_response) # save the response to the sure_outcome_eval list

    # extract the confidence value from the response
    confidence_assesment = extract_confidence(LLM_response)

    # confidence value into the data
    problems_reports.at[i, new_reason] = confidence_assesment

    # monitor progress
    print(str(i) + '/' + str(problems_reports.shape[0]-1))

## Check the results!

In [None]:
# Show verbal reports for which the reason was assessed to be used with high confidecne, i.e., between 80 to 100
show_verbal_reports_in_range(problems_reports, new_reason, 80, 100) # or use 0, 100 to display the entier data frame

In [None]:
print(LLM_results[7]) # first LLM response based

# Clean the notebook (optional)

In [None]:
# Clear all variables
%reset -f

# Clear all outputs
from IPython.display import clear_output
clear_output()

# Restart runtime
import os
os._exit(00)