### Import

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "7"

In [2]:
import regex
import sys
import json
import argparse
import numpy as np
import pandas as pd
from tqdm import tqdm
from datetime import datetime

In [3]:
sys.path.append("../")

In [4]:
class EmptyArgs:
    def __init__(self):
        pass

args = EmptyArgs()

In [5]:
from model.init import seed_everything

num_seed = 42
seed_everything(seed=num_seed)

seed everything: 42


In [6]:
from metric.generation import calc_metrics_gen, print_metrics_gen

### Config

In [7]:
num_bootstrap = 1000
path_dir_performance = "performance"
list_prompt_mode = ["direct", "cot", "direct-5-shot"]

In [8]:
# list_model = [
#     "Llama-3.3-70B-Instruct",
#     # "MeLLaMA-70B-chat",
#     "Mistral-Large-Instruct-2411",
#     "Phi-3.5-MoE-instruct",
#     "Yi-1.5-34B-Chat-16K",
# ]

In [9]:
def evaluate(task):
    dict_prompt_model_performance = {}
    for prompt_mode in list_prompt_mode:
        dict_model_performance = task.evaluate_by_model(prompt_mode=prompt_mode, bootstrap=num_bootstrap)
        # dict_model_performance = task.evaluate_by_model(prompt_mode=prompt_mode, model_name=list_model, bootstrap=num_bootstrap)
        path_file_performance = f"{path_dir_performance}/{task.name}.{prompt_mode}.performance.json"
        with open(path_file_performance, 'w') as f:
            json.dump(dict_model_performance, f, indent=4)
        dict_prompt_model_performance[prompt_mode] = dict_model_performance
    return dict_prompt_model_performance

In [10]:
def print_performance(dict_prompt_model_performance):
    dict_mode_performance = {}
    for prompt_mode in list_prompt_mode:
        str_metrics = print_metrics_gen(dict_prompt_model_performance[prompt_mode])
        print("Prompt Mode:", prompt_mode)
        print(str_metrics)
        print("===============================")
        dict_mode_performance[prompt_mode] = str_metrics
    return dict_mode_performance

### 84.MedDG

In [11]:
from dataset.generation import Task_gen_MedDG

In [12]:
task = '84.MedDG'
task = Task_gen_MedDG(args=args, task=task)

Load 84.MedDG data: train: 73530, val: 0, test: 2747


In [12]:
dict_prompt_model_performance = {}
for prompt_mode in list_prompt_mode:
    dict_model_performance = task.evaluate_by_model(prompt_mode=prompt_mode, bootstrap=num_bootstrap)
    path_file_performance = f"{path_dir_performance}/{task.name}.{prompt_mode}.performance.json"
    with open(path_file_performance, 'w') as f:
        json.dump(dict_model_performance, f, indent=4)
    dict_prompt_model_performance[prompt_mode] = dict_model_performance

100%|██████████| 34/34 [08:27<00:00, 14.92s/it]
100%|██████████| 34/34 [11:32<00:00, 20.36s/it]
100%|██████████| 34/34 [09:36<00:00, 16.96s/it]


In [13]:
str_metrics = print_metrics_gen(dict_prompt_model_performance['direct'])
print(str_metrics)

12.51 [12.50, 12.52]; 22.55 [22.54, 22.57]; 66.49 [66.48, 66.50]; 0.00 [0.00, 0.00]; 12.17 [12.16, 12.18]; 21.72 [21.71, 21.73]; 65.80 [65.80, 65.81]; 0.00 [0.00, 0.00]; 12.45 [12.45, 12.46]; 19.15 [19.14, 19.16]; 65.08 [65.08, 65.09]; 0.00 [0.00, 0.00]; 12.96 [12.96, 12.97]; 20.20 [20.19, 20.21]; 65.55 [65.55, 65.56]; 0.00 [0.00, 0.00]; 9.89 [9.88, 9.90]; 19.62 [19.61, 19.63]; 64.47 [64.46, 64.47]; 0.00 [0.00, 0.00]; 11.05 [11.04, 11.05]; 16.24 [16.23, 16.25]; 63.37 [63.36, 63.37]; 0.00 [0.00, 0.00]; 11.37 [11.36, 11.37]; 16.21 [16.21, 16.22]; 64.05 [64.04, 64.05]; 0.00 [0.00, 0.00]; 8.58 [8.58, 8.58]; 8.80 [8.80, 8.81]; 59.89 [59.88, 59.89]; 0.00 [0.00, 0.00]; 7.73 [7.73, 7.73]; 5.13 [5.12, 5.13]; 53.97 [53.96, 53.98]; 0.00 [0.00, 0.00]; 7.95 [7.95, 7.95]; 5.84 [5.84, 5.85]; 55.35 [55.35, 55.36]; 0.00 [0.00, 0.00]; 10.52 [10.51, 10.53]; 17.01 [17.00, 17.03]; 63.42 [63.42, 63.43]; 0.00 [0.00, 0.00]; 10.73 [10.72, 10.74]; 20.90 [20.88, 20.91]; 64.81 [64.80, 64.82]; 0.00 [0.00, 0.00]; 6

In [14]:
str_metrics = print_metrics_gen(dict_prompt_model_performance['cot'])
print(str_metrics)

10.11 [10.10, 10.11]; 12.48 [12.47, 12.49]; 61.39 [61.39, 61.40]; 0.00 [0.00, 0.00]; 9.52 [9.52, 9.53]; 10.35 [10.34, 10.36]; 59.93 [59.93, 59.94]; 0.00 [0.00, 0.00]; 11.62 [11.61, 11.62]; 17.18 [17.17, 17.19]; 64.15 [64.14, 64.15]; 0.00 [0.00, 0.00]; 11.47 [11.47, 11.48]; 16.59 [16.58, 16.59]; 64.15 [64.15, 64.15]; 0.00 [0.00, 0.00]; 7.85 [7.85, 7.85]; 6.35 [6.35, 6.35]; 57.21 [57.20, 57.21]; 0.00 [0.00, 0.00]; 10.59 [10.59, 10.60]; 14.51 [14.50, 14.52]; 62.65 [62.64, 62.65]; 0.00 [0.00, 0.00]; 10.41 [10.41, 10.42]; 14.11 [14.10, 14.11]; 62.80 [62.80, 62.81]; 0.00 [0.00, 0.00]; 7.44 [7.44, 7.45]; 4.42 [4.41, 4.42]; 55.27 [55.27, 55.28]; 0.00 [0.00, 0.00]; 7.84 [7.83, 7.84]; 5.47 [5.46, 5.47]; 54.63 [54.63, 54.64]; 0.00 [0.00, 0.00]; 8.17 [8.17, 8.17]; 6.58 [6.57, 6.58]; 56.39 [56.38, 56.39]; 0.00 [0.00, 0.00]; 10.24 [10.24, 10.25]; 13.26 [13.25, 13.27]; 61.45 [61.44, 61.45]; 0.00 [0.00, 0.00]; 11.02 [11.01, 11.03]; 20.85 [20.84, 20.87]; 64.86 [64.85, 64.86]; 0.00 [0.00, 0.00]; 6.71 [6

In [15]:
str_metrics = print_metrics_gen(dict_prompt_model_performance['direct-5-shot'])
print(str_metrics)

12.54 [12.53, 12.55]; 22.24 [22.23, 22.25]; 66.67 [66.67, 66.68]; 0.00 [0.00, 0.00]; 12.76 [12.75, 12.77]; 22.20 [22.19, 22.21]; 66.35 [66.34, 66.36]; 0.00 [0.00, 0.00]; 12.66 [12.65, 12.67]; 22.22 [22.21, 22.23]; 66.75 [66.74, 66.75]; 0.00 [0.00, 0.00]; 13.62 [13.61, 13.62]; 23.18 [23.17, 23.19]; 67.26 [67.25, 67.26]; 0.00 [0.00, 0.00]; 9.69 [9.69, 9.70]; 20.21 [20.20, 20.22]; 65.04 [65.04, 65.05]; 0.00 [0.00, 0.00]; 10.37 [10.36, 10.37]; 20.09 [20.08, 20.10]; 66.05 [66.04, 66.06]; 0.00 [0.00, 0.00]; 13.97 [13.97, 13.98]; 21.95 [21.94, 21.96]; 66.65 [66.64, 66.65]; 0.00 [0.00, 0.00]; 8.74 [8.74, 8.74]; 9.20 [9.20, 9.21]; 60.14 [60.14, 60.14]; 0.00 [0.00, 0.00]; 7.74 [7.74, 7.74]; 5.23 [5.23, 5.23]; 54.99 [54.99, 55.00]; 0.00 [0.00, 0.00]; 7.63 [7.63, 7.63]; 4.77 [4.76, 4.77]; 53.90 [53.89, 53.90]; 0.00 [0.00, 0.00]; 0.00 [0.00, 0.00]; 0.00 [0.00, 0.00]; 0.02 [0.02, 0.02]; 99.96 [99.96, 99.96]; 0.00 [0.00, 0.00]; 0.00 [0.00, 0.00]; 0.02 [0.02, 0.02]; 99.96 [99.96, 99.96]; 7.95 [7.95, 7

#### Check wrong format

In [16]:
dict_model_result = task.search_result_by_model()
model_one = 'Llama-3.1-70B-Instruct'
dict_model_result[model_one]

['result/84.MedDG/Llama-3.1-70B-Instruct/84.MedDG-direct-greedy-42.result.json']

In [17]:
result_one = dict_model_result[model_one][0]
with open(result_one, 'r') as f:
    list_dict_result = json.load(f)

In [18]:
list_label = task.get_label(list_dict_result)
list_pred = task.get_pred(list_dict_result)

In [19]:
dict_metrics_avg, dict_metrics_sample = calc_metrics_gen(list_label, list_pred, lang=task.language)
dict_metrics_avg

{'bleu': 9.952680117936573,
 'rouge': 20.207461132084305,
 'meteor': 0.0,
 'bertscore': 65.5563459684513}

### 23.cMedQA

In [19]:
from dataset.generation import Task_gen_cMedQA

In [20]:
task = '23.cMedQA'
task = Task_gen_cMedQA(args=args, task=task)

Load 23.cMedQA data: train: 162517, val: 6132, test: 6184
Notice: this task has multiple reference labels


In [12]:
dict_prompt_model_performance = evaluate(task)
# dict_prompt_model_performance = {}
# for prompt_mode in list_prompt_mode:
#     path_file_performance = f"{path_dir_performance}/{task.name}.{prompt_mode}.performance.json"
#     with open(path_file_performance, 'r', encoding='utf-8') as f:
#         dict_model_performance = json.load(f)
#     dict_prompt_model_performance[prompt_mode] = dict_model_performance

100%|██████████| 34/34 [55:47<00:00, 98.46s/it] 
100%|██████████| 34/34 [1:14:17<00:00, 131.10s/it]
  9%|▉         | 3/34 [04:11<42:37, 82.48s/it]   

In [24]:
dict_mode_performance = print_performance(dict_prompt_model_performance)

Prompt Mode: direct
7.96 [7.96, 7.97]; 17.30 [17.29, 17.30]; 64.26 [64.26, 64.27]; 0.00 [0.00, 0.00]; 11.32 [11.31, 11.32]; 19.07 [19.07, 19.08]; 64.82 [64.82, 64.83]; 0.00 [0.00, 0.00]; 14.36 [14.36, 14.37]; 19.75 [19.74, 19.75]; 65.26 [65.26, 65.26]; 0.00 [0.00, 0.00]; 15.27 [15.27, 15.28]; 21.20 [21.19, 21.20]; 66.36 [66.36, 66.36]; 0.00 [0.00, 0.00]; 0.98 [0.98, 0.99]; 10.42 [10.41, 10.42]; 54.50 [54.50, 54.51]; 0.00 [0.00, 0.00]; 11.98 [11.97, 11.98]; 18.57 [18.57, 18.58]; 64.73 [64.72, 64.73]; 0.00 [0.00, 0.00]; 15.13 [15.13, 15.14]; 20.69 [20.68, 20.69]; 65.99 [65.99, 65.99]; 0.00 [0.00, 0.00]; 10.16 [10.16, 10.16]; 11.64 [11.63, 11.64]; 61.25 [61.25, 61.25]; 0.00 [0.00, 0.00]; 8.31 [8.31, 8.32]; 6.64 [6.64, 6.64]; 52.72 [52.72, 52.73]; 0.00 [0.00, 0.00]; 8.16 [8.16, 8.16]; 6.10 [6.10, 6.11]; 52.44 [52.43, 52.44]; 0.00 [0.00, 0.00]; 8.83 [8.83, 8.84]; 15.46 [15.46, 15.47]; 62.06 [62.05, 62.06]; 0.00 [0.00, 0.00]; 11.21 [11.21, 11.22]; 17.92 [17.92, 17.93]; 64.24 [64.24, 64.24]; 

### 29.EHRQA.qa

In [11]:
from dataset.generation import Task_gen_EHRQA_qa

In [12]:
task = '29.EHRQA.qa'
task = Task_gen_EHRQA_qa(args=args, task=task)

Load 29.EHRQA.qa data: train: 38015, val: 4431, test: 5097
Notice: this task has multiple reference labels


In [None]:
dict_prompt_model_performance = evaluate(task)
# dict_prompt_model_performance = {}
# for prompt_mode in list_prompt_mode:
#     path_file_performance = f"{path_dir_performance}/{task.name}.{prompt_mode}.performance.json"
#     with open(path_file_performance, 'r', encoding='utf-8') as f:
#         dict_model_performance = json.load(f)
#     dict_prompt_model_performance[prompt_mode] = dict_model_performance

In [None]:
dict_mode_performance = print_performance(dict_prompt_model_performance)

Prompt Mode: direct
13.63 [13.62, 13.63]; 18.36 [18.35, 18.36]; 65.36 [65.35, 65.36]; 0.00 [0.00, 0.00]; 10.12 [10.12, 10.13]; 10.39 [10.38, 10.40]; 60.75 [60.75, 60.76]; 0.37 [0.36, 0.37]; 13.55 [13.55, 13.56]; 17.94 [17.93, 17.94]; 65.19 [65.19, 65.19]; 0.00 [0.00, 0.00];
Prompt Mode: cot
12.96 [12.95, 12.96]; 17.41 [17.40, 17.41]; 64.57 [64.56, 64.57]; 0.00 [0.00, 0.00]; 9.61 [9.61, 9.62]; 9.17 [9.16, 9.17]; 60.18 [60.18, 60.19]; 0.31 [0.31, 0.32]; 13.20 [13.20, 13.21]; 17.79 [17.79, 17.80]; 64.80 [64.80, 64.81]; 0.00 [0.00, 0.00];
Prompt Mode: direct-5-shot
14.64 [14.63, 14.64]; 20.52 [20.52, 20.53]; 66.55 [66.55, 66.56]; 0.00 [0.00, 0.00]; 6.98 [6.98, 6.98]; 1.65 [1.64, 1.65]; 49.42 [49.41, 49.43]; 0.06 [0.06, 0.06]; 14.85 [14.84, 14.86]; 18.98 [18.97, 18.99]; 65.75 [65.75, 65.76]; 0.00 [0.00, 0.00];


In [None]:
dict_mode_performance = print_performance(dict_prompt_model_performance)

Prompt Mode: direct
7.82 [7.81, 7.83]; 17.79 [17.78, 17.80]; 64.85 [64.84, 64.85]; 0.00 [0.00, 0.00]; 8.17 [8.17, 8.18]; 18.56 [18.56, 18.57]; 65.09 [65.09, 65.10]; 0.00 [0.00, 0.00]; 10.66 [10.66, 10.67]; 17.91 [17.90, 17.91]; 64.84 [64.83, 64.84]; 0.00 [0.00, 0.00]; 10.81 [10.81, 10.82]; 19.07 [19.06, 19.07]; 65.66 [65.65, 65.66]; 0.00 [0.00, 0.00]; 1.89 [1.89, 1.90]; 10.92 [10.91, 10.93]; 55.20 [55.20, 55.21]; 0.00 [0.00, 0.00]; 9.26 [9.26, 9.27]; 17.57 [17.57, 17.58]; 64.63 [64.63, 64.64]; 0.00 [0.00, 0.00]; 11.40 [11.40, 11.41]; 18.55 [18.54, 18.55]; 65.23 [65.23, 65.23]; 0.00 [0.00, 0.00]; 9.93 [9.93, 9.94]; 12.25 [12.24, 12.25]; 61.94 [61.94, 61.95]; 0.00 [0.00, 0.00]; 9.25 [9.25, 9.25]; 9.46 [9.46, 9.46]; 54.29 [54.29, 54.30]; 0.00 [0.00, 0.00]; 8.95 [8.95, 8.95]; 9.08 [9.07, 9.08]; 56.19 [56.18, 56.19]; 0.00 [0.00, 0.00]; 6.14 [6.13, 6.14]; 13.03 [13.03, 13.04]; 61.10 [61.10, 61.11]; 0.00 [0.00, 0.00]; 7.91 [7.90, 7.91]; 15.03 [15.02, 15.04]; 62.45 [62.45, 62.46]; 0.00 [0.00, 

### 86.IMCS-V2-MRG

In [None]:
from dataset.generation import Task_gen_IMCS_V2_MRG

In [None]:
task = '86.IMCS-V2-MRG'
task = Task_gen_IMCS_V2_MRG(args=args, task=task)

Load 86.IMCS-V2-MRG data: train: 2472, val: 0, test: 833
Notice: this task has multiple reference labels


In [None]:
dict_prompt_model_performance = evaluate(task)

100%|██████████| 34/34 [11:31<00:00, 20.34s/it]
100%|██████████| 34/34 [13:31<00:00, 23.85s/it]
100%|██████████| 34/34 [08:24<00:00, 14.84s/it]


In [None]:
dict_mode_performance = print_performance(dict_prompt_model_performance)

Prompt Mode: direct
34.98 [34.96, 35.00]; 43.52 [43.50, 43.54]; 76.56 [76.55, 76.57]; 0.00 [0.00, 0.00]; 35.34 [35.32, 35.36]; 43.87 [43.85, 43.89]; 76.56 [76.55, 76.57]; 0.00 [0.00, 0.00]; 26.96 [26.94, 26.97]; 36.73 [36.71, 36.74]; 73.81 [73.81, 73.82]; 0.00 [0.00, 0.00]; 33.18 [33.16, 33.20]; 41.99 [41.97, 42.01]; 75.90 [75.89, 75.91]; 0.00 [0.00, 0.00]; 23.88 [23.85, 23.90]; 30.21 [30.17, 30.24]; 70.28 [70.27, 70.30]; 0.00 [0.00, 0.00]; 24.51 [24.50, 24.53]; 32.74 [32.73, 32.76]; 72.17 [72.16, 72.17]; 0.00 [0.00, 0.00]; 30.94 [30.92, 30.96]; 39.55 [39.53, 39.56]; 74.69 [74.68, 74.69]; 0.00 [0.00, 0.00]; 16.94 [16.93, 16.95]; 25.59 [25.58, 25.61]; 69.19 [69.18, 69.19]; 0.00 [0.00, 0.00]; 12.16 [12.15, 12.17]; 14.78 [14.77, 14.80]; 61.09 [61.08, 61.10]; 0.00 [0.00, 0.00]; 13.07 [13.06, 13.08]; 16.55 [16.54, 16.57]; 62.65 [62.65, 62.66]; 0.00 [0.00, 0.00]; 24.55 [24.53, 24.57]; 30.76 [30.73, 30.78]; 69.42 [69.41, 69.43]; 0.00 [0.00, 0.00]; 23.63 [23.60, 23.66]; 31.64 [31.61, 31.67]; 7

### 103.icliniq-10k

In [35]:
from dataset.generation import Task_gen_icliniq

In [36]:
task = '103.icliniq-10k'
task = Task_gen_icliniq(args=args, task=task)

Load 103.icliniq-10k data: train: 5855, val: 732, test: 733


In [None]:
dict_prompt_model_performance = evaluate(task)

  0%|          | 0/34 [00:00<?, ?it/s]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  3%|▎         | 1/34 [00:18<10:19, 18.76s/it]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  6%|▌         | 2/34 [00:30<07:43, 14.50s/it]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infer

In [38]:
dict_mode_performance = print_performance(dict_prompt_model_performance)

Prompt Mode: direct
5.20 [5.19, 5.21]; 10.72 [10.71, 10.73]; 84.14 [84.14, 84.15]; 0.00 [0.00, 0.00]; 6.75 [6.74, 6.76]; 11.97 [11.96, 11.98]; 84.39 [84.38, 84.39]; 0.00 [0.00, 0.00]; 9.03 [9.03, 9.04]; 12.48 [12.47, 12.49]; 83.25 [83.25, 83.25]; 0.00 [0.00, 0.00]; 9.89 [9.88, 9.89]; 13.92 [13.91, 13.93]; 84.19 [84.19, 84.20]; 0.00 [0.00, 0.00]; 6.03 [6.02, 6.05]; 9.81 [9.79, 9.82]; 82.89 [82.88, 82.89]; 0.00 [0.00, 0.00]; 9.34 [9.34, 9.35]; 12.68 [12.67, 12.69]; 83.41 [83.40, 83.41]; 0.00 [0.00, 0.00]; 10.10 [10.09, 10.10]; 13.83 [13.82, 13.84]; 84.10 [84.10, 84.10]; 0.00 [0.00, 0.00]; 8.56 [8.56, 8.56]; 10.43 [10.42, 10.44]; 81.61 [81.61, 81.62]; 0.00 [0.00, 0.00]; 7.90 [7.90, 7.90]; 6.52 [6.51, 6.53]; 77.07 [77.07, 77.08]; 0.00 [0.00, 0.00]; 7.90 [7.90, 7.91]; 7.00 [6.99, 7.00]; 77.62 [77.61, 77.62]; 0.00 [0.00, 0.00]; 10.21 [10.20, 10.22]; 15.16 [15.15, 15.17]; 84.37 [84.36, 84.37]; 0.00 [0.00, 0.00]; 6.74 [6.73, 6.76]; 10.52 [10.51, 10.54]; 82.23 [82.22, 82.23]; 0.00 [0.00, 0.00];

### 104.HealthCareMagic-100k

In [15]:
from dataset.generation import Task_gen_HealthCareMagic

In [16]:
task = '104.HealthCareMagic-100k'
task = Task_gen_HealthCareMagic(args=args, task=task)

Load 104.HealthCareMagic-100k data: train: 89592, val: 11205, test: 11199


In [None]:
# dict_prompt_model_performance = evaluate(task)
dict_prompt_model_performance = {}
for prompt_mode in list_prompt_mode:
    path_file_performance = f"{path_dir_performance}/{task.name}.{prompt_mode}.performance.json"
    with open(path_file_performance, 'r', encoding='utf-8') as f:
        dict_model_performance = json.load(f)
    dict_prompt_model_performance[prompt_mode] = dict_model_performance

In [42]:
dict_mode_performance = print_performance(dict_prompt_model_performance)

Prompt Mode: direct
4.05 [4.05, 4.05]; 10.62 [10.62, 10.62]; 83.79 [83.79, 83.79]; 0.00 [0.00, 0.00]; 5.45 [5.44, 5.45]; 11.73 [11.73, 11.74]; 84.01 [84.01, 84.01]; 0.00 [0.00, 0.00]; 9.24 [9.24, 9.24]; 12.94 [12.94, 12.95]; 82.87 [82.86, 82.87]; 0.00 [0.00, 0.00]; 10.05 [10.05, 10.05]; 14.30 [14.30, 14.30]; 83.91 [83.91, 83.91]; 0.00 [0.00, 0.00]; 6.69 [6.68, 6.69]; 10.97 [10.97, 10.97]; 82.89 [82.89, 82.89]; 0.00 [0.00, 0.00]; 9.42 [9.42, 9.42]; 13.12 [13.12, 13.12]; 83.07 [83.07, 83.07]; 0.00 [0.00, 0.00]; 10.31 [10.30, 10.31]; 14.43 [14.43, 14.43]; 83.83 [83.83, 83.83]; 0.00 [0.00, 0.00]; 8.70 [8.70, 8.70]; 11.31 [11.30, 11.31]; 81.39 [81.39, 81.39]; 0.00 [0.00, 0.00]; 7.88 [7.88, 7.89]; 6.82 [6.82, 6.83]; 76.57 [76.57, 76.58]; 0.00 [0.00, 0.00]; 7.59 [7.58, 7.59]; 6.35 [6.35, 6.35]; 76.26 [76.26, 76.26]; 0.00 [0.00, 0.00]; 8.50 [8.50, 8.50]; 13.92 [13.92, 13.92]; 84.01 [84.01, 84.02]; 0.00 [0.00, 0.00]; 5.98 [5.98, 5.98]; 10.72 [10.72, 10.73]; 82.75 [82.75, 82.76]; 0.00 [0.00, 0.0

### 107.MIMIC-IV BHC

In [None]:
from dataset.generation import Task_gen_mimic_iv_BHC

In [None]:
task = '107.MIMIC-IV BHC'
task = Task_gen_mimic_iv_BHC(args=args, task=task)

Load 107.MIMIC-IV BHC data: train: 10, val: 1000, test: 1000


In [None]:
dict_prompt_model_performance = evaluate(task)

  0%|          | 0/4 [00:00<?, ?it/s]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 25%|██▌       | 1/4 [00:52<02:38, 52.85s/it]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 50%|█████     | 2/4 [01:12<01:06, 33.31s/it]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inferenc

In [None]:
dict_mode_performance = print_performance(dict_prompt_model_performance)

Prompt Mode: direct
5.74 [5.73, 5.75]; 18.52 [18.51, 18.53]; 83.53 [83.52, 83.53]; 0.00 [0.00, 0.00]; 5.98 [5.97, 5.98]; 7.98 [7.97, 8.00]; 77.76 [77.75, 77.77]; 0.00 [0.00, 0.00]; 5.30 [5.29, 5.31]; 16.53 [16.52, 16.54]; 83.07 [83.06, 83.07]; 0.00 [0.00, 0.00];
Prompt Mode: cot
5.16 [5.15, 5.17]; 17.70 [17.69, 17.71]; 83.32 [83.32, 83.32]; 0.00 [0.00, 0.00]; 7.05 [7.04, 7.05]; 7.62 [7.61, 7.64]; 77.69 [77.69, 77.70]; 0.00 [0.00, 0.00]; 3.56 [3.55, 3.57]; 15.46 [15.46, 15.47]; 83.28 [83.28, 83.28]; 0.00 [0.00, 0.00];
Prompt Mode: direct-5-shot
6.83 [6.82, 6.84]; 19.56 [19.55, 19.57]; 83.76 [83.76, 83.76]; 0.00 [0.00, 0.00]; 6.29 [6.28, 6.31]; 17.95 [17.94, 17.96]; 83.47 [83.46, 83.47]; 0.00 [0.00, 0.00]; 9.34 [9.33, 9.34]; 9.63 [9.62, 9.64]; 80.70 [80.69, 80.70]; 0.00 [0.00, 0.00];


## Question-Answering

### 91-2.CAS-evidence

In [10]:
from dataset.generation import Task_gen_CAS_evidence

In [11]:
task = "91-2.CAS.evidence"
task = Task_gen_CAS_evidence(args=args, task=task)

Load 91-2.CAS.evidence data: train: 20, val: 0, test: 696


In [12]:
dict_prompt_model_performance = evaluate(task)

  0%|          | 0/34 [00:00<?, ?it/s]

100%|██████████| 34/34 [02:30<00:00,  4.43s/it]
100%|██████████| 34/34 [02:21<00:00,  4.15s/it]
100%|██████████| 34/34 [02:13<00:00,  3.92s/it]


In [13]:
dict_mode_performance = print_performance(dict_prompt_model_performance)

Prompt Mode: direct
32.64 [32.59, 32.69]; 45.52 [45.47, 45.57]; 81.09 [81.07, 81.12]; 0.28 [0.27, 0.30]; 33.65 [33.60, 33.69]; 46.80 [46.75, 46.85]; 81.88 [81.86, 81.91]; 0.00 [0.00, 0.00]; 29.20 [29.15, 29.24]; 39.18 [39.13, 39.23]; 79.02 [78.99, 79.04]; 0.00 [0.00, 0.00]; 30.58 [30.53, 30.62]; 41.06 [41.01, 41.11]; 78.94 [78.92, 78.97]; 0.00 [0.00, 0.00]; 7.84 [7.81, 7.86]; 13.96 [13.93, 14.00]; 56.23 [56.18, 56.29]; 37.48 [37.36, 37.59]; 20.67 [20.63, 20.71]; 30.19 [30.14, 30.24]; 73.58 [73.55, 73.61]; 6.63 [6.57, 6.69]; 31.67 [31.62, 31.72]; 43.24 [43.19, 43.29]; 80.22 [80.20, 80.25]; 0.00 [0.00, 0.00]; 28.64 [28.59, 28.68]; 36.46 [36.41, 36.51]; 76.01 [75.98, 76.04]; 0.57 [0.55, 0.59]; 0.00 [0.00, 0.00]; 0.00 [0.00, 0.00]; 16.17 [16.11, 16.22]; 72.43 [72.33, 72.53]; 0.13 [0.12, 0.14]; 0.16 [0.15, 0.17]; 0.54 [0.52, 0.55]; 99.27 [99.25, 99.29]; 0.68 [0.67, 0.69]; 0.84 [0.83, 0.86]; 3.22 [3.19, 3.26]; 95.68 [95.63, 95.73]; 19.59 [19.55, 19.63]; 28.50 [28.45, 28.55]; 66.12 [66.06, 66

## End

In [14]:
print('Done.')

Done.
