### Import

In [2]:
import regex
import sys
import json
import argparse
import numpy as np
import pandas as pd
from tqdm import tqdm
from datetime import datetime

In [3]:
sys.path.append("../")

In [4]:
class EmptyArgs:
    def __init__(self):
        pass

args = EmptyArgs()

In [5]:
from model.init import seed_everything

num_seed = 42
seed_everything(seed=num_seed)

seed everything: 42


In [6]:
from dataset.process import process_text_clean

In [7]:
from metric.extraction import calc_metrics_ext, print_metrics_ext
from metric.extraction import calc_metrics_ext_qa, print_metrics_ext_qa

### Config

In [8]:
num_bootstrap = 1000
path_dir_performance = "performance"
list_prompt_mode = ["direct", "cot", "direct-5-shot"]

In [9]:
def evaluate(task):
    dict_prompt_model_performance = {}
    for prompt_mode in list_prompt_mode:
        dict_model_performance = task.evaluate_by_model(prompt_mode=prompt_mode, bootstrap=num_bootstrap)
        path_file_performance = f"{path_dir_performance}/{task.name}.{prompt_mode}.performance.json"
        with open(path_file_performance, 'w') as f:
            json.dump(dict_model_performance, f, indent=4)
        dict_prompt_model_performance[prompt_mode] = dict_model_performance
    return dict_prompt_model_performance

In [10]:
def print_performance(dict_prompt_model_performance, flag_print_missing=False):
    dict_mode_performance = {}
    for prompt_mode in list_prompt_mode:
        str_metrics = print_metrics_ext(dict_prompt_model_performance[prompt_mode], flag_print_missing=flag_print_missing)
        print("Prompt Mode:", prompt_mode)
        print(str_metrics)
        print("===============================")
        dict_mode_performance[prompt_mode] = str_metrics
    return dict_mode_performance

In [11]:
def print_performance_ext_qa(dict_prompt_model_performance, flag_print_missing=False):
    dict_mode_performance = {}
    for prompt_mode in list_prompt_mode:
        str_metrics = print_metrics_ext_qa(dict_prompt_model_performance[prompt_mode], flag_print_missing=flag_print_missing)
        print("Prompt Mode:", prompt_mode)
        print(str_metrics)
        print("===============================")
        dict_mode_performance[prompt_mode] = str_metrics
    return dict_mode_performance

### 1-2.ADE-ADE relation

In [12]:
from dataset.extraction import Task_ext_ADE_ADE_relation

In [13]:
task = '1-2.ADE-ADE relation'
task = Task_ext_ADE_ADE_relation(args=args, task=task)

Load 1-2.ADE-ADE relation data: train: 3416, val: 427, test: 428


In [14]:
dict_prompt_model_performance = {}
for prompt_mode in list_prompt_mode:
    dict_model_performance = task.evaluate_by_model(prompt_mode=prompt_mode, bootstrap=num_bootstrap)
    path_file_performance = f"{path_dir_performance}/{task.name}.{prompt_mode}.performance.json"
    with open(path_file_performance, 'w') as f:
        json.dump(dict_model_performance, f, indent=4)
    dict_prompt_model_performance[prompt_mode] = dict_model_performance

  0%|          | 0/34 [00:00<?, ?it/s]

100%|██████████| 34/34 [00:10<00:00,  3.25it/s]
100%|██████████| 34/34 [00:09<00:00,  3.47it/s]
100%|██████████| 34/34 [00:10<00:00,  3.33it/s]


In [15]:
str_metrics = print_metrics_ext(dict_prompt_model_performance['direct'])
print(str_metrics)

77.70 [77.57, 77.83]; 57.76 [57.60, 57.92]; 0.69 [0.66, 0.71]; 90.14 [90.05, 90.22]; 66.44 [66.28, 66.60]; 0.48 [0.46, 0.50]; 81.29 [81.19, 81.39]; 38.83 [38.69, 38.97]; 15.41 [15.30, 15.52]; 89.35 [89.26, 89.44]; 55.33 [55.17, 55.49]; 3.29 [3.23, 3.34]; 58.56 [58.42, 58.70]; 15.93 [15.82, 16.03]; 7.72 [7.64, 7.80]; 78.12 [78.01, 78.24]; 45.02 [44.86, 45.17]; 8.22 [8.13, 8.30]; 88.14 [88.05, 88.23]; 59.70 [59.54, 59.85]; 2.59 [2.54, 2.64]; 68.45 [68.33, 68.57]; 40.57 [40.42, 40.71]; 0.00 [0.00, 0.00]; 0.85 [0.82, 0.89]; 1.29 [1.24, 1.34]; 22.75 [22.63, 22.88]; 38.17 [38.03, 38.31]; 24.51 [24.38, 24.64]; 53.50 [53.34, 53.65]; 47.55 [47.40, 47.70]; 26.94 [26.80, 27.07]; 3.48 [3.42, 3.53]; 90.59 [90.51, 90.67]; 64.63 [64.50, 64.76]; 0.94 [0.91, 0.97]; 2.10 [2.04, 2.15]; 0.25 [0.24, 0.27]; 86.21 [86.11, 86.32]; 89.01 [88.92, 89.10]; 67.08 [66.96, 67.21]; 0.94 [0.91, 0.97]; 74.06 [73.96, 74.17]; 36.94 [36.81, 37.07]; 0.00 [0.00, 0.00]; 37.51 [37.32, 37.69]; 15.53 [15.41, 15.64]; 15.20 [15.1

In [16]:
str_metric = print_metrics_ext(dict_prompt_model_performance['cot'])
print(str_metric)

71.28 [71.14, 71.41]; 44.08 [43.92, 44.23]; 3.96 [3.90, 4.02]; 83.14 [83.02, 83.26]; 50.12 [49.95, 50.29]; 1.42 [1.38, 1.46]; 83.76 [83.65, 83.87]; 28.39 [28.24, 28.54]; 0.93 [0.90, 0.96]; 75.84 [75.72, 75.96]; 25.79 [25.67, 25.92]; 2.73 [2.69, 2.78]; 20.94 [20.82, 21.06]; 6.15 [6.08, 6.22]; 25.26 [25.13, 25.39]; 70.99 [70.85, 71.12]; 22.43 [22.31, 22.55]; 2.33 [2.29, 2.37]; 81.44 [81.33, 81.55]; 50.29 [50.13, 50.44]; 0.94 [0.91, 0.97]; 0.43 [0.41, 0.45]; 0.86 [0.82, 0.91]; 0.00 [0.00, 0.00]; 1.27 [1.24, 1.30]; 0.71 [0.69, 0.74]; 16.60 [16.49, 16.71]; 19.37 [19.24, 19.51]; 13.08 [12.95, 13.22]; 60.52 [60.37, 60.67]; 78.55 [78.42, 78.67]; 43.67 [43.52, 43.82]; 0.69 [0.66, 0.71]; 88.83 [88.74, 88.91]; 65.24 [65.10, 65.38]; 3.76 [3.70, 3.82]; 1.73 [1.69, 1.78]; 0.00 [0.00, 0.00]; 82.94 [82.83, 83.06]; 79.81 [79.70, 79.93]; 38.22 [38.08, 38.36]; 4.44 [4.38, 4.50]; 69.65 [69.43, 69.87]; 35.91 [35.76, 36.06]; 3.25 [3.20, 3.30]; 41.62 [41.45, 41.80]; 17.03 [16.93, 17.14]; 6.77 [6.69, 6.85]; 7

In [None]:
str_metric = print_metrics_ext(dict_prompt_model_performance['direct-5-shot'])
print(str_metric)

90.18 [90.09, 90.26]; 65.01 [64.85, 65.17]; 0.69 [0.67, 0.72]; 91.23 [91.14, 91.31]; 68.72 [68.58, 68.87]; 0.00 [0.00, 0.00]; 86.88 [86.78, 86.97]; 51.35 [51.19, 51.51]; 6.84 [6.76, 6.92]; 87.89 [87.79, 87.99]; 51.11 [50.95, 51.27]; 0.69 [0.67, 0.72]; 80.10 [79.97, 80.23]; 45.67 [45.51, 45.82]; 0.00 [0.00, 0.00]; 71.17 [71.02, 71.31]; 44.13 [43.98, 44.29]; 2.29 [2.24, 2.33]; 89.95 [89.86, 90.05]; 53.49 [53.33, 53.65]; 0.71 [0.68, 0.74]; 55.63 [55.49, 55.77]; 32.31 [32.18, 32.44]; 0.23 [0.22, 0.25]; 74.96 [74.85, 75.06]; 49.32 [49.18, 49.45]; 0.00 [0.00, 0.00]; 62.39 [62.32, 62.45]; 43.91 [43.82, 44.01]; 0.00 [0.00, 0.00]; 50.33 [50.25, 50.41]; 29.57 [29.49, 29.65]; 0.22 [0.20, 0.23]; 47.97 [47.91, 48.03]; 31.55 [31.47, 31.62]; 1.41 [1.38, 1.45]; 75.88 [75.74, 76.02]; 22.34 [22.21, 22.46]; 0.00 [0.00, 0.00]; 72.18 [72.04, 72.32]; 47.16 [47.01, 47.31]; 0.00 [0.00, 0.00]; 28.79 [28.66, 28.92]; 20.57 [20.46, 20.68]; 0.00 [0.00, 0.00]; 19.30 [19.19, 19.41]; 9.80 [9.73, 9.87]; 18.80 [18.68, 

#### Check wrong format

In [None]:
# sort the dict_prompt_model_performance['cot] by ['num_failed_ratio']['mean']
dict_model_performance = dict_prompt_model_performance['cot']
list_model_failed = []
for model_name, dict_performance in dict_model_performance.items():
    list_model_failed.append([model_name, round(dict_performance['num_failed_ratio']['mean'], 2)])
sorted_list_model_failed = sorted(list_model_failed, key=lambda x: x[1], reverse=True)
sorted_list_model_failed

[['BioMistral-7B', 96.75],
 ['Llama3-OpenBioLLM-8B', 82.92],
 ['meditron-70b', 60.46],
 ['Llama-3.2-1B-Instruct', 25.16],
 ['meditron-7b', 16.58],
 ['Qwen2.5-7B-Instruct', 14.06],
 ['Yi-1.5-9B-Chat-16K', 10.77],
 ['Qwen2.5-3B-Instruct', 10.5],
 ['Llama-3.1-8B-UltraMedical', 6.71],
 ['Yi-1.5-34B-Chat-16K', 6.6],
 ['Llama3-OpenBioLLM-70B', 4.46],
 ['gemma-2-9b-it', 3.97],
 ['MeLLaMA-70B-chat', 3.76],
 ['MMed-Llama-3-8B', 3.28],
 ['Athene-V2-Chat', 3.23],
 ['Llama-3.1-70B-Instruct', 2.85],
 ['Qwen2.5-72B-Instruct', 2.38],
 ['Llama-3.2-3B-Instruct', 2.32],
 ['Qwen2.5-1.5B-Instruct', 1.89],
 ['gpt-4o', 1.62],
 ['gemma-2-27b-it', 1.42],
 ['Ministral-8B-Instruct-2410', 1.38],
 ['Llama-3.3-70B-Instruct', 0.95],
 ['Llama-3-70B-UltraMedical', 0.95],
 ['Llama-3.1-8B-Instruct', 0.94],
 ['MeLLaMA-13B-chat', 0.7],
 ['Mistral-Large-Instruct-2411', 0.69],
 ['Mistral-Small-Instruct-2409', 0.68],
 ['Phi-4', 0.48],
 ['QwQ-32B-Preview', 0.46],
 ['gpt-35-turbo', 0.24],
 ['Llama-3.1-Nemotron-70B-Instruct-HF

In [None]:
model_one = 'gemma-2-9b-it'
prompt_mode = 'cot'
dict_model_result = task.search_result_by_model(prompt_mode=prompt_mode)
result_one = dict_model_result[model_one][0]
print(result_one)
with open(result_one, 'r') as f:
    list_dict_result = json.load(f)

result/1-2.ADE-ADE relation/gemma-2-9b-it/1-2.ADE-ADE relation-cot-greedy-42.result.json


In [None]:
list_label = task.get_label(list_dict_result, prompt_mode=prompt_mode)
list_pred = task.get_pred(list_dict_result, prompt_mode=prompt_mode)
for idx, (pred, dict_result) in enumerate(zip(list_pred, list_dict_result)):
    if pred==-1:
        print(f"{idx} - Input: {dict_result['input']}")
        print("------------------------------------")
        print("Output:", dict_result['pred'])
        print("====================================\n")

In [None]:
list_pred, _ = task.get_pred_none(list_pred, list_label)

### 1-3.ADE-drug dosage

In [None]:
from dataset.extraction import Task_ext_ADE_Drug_dosage

In [None]:
task = '1-3.ADE-Drug dosage'
task = Task_ext_ADE_Drug_dosage(args=args, task=task)

Load 1-3.ADE-Drug dosage data: train: 20, val: 0, test: 193


In [None]:
dict_prompt_model_performance = evaluate(task)

  3%|▎         | 1/34 [00:00<00:04,  7.40it/s]

100%|██████████| 34/34 [00:04<00:00,  6.87it/s]
100%|██████████| 34/34 [00:04<00:00,  7.15it/s]
100%|██████████| 34/34 [00:04<00:00,  7.14it/s]


In [None]:
dict_mode_performance = print_performance(dict_prompt_model_performance)

Prompt Mode: direct
92.13 [92.04, 92.23]; 53.90 [53.70, 54.11]; 0.53 [0.50, 0.57]; 93.32 [93.24, 93.40]; 52.03 [51.82, 52.23]; 0.00 [0.00, 0.00]; 58.25 [58.04, 58.46]; 36.41 [36.21, 36.61]; 14.52 [14.37, 14.68]; 89.18 [89.05, 89.30]; 46.84 [46.63, 47.06]; 4.13 [4.04, 4.21]; 52.57 [52.32, 52.82]; 12.63 [12.51, 12.76]; 2.58 [2.51, 2.65]; 67.91 [67.72, 68.11]; 30.38 [30.18, 30.58]; 17.98 [17.82, 18.15]; 89.88 [89.75, 90.00]; 52.68 [52.47, 52.88]; 2.04 [1.97, 2.10]; 78.30 [78.12, 78.49]; 29.27 [29.08, 29.46]; 0.00 [0.00, 0.00]; 0.00 [0.00, 0.00]; 0.00 [0.00, 0.00]; 27.89 [27.68, 28.09]; 48.58 [48.42, 48.73]; 28.02 [27.87, 28.16]; 40.33 [40.11, 40.56]; 49.94 [49.71, 50.16]; 27.66 [27.48, 27.85]; 2.01 [1.95, 2.07]; 92.56 [92.47, 92.64]; 63.79 [63.59, 63.99]; 1.03 [0.98, 1.08]; 0.77 [0.73, 0.82]; 0.00 [0.00, 0.00]; 81.48 [81.31, 81.65]; 82.19 [82.03, 82.36]; 43.98 [43.76, 44.20]; 0.55 [0.51, 0.58]; 61.38 [61.21, 61.56]; 36.19 [36.01, 36.36]; 0.00 [0.00, 0.00]; 23.36 [23.17, 23.55]; 10.22 [10.

### 7.Cantemist.NER

In [None]:
from dataset.extraction import Task_ext_Cantemist_NER

In [None]:
task = '7.Cantemist.NER'
task = Task_ext_Cantemist_NER(args=args, task=task)

Load 7.Cantemist.NER data: train: 501, val: 499, test: 300


In [None]:
dict_prompt_model_performance = evaluate(task)

100%|██████████| 34/34 [00:16<00:00,  2.12it/s]
100%|██████████| 34/34 [00:16<00:00,  2.06it/s]
100%|██████████| 34/34 [00:19<00:00,  1.73it/s]


In [None]:
dict_mode_performance = print_performance(dict_prompt_model_performance)

Prompt Mode: direct
14.40 [14.35, 14.45]; 14.40 [14.35, 14.45]; 0.00 [0.00, 0.00]; 16.51 [16.46, 16.56]; 16.51 [16.46, 16.56]; 0.00 [0.00, 0.00]; 21.67 [21.61, 21.72]; 21.67 [21.61, 21.72]; 1.68 [1.63, 1.72]; 21.77 [21.72, 21.82]; 21.77 [21.72, 21.82]; 0.00 [0.00, 0.00]; 0.05 [0.05, 0.06]; 0.00 [0.00, 0.00]; 43.36 [43.18, 43.54]; 11.66 [11.62, 11.71]; 11.39 [11.35, 11.43]; 0.00 [0.00, 0.00]; 21.59 [21.51, 21.67]; 21.59 [21.51, 21.67]; 0.00 [0.00, 0.00]; 12.79 [12.75, 12.83]; 12.68 [12.63, 12.72]; 0.00 [0.00, 0.00]; 0.00 [0.00, 0.00]; 0.00 [0.00, 0.00]; 23.75 [23.59, 23.90]; 0.06 [0.06, 0.06]; 0.00 [0.00, 0.00]; 32.26 [32.09, 32.43]; 0.00 [0.00, 0.00]; 0.00 [0.00, 0.00]; 96.98 [96.92, 97.05]; 14.77 [14.71, 14.82]; 14.77 [14.71, 14.82]; 0.67 [0.64, 0.70]; 0.00 [0.00, 0.00]; 0.00 [0.00, 0.00]; 53.85 [53.67, 54.03]; 0.27 [0.26, 0.28]; 0.27 [0.26, 0.28]; 0.00 [0.00, 0.00]; 6.34 [6.29, 6.38]; 6.20 [6.15, 6.24]; 20.03 [19.89, 20.17]; 11.19 [11.15, 11.24]; 10.67 [10.63, 10.71]; 6.16 [6.07, 6.2

### 7.Cantemist.Norm

In [None]:
from dataset.extraction import Task_ext_Cantemist_Norm

In [None]:
task = '7.Cantemist.Norm'
task = Task_ext_Cantemist_Norm(args=args, task=task)

Load 7.Cantemist.Norm data: train: 501, val: 499, test: 300


In [None]:
dict_prompt_model_performance = evaluate(task)

100%|██████████| 34/34 [00:15<00:00,  2.19it/s]
100%|██████████| 34/34 [00:15<00:00,  2.22it/s]
100%|██████████| 34/34 [00:19<00:00,  1.79it/s]


In [None]:
dict_mode_performance = print_performance(dict_prompt_model_performance)

Prompt Mode: direct
19.91 [19.86, 19.97]; 0.94 [0.93, 0.96]; 0.00 [0.00, 0.00]; 21.48 [21.42, 21.54]; 5.50 [5.46, 5.53]; 0.66 [0.63, 0.69]; 17.80 [17.74, 17.86]; 4.41 [4.38, 4.44]; 22.82 [22.67, 22.97]; 20.14 [20.08, 20.21]; 10.00 [9.96, 10.05]; 0.96 [0.93, 1.00]; 0.15 [0.14, 0.15]; 0.00 [0.00, 0.00]; 17.34 [17.21, 17.48]; 7.69 [7.65, 7.72]; 0.00 [0.00, 0.00]; 0.00 [0.00, 0.00]; 19.47 [19.39, 19.55]; 10.04 [10.00, 10.09]; 0.00 [0.00, 0.00]; 4.19 [4.15, 4.22]; 1.25 [1.23, 1.26]; 0.00 [0.00, 0.00]; 0.00 [0.00, 0.00]; 0.00 [0.00, 0.00]; 39.05 [38.88, 39.23]; 0.12 [0.12, 0.13]; 0.00 [0.00, 0.00]; 34.90 [34.73, 35.07]; 1.22 [1.20, 1.24]; 0.06 [0.05, 0.06]; 31.03 [30.86, 31.19]; 5.89 [5.85, 5.93]; 2.49 [2.47, 2.51]; 32.49 [32.32, 32.66]; 0.07 [0.07, 0.08]; 0.07 [0.07, 0.08]; 63.82 [63.65, 64.00]; 4.89 [4.85, 4.94]; 2.15 [2.12, 2.17]; 2.35 [2.29, 2.40]; 5.93 [5.89, 5.97]; 0.87 [0.85, 0.88]; 42.04 [41.86, 42.22]; 0.00 [0.00, 0.00]; 0.00 [0.00, 0.00]; 20.00 [19.86, 20.15]; 15.72 [15.66, 15.77];

### 9.CHIP-CDEE

In [None]:
from dataset.extraction import Task_ext_CHIP_CDEE

In [None]:
task = '9.CHIP-CDEE'
task = Task_ext_CHIP_CDEE(args=args, task=task)

Load 9.CHIP-CDEE data: train: 1587, val: 0, test: 384


In [None]:
dict_prompt_model_performance = evaluate(task)

  0%|          | 0/34 [00:00<?, ?it/s]

100%|██████████| 34/34 [00:26<00:00,  1.28it/s]
100%|██████████| 34/34 [00:15<00:00,  2.25it/s]
100%|██████████| 34/34 [00:09<00:00,  3.43it/s]


In [None]:
dict_mode_performance = print_performance(dict_prompt_model_performance)

Prompt Mode: direct
39.06 [38.94, 39.18]; 17.48 [17.38, 17.58]; 17.72 [17.60, 17.84]; 46.81 [46.70, 46.92]; 21.78 [21.67, 21.88]; 14.37 [14.25, 14.48]; 17.95 [17.80, 18.11]; 11.17 [11.06, 11.29]; 63.58 [63.43, 63.73]; 45.49 [45.37, 45.62]; 24.71 [24.60, 24.81]; 24.96 [24.83, 25.10]; 0.00 [0.00, 0.00]; 0.00 [0.00, 0.00]; 98.70 [98.66, 98.73]; 0.51 [0.50, 0.53]; 0.09 [0.09, 0.10]; 92.48 [92.39, 92.56]; 45.82 [45.69, 45.94]; 24.82 [24.71, 24.93]; 29.18 [29.03, 29.33]; 39.43 [39.31, 39.56]; 20.73 [20.63, 20.84]; 30.76 [30.61, 30.90]; 0.00 [0.00, 0.00]; 0.00 [0.00, 0.00]; 100.00 [100.00, 100.00]; 0.51 [0.50, 0.53]; 0.09 [0.08, 0.10]; 96.37 [96.31, 96.43]; 3.06 [2.99, 3.14]; 0.09 [0.08, 0.09]; 68.23 [68.08, 68.39]; 35.70 [35.56, 35.84]; 15.88 [15.78, 15.98]; 33.06 [32.92, 33.21]; 0.00 [0.00, 0.00]; 0.00 [0.00, 0.00]; 69.81 [69.66, 69.96]; 19.69 [19.51, 19.86]; 7.88 [7.79, 7.97]; 10.45 [10.36, 10.54]; 3.99 [3.91, 4.07]; 0.00 [0.00, 0.00]; 73.53 [73.39, 73.67]; 1.55 [1.49, 1.60]; 0.00 [0.00, 0

#### Check wrong format

In [None]:
model_one = 'Llama-3.3-70B-Instruct'
prompt_mode = 'direct'
dict_model_result = task.search_result_by_model(prompt_mode=prompt_mode)
result_one = dict_model_result[model_one][0]
print(result_one)
with open(result_one, 'r') as f:
    list_dict_result = json.load(f)

result/9.CHIP-CDEE/Llama-3.3-70B-Instruct/9.CHIP-CDEE-direct-greedy-42.result.json


In [None]:
list_pred_str = [ dict_data['pred'] for dict_data in list_dict_result]
list_pred = task.get_pred(list_dict_result)
list_label = task.get_label(list_dict_result)
print(len(list_pred), len(list_label), sum([1 for pred in list_pred if pred==[-1]]))

384 384 112


In [None]:
for idx, (pred, dict_result) in enumerate(zip(list_pred, list_dict_result)):
    if pred==[-1]:
        print(f"{idx} - Input:\n{dict_result['input']}")
        print("------------------------------------")
        print(f"Output:\n{dict_result['output']}")
        print("------------------------------------")
        print(f"Pred:\n{dict_result['pred']}")
        print("====================================\n")

1 - Input:
本次发病以来患者精神食欲稍差,大小便如常,睡眠较差,体重无明显下降。
------------------------------------
Output:
subject: 精神, description: [稍差], location: [], status: 肯定;
subject: 食欲, description: [稍差], location: [], status: 肯定;
subject: 大小便, description: [正常], location: [], status: 肯定;
subject: 睡眠, description: [较差], location: [], status: 肯定;
subject: 体重下降, description: [明显], location: [], status: 否定;
------------------------------------
Pred:
subject: 精神, description: 稍差, location: [], status: 肯定;
subject: 食欲, description: 稍差, location: [], status: 肯定;
subject: 大小便, description: 如常, location: [], status: 肯定;
subject: 睡眠, description: 较差, location: [], status: 肯定;
subject: 体重, description: 无明显下降, location: [], status: 肯定;

3 - Input:
2020-7-26于某医院某院区就诊，行电子阴道镜：宫颈癌？妇科彩超提示：宫颈处减弱回声，约7.6×4.0cm；宫腔内积液。
------------------------------------
Output:
subject: 癌, description: [], location: [宫颈], status: 不确定;
subject: 回声, description: [减弱], location: [宫颈处], status: 肯定;
subject: 积液, description: [], location: [宫腔内], sta

### 28.MIE

In [None]:
from dataset.extraction import Task_ext_MIE

In [None]:
task = "28.MIE"
task = Task_ext_MIE(args=args, task=task)

Load 28.MIE data: train: 11261, val: 2380, test: 2235


In [None]:
dict_prompt_model_performance = evaluate(task)

  0%|          | 0/34 [00:00<?, ?it/s]

100%|██████████| 34/34 [02:01<00:00,  3.56s/it]
100%|██████████| 34/34 [02:07<00:00,  3.75s/it]
100%|██████████| 34/34 [01:03<00:00,  1.87s/it]


In [None]:
dict_mode_performance = print_performance(dict_prompt_model_performance)

Prompt Mode: direct
32.03 [31.99, 32.07]; 16.96 [16.93, 16.99]; 0.00 [0.00, 0.00]; 39.33 [39.29, 39.36]; 22.15 [22.12, 22.18]; 0.00 [0.00, 0.00]; 24.90 [24.85, 24.94]; 8.14 [8.11, 8.17]; 28.90 [28.84, 28.96]; 42.93 [42.89, 42.96]; 18.99 [18.96, 19.02]; 0.00 [0.00, 0.00]; 6.45 [6.42, 6.48]; 0.00 [0.00, 0.00]; 44.49 [44.42, 44.55]; 19.94 [19.91, 19.97]; 1.81 [1.80, 1.82]; 0.09 [0.08, 0.09]; 38.67 [38.63, 38.70]; 23.11 [23.08, 23.14]; 0.00 [0.00, 0.00]; 32.56 [32.54, 32.59]; 16.72 [16.69, 16.74]; 0.00 [0.00, 0.00]; 0.00 [0.00, 0.00]; 0.00 [0.00, 0.00]; 17.93 [17.88, 17.98]; 3.31 [3.29, 3.33]; 0.40 [0.39, 0.40]; 13.61 [13.57, 13.66]; 22.54 [22.50, 22.57]; 6.13 [6.11, 6.15]; 7.57 [7.54, 7.61]; 41.69 [41.65, 41.73]; 22.63 [22.60, 22.67]; 0.00 [0.00, 0.00]; 0.02 [0.02, 0.02]; 0.00 [0.00, 0.00]; 16.44 [16.39, 16.49]; 30.08 [30.04, 30.12]; 18.14 [18.10, 18.17]; 0.18 [0.17, 0.18]; 17.96 [17.93, 17.99]; 1.73 [1.72, 1.74]; 0.00 [0.00, 0.00]; 8.76 [8.73, 8.79]; 1.79 [1.78, 1.81]; 10.72 [10.68, 10.7

### 31.Ex4CDS

In [None]:
from dataset.extraction import Task_ext_Ex4CDS

In [None]:
task = "31.Ex4CDS"
task = Task_ext_Ex4CDS(args=args, task=task)

Load 31.Ex4CDS data: train: 20, val: 0, test: 411


In [None]:
dict_prompt_model_performance = evaluate(task)

  0%|          | 0/34 [00:00<?, ?it/s]

100%|██████████| 34/34 [00:13<00:00,  2.55it/s]
100%|██████████| 34/34 [00:13<00:00,  2.43it/s]
100%|██████████| 34/34 [00:12<00:00,  2.64it/s]


In [None]:
dict_mode_performance = print_performance(dict_prompt_model_performance)

Prompt Mode: direct
44.80 [44.71, 44.88]; 33.12 [33.04, 33.19]; 0.00 [0.00, 0.00]; 55.61 [55.54, 55.68]; 42.81 [42.74, 42.88]; 0.00 [0.00, 0.00]; 16.38 [16.29, 16.47]; 11.72 [11.65, 11.79]; 0.75 [0.73, 0.78]; 52.98 [52.91, 53.05]; 43.33 [43.26, 43.40]; 0.00 [0.00, 0.00]; 2.51 [2.47, 2.55]; 0.49 [0.48, 0.50]; 10.74 [10.64, 10.83]; 35.40 [35.32, 35.48]; 14.71 [14.66, 14.76]; 0.00 [0.00, 0.00]; 57.12 [57.06, 57.17]; 45.61 [45.55, 45.68]; 0.00 [0.00, 0.00]; 45.61 [45.54, 45.67]; 34.35 [34.29, 34.42]; 0.00 [0.00, 0.00]; 0.18 [0.17, 0.18]; 0.00 [0.00, 0.00]; 30.72 [30.58, 30.86]; 18.52 [18.44, 18.60]; 7.65 [7.61, 7.69]; 28.96 [28.82, 29.10]; 7.45 [7.40, 7.50]; 4.10 [4.06, 4.13]; 0.00 [0.00, 0.00]; 41.13 [41.05, 41.21]; 26.35 [26.29, 26.42]; 0.00 [0.00, 0.00]; 0.05 [0.05, 0.06]; 0.00 [0.00, 0.00]; 24.83 [24.70, 24.96]; 5.26 [5.19, 5.32]; 4.25 [4.20, 4.30]; 1.72 [1.68, 1.76]; 25.04 [24.96, 25.12]; 13.79 [13.73, 13.84]; 4.65 [4.58, 4.71]; 0.72 [0.70, 0.73]; 0.38 [0.37, 0.39]; 24.95 [24.82, 25.0

#### Check wrong format

In [None]:
model_one = 'gpt-4o'
prompt_mode = 'direct'
dict_model_result = task.search_result_by_model(prompt_mode=prompt_mode)
result_one = dict_model_result[model_one][0]
print(result_one)
with open(result_one, 'r') as f:
    list_dict_result = json.load(f)

result/31.Ex4CDS/gpt-4o/31.Ex4CDS-direct-greedy-42.result.json


In [None]:
list_pred_str = [ dict_data['pred'] for dict_data in list_dict_result]
list_pred = task.get_pred(list_dict_result)
list_label = task.get_label(list_dict_result)
print(len(list_pred), len(list_label), sum([1 for pred in list_pred if pred==[-1]]))

411 411 0


In [None]:
for idx, (pred, dict_result) in enumerate(zip(list_pred, list_dict_result)):
    if pred==[-1]:
        print(f"{idx} - Input:\n{dict_result['input']}")
        print("------------------------------------")
        print(f"Output:\n{dict_result['output']}")
        print("------------------------------------")
        print(f"Pred:\n{dict_result['pred']}")
        print("====================================\n")

In [None]:
list_fine = []
for preds in list_label:
    flag_factuality = False
    flag_progression = False
    for pred in preds:
        factuality = pred["factuality"]
        progression = pred["progression"]
        if factuality!="none":
            flag_factuality = True
        if progression!="none":
            flag_progression = True
    if flag_factuality:
        list_fine.append([flag_factuality, flag_progression])

In [None]:
len(list_fine), len(list_fine)/len(list_label)

(244, 0.5936739659367397)

In [None]:
# count the number of factuality
dict_factuality = {}
for preds in list_label:
    for pred in preds:
        factuality = pred["factuality"]
        if factuality not in dict_factuality:
            dict_factuality[factuality] = 0
        dict_factuality[factuality] += 1
dict_factuality

{'none': 2652,
 'negative': 202,
 'speculated': 91,
 'possible_future': 78,
 'minor': 27,
 'unlikely': 13}

In [None]:
# count the number of factuality
dict_factuality = {}
for preds in list_label:
    for pred in preds:
        factuality = pred["progression"]
        if factuality not in dict_factuality:
            dict_factuality[factuality] = 0
        dict_factuality[factuality] += 1
dict_factuality

{'none': 1401,
 'decrease_risk_factor': 247,
 'decrease_symptom': 353,
 'risk_factor': 68,
 'increase_risk_factor': 555,
 'increase_symptom': 372,
 'symptom': 65,
 'decrease': 2}

In [None]:
# count the number of factuality
dict_factuality = {}
for preds in list_pred:
    for pred in preds:
        factuality = pred["factuality"]
        if factuality not in dict_factuality:
            dict_factuality[factuality] = 0
        dict_factuality[factuality] += 1
dict_factuality

{'minor': 62,
 'none': 315,
 'negative': 258,
 'positive': 1883,
 'speculated': 50,
 'possible_future': 102,
 'unlikely': 28,
 'unklare': 1}

In [None]:
# count the number of factuality
dict_factuality = {}
for preds in list_pred:
    for pred in preds:
        factuality = pred["progression"]
        if factuality not in dict_factuality:
            dict_factuality[factuality] = 0
        dict_factuality[factuality] += 1
dict_factuality

{'none': 1410,
 'increase_risk_factor': 1026,
 'decrease_risk_factor': 225,
 'conclusion': 37,
 'decrease_symptom': 1}

### 43.IMCS-V2-NER

In [None]:
from dataset.extraction import Task_ext_IMCS_V2_NER

In [None]:
task = '43.IMCS-V2-NER'
task = Task_ext_IMCS_V2_NER(args=args, task=task)

Load 43.IMCS-V2-NER data: train: 7130, val: 0, test: 2374


In [None]:
dict_prompt_model_performance = evaluate(task)

  0%|          | 0/34 [00:00<?, ?it/s]

100%|██████████| 34/34 [00:55<00:00,  1.64s/it]
100%|██████████| 34/34 [00:56<00:00,  1.66s/it]
100%|██████████| 34/34 [00:54<00:00,  1.61s/it]


In [None]:
dict_mode_performance = print_performance(dict_prompt_model_performance)

Prompt Mode: direct
70.21 [70.18, 70.24]; 58.42 [58.39, 58.46]; 0.00 [0.00, 0.00]; 70.57 [70.54, 70.60]; 62.08 [62.04, 62.12]; 0.00 [0.00, 0.00]; 59.65 [59.61, 59.69]; 30.54 [30.49, 30.59]; 2.65 [2.63, 2.67]; 72.93 [72.90, 72.96]; 41.54 [41.49, 41.60]; 0.00 [0.00, 0.00]; 0.16 [0.16, 0.16]; 0.03 [0.03, 0.03]; 0.73 [0.72, 0.74]; 50.80 [50.76, 50.84]; 36.92 [36.88, 36.96]; 0.00 [0.00, 0.00]; 72.85 [72.82, 72.89]; 65.18 [65.14, 65.22]; 0.00 [0.00, 0.00]; 64.05 [64.02, 64.08]; 56.45 [56.41, 56.48]; 0.00 [0.00, 0.00]; 0.02 [0.02, 0.02]; 0.00 [0.00, 0.00]; 0.00 [0.00, 0.00]; 22.46 [22.43, 22.49]; 17.88 [17.85, 17.90]; 0.76 [0.74, 0.77]; 34.16 [34.12, 34.20]; 27.79 [27.75, 27.83]; 0.38 [0.37, 0.39]; 64.35 [64.31, 64.39]; 56.24 [56.20, 56.28]; 0.00 [0.00, 0.00]; 2.52 [2.51, 2.54]; 1.57 [1.56, 1.58]; 1.39 [1.37, 1.40]; 43.05 [43.01, 43.10]; 32.03 [31.98, 32.08]; 0.00 [0.00, 0.00]; 23.96 [23.93, 23.99]; 11.51 [11.48, 11.53]; 0.13 [0.12, 0.13]; 46.83 [46.79, 46.88]; 35.19 [35.15, 35.23]; 4.63 [4.6

### 83.CHIP-MDCFNPC

In [None]:
from dataset.extraction import Task_ext_CHIP_MDCFNPC

In [None]:
task = '83.CHIP-MDCFNPC'
task = Task_ext_CHIP_MDCFNPC(args=args, task=task)

Load 83.CHIP-MDCFNPC data: train: 59304, val: 0, test: 11785


In [None]:
dict_prompt_model_performance = evaluate(task)

  0%|          | 0/34 [00:00<?, ?it/s]

100%|██████████| 34/34 [04:22<00:00,  7.71s/it]
100%|██████████| 34/34 [04:27<00:00,  7.85s/it]
100%|██████████| 34/34 [04:10<00:00,  7.37s/it]


In [None]:
dict_mode_performance = print_performance(dict_prompt_model_performance)

Prompt Mode: direct
42.52 [42.50, 42.55]; 14.30 [14.28, 14.32]; 0.01 [0.01, 0.01]; 57.40 [57.37, 57.42]; 34.19 [34.16, 34.22]; 0.02 [0.02, 0.02]; 33.57 [33.55, 33.59]; 15.30 [15.28, 15.32]; 19.50 [19.48, 19.53]; 55.11 [55.09, 55.14]; 35.59 [35.56, 35.62]; 0.03 [0.03, 0.03]; 0.71 [0.71, 0.71]; 0.25 [0.25, 0.25]; 1.66 [1.65, 1.66]; 16.33 [16.31, 16.34]; 6.71 [6.70, 6.73]; 0.01 [0.01, 0.01]; 54.15 [54.13, 54.17]; 33.40 [33.38, 33.43]; 0.00 [0.00, 0.00]; 27.58 [27.56, 27.60]; 15.26 [15.24, 15.27]; 0.00 [0.00, 0.00]; 0.02 [0.02, 0.02]; 0.00 [0.00, 0.00]; 2.11 [2.10, 2.12]; 0.02 [0.02, 0.02]; 0.01 [0.00, 0.01]; 4.32 [4.31, 4.33]; 19.18 [19.16, 19.20]; 7.37 [7.35, 7.38]; 0.32 [0.32, 0.33]; 50.57 [50.54, 50.59]; 29.26 [29.24, 29.29]; 8.49 [8.48, 8.51]; 0.38 [0.38, 0.39]; 0.23 [0.22, 0.23]; 2.02 [2.01, 2.03]; 34.44 [34.42, 34.47]; 21.74 [21.71, 21.76]; 0.00 [0.00, 0.00]; 10.18 [10.16, 10.19]; 4.33 [4.33, 4.34]; 9.39 [9.37, 9.41]; 1.31 [1.30, 1.31]; 0.38 [0.38, 0.38]; 3.46 [3.45, 3.47]; 53.77 [5

### 85.IMCS-V2-SR

In [None]:
from dataset.extraction import Task_ext_IMCS_V2_SR

In [None]:
task = "85.IMCS-V2-SR"
task = Task_ext_IMCS_V2_SR(args=args, task=task)

Load 85.IMCS-V2-SR data: train: 2466, val: 0, test: 832


In [None]:
dict_prompt_model_performance = evaluate(task)

  0%|          | 0/34 [00:00<?, ?it/s]

100%|██████████| 34/34 [00:27<00:00,  1.24it/s]
100%|██████████| 34/34 [00:25<00:00,  1.34it/s]
100%|██████████| 34/34 [00:24<00:00,  1.40it/s]


In [None]:
dict_mode_performance = print_performance(dict_prompt_model_performance)

Prompt Mode: direct
36.32 [36.28, 36.36]; 29.13 [29.09, 29.17]; 0.00 [0.00, 0.00]; 39.61 [39.57, 39.65]; 32.05 [32.02, 32.09]; 0.00 [0.00, 0.00]; 29.53 [29.48, 29.57]; 23.58 [23.54, 23.62]; 0.00 [0.00, 0.00]; 26.29 [26.24, 26.34]; 20.52 [20.48, 20.57]; 0.00 [0.00, 0.00]; 6.43 [6.40, 6.46]; 3.24 [3.22, 3.26]; 14.69 [14.61, 14.77]; 16.68 [16.64, 16.71]; 12.95 [12.92, 12.97]; 0.13 [0.12, 0.14]; 27.13 [27.06, 27.20]; 21.82 [21.76, 21.88]; 0.00 [0.00, 0.00]; 0.03 [0.03, 0.03]; 0.03 [0.02, 0.03]; 0.00 [0.00, 0.00]; 0.00 [0.00, 0.00]; 0.00 [0.00, 0.00]; 22.65 [22.56, 22.74]; 0.00 [0.00, 0.00]; 0.00 [0.00, 0.00]; 0.25 [0.24, 0.26]; 6.71 [6.68, 6.74]; 3.36 [3.34, 3.38]; 7.06 [7.01, 7.12]; 24.37 [24.32, 24.41]; 19.94 [19.90, 19.99]; 0.00 [0.00, 0.00]; 0.49 [0.48, 0.50]; 0.39 [0.38, 0.40]; 1.32 [1.29, 1.34]; 1.82 [1.79, 1.84]; 1.50 [1.48, 1.52]; 0.00 [0.00, 0.00]; 20.23 [20.19, 20.27]; 13.64 [13.61, 13.67]; 0.12 [0.11, 0.13]; 13.82 [13.78, 13.86]; 9.94 [9.91, 9.97]; 0.00 [0.00, 0.00]; 5.12 [5.09,

### 91-1.CAS-label

In [None]:
from dataset.extraction import Task_ext_CAS_label

In [None]:
task = "91-1.CAS.label"
task = Task_ext_CAS_label(args=args, task=task)

Load 91-1.CAS.label data: train: 20, val: 0, test: 696


In [None]:
dict_prompt_model_performance = evaluate(task)

  0%|          | 0/34 [00:00<?, ?it/s]

100%|██████████| 34/34 [00:20<00:00,  1.63it/s]
100%|██████████| 34/34 [00:25<00:00,  1.36it/s]
100%|██████████| 34/34 [00:20<00:00,  1.63it/s]


In [None]:
dict_mode_performance = print_performance(dict_prompt_model_performance)

Prompt Mode: direct
88.54 [88.46, 88.61]; 45.22 [45.10, 45.34]; 1.87 [1.84, 1.90]; 90.03 [89.96, 90.11]; 45.23 [45.11, 45.35]; 0.00 [0.00, 0.00]; 64.25 [64.14, 64.36]; 29.06 [28.94, 29.18]; 45.26 [45.14, 45.37]; 61.50 [61.39, 61.61]; 33.26 [33.15, 33.37]; 0.00 [0.00, 0.00]; 59.40 [59.28, 59.52]; 5.47 [5.41, 5.53]; 47.05 [46.93, 47.17]; 22.81 [22.69, 22.93]; 9.81 [9.72, 9.90]; 83.85 [83.76, 83.93]; 84.91 [84.82, 84.99]; 47.26 [47.14, 47.38]; 0.00 [0.00, 0.00]; 53.68 [53.55, 53.80]; 31.50 [31.39, 31.61]; 17.12 [17.03, 17.21]; 0.00 [0.00, 0.00]; 0.00 [0.00, 0.00]; 50.70 [50.58, 50.82]; 2.16 [2.13, 2.18]; 0.26 [0.25, 0.27]; 3.00 [2.96, 3.04]; 80.31 [80.22, 80.40]; 33.98 [33.86, 34.09]; 13.92 [13.84, 14.00]; 77.77 [77.67, 77.87]; 36.21 [36.10, 36.33]; 6.49 [6.43, 6.55]; 0.14 [0.14, 0.15]; 0.00 [0.00, 0.00]; 2.29 [2.25, 2.32]; 88.80 [88.72, 88.87]; 42.88 [42.77, 43.00]; 0.00 [0.00, 0.00]; 30.02 [29.91, 30.13]; 7.19 [7.14, 7.25]; 12.23 [12.15, 12.30]; 3.92 [3.89, 3.96]; 1.69 [1.67, 1.72]; 4.1

### 96.RuCCoN.NER

In [None]:
from dataset.extraction import Task_ext_RuCCoN_NER

In [None]:
task = "96.RuCCoN.NER"
task = Task_ext_RuCCoN_NER(args=args, task=task)

Load 96.RuCCoN.NER data: train: 20, val: 0, test: 854


In [None]:
dict_prompt_model_performance = evaluate(task)

100%|██████████| 34/34 [00:42<00:00,  1.26s/it]
100%|██████████| 34/34 [00:45<00:00,  1.35s/it]
100%|██████████| 34/34 [00:40<00:00,  1.20s/it]


In [None]:
dict_mode_performance = print_performance(dict_prompt_model_performance)

Prompt Mode: direct
27.54 [27.50, 27.58]; 25.30 [25.26, 25.34]; 0.00 [0.00, 0.00]; 32.20 [32.16, 32.24]; 29.23 [29.19, 29.27]; 0.00 [0.00, 0.00]; 27.43 [27.39, 27.46]; 23.88 [23.84, 23.91]; 0.48 [0.46, 0.49]; 32.68 [32.63, 32.72]; 30.08 [30.04, 30.12]; 0.00 [0.00, 0.00]; 0.25 [0.25, 0.26]; 0.04 [0.04, 0.04]; 9.90 [9.84, 9.97]; 24.83 [24.79, 24.87]; 15.44 [15.40, 15.48]; 0.00 [0.00, 0.00]; 32.89 [32.85, 32.93]; 30.63 [30.59, 30.67]; 0.00 [0.00, 0.00]; 30.18 [30.14, 30.22]; 24.79 [24.75, 24.83]; 0.00 [0.00, 0.00]; 0.15 [0.14, 0.15]; 0.00 [0.00, 0.00]; 44.46 [44.36, 44.56]; 7.24 [7.20, 7.27]; 5.98 [5.95, 6.01]; 5.53 [5.48, 5.58]; 3.85 [3.82, 3.87]; 2.44 [2.42, 2.45]; 5.63 [5.58, 5.68]; 18.21 [18.16, 18.25]; 16.46 [16.41, 16.50]; 0.12 [0.11, 0.12]; 0.79 [0.77, 0.81]; 0.42 [0.41, 0.43]; 38.45 [38.35, 38.56]; 28.02 [27.98, 28.07]; 25.51 [25.47, 25.56]; 0.00 [0.00, 0.00]; 12.10 [12.06, 12.14]; 9.50 [9.47, 9.54]; 6.34 [6.28, 6.39]; 20.88 [20.84, 20.93]; 16.13 [16.09, 16.16]; 3.43 [3.40, 3.47];

### 96.RuCCoN.NER_Nor

In [None]:
from dataset.extraction import Task_ext_RuCCoN_NER_Nor

In [None]:
task = "96.RuCCoN.NER_Nor"
task = Task_ext_RuCCoN_NER_Nor(args=args, task=task)

Load 96.RuCCoN.NER_Nor data: train: 20, val: 0, test: 854


In [None]:
dict_prompt_model_performance = evaluate(task)

100%|██████████| 34/34 [09:34<00:00, 16.90s/it]  
100%|██████████| 34/34 [04:34<00:00,  8.06s/it]
100%|██████████| 34/34 [00:42<00:00,  1.25s/it]


In [None]:
dict_mode_performance = print_performance(dict_prompt_model_performance)

Prompt Mode: direct
25.39 [25.34, 25.44]; 0.39 [0.38, 0.39]; 0.00 [0.00, 0.00]; 28.34 [28.29, 28.38]; 9.72 [9.69, 9.74]; 0.00 [0.00, 0.00]; 28.98 [28.94, 29.02]; 0.72 [0.71, 0.73]; 0.83 [0.81, 0.85]; 33.09 [33.04, 33.13]; 12.49 [12.47, 12.51]; 0.00 [0.00, 0.00]; 0.33 [0.32, 0.34]; 0.01 [0.01, 0.01]; 41.57 [41.47, 41.68]; 18.72 [18.68, 18.76]; 0.00 [0.00, 0.00]; 0.22 [0.21, 0.23]; 33.37 [33.33, 33.41]; 13.00 [12.98, 13.03]; 0.00 [0.00, 0.00]; 10.16 [10.12, 10.20]; 0.00 [0.00, 0.00]; 1.99 [1.96, 2.02]; 0.03 [0.03, 0.04]; 0.00 [0.00, 0.00]; 39.12 [39.01, 39.22]; 3.26 [3.24, 3.28]; 0.25 [0.24, 0.25]; 9.63 [9.57, 9.69]; 2.48 [2.46, 2.50]; 0.49 [0.48, 0.50]; 14.65 [14.57, 14.72]; 11.62 [11.58, 11.67]; 0.43 [0.43, 0.44]; 0.35 [0.34, 0.36]; 0.70 [0.68, 0.71]; 0.00 [0.00, 0.00]; 20.56 [20.48, 20.65]; 17.03 [16.97, 17.09]; 0.74 [0.73, 0.75]; 0.00 [0.00, 0.00]; 12.35 [12.31, 12.39]; 3.77 [3.75, 3.79]; 13.90 [13.82, 13.97]; 18.54 [18.49, 18.59]; 0.00 [0.00, 0.00]; 5.63 [5.58, 5.67]; 30.71 [30.66, 

### 98.BRONCO150.NER_Nor

In [None]:
from dataset.extraction import Task_ext_BRONCO150_NER_Nor

In [None]:
task = "98.BRONCO150.NER_Nor"
task = Task_ext_BRONCO150_NER_Nor(args=args, task=task)

Load 98.BRONCO150.NER_Nor data: train: 20, val: 0, test: 880


In [None]:
dict_prompt_model_performance = evaluate(task)

100%|██████████| 34/34 [02:51<00:00,  5.04s/it]
100%|██████████| 34/34 [00:50<00:00,  1.49s/it]
100%|██████████| 34/34 [01:48<00:00,  3.18s/it]


In [None]:
dict_mode_performance = print_performance(dict_prompt_model_performance)

Prompt Mode: direct
43.88 [43.83, 43.92]; 0.04 [0.04, 0.04]; 0.00 [0.00, 0.00]; 49.80 [49.76, 49.84]; 0.07 [0.07, 0.07]; 0.00 [0.00, 0.00]; 36.08 [36.02, 36.14]; 0.33 [0.33, 0.34]; 5.00 [4.95, 5.05]; 58.29 [58.25, 58.32]; 0.14 [0.14, 0.15]; 0.00 [0.00, 0.00]; 2.87 [2.84, 2.90]; 0.38 [0.38, 0.39]; 59.81 [59.71, 59.92]; 40.41 [40.36, 40.45]; 0.54 [0.53, 0.55]; 0.00 [0.00, 0.00]; 58.46 [58.42, 58.49]; 0.01 [0.01, 0.01]; 0.00 [0.00, 0.00]; 41.66 [41.62, 41.70]; 0.00 [0.00, 0.00]; 0.00 [0.00, 0.00]; 0.85 [0.84, 0.86]; 0.00 [0.00, 0.00]; 40.84 [40.74, 40.94]; 5.92 [5.89, 5.95]; 0.72 [0.71, 0.73]; 6.27 [6.22, 6.32]; 9.22 [9.18, 9.25]; 0.82 [0.82, 0.83]; 3.89 [3.85, 3.94]; 27.56 [27.50, 27.61]; 4.21 [4.19, 4.23]; 0.12 [0.11, 0.12]; 1.04 [1.03, 1.06]; 0.02 [0.02, 0.02]; 58.28 [58.18, 58.38]; 14.38 [14.32, 14.45]; 0.41 [0.40, 0.42]; 0.22 [0.21, 0.23]; 24.80 [24.75, 24.85]; 3.52 [3.50, 3.54]; 0.23 [0.22, 0.24]; 12.03 [11.99, 12.08]; 0.00 [0.00, 0.00]; 5.14 [5.10, 5.19]; 46.80 [46.76, 46.85]; 0.00

### 98.BRONCO150.NER_status

In [None]:
from dataset.extraction import Task_ext_BRONCO150_NER_status

In [None]:
task = "98.BRONCO150.NER_status"
task = Task_ext_BRONCO150_NER_status(args=args, task=task)

Load 98.BRONCO150.NER_status data: train: 20, val: 0, test: 880


In [None]:
dict_prompt_model_performance = evaluate(task)

100%|██████████| 34/34 [03:20<00:00,  5.90s/it]
100%|██████████| 34/34 [00:43<00:00,  1.27s/it]
100%|██████████| 34/34 [01:07<00:00,  1.98s/it]


In [None]:
dict_mode_performance = print_performance(dict_prompt_model_performance)

Prompt Mode: direct
45.87 [45.83, 45.92]; 38.31 [38.28, 38.35]; 0.11 [0.11, 0.12]; 54.24 [54.20, 54.28]; 45.38 [45.34, 45.41]; 0.00 [0.00, 0.00]; 36.98 [36.93, 37.04]; 13.05 [13.01, 13.09]; 0.12 [0.11, 0.12]; 56.10 [56.06, 56.14]; 48.66 [48.62, 48.70]; 0.00 [0.00, 0.00]; 2.41 [2.39, 2.43]; 0.16 [0.15, 0.16]; 66.68 [66.58, 66.78]; 42.47 [42.43, 42.51]; 14.55 [14.51, 14.58]; 0.00 [0.00, 0.00]; 57.28 [57.24, 57.32]; 50.10 [50.06, 50.14]; 0.00 [0.00, 0.00]; 42.20 [42.16, 42.24]; 28.64 [28.59, 28.68]; 0.00 [0.00, 0.00]; 0.62 [0.61, 0.62]; 0.00 [0.00, 0.00]; 57.01 [56.91, 57.11]; 13.66 [13.61, 13.70]; 9.37 [9.34, 9.41]; 7.61 [7.56, 7.67]; 9.88 [9.85, 9.91]; 3.37 [3.35, 3.38]; 7.40 [7.34, 7.45]; 35.99 [35.94, 36.04]; 28.16 [28.12, 28.21]; 2.18 [2.15, 2.22]; 2.97 [2.95, 3.00]; 1.96 [1.94, 1.98]; 28.45 [28.35, 28.54]; 41.89 [41.84, 41.94]; 35.29 [35.24, 35.34]; 0.00 [0.00, 0.00]; 23.03 [22.98, 23.07]; 10.10 [10.08, 10.13]; 0.00 [0.00, 0.00]; 6.12 [6.08, 6.15]; 0.72 [0.71, 0.73]; 7.58 [7.53, 7.6

### 99.CARDIO:DE

In [None]:
from dataset.extraction import Task_ext_CARDIO_DE

In [None]:
task = "99.CARDIO:DE"
task = Task_ext_CARDIO_DE(args=args, task=task)

Load 99.CARDIO:DE data: train: 19, val: 0, test: 370


In [None]:
dict_prompt_model_performance = evaluate(task)

100%|██████████| 34/34 [00:31<00:00,  1.08it/s]
100%|██████████| 34/34 [00:33<00:00,  1.02it/s]
100%|██████████| 34/34 [00:27<00:00,  1.25it/s]


In [None]:
dict_mode_performance = print_performance(dict_prompt_model_performance)

Prompt Mode: direct
48.38 [48.30, 48.47]; 26.77 [26.72, 26.83]; 0.00 [0.00, 0.00]; 43.54 [43.47, 43.61]; 21.62 [21.57, 21.67]; 0.00 [0.00, 0.00]; 56.80 [56.73, 56.86]; 33.14 [33.09, 33.20]; 0.00 [0.00, 0.00]; 57.28 [57.23, 57.32]; 41.96 [41.92, 42.00]; 0.00 [0.00, 0.00]; 0.02 [0.02, 0.02]; 0.00 [0.00, 0.00]; 24.28 [24.13, 24.42]; 40.36 [40.30, 40.41]; 9.31 [9.28, 9.35]; 0.00 [0.00, 0.00]; 52.63 [52.56, 52.70]; 39.54 [39.48, 39.60]; 0.00 [0.00, 0.00]; 52.94 [52.88, 52.99]; 33.53 [33.48, 33.59]; 0.00 [0.00, 0.00]; 0.00 [0.00, 0.00]; 0.00 [0.00, 0.00]; 0.78 [0.75, 0.81]; 0.00 [0.00, 0.00]; 0.00 [0.00, 0.00]; 3.79 [3.72, 3.85]; 0.88 [0.87, 0.90]; 0.02 [0.02, 0.02]; 3.56 [3.50, 3.62]; 23.08 [23.00, 23.16]; 8.51 [8.47, 8.56]; 1.61 [1.57, 1.65]; 0.00 [0.00, 0.00]; 0.00 [0.00, 0.00]; 47.37 [47.21, 47.54]; 0.22 [0.21, 0.23]; 0.13 [0.12, 0.13]; 0.00 [0.00, 0.00]; 9.19 [9.12, 9.25]; 4.83 [4.79, 4.87]; 0.00 [0.00, 0.00]; 35.74 [35.67, 35.81]; 20.12 [20.07, 20.17]; 0.00 [0.00, 0.00]; 39.53 [39.44, 

### 100.GraSSCo_PHI

In [12]:
from dataset.extraction import Task_ext_GraSSCo_PHI

In [13]:
task = "100.GraSSCo_PHI"
task = Task_ext_GraSSCo_PHI(args=args, task=task)

Load 100.GraSSCo_PHI data: train: 20, val: 0, test: 329


In [14]:
dict_prompt_model_performance = evaluate(task)

100%|██████████| 34/34 [00:10<00:00,  3.19it/s]
100%|██████████| 34/34 [00:10<00:00,  3.26it/s]
100%|██████████| 34/34 [00:09<00:00,  3.55it/s]


In [15]:
dict_mode_performance = print_performance(dict_prompt_model_performance)

Prompt Mode: direct
52.18 [52.04, 52.33]; 46.52 [46.38, 46.66]; 0.00 [0.00, 0.00]; 58.16 [58.01, 58.30]; 55.02 [54.88, 55.16]; 0.00 [0.00, 0.00]; 23.12 [23.00, 23.24]; 17.24 [17.16, 17.33]; 5.10 [5.02, 5.17]; 56.51 [56.38, 56.63]; 54.57 [54.45, 54.69]; 0.00 [0.00, 0.00]; 0.05 [0.05, 0.06]; 0.00 [0.00, 0.00]; 2.76 [2.71, 2.82]; 18.04 [17.97, 18.12]; 12.91 [12.86, 12.97]; 0.00 [0.00, 0.00]; 45.20 [45.08, 45.32]; 43.48 [43.36, 43.59]; 0.00 [0.00, 0.00]; 39.38 [39.28, 39.48]; 31.54 [31.44, 31.63]; 0.00 [0.00, 0.00]; 0.24 [0.23, 0.25]; 0.00 [0.00, 0.00]; 70.81 [70.66, 70.97]; 8.25 [8.18, 8.32]; 7.20 [7.14, 7.26]; 21.66 [21.52, 21.80]; 12.14 [12.08, 12.21]; 8.32 [8.27, 8.37]; 0.00 [0.00, 0.00]; 34.84 [34.73, 34.96]; 32.86 [32.75, 32.97]; 0.30 [0.28, 0.32]; 0.41 [0.40, 0.42]; 0.00 [0.00, 0.00]; 41.88 [41.71, 42.05]; 22.54 [22.46, 22.63]; 20.96 [20.88, 21.04]; 1.24 [1.21, 1.28]; 11.40 [11.34, 11.47]; 8.96 [8.90, 9.01]; 5.77 [5.69, 5.85]; 9.09 [9.03, 9.14]; 6.71 [6.67, 6.76]; 12.48 [12.36, 12.5

#### Check wrong format

In [29]:
model_one = 'gemma-2-9b-it'
prompt_mode = 'cot'
dict_model_result = task.search_result_by_model(prompt_mode=prompt_mode)
result_one = dict_model_result[model_one][0]
print(result_one)
with open(result_one, 'r') as f:
    list_dict_result = json.load(f)

result/100.GraSSCo_PHI/gemma-2-9b-it/100.GraSSCo_PHI-cot-greedy-42.result.json


In [30]:
task.list_pattern

['entity: (.+), type: (.+);',
 'entity:(.+), type:(.+);',
 '.*:\\s*(.+),\\s*.*:\\s*([^;]+)',
 '(.+):([^;]+)']

In [31]:
list_list_dict_entity = []
for dict_data in list_dict_result:
    list_dict_entity = []
    list_line = process_text_clean(dict_data["output"]).split(task.sep_event)
    list_line = [line.strip() for line in list_line if line.strip() != ""]
    for idx_line, line_one in enumerate(list_line):
        result = regex.search(task.list_pattern[0], line_one, regex.IGNORECASE)
        # If the entity is found, extract the entity and the features
        # The group(1)-group(num_subject) is the subject, the group(num_subject+1)-group(num_subject+num_feature) is the features
        dict_entity = {}
        for idx, field_subject in enumerate(task.list_field_subject):
            dict_entity[field_subject] = result.group(idx + 1).strip()
        # Extract the features
        for idx, field_feature in enumerate(task.list_field_feature):
            dict_entity[field_feature] = result.group(
                idx + task.num_subject + 1
            ).strip()
        list_dict_entity.append(dict_entity)
    list_list_dict_entity.append(list_dict_entity)

AttributeError: 'NoneType' object has no attribute 'group'

In [33]:
line_one

'entity: oktober'

In [32]:
for idx_data, dict_data in enumerate(task.dataset_test):
    list_line = dict_data["output"].split(task.sep_event)
    list_line = [line.strip() for line in list_line if line.strip() != ""]
    for idx, line in enumerate(list_line):
        if ";" not in line:
            print(f"{dict_data['id']}-{idx} - {line}")
            print("------------------------------------")

In [22]:
list_line[0]

'entity: oktober'

In [None]:
list_line

['entity: oktober', '2012, type: date;']

In [None]:
for idx_result, dict_result in enumerate(list_dict_result):
    # 计算\n的数量
    num_newline = dict_result["output"].count("\n")+1
    # 计算;的数量
    num_semicolon = dict_result["output"].count(";")
    if num_newline!=num_semicolon:
        print(f"{idx_result} - Output: {dict_result['output']}")
        print("====================================\n")
        break

10 - Output: entity: Oktober
2012, type: DATE;



In [None]:
dict_result["output"]

'entity: Oktober\n2012, type: DATE;'

In [None]:
# list_label = task.get_label(list_dict_result, prompt_mode=prompt_mode)
list_pred = task.get_pred(list_dict_result, prompt_mode=prompt_mode)
for idx, (pred, dict_result) in enumerate(zip(list_pred, list_dict_result)):
    if pred==-1:
        print(f"{idx} - Input: {dict_result['input']}")
        print("------------------------------------")
        print("Output:", dict_result['pred'])
        print("====================================\n")

In [None]:
list_pred

[[{'entity': 'dr. blasenstein', 'type': 'name_doctor'},
  {'entity': 'chefarzt', 'type': 'profession'}],
 [{'entity': 'manuela beuerle', 'type': 'name_patient'},
  {'entity': '19.04.2020', 'type': 'date'},
  {'entity': '09.02.22', 'type': 'date'}],
 [{'entity': 'pd dr. elisabeth bamberger', 'type': 'name_doctor'},
  {'entity': 'l 4 li.', 'type': 'id'},
  {'entity': 'l 4/5 re.', 'type': 'id'},
  {'entity': '30°', 'type': 'age'},
  {'entity': '30°', 'type': 'age'}],
 [{'entity': 'm. boeck', 'type': 'name_patient'},
  {'entity': '1.2.2000', 'type': 'date'},
  {'entity': '3.5.2028', 'type': 'date'},
  {'entity': '8.5.2028', 'type': 'date'},
  {'entity': '28', 'type': 'age'}],
 [{'entity': 'tvt', 'type': '...'},
  {'entity': 'marcumar', 'type': '...'},
  {'entity': 'medroxyprogesteron', 'type': '...'},
  {'entity': '3', 'type': 'age'},
  {'entity': '2033', 'type': 'id'},
  {'entity': 'dm typ 1', 'type': '...'}],
 [{'entity': 'floristin', 'type': 'profession'},
  {'entity': 'partnerin', 'typ

In [None]:
list_pred, _ = task.get_pred_none(list_pred, list_label)

### 101.IFMIR.NER

In [None]:
from dataset.extraction import Task_ext_IFMIR_NER

In [None]:
task = "101.IFMIR.NER"
task = Task_ext_IFMIR_NER(args=args, task=task)

Load 101.IFMIR.NER data: train: 46721, val: 5834, test: 5748


In [None]:
dict_prompt_model_performance = evaluate(task)

100%|██████████| 34/34 [03:15<00:00,  5.75s/it]
100%|██████████| 34/34 [03:48<00:00,  6.73s/it]
100%|██████████| 34/34 [02:42<00:00,  4.78s/it]


In [None]:
dict_mode_performance = print_performance(dict_prompt_model_performance)

Prompt Mode: direct
37.82 [37.80, 37.84]; 31.09 [31.07, 31.11]; 0.07 [0.07, 0.07]; 41.15 [41.12, 41.17]; 35.00 [34.98, 35.02]; 0.00 [0.00, 0.00]; 29.66 [29.64, 29.68]; 20.46 [20.44, 20.48]; 4.94 [4.93, 4.96]; 43.14 [43.12, 43.16]; 37.08 [37.06, 37.10]; 0.00 [0.00, 0.00]; 0.20 [0.20, 0.20]; 0.05 [0.05, 0.05]; 8.32 [8.30, 8.34]; 27.94 [27.91, 27.96]; 13.90 [13.88, 13.91]; 0.00 [0.00, 0.00]; 43.21 [43.19, 43.23]; 36.05 [36.03, 36.07]; 0.00 [0.00, 0.00]; 34.46 [34.44, 34.48]; 21.28 [21.27, 21.30]; 0.00 [0.00, 0.00]; 0.02 [0.02, 0.02]; 0.00 [0.00, 0.00]; 61.30 [61.26, 61.34]; 17.47 [17.45, 17.48]; 11.59 [11.58, 11.61]; 13.95 [13.93, 13.98]; 12.20 [12.18, 12.22]; 6.41 [6.40, 6.42]; 0.28 [0.27, 0.28]; 32.90 [32.87, 32.92]; 27.30 [27.27, 27.32]; 0.00 [0.00, 0.00]; 0.79 [0.78, 0.80]; 0.44 [0.44, 0.45]; 72.22 [72.18, 72.26]; 24.14 [24.11, 24.16]; 18.85 [18.83, 18.87]; 0.02 [0.02, 0.02]; 18.44 [18.43, 18.46]; 11.01 [11.00, 11.03]; 5.78 [5.76, 5.80]; 10.73 [10.72, 10.75]; 6.29 [6.28, 6.30]; 13.91 

### 101.IFMIR.NER_factuality

In [None]:
from dataset.extraction import Task_ext_IFMIR_NER_factuality

In [None]:
task = "101.IFMIR.NER_factuality"
task = Task_ext_IFMIR_NER_factuality(args=args, task=task)

Load 101.IFMIR.NER_factuality data: train: 46721, val: 5834, test: 5748


In [None]:
dict_prompt_model_performance = evaluate(task)

100%|██████████| 34/34 [28:31<00:00, 50.34s/it]  
100%|██████████| 34/34 [04:15<00:00,  7.53s/it]
100%|██████████| 34/34 [03:30<00:00,  6.20s/it]


In [None]:
dict_mode_performance = print_performance(dict_prompt_model_performance)

Prompt Mode: direct
38.48 [38.46, 38.50]; 15.55 [15.53, 15.56]; 0.02 [0.02, 0.02]; 41.89 [41.87, 41.91]; 22.26 [22.24, 22.27]; 0.00 [0.00, 0.00]; 28.28 [28.25, 28.31]; 2.02 [2.01, 2.03]; 0.70 [0.69, 0.71]; 39.69 [39.67, 39.71]; 17.98 [17.96, 17.99]; 0.02 [0.02, 0.02]; 1.73 [1.73, 1.74]; 0.07 [0.07, 0.07]; 79.28 [79.25, 79.31]; 27.69 [27.66, 27.71]; 2.53 [2.52, 2.54]; 0.62 [0.61, 0.62]; 40.90 [40.88, 40.92]; 17.65 [17.64, 17.67]; 0.00 [0.00, 0.00]; 27.86 [27.85, 27.88]; 3.49 [3.49, 3.50]; 0.00 [0.00, 0.00]; 0.00 [0.00, 0.00]; 0.00 [0.00, 0.00]; 87.81 [87.78, 87.83]; 18.76 [18.74, 18.78]; 7.23 [7.22, 7.24]; 24.59 [24.56, 24.63]; 12.67 [12.65, 12.69]; 3.04 [3.03, 3.05]; 1.72 [1.70, 1.73]; 31.22 [31.19, 31.24]; 13.28 [13.27, 13.30]; 0.02 [0.02, 0.02]; 0.32 [0.32, 0.33]; 0.11 [0.11, 0.11]; 50.97 [50.93, 51.01]; 27.99 [27.97, 28.02]; 13.96 [13.94, 13.97]; 0.02 [0.02, 0.02]; 22.20 [22.18, 22.22]; 3.84 [3.84, 3.85]; 6.52 [6.50, 6.54]; 1.01 [1.00, 1.01]; 0.17 [0.17, 0.17]; 78.88 [78.84, 78.91];

### 102.iCorpus

In [None]:
from dataset.extraction import Task_ext_iCorpus

In [None]:
task = "102.iCorpus"
task = Task_ext_iCorpus(args=args, task=task)

Load 102.iCorpus data: train: 1749, val: 219, test: 219


In [None]:
dict_prompt_model_performance = evaluate(task)

100%|██████████| 34/34 [00:10<00:00,  3.33it/s]
100%|██████████| 34/34 [00:10<00:00,  3.16it/s]
100%|██████████| 34/34 [00:11<00:00,  2.92it/s]


In [None]:
dict_mode_performance = print_performance(dict_prompt_model_performance)

Prompt Mode: direct
27.83 [27.74, 27.92]; 23.12 [23.04, 23.21]; 0.91 [0.87, 0.95]; 40.47 [40.35, 40.58]; 34.82 [34.71, 34.93]; 0.00 [0.00, 0.00]; 23.06 [22.93, 23.19]; 15.96 [15.86, 16.07]; 0.00 [0.00, 0.00]; 37.47 [37.35, 37.60]; 33.34 [33.22, 33.47]; 0.00 [0.00, 0.00]; 0.34 [0.33, 0.35]; 0.00 [0.00, 0.00]; 5.07 [4.98, 5.16]; 24.37 [24.30, 24.44]; 9.27 [9.22, 9.31]; 0.00 [0.00, 0.00]; 38.95 [38.82, 39.08]; 34.69 [34.56, 34.82]; 0.00 [0.00, 0.00]; 34.39 [34.29, 34.49]; 29.03 [28.94, 29.13]; 0.00 [0.00, 0.00]; 0.00 [0.00, 0.00]; 0.00 [0.00, 0.00]; 82.21 [82.05, 82.36]; 1.30 [1.28, 1.32]; 0.44 [0.43, 0.45]; 22.39 [22.22, 22.56]; 13.73 [13.63, 13.83]; 6.19 [6.14, 6.24]; 0.00 [0.00, 0.00]; 27.54 [27.43, 27.66]; 20.19 [20.09, 20.29]; 0.00 [0.00, 0.00]; 0.07 [0.07, 0.08]; 0.00 [0.00, 0.00]; 47.06 [46.85, 47.27]; 10.16 [10.06, 10.26]; 8.55 [8.46, 8.64]; 3.21 [3.13, 3.28]; 8.57 [8.51, 8.64]; 4.15 [4.12, 4.19]; 15.16 [15.00, 15.31]; 4.64 [4.58, 4.70]; 2.60 [2.57, 2.64]; 25.77 [25.58, 25.95]; 25

## End

In [None]:
print('Done.')

Done.
