In [1]:
import os
import re
import pandas as pd
import numpy as np

In [2]:
def postprocess(df, col):
    res = [re.sub('\n','', str(x)) for x in df[col]]
    res =  [x.split('                ')[0] for x in res]
    res =  [x.split('Note')[0] for x in res]
    res =  ['' if x[1:4] == '"""' else x for x in res]
    return res

def extract_between_markers(input_string):
    pattern = r'@@(.*?)##'
    extracted = re.findall(pattern, input_string)
    return extracted


def computeTermEvalMetrics(extracted_terms, gold_df):
    # make lower case cause gold standard is lower case
    extracted_terms = [item.lower() for item in extracted_terms]
    extracted_terms = set([x for x in extracted_terms if x != ''])
    gold_set = [item.lower() for item in gold_df]
    gold_set =  set([x for x in gold_set if x != ''])
    true_pos = extracted_terms.intersection(gold_set)
    recall = round(len(true_pos)*100/len(gold_set),1)
    precision = round(len(true_pos)*100/len(extracted_terms),1)
    fscore = round(2*(precision*recall)/(precision+recall),1)
    print(str(precision)+' & ' +  str(recall)+' & ' +  str(fscore))
    return precision, recall, fscore

def eval_template3(df, col, gold_path):
    output =  postprocess(df, col)
    output = [extract_between_markers(x) for x in output] 
    predictions = []
    for x in output:
        predictions.extend(x)

    gold_list = pd.read_csv(gold_path, header=None, delimiter='\t')[0].tolist()
    computeTermEvalMetrics(predictions, gold_list)

def extract_entities(words, labels):
    entities = []
    current_entity = ''
    current_label = ''

    for word, label in zip(words, labels):
        if label.startswith('B'):
            if current_entity:
                entities.append(current_entity.strip())
            current_entity = word
            current_label = label[2:]
        elif label.startswith('I') and current_label == label[2:]:
            current_entity += ' ' + word
        else:
            if current_entity:
                entities.append(current_entity.strip())
                current_entity = ''
                current_label = ''

    if current_entity:
        entities.append(current_entity.strip())

    return entities


def filter_text(text):
    allowed_chars = {'B', 'I', 'O', ' '}
    filtered_text = ''.join(c for c in text if c in allowed_chars)
    return filtered_text

def eval_template1(df, col, gold_list):
    output = postprocess(df, col)
    output = [re.sub("'", "", x).strip() for x in output]
    
    extracted_list = []
    for x, y in list(zip(df.words, output)):
        words = eval(x)
        labels = y[:len(words)*2]
        
        if labels.startswith("'I") or labels == '':
            labels = 'O '*len(words)
        labels =  filter_text(labels).split(' ')
        extracted_list.append(labels)

    predictions = []
    for x, y in zip(df.words, extracted_list):
        terms =  extract_entities(eval(x), y)
        predictions.extend(terms)
    precision, recall, fscore = computeTermEvalMetrics(predictions, gold_list)
    return precision, recall, fscore

## LLAMA 7B

In [3]:
def evaluate_prompts(df, format, lang, ver, gold_list):
    if  "1" in format:
        print("#"*50)
        print("#1. Extracted IOB format")
        precision, recall, fscore = eval_template1(df, format, gold_list)

    elif "2" in format:
        print("#"*50)
        print("#2. Extracted candidate term list")
        candidate_terms = []
        for x in df[format]:
            ## 13B ANN
            if "[global, proteomics, pathway, analysis, pressure-overload-induced, heart, failure, mitochondrial-targeted, peptides]" in str(x):
                candidate_terms.extend(["global", "proteomics", "pathway", "analysis", "pressure-overload-induced", "heart", "failure", "mitochondrial-targeted", "peptides"])
            ## 13B NES    
            elif '[ "diuretics", "comorbidities", "6-MWT", "NYHA class III", "age > 65 years" ]' in str(x):
                candidate_terms.extend([ "diuretics", "comorbidities", "6-MWT", "NYHA class III", "age > 65 years" ])
            elif '["patient-level regression", "30-day risk-adjusted mortality", "readmissions", "costs", "volume groups", "patient", "physician", "hospital characteristics"]' in str(x):
                candidate_terms.extend(["patient-level regression", "30-day risk-adjusted mortality", "readmissions", "costs", "volume groups", "patient", "physician", "hospital characteristics"])
            else:
                term = eval(str(x).split('Output: ')[1].split('[INST')[0].strip()) if len(str(x).split('Output: ')) > 1 and len(str(x).split('Output: ')[1].split('[INST')[0].strip()) > 1 else []
                candidate_terms.extend(term)
        precision, recall, fscore = computeTermEvalMetrics(candidate_terms, gold_list)
    elif "3" in format:
        print("#"*50)
        print("#3. Masking terms")
        ### RAW
        candidate_terms = []
        for x in df[format]:
            sent = str(x).split('Output: ')[1].split('[INST]')[0].strip() if len(str(x).split('Output: ')) > 1 else ""
            candidate_terms.extend(extract_between_markers(sent))

        ### PROCESSED
        candidate_terms_list = []
        for x in candidate_terms:
            candidate_terms_list.append(x.split('@@')[-1])

        precision, recall, fscore = computeTermEvalMetrics(candidate_terms_list, gold_list)
    else:
        raise Exception("Format not supported")

In [4]:
csv_directory = '../results/llama2_results/llama_en_htfl_7b'

llama_en_htfl_7b_ann_tem1 = pd.read_csv(os.path.join(csv_directory,'llama_en_htfl_7b_ann_tem1.csv'))
llama_en_htfl_7b_ann_tem2 = pd.read_csv(os.path.join(csv_directory,'llama_en_htfl_7b_ann_tem2.csv'))
llama_en_htfl_7b_ann_tem3 = pd.read_csv(os.path.join(csv_directory,'llama_en_htfl_7b_ann_tem3.csv'))

llama_en_htfl_7b_nes_tem1 = pd.read_csv(os.path.join(csv_directory,'llama_en_htfl_7b_nes_tem1.csv'))
llama_en_htfl_7b_nes_tem2 = pd.read_csv(os.path.join(csv_directory,'llama_en_htfl_7b_nes_tem2.csv'))
llama_en_htfl_7b_nes_tem3 = pd.read_csv(os.path.join(csv_directory,'llama_en_htfl_7b_nes_tem3.csv'))

llama_fr_htfl_7b_ann_tem1 = pd.read_csv(os.path.join(csv_directory,'llama_fr_htfl_7b_ann_tem1.csv'))
llama_fr_htfl_7b_ann_tem2 = pd.read_csv(os.path.join(csv_directory,'llama_fr_htfl_7b_ann_tem2.csv'))
llama_fr_htfl_7b_ann_tem3 = pd.read_csv(os.path.join(csv_directory,'llama_fr_htfl_7b_ann_tem3.csv'))

llama_fr_htfl_7b_nes_tem1 = pd.read_csv(os.path.join(csv_directory,'llama_fr_htfl_7b_nes_tem1.csv'))
llama_fr_htfl_7b_nes_tem2 = pd.read_csv(os.path.join(csv_directory,'llama_fr_htfl_7b_nes_tem2.csv'))
llama_fr_htfl_7b_nes_tem3 = pd.read_csv(os.path.join(csv_directory,'llama_fr_htfl_7b_nes_tem3.csv'))

llama_nl_htfl_7b_ann_tem1 = pd.read_csv(os.path.join(csv_directory,'llama_nl_htfl_7b_ann_tem1.csv'))
llama_nl_htfl_7b_ann_tem2 = pd.read_csv(os.path.join(csv_directory,'llama_nl_htfl_7b_ann_tem2.csv'))
llama_nl_htfl_7b_ann_tem3 = pd.read_csv(os.path.join(csv_directory,'llama_nl_htfl_7b_ann_tem3.csv'))

llama_nl_htfl_7b_nes_tem1 = pd.read_csv(os.path.join(csv_directory,'llama_nl_htfl_7b_nes_tem1.csv'))
llama_nl_htfl_7b_nes_tem2 = pd.read_csv(os.path.join(csv_directory,'llama_nl_htfl_7b_nes_tem2.csv'))
llama_nl_htfl_7b_nes_tem3 = pd.read_csv(os.path.join(csv_directory,'llama_nl_htfl_7b_nes_tem3.csv'))

gold_en_ann =  pd.read_csv('htfl_en_terms.tsv', header=None, delimiter='\t')[0].tolist()
gold_en_nes =  pd.read_csv('htfl_en_terms_nes.tsv', header=None, delimiter='\t')[0].tolist()

gold_fr_ann =  pd.read_csv('htfl_fr_terms.tsv', header=None, delimiter='\t')[0].tolist()
gold_fr_nes =  pd.read_csv('htfl_fr_terms_nes.tsv', header=None, delimiter='\t')[0].tolist()

gold_nl_ann =  pd.read_csv('htfl_nl_terms.tsv', header=None, delimiter='\t')[0].tolist()
gold_nl_nes =  pd.read_csv('htfl_nl_terms_nes.tsv', header=None, delimiter='\t')[0].tolist()

In [5]:

evaluate_prompts(llama_en_htfl_7b_ann_tem2, 'llama_output2ann', 'en', 'ann', gold_en_ann)
evaluate_prompts(llama_en_htfl_7b_ann_tem3, 'llama_output3ann', 'en', 'ann', gold_en_ann)
evaluate_prompts(llama_en_htfl_7b_nes_tem1, 'llama_output1nes', 'en', 'nes', gold_en_nes)
evaluate_prompts(llama_en_htfl_7b_nes_tem2, 'llama_output2nes', 'en', 'nes', gold_en_nes)
evaluate_prompts(llama_en_htfl_7b_nes_tem3, 'llama_output3nes', 'en', 'nes', gold_en_nes)

##################################################
#1. Extracted IOB format
12.4 & 4.8 & 6.9
##################################################
#2. Extracted candidate term list
40.4 & 62.6 & 49.1
##################################################
#3. Masking terms
40.3 & 26.8 & 32.2
##################################################
#1. Extracted IOB format
17.3 & 7.3 & 10.3
##################################################
#2. Extracted candidate term list
42.9 & 63.4 & 51.2
##################################################
#3. Masking terms
45.0 & 32.5 & 37.7


In [6]:
evaluate_prompts(llama_fr_htfl_7b_ann_tem1, 'llama_output1ann', 'fr', 'ann', gold_fr_ann)
evaluate_prompts(llama_fr_htfl_7b_ann_tem2, 'llama_output2ann', 'fr', 'ann', gold_fr_ann)
evaluate_prompts(llama_fr_htfl_7b_ann_tem3, 'llama_output3ann', 'fr', 'ann', gold_fr_ann)
evaluate_prompts(llama_fr_htfl_7b_nes_tem1, 'llama_output1nes', 'fr', 'nes', gold_fr_nes)
evaluate_prompts(llama_fr_htfl_7b_nes_tem2, 'llama_output2nes', 'fr', 'nes', gold_fr_nes)
evaluate_prompts(llama_fr_htfl_7b_nes_tem3, 'llama_output3nes', 'fr', 'nes', gold_fr_nes)

##################################################
#1. Extracted IOB format
7.5 & 9.3 & 8.3
##################################################
#2. Extracted candidate term list
36.3 & 59.2 & 45.0
##################################################
#3. Masking terms
58.5 & 23.4 & 33.4
##################################################
#1. Extracted IOB format
8.4 & 11.0 & 9.5
##################################################
#2. Extracted candidate term list
36.0 & 61.6 & 45.4
##################################################
#3. Masking terms
52.1 & 34.5 & 41.5


In [7]:
evaluate_prompts(llama_nl_htfl_7b_ann_tem1, 'llama_output1ann', 'nl', 'ann', gold_nl_ann)
evaluate_prompts(llama_nl_htfl_7b_ann_tem2, 'llama_output2ann', 'nl', 'ann', gold_nl_ann)
evaluate_prompts(llama_nl_htfl_7b_ann_tem3, 'llama_output3ann', 'nl', 'ann', gold_nl_ann)
evaluate_prompts(llama_nl_htfl_7b_nes_tem1, 'llama_output1nes', 'nl', 'nes', gold_nl_nes)
evaluate_prompts(llama_nl_htfl_7b_nes_tem2, 'llama_output2nes', 'nl', 'nes', gold_nl_nes)
llama_nl_htfl_7b_nes_tem3['llama_output3nes'] = llama_nl_htfl_7b_nes_tem3['llama_output1nes'].copy()
evaluate_prompts(llama_nl_htfl_7b_nes_tem3, 'llama_output3nes', 'nl', 'nes', gold_nl_nes)

##################################################
#1. Extracted IOB format
19.2 & 14.4 & 16.5
##################################################
#2. Extracted candidate term list
40.4 & 73.1 & 52.0
##################################################
#3. Masking terms
53.8 & 41.6 & 46.9
##################################################
#1. Extracted IOB format
16.6 & 23.8 & 19.6
##################################################
#2. Extracted candidate term list
40.3 & 75.6 & 52.6
##################################################
#3. Masking terms
48.8 & 52.3 & 50.5


## LLAMA 13B

In [8]:
directory = '../results/llama2_results/llama_en_htfl_13b'

llama_en_htfl_13b_ann_tem1 = pd.read_csv(os.path.join(directory,'llama_en_htfl_13b_ann_tem1.csv'))
llama_en_htfl_13b_ann_tem2 = pd.read_csv(os.path.join(directory,'llama_en_htfl_13b_ann_tem2.csv'))
llama_en_htfl_13b_ann_tem3 = pd.read_csv(os.path.join(directory,'llama_en_htfl_13b_ann_tem3.csv'))

llama_en_htfl_13b_nes_tem1 = pd.read_csv(os.path.join(directory,'llama_en_htfl_13b_nes_tem1.csv'))
llama_en_htfl_13b_nes_tem2 = pd.read_csv(os.path.join(directory,'llama_en_htfl_13b_nes_tem2.csv'))
llama_en_htfl_13b_nes_tem3 = pd.read_csv(os.path.join(directory,'llama_en_htfl_13b_nes_tem3.csv'))

In [9]:
evaluate_prompts(llama_en_htfl_13b_ann_tem1, 'llama_output1ann', 'en', 'ann', gold_en_ann)
evaluate_prompts(llama_en_htfl_13b_ann_tem2, 'llama_output2ann', 'en', 'ann', gold_en_ann)
evaluate_prompts(llama_en_htfl_13b_ann_tem3, 'llama_output3ann', 'en', 'ann', gold_en_ann)
evaluate_prompts(llama_en_htfl_13b_nes_tem1, 'llama_output1nes', 'en', 'nes', gold_en_nes)
evaluate_prompts(llama_en_htfl_13b_nes_tem2, 'llama_output2nes', 'en', 'nes', gold_en_nes)
evaluate_prompts(llama_en_htfl_13b_nes_tem3, 'llama_output3nes', 'en', 'nes', gold_en_nes)

##################################################
#1. Extracted IOB format
12.1 & 1.7 & 3.0
##################################################
#2. Extracted candidate term list
35.0 & 63.4 & 45.1
##################################################
#3. Masking terms
40.0 & 36.9 & 38.4
##################################################
#1. Extracted IOB format
25.9 & 2.4 & 4.4
##################################################
#2. Extracted candidate term list
38.4 & 66.1 & 48.6
##################################################
#3. Masking terms
40.3 & 47.5 & 43.6
