### Load data

In [None]:
import os
import glob
import random, os
import pandas as pd
import time

dataset = './dataset/PICO/test/'
search_pattern = os.path.join(dataset, '*.bio')  # Use os.path.join for compatibility
files = glob.glob(search_pattern)

# Normalize path to always use forward slashes
files = [file.replace(os.sep, '/') for file in files]
print(files)

### Getting results from GPTs

In [None]:
import openai
import os

def get_output(prompt, GPT):
    if GPT == 3.5:
        openai.api_key = ''
        model = 'gpt-3.5-turbo-1106'
        message = openai.ChatCompletion.create(
            model=model,
            temperature=0,
            messages=[
                    {"role": "user", "content": prompt}
                ]
        )
        result = message['choices'][0]['message']['content']

    elif GPT == 4:
        openai.api_key = ''
        model = 'gpt-4'
        message = openai.ChatCompletion.create(
            model=model,
            temperature=0,
            messages=[
                    {"role": "user", "content": prompt}
                ]
        )
        result = message['choices'][0]['message']['content']

    elif GPT == 'instruct':
        openai.api_key = ''
        model = "gpt-3.5-turbo-instruct"
        message = openai.Completion.create(
            model = model,
            prompt = prompt,
            temperature = 0    
        )
        result = message['choices'][0]['text']   


    print(result)
    return result

In [None]:
def create_prompt(input_text, prompt_type='base'):
    
    # Initial part of the prompt that describes the task
    prompt = '''### Task
You are a skilled medical expert. Your task is to generate an HTML version of an input text, marking up specific entities related to healthcare. The entities to be identified are: 'Participant', 'Intervention', 'Control', and 'Outcomes'. Use HTML <span> tags to highlight these entities. Each <span> should have a class attribute indicating the type of the entity.

### Markup Format
Use <span class="participant"> to denote a participant entity.
Use <span class="intervention"> to denote a intervention entity.
Use <span class="control"> to denote a control entity.
Use <span class="outcome"> to denote a outcome entity.
Leave the text as it is if no such entities are found.
'''

    if prompt_type == 'guide':
        prompt += f'''
### Entity Recognition Guide
'Participant' refers to the descriptions of participants involved in a medical study, including details about their recruitment process and the characteristics or requirements they needed to meet for inclusion. These descriptions typically encompass various relevant factors such as age, gender, sample size, medical diagnoses or conditions, treatment locations, and other specific details pertinent to the study being conducted. These population descriptors provide important context and help define the target group under investigation.
'Intervention' refers to the proposed treatment or approach being administered to the participants. While interventions commonly refer to medical treatments in the medical literature, it's important to note that interventions can also encompass non-medical approaches, such as educational courses or musical therapies. The intervention is the specific action or method being implemented with the aim of addressing or influencing the condition or problem being studied.
'Control' refers to the comparison or control treatment utilized in many studies. The control group serves as a baseline for comparison to evaluate the effectiveness of the intervention in terms of the desired outcomes. In some cases, the control group may receive a placebo treatment, which is an inactive substance or sham procedure that mimics the appearance of the actual intervention but lacks the active components. Alternatively, the control group may receive no treatment at all. These control treatments are implemented to provide a reference point for assessing the impact of the intervention and determining its efficacy in relation to the desired outcomes.
'Outcome' refers to the measurements or observations used to assess the effectiveness of the treatment in individuals participating in a trial. Outcomes are often described by specifying the specific score, scale, measurement tool, or clinical test utilized to evaluate the desired outcome. In clinical trials, researchers compare outcomes between two or more groups of patients, each receiving a different treatment. These outcomes serve as measurable indicators to determine whether the treatment has produced the intended effect or achieved the desired result. By comparing outcomes across different treatment groups, researchers can assess the comparative effectiveness of the interventions being studied.
'''
    # few-shot learning
    elif prompt_type == '1shot':
        prompt += f'''
### Examples
Example Input 1: Acupuncture with sham device twice a week for six weeks or placebo pill once a day for eight weeks .
Analysis 1: In this example, 'Acupuncture with sham device' is an intervention entity, 'placebo pill' is a control entity.
'''
    elif prompt_type == '3shot':
        prompt += f'''
Example Input 1: Acupuncture with sham device twice a week for six weeks or placebo pill once a day for eight weeks .
Analysis 1: In this example, 'Acupuncture with sham device' is an intervention entity, 'placebo pill' is a control entity.

Example Input 2: Comparison of participants who remained on placebo continued beyond the run - in period to the end of the study .
Analysis 2: In this example, 'placebo' is a control entity.

Example Input 3: Arm pain measured on a 10 point pain .
Analysis 3: In this example, 'Arm pain' and 'a 10 point pain' are outcome entities.
'''
    elif prompt_type == '5shot':
        prompt += f'''
### Examples
Example Input 1: Acupuncture with sham device twice a week for six weeks or placebo pill once a day for eight weeks .
Analysis 1: In this example, 'Acupuncture with sham device' is an intervention entity, 'placebo pill' is a control entity.

Example Input 2: Comparison of participants who remained on placebo continued beyond the run - in period to the end of the study .
Analysis 2: In this example, 'placebo' is a control entity.

Example Input 3: Arm pain measured on a 10 point pain .
Analysis 3: In this example, 'Arm pain' and 'a 10 point pain' are outcome entities.

Example Input 4: Plerixafor plus granulocyte colony versus placebo plus granulocyte colony
Analysis 4: In this example, 'Plerixafor plus granulocyte colony' is a intervention entity, 'placebo plus granulocyte colony' is a control entity

Example Input 5: Effect of coenzyme Q10 in patients with hormonally untreated carcinoma of the prostate
Analysis 5: In this example, 'coenzyme Q10' is a intervention entity, 'patients with hormonally untreated carcinoma of the prostate' is a participant entity
'''

    # Add input text to be annotated
    prompt += f'''
### Input Text: {input_text}
### Output Text:
'''
    #print(prompt)
    return prompt

In [None]:
def run(GPT, prompt_type):
    for i, file in enumerate(files):
        with open(file,'r') as f_read:
            text = ' '.join([line.split('\t')[0] for line in f_read.read().splitlines()])
        file_name = file.split('/')[-1].split('.')[0]

        dir_path = f'./output/PICO/{GPT}/{prompt_type}/'
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)

        success = False
        while not success:
            try:
                prompt = create_prompt(text, prompt_type)
                output = get_output(prompt, GPT)
                with open(f'./output/PICO/{GPT}/{prompt_type}/{file_name}.html','w') as f_write:
                    f_write.write(output)
                    success = True
            except Exception as e:
                print (e)
                pass

In [None]:
GPT = 4
prompt_type = 'base'
run(GPT, prompt_type)

### Evaluation

In [None]:
from bs4 import BeautifulSoup as bs
from bs4 import NavigableString, Tag
import spacy

py_nlp = spacy.load ("en_core_web_lg")

In [None]:
def html2bio(html_path):
    with open(html_path) as f:
        
        html = f.read()
        
        if '***output***' in html.lower():
            html = html[html.lower().index('***output***')+len('***output***')+1:]
        if 'output:' in html.lower():
            html = html[html.lower().index('output:')+len('output:')+1:]
        if 'output text' in html.lower():
            html = html[html.lower().index('output text')+len('output text')+1:]
        if '***Highlighted Text***'  in html.lower():
            html = html[html.lower().index('***Highlighted Text***')+len('***Highlighted Text***')+1:]
        if '<body>' in html:
            html = html[html.index('<body>')+6:html.index('</body>')]
        if '<p>' in html:
            html = html[html.index('<p>')+3:html.index('</p>')]
            
        #print (html_path)
        #print (html,'\n')
        
        # Parse HTML using BeautifulSoup
        soup = bs(html, "html.parser")

        # Extract text under 'p' tags and convert to BIO format
        bio_format = []
        

        for child in soup.children:
            if isinstance(child, NavigableString):
                for word in child.split():
                    bio_format.append(f"{word}\tO\n")  ### split each word, and append -> 'of\tO\n' (O - outside)
            elif isinstance(child, Tag):
                words = py_nlp (child.get_text())
                try:
                    entity = child.attrs['class'][0]
                except:
                    entity = 'O'
                if len(words) != 0:
                    if entity != 'O' and entity in ['participant', 'intervention', 'control', 'outcome']:
                        ## first token: B - beginning
                        if entity == 'participant':
                            bio_format.append(f"{words[0]}\tB-P\n") 
                        elif entity == 'intervention':
                            bio_format.append(f"{words[0]}\tB-I\n") 
                        elif entity == 'control':
                            bio_format.append(f"{words[0]}\tB-C\n") 
                        elif entity == 'outcome':
                            bio_format.append(f"{words[0]}\tB-O\n")
                        ## second to last: I - inside
                        for word in words[1:]:
                            if entity == 'participant':
                                bio_format.append(f"{word}\tI-P\n") 
                            elif entity == 'intervention':
                                bio_format.append(f"{word}\tI-I\n") 
                            elif entity == 'control':
                                bio_format.append(f"{word}\tI-C\n") 
                            elif entity == 'outcome':
                                bio_format.append(f"{word}\tI-O\n")    
                    else:
                        bio_format.append(f"{words[0]}\tO\n")
                        for word in words[1:]:
                            bio_format.append(f"{word}\tO\n")
    return bio_format

In [None]:
import re

def split_sub_sentences(output_text):
    # Define the regular expression pattern to match each entity type individually
    pattern = r'(Participant|Intervention|Control|Outcome)\s+entities\s+are\s*(.*?)(?=(Participant|Intervention|Control|Outcome)\s+entities\s+are|\Z)'

    # Find all matches using the pattern
    matches = re.findall(pattern, output_text, flags=re.IGNORECASE)

    # Extract sub-sentences for each entity type
    sub_sentences = [''] * 4
    for match in matches:
        entity_type = match[0].strip().lower()
        sub_sentence = match[1].strip().rstrip(',')  # Remove trailing comma
        if entity_type == 'participant':
            sub_sentences[0] = sub_sentence
        elif entity_type == 'intervention':
            sub_sentences[1] = sub_sentence
        elif entity_type == 'control':
            sub_sentences[2] = sub_sentence
        elif entity_type == 'outcome':
            sub_sentences[3] = sub_sentence

    return sub_sentences


def txt2bio(output_path, ori_tokens):
        
    # read the result file
    with open(output_path, 'r') as file:
        output_text = file.read()

        # print(ori_tokens)
        # print(output_text) 

        # if none, than make all o
        if output_text.lower() == 'none':
            entity_list = ['O' for _ in ori_tokens]
        else: 
            entity_types = ['P', 'I', 'C', 'O']
            sub_sentences = split_sub_sentences(output_text)

            # Initialize a dictionary to store the extracted entity types
            extracted_entities = {}

            # Use regex to extract entities for each sub-sentence
            for entity_type, sub_sentence in zip(entity_types, sub_sentences):
                entities = re.split(r', |\. ', sub_sentence)
                entities = [entity.rstrip('.') for entity in entities]
                extracted_entities[entity_type] = entities
                
            # Print the extracted entities
            ##print(extracted_entities)

            # Initialize entity_list with 'O' for all tokens
            entity_list = ['O' for _ in ori_tokens]

            # Iterate over the tokens and check if they belong to any extracted entity
            for entity_type, entities in extracted_entities.items():
                for entity in entities:
                    if entity:  # Check if the entity is not empty
                        # Check if the entity is present in the original tokens as a whole phrase
                        entity_length = len(entity.split())
                        occurrences = [i for i in range(len(ori_tokens)) if ' '.join(ori_tokens[i:i+entity_length]) == entity]
                        for start_index in occurrences:
                            if entity_list[start_index] == 'O':
                                # Annotate the tokens belonging to the entity
                                entity_list[start_index] = f"B-{entity_type}"
                                for j in range(start_index + 1, start_index + entity_length):
                                    entity_list[j] = f"I-{entity_type}"

        # make sure the results look similar to bio_format above
        bio_format = [f"{token}\t{entity}\n" for token, entity in zip(ori_tokens, entity_list)]
        ##print(bio_format)
    
    return bio_format

In [None]:
def get_performance(GPT, prompt_type, html=True):
    all_tags = []    # predicted labels
    all_tokens = []  # all tokens
    gold_tags = []   # golden labels

    for file in files:  # files are the golden standard target
        file_name = file.split('/')[-1].split('.')[0]
        with open(file) as f_gold:
            lines = f_gold.readlines()
            tokens = [line.strip().split('\t')[0] for line in lines]  # tokens: e.g., ['ABC', 'is', 'a', '70', ...]
            tags = [line.strip().split('\t')[-1] for line in lines]   # golden: e.g., ['o',   'o',  'o', 'I', ...]

            if html: 
                prediction = f'./output/PICO/GPT-{GPT}/{prompt_type}/{file_name}.html'
                bio_2 = html2bio(prediction)  # e.g., ['Renal\tB-problem\n', 'cell\tI-problem\n', 'carcinoma\tI-problem\n', 'is\tO\n', ...]
            else:
                prediction = f'./output/PICO/GPT-{GPT}/{prompt_type}/{file_name}.output'
                bio_2 = txt2bio(prediction, tokens)

            all_tokens += tokens

            for i, token in enumerate(tokens):
                if token != '':
                    match = False
                    for i2 in range(i,-1,-1):
                        try:
                            # extract token and the corresponding predicted labels, e.g., ['Renal', 'B-problem']
                            token_2, tag_2 = bio_2[i2].strip().split('\t') 
                        except:
                            token_2, tag_2 = None, None
                        
                        if token_2!=None:
                            if token in token_2 or token_2 in token:
                                match = True
                                break

                    if not match:
                        tag_2 = 'O'
                else:
                    tag_2 = ''
                
                gold_tags.append(tags[i])
                all_tags.append(tag_2)

            ##print(gold_tags, all_tags)
                
    with open(f'./output/PICO/GPT-{GPT}/{prompt_type}/merged_gold_pred.bio','w') as fg:
        for i, (token, gold_tag, all_tag) in enumerate(zip(all_tokens, gold_tags, all_tags)):
            if token!='':
                fg.write(f'{token}\t{gold_tag}\t{all_tag}\n')
            else:
                fg.write(f'\n')
    !python ./evaluate_pico.py -lf ./output/PICO/GPT-{GPT}/{prompt_type}/merged_gold_pred.bio

In [None]:
# load results from specified dir
GPT = 'ul2'
prompt_type = 'guide'

# get performance
get_performance(GPT, prompt_type, False) 