In [None]:
from openai import OpenAI
import pandas as pd
import re
import os
import spacy
py_nlp = spacy.load("en_core_web_lg")

from bs4 import BeautifulSoup, NavigableString, Tag
client = OpenAI(api_key='') #HIDDEN

##### Define prompt #####
prompt = '''
- Task
Your task is to generate HTML format output from an input text, marking up specific entities related to clincal healthcare which are in doctor's note or clinical note. 
The entities need to be recognized are: 'cc', 'hpi.location', 'hpi.quality', 'hpi.severity', 'hpi.duration', 'hpi.timing', 'hpi.context', 'hpi.modifyingFactors', 'hpi.assocSignsAndSymptoms', 'pastHistory','familyHistory', 'socialHistory'.
If a setence has negation words, entities might not need to be identified
Use HTML <span> tags to mark up these entities. Each <span> tag must have a class attribute indicating the entity types


- Entity markup guide
Use <span class="cc"> to denote a chief complain entity in the clinical note
Use <span class="hpi.location"> to denote an entity related to the location of a symptom or condition in the history of present illness.
Use <span class="hpi.quality"> to denote an entity related to the quality or character of a symptom in the history of present illness.
Use <span class="hpi.severity"> to denote an entity related to the severity of a symptom or condition in the history of present illness.
Use <span class="hpi.duration"> to denote an entity related to how long a symptom or condition has been present in the history of present illness.
Use <span class="hpi.timing"> to denote an entity related to the timing or frequency of a symptom in the history of present illness.
Use <span class="hpi.context"> to denote an entity related to the context or circumstances surrounding a symptom or condition in the history of present illness.
Use <span class="hpi.modifyingFactors"> to denote an entity related to factors that make a symptom better or worse in the history of present illness.
Use <span class="hpi.assocSignsAndSymptoms"> to denote an entity related to associated signs and symptoms present with the condition.
Use <span class="pastHistory"> to denote an entity related to the patient's past medical history.
Use <span class="familyHistory"> to denote an entity related to the patient's family history.
Use <span class="socialHistory"> to denote an entity related to the patient's social history, such as lifestyle or habits.
Leave the text as it is if no such entities are found. 
'''

### DEFINE REQUIRED FUNCTION

In [None]:
#Read bio file and split into sentences and labels
def split_bio_file(file_content):
    sentences = []
    labels = []
    sentence = []
    label = []

    for line in file_content.splitlines():
        
        if line.strip():  # non-empty line
            word, tag = line.split()
            sentence.append(word)
            label.append(tag)     
        else:  # empty line indicates end of a sentence
            if sentence:  # avoid appending empty lists
                sentences.append(sentence)
                labels.append(label)
                sentence = []
                label = []

    # Append last sentence if it exists
    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels


### API REQUEST AND EXTRACT RESULTS

In [None]:
for s in os.listdir('./bio_data/'):
# s = ('sample_70.iob')
    sample = s[:-4]
    f = open(f"bio_data/{sample}.iob", "r")
    file_content = f.read()
    sentences, labels = split_bio_file(file_content)

    entity_list = ['cc',
                    'hpi.location',
                    'hpi.quality',
                    'hpi.severity',
                    'hpi.duration',
                    'hpi.timing',
                    'hpi.context',
                    'hpi.modifyingFactors',
                    'hpi.assocSignsAndSymptoms',
                    'pastHistory',
                    'familyHistory',
                    'socialHistory']

    bio_output = []

    for sent in sentences:
        sent = " ".join(sent)

        #Call OpenAI API - GPT-4o model
        response = client.chat.completions.create(
        model="gpt-4o",
        temperature=0,
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "assistant", "content": prompt},
            {"role": "user", "content": f"### Input text: {sent}\n### Output text:"},
        ]
        )

        output = response.choices[0].message.content
        output = output.replace("### Input text:", "").replace("### Output text:", "")
        output = output.strip("```html").strip("\n```")


        # Parse HTML using BeautifulSoup
        soup = BeautifulSoup(output, "html.parser")

        # Extract text under 'p' tags and convert to BIO format        
        for child in soup.children:
            if isinstance(child, NavigableString): #Child is a text node, split words and assign O tag
                for word in child.split():
                    bio_output.append(f"{word}\tO\n")
            elif isinstance(child, Tag): #Child is a tag, retrieve text in the tag
                words = py_nlp (child.get_text())
                #Extract first class attribute for the entity type - if class attribute absent then assign O
                try:
                    entity = child.attrs['class'][0]
                except:
                    entity = 'O'
                
                #Assigning BIO tags
                if len(words) != 0:
                    if entity != 'O' and entity in entity_list:
                        bio_output.append(f"{words[0]}\tB-{entity}\n")
                        for word in words[1:]:
                            bio_output.append(f"{word}\tI-{entity}\n")
                    else:
                        bio_output.append(f"{words[0]}\tO\n")
                        for word in words[1:]:
                            bio_output.append(f"{word}\tO\n")

        #Writing to text file and run evaluation:
        with open(f'./results/{sample}','w') as f:
            f.writelines(bio_output)
