In [1]:
import os
from bs4 import BeautifulSoup
from copy import deepcopy, copy
import tiktoken
from pprint import pprint
from IPython.display import clear_output

RAW_XML_DIR = 'abstract_and_results_xml_files'
NO_ATTR_XML_DIR = 'no_attributes_xml_files'
DATASET_DIR = 'annotated_rct_dataset.json'

In [2]:
def remove_html_body(soup):
    """
    Remove the html and body tags from the soup object. 
    This is necessary because the lxml parser adds these tags automatically.
    
    Args:
    soup: BeautifulSoup object

    Returns:
    soup: BeautifulSoup object with html and body tags removed
    """
    html_tag = soup.html
    body_tag = soup.body

    # Unwrap the unnecessary tags that are added by lxml parser
    if html_tag is not None:
        html_tag.unwrap()
    if body_tag is not None:
        body_tag.unwrap()

    return soup

def read_xml_directory(directory):
    """ Read all the XML files in the directory and return a dictionary with pmcid as the key and BeautifulSoup object as the value"""
    soups = dict()
    for filename in os.listdir(directory):
        if filename.endswith(".xml"):
            # Get the pmcid from the filename
            pmcid = int(filename.split('.')[0].split('C')[1])
            filepath = os.path.join(directory, filename)
            with open(filepath, 'r') as file:
                soup = BeautifulSoup(file.read(), 'lxml')

                # Remove the html and body tags
                remove_html_body(soup)

                # Add the soup to the dictionary with pmcid as the key
                soups[pmcid] = soup

    return soups

def convert_to_int(value):
    """Convert str value to an integer"""
    if ',' in str(value):
        return int(value.replace(',', ''))
    else:
        return int(value)

def count_tokens(soup, encoding):
    """Given a soup object, return the number of tokens in the text"""
    return len(encoding.encode(str(soup)))

def condition():
    """Given a chunk of text, return True if the chunk meets the condition"""
    """
    CHATGPT PROMPT:

    You are an expert on medical randomized controlled trials. You are trying to extract any relevant values for meta-analysis: intervention events, intervention group size, comparator events, comparator group size, intervention mean, intervention standard deviation, comparator mean, comparator standard deviation. Output only "y" if any of these values exists within the given chunk, output only "n" if the chunk contains none of these relevant values. Do not provide any explanation

    Intervention: {INTERVENTION}
    Comparator: {COMPARATOR}
    Outcome: {OUTCOME}

    Chunk:
    """
    return input('Is the model gonna return y or n (y/n) ') == 'y'

def concatenate_soups(soup_list):
    """Concatenate a list of soup objects into a single soup object"""
    new_soup = BeautifulSoup("", 'lxml')
    for soup in soup_list:
        new_soup.append(copy(soup))
    return new_soup

def chunk_xml(xml_element, min_tokens, condition):
    """
    Chunk the XML element into smaller parts based on the specified condition and minimum number of tokens for a valid chunk.
    """
    keep_chunks = []

    def process_chunk(chunk):
        """
        Process a chunk: If the condition is true and chunk length is greater than min_tokens, further chunk it recursively.
        """

        # Check if the condition is true for the chunk and chunk length is greater than min_tokens
        chunk = deepcopy(chunk)
        pprint(chunk)
        relevant = condition()

        is_p_tag = chunk.name == 'p'

        try: is_table = 'table' in chunk.name
        except TypeError: is_table = False

        clear_output()

        if (is_table and relevant) or (is_p_tag and relevant):
            keep_chunks.append(chunk)

        elif count_tokens(chunk, encoding) >= min_tokens and relevant and not is_table:
            # Chunk it further, recursively
            keep_chunks.extend(chunk_xml(chunk, min_tokens, condition))

        # if the chunk is too small and the condition is true, keep it    
        elif count_tokens(chunk, encoding) < min_tokens and relevant:
            keep_chunks.extend(chunk)

        # discard the chunk if the condition is false

    # Iterate through the children of the XML element
    for child in xml_element.contents:
        # Process the chunk
        process_chunk(child)  

    # Return the list of chunks as a single soup object
    return keep_chunks

def combine_chunks(soup_list, max_length, count_tokens, encoding):
    final_chunks = []
    current_chunk = BeautifulSoup("", 'lxml')
    current_length = 0

    for soup in soup_list:
        soup_length = count_tokens(soup, encoding)
        if current_length + soup_length > max_length:
            # If adding this soup would exceed max_length, finish the current chunk
            if current_length > 0:  # Avoid adding empty chunks
                final_chunks.append(current_chunk)
            # Start a new chunk with the current soup
            current_chunk = soup
            current_length = soup_length
        else:
            # If adding this soup wouldn't exceed max_length, add it to the current chunk
            current_chunk.append(soup)
            current_length += soup_length

    # After the loop, add the last chunk if it's not empty
    if current_length > 0:
        final_chunks.append(current_chunk)

    return final_chunks

In [46]:
soups = read_xml_directory(NO_ATTR_XML_DIR)
encoding = tiktoken.get_encoding("cl100k_base")
test_soup = soups[115849]
print(count_tokens(test_soup, encoding))
chunks = chunk_xml(test_soup, 250, condition)
condensed_chunks = combine_chunks(chunks, 2000, count_tokens, encoding)

In [47]:
def __remove_style_tags(soup: BeautifulSoup) -> BeautifulSoup:
    """
    Remove the style tags from the soup object.
    
    Args:
    soup: BeautifulSoup object
    
    Returns:
    soup: BeautifulSoup object
    """
    soup = copy(soup)
    for tag in soup.find_all(["sup", "sub", "italic", "bold", "underline"]):
        tag.unwrap()
    return soup

In [53]:
__remove_style_tags(test_soup).find_all('bold')

[]

In [14]:
for chunk in condensed_chunks:
    print(chunk)
    print(count_tokens(chunk, encoding))

<p>Four months after the intervention, an increase in the mean scores of total PA (p&lt;0.001, ES=4.77), physical fitness tests including flexibility (p&lt;0.001, ES=1.59), muscular endurance (p&lt;0.001, ES=2.0), cardiorespiratory endurance (p&lt;0.001, ES=0.51), and a decrease in mean scores of agility test (p&lt;0.001, ES= − 0.51) and sedentary behavior (p&lt;0.01, ES=− 0.74) was observed in the intervention group compared to the control group. The intervention group reported an increase in the scores of intrinsic motivation (p&lt;0.001, ES=3.34), identified regulation (p&lt;0.001, ES= 1.28), perceptions of competence (p&lt;0.001, ES=0.81) and autonomy (p&lt;0.001, ES=2.01), enjoyment (p&lt;0.001, ES=0.98) and health motives (p&lt;0.01, ES=0.19), health care climate (p&lt;0.001, ES=4.6), and a decreased score of external regulation (p&lt;0.01, ES=−0.55) and amotivation (p&lt;0.01, ES= −0.56) over time, compared to the control group.</p><p>Seventy women in reproductive age were rando

In [15]:
[count_tokens(chunk, encoding) for chunk in condensed_chunks]

[827, 2067]

In [16]:
count_tokens("""Do NOT provide an explanation.
      **QUESTION:** Is the outcome of {{outcome}} from a randomized controlled trial a binary or continuous type? 
      (A) binary
      (B) continuous
      (C) unknown - there is insufficient information to make any inference
      **ANSWER:** ( """, encoding)

63

In [17]:
count_tokens("""Article: {abstract_and_results_xml}

      Based on the given trial article, produce a 2x2 contingency table in YAML format for the following Intervention, Comparator, and Outcome: 
      Intervention: {intervention}
      Comparator: {comparator}
      Outcome: {outcome}

      The YAML format should include the fields "events" and "group_size" for only "intervention" and "comparator" but not "outcome". Example:
      intervention:
          events: NUMBER
          group_size: NUMBER
      comparator:
          events: NUMBER
          group_size: NUMBER

      Only produce YAML response. Do NOT provide an explanation. If any of the numerical information is unavailable or not extractable or not easy to calculate, please say "x".
      If there are numerical data for pre and post-intervention, choose the post-intervention data. If there are multiple timeframes for the outcome, choose the one closest to the outcome timepoint of interest or the very last one.

      YAML:""", encoding)

205

In [18]:
count_tokens("""Article: {abstract_and_results_xml}

      Based on the given trial article, what is the table of mean outcome and standard deviation in YAML format for the following Intervention, Comparator, and Outcome?
      Intervention: {intervention}
      Comparator: {comparator}
      Outcome: {outcome}
      
      Include the total size of each group for Intervention and Comparator. 
      The YAML format should include the fields "mean", "standard_deviation", and "group_size" for only "intervention" and "comparator" but not "outcome". Example:
      intervention:
          mean: NUMBER
          standard_deviation: NUMBER
          group_size: NUMBER
      comparator:
          mean: NUMBER
          standard_deviation: NUMBER
          group_size: NUMBER

      Only produce YAML response. Do NOT provide an explanation. If any of the numerical information is unavailable or not extractable or not easy to calculate, please say "x".
      If there are numerical data for pre and post-intervention, choose the post-intervention data. If there are multiple timeframes for the outcome, choose the one closest to the outcome timepoint of interest or the very last one.

      YAML:""", encoding)

240