In [178]:
import json
import os
from bs4 import BeautifulSoup
from copy import deepcopy
import tiktoken
from pprint import pprint
from IPython.display import clear_output
from copy import copy

XML_DIR = 'abstract_and_results_xml_files'
DATASET_DIR = 'annotated_rct_dataset.json'

with open(DATASET_DIR) as f:
    meta_data = json.load(f)

In [31]:
def remove_html_body(soup):
    """Given a BeautifulSoup object, remove the html and body tags"""
    html_tag = soup.html
    body_tag = soup.body
    
    # Unwrap the tags that are added by lxml
    if html_tag is not None:
        html_tag.unwrap()
    if body_tag is not None:
        body_tag.unwrap()
        
    return soup

In [32]:
# XML file directory
def read_xml_directory(directory):
    # BeautifulSoup objects for each XML file
    soups = dict()
    for filename in os.listdir(directory):
        if filename.endswith(".xml"):
            # Get the pmcid from the filename
            pmcid = int(filename.split('.')[0].split('C')[1])
            filepath = os.path.join(directory, filename)
            with open(filepath, 'r') as file:
                soup = BeautifulSoup(file.read(), 'lxml')
                
                remove_html_body(soup)
                
                soups[pmcid] = soup
    
    return soups

In [80]:
def find_abstract(soup):
    """Given a BeautifulSoup object, return the abstract text"""
    abstract = soup.find('abstract')
    if abstract:
        return abstract
    else:
        return None
    
def find_non_abstract(soup):
    """Given a BeautifulSoup object, return everything except the abstract"""
    soup_str = str(soup)
    new_soup = BeautifulSoup(soup_str, 'lxml')
    
    remove_html_body(new_soup)
    
    abstract = new_soup.find('abstract')
    if abstract:
        abstract.decompose()  # This will remove the tag from the new soup
    return new_soup

In [181]:
soups = read_xml_directory(XML_DIR)

In [36]:
encoding = tiktoken.get_encoding("cl100k_base")

In [82]:
def convert_to_int(value):
    if ',' in str(value):
        return int(value.replace(',', ''))
    else:
        return int(value)

In [83]:
def find_meta_data(meta_data, pmcid):
    """Given the list of JSON objects, return a list of all record with the given pmcid"""
    if type(pmcid) == list:
        return [element for element in meta_data if convert_to_int(element['pmcid']) in pmcid]
    elif type(pmcid) == int:
        return [element for element in meta_data if convert_to_int(element['pmcid']) == int(pmcid)]
    else:
        raise ValueError('pmcid must be an integer or a list of integers')
    
# find_meta_data(meta_data, [2667135, 5498715])

In [93]:
def count_tokens(soup, encoding):
    """Given a soup object, return the number of tokens in the text"""
    return len(encoding.encode(str(soup)))

In [212]:
def condition():
    """Given a chunk of text, return True if the chunk meets the condition"""
    """
    You are an expert on medical randomized controlled trials. You are trying to extract any relevant values for meta-analysis: intervention events, intervention group size, comparator events, comparator group size, intervention mean, intervention standard deviation, comparator mean, comparator standard deviation. Output only "y" if any of these values exists within the given chunk, output only "n" if the chunk contains none of these relevant values. Do not provide any explanation

Intervention: Motivational interviewing through self-determination theory sessions
Comparator: Standard education session
Outcome: Scores of external regulation

Chunk:
    """
    return input('Is the model gonna return y or n (y/n) ') == 'y'

In [190]:
def concatenate_soups(soup_list):
    new_soup = BeautifulSoup("", 'lxml')
    for soup in soup_list:
        new_soup.append(copy(soup))
    return new_soup

In [239]:
def chunk_xml(xml_element, min_tokens, condition):
    """
    Chunk the XML element into smaller parts based on the specified condition and minimum number of tokens for a valid chunk.
    """
    keep_chunks = []

    def process_chunk(chunk):
        """
        Process a chunk: If the condition is true and chunk length is greater than min_tokens, further chunk it recursively.
        """
        
        # Check if the condition is true for the chunk and chunk length is greater than min_tokens
        pprint(chunk)
        relevant = condition()
        is_table = 'table' in chunk.name
        clear_output()
        
        if is_table and relevant:
            keep_chunks.append(chunk)
        
        elif count_tokens(chunk) >= min_tokens and relevant and not is_table:
            # Chunk it further, recursively
            keep_chunks.extend(chunk_xml(chunk, min_tokens, condition))
    
        # if the chunk is too small and the condition is true, keep it    
        elif count_tokens(chunk) < min_tokens and relevant:
            keep_chunks.append(chunk)
            
        # discard the chunk if the condition is false

    # Iterate through the children of the XML element
    for child in xml_element.children:
        # Process the chunk
        process_chunk(child)  
    
    # Return the list of chunks as a single soup object
    return concatenate_soups(keep_chunks)

In [240]:
test_soup = soups[5498715]

In [241]:
chunks = chunk_xml(test_soup, 500, condition)

In [242]:
len(encoding.encode(str(chunks)))

6932