In [1]:
import os
from bs4 import BeautifulSoup
from copy import deepcopy, copy
import tiktoken
from pprint import pprint
from IPython.display import clear_output

RAW_XML_DIR = 'abstract_and_results_xml_files'
NO_ATTR_XML_DIR = 'no_attributes_xml_files'
DATASET_DIR = 'annotated_rct_dataset.json'

In [2]:
def remove_html_body(soup):
    """
    Remove the html and body tags from the soup object. 
    This is necessary because the lxml parser adds these tags automatically.
    
    Args:
    soup: BeautifulSoup object

    Returns:
    soup: BeautifulSoup object with html and body tags removed
    """
    html_tag = soup.html
    body_tag = soup.body

    # Unwrap the unnecessary tags that are added by lxml parser
    if html_tag is not None:
        html_tag.unwrap()
    if body_tag is not None:
        body_tag.unwrap()

    return soup

def read_xml_directory(directory):
    """ Read all the XML files in the directory and return a dictionary with pmcid as the key and BeautifulSoup object as the value"""
    soups = {}
    for filename in os.listdir(directory):
        if filename.endswith(".xml"):
            # Get the pmcid from the filename
            pmcid = int(filename.split('.')[0].split('C')[1])
            filepath = os.path.join(directory, filename)
            with open(filepath, 'r') as file:
                soup = BeautifulSoup(file.read(), 'lxml')

                # Remove the html and body tags
                remove_html_body(soup)

                # Add the soup to the dictionary with pmcid as the key
                soups[pmcid] = soup

    return soups

def convert_to_int(value):
    """Convert str value to an integer"""
    if ',' in str(value):
        return int(value.replace(',', ''))
    else:
        return int(value)

def count_tokens(soup, encoding):
    """Given a soup object, return the number of tokens in the text"""
    return len(encoding.encode(str(soup)))

def condition():
    """Given a chunk of text, return True if the chunk meets the condition"""
    """
    CHATGPT PROMPT:

    You are an expert on medical randomized controlled trials. You are trying to extract any relevant values for meta-analysis: intervention events, intervention group size, comparator events, comparator group size, intervention mean, intervention standard deviation, comparator mean, comparator standard deviation. Output only "y" if any of these values exists within the given chunk, output only "n" if the chunk contains none of these relevant values. Do not provide any explanation

    Intervention: {INTERVENTION}
    Comparator: {COMPARATOR}
    Outcome: {OUTCOME}

    Chunk:
    """
    return input('Is the model gonna return y or n (y/n) ') == 'y'

def concatenate_soups(soup_list):
    """Concatenate a list of soup objects into a single soup object"""
    new_soup = BeautifulSoup("", 'lxml')
    for soup in soup_list:
        new_soup.append(copy(soup))
    return new_soup

def chunk_xml(xml_element, min_tokens, condition):
    """
    Chunk the XML element into smaller parts based on the specified condition and minimum number of tokens for a valid chunk.
    """
    keep_chunks = []

    def process_chunk(chunk):
        """
        Process a chunk: If the condition is true and chunk length is greater than min_tokens, further chunk it recursively.
        """

        # Check if the condition is true for the chunk and chunk length is greater than min_tokens
        chunk = deepcopy(chunk)
        pprint(chunk)
        relevant = condition()

        is_p_tag = chunk.name == 'p'

        try: is_table = 'table' in chunk.name
        except TypeError: is_table = False

        clear_output()

        if (is_table and relevant) or (is_p_tag and relevant):
            keep_chunks.append(chunk)

        elif count_tokens(chunk, encoding) >= min_tokens and relevant and not is_table:
            # Chunk it further, recursively
            keep_chunks.extend(chunk_xml(chunk, min_tokens, condition))

        # if the chunk is too small and the condition is true, keep it    
        elif count_tokens(chunk, encoding) < min_tokens and relevant:
            keep_chunks.extend(chunk)

        # discard the chunk if the condition is false

    # Iterate through the children of the XML element
    for child in xml_element.contents:
        # Process the chunk
        process_chunk(child)  

    # Return the list of chunks as a single soup object
    return keep_chunks

def combine_chunks(soup_list, max_length, count_tokens, encoding):
    final_chunks = []
    current_chunk = BeautifulSoup("", 'lxml')
    current_length = 0

    for soup in soup_list:
        soup_length = count_tokens(soup, encoding)
        if current_length + soup_length > max_length:
            # If adding this soup would exceed max_length, finish the current chunk
            if current_length > 0:  # Avoid adding empty chunks
                final_chunks.append(current_chunk)
            # Start a new chunk with the current soup
            current_chunk = soup
            current_length = soup_length
        else:
            # If adding this soup wouldn't exceed max_length, add it to the current chunk
            current_chunk.append(soup)
            current_length += soup_length

    # After the loop, add the last chunk if it's not empty
    if current_length > 0:
        final_chunks.append(current_chunk)

    return final_chunks

In [46]:
soups = read_xml_directory(NO_ATTR_XML_DIR)
encoding = tiktoken.get_encoding("cl100k_base")
test_soup = soups[115849]
print(count_tokens(test_soup, encoding))
chunks = chunk_xml(test_soup, 250, condition)
condensed_chunks = combine_chunks(chunks, 2000, count_tokens, encoding)

In [47]:
def __remove_style_tags(soup: BeautifulSoup) -> BeautifulSoup:
    """
    Remove the style tags from the soup object.
    
    Args:
    soup: BeautifulSoup object
    
    Returns:
    soup: BeautifulSoup object
    """
    soup = copy(soup)
    for tag in soup.find_all(["sup", "sub", "italic", "bold", "underline"]):
        tag.unwrap()
    return soup

In [53]:
__remove_style_tags(test_soup).find_all('bold')

[]

In [14]:
for chunk in condensed_chunks:
    print(chunk)
    print(count_tokens(chunk, encoding))

<p>Four months after the intervention, an increase in the mean scores of total PA (p&lt;0.001, ES=4.77), physical fitness tests including flexibility (p&lt;0.001, ES=1.59), muscular endurance (p&lt;0.001, ES=2.0), cardiorespiratory endurance (p&lt;0.001, ES=0.51), and a decrease in mean scores of agility test (p&lt;0.001, ES= − 0.51) and sedentary behavior (p&lt;0.01, ES=− 0.74) was observed in the intervention group compared to the control group. The intervention group reported an increase in the scores of intrinsic motivation (p&lt;0.001, ES=3.34), identified regulation (p&lt;0.001, ES= 1.28), perceptions of competence (p&lt;0.001, ES=0.81) and autonomy (p&lt;0.001, ES=2.01), enjoyment (p&lt;0.001, ES=0.98) and health motives (p&lt;0.01, ES=0.19), health care climate (p&lt;0.001, ES=4.6), and a decreased score of external regulation (p&lt;0.01, ES=−0.55) and amotivation (p&lt;0.01, ES= −0.56) over time, compared to the control group.</p><p>Seventy women in reproductive age were rando

In [15]:
[count_tokens(chunk, encoding) for chunk in condensed_chunks]

[827, 2067]

In [16]:
count_tokens("""Do NOT provide an explanation.
      **QUESTION:** Is the outcome of {{outcome}} from a randomized controlled trial a binary or continuous type? 
      (A) binary
      (B) continuous
      (C) unknown - there is insufficient information to make any inference
      **ANSWER:** ( """, encoding)

63

In [17]:
count_tokens("""Article: {abstract_and_results_xml}

      Based on the given trial article, produce a 2x2 contingency table in YAML format for the following Intervention, Comparator, and Outcome: 
      Intervention: {intervention}
      Comparator: {comparator}
      Outcome: {outcome}

      The YAML format should include the fields "events" and "group_size" for only "intervention" and "comparator" but not "outcome". Example:
      intervention:
          events: NUMBER
          group_size: NUMBER
      comparator:
          events: NUMBER
          group_size: NUMBER

      Only produce YAML response. Do NOT provide an explanation. If any of the numerical information is unavailable or not extractable or not easy to calculate, please say "x".
      If there are numerical data for pre and post-intervention, choose the post-intervention data. If there are multiple timeframes for the outcome, choose the one closest to the outcome timepoint of interest or the very last one.

      YAML:""", encoding)

205

In [18]:
count_tokens("""Article: {abstract_and_results_xml}

      Based on the given trial article, what is the table of mean outcome and standard deviation in YAML format for the following Intervention, Comparator, and Outcome?
      Intervention: {intervention}
      Comparator: {comparator}
      Outcome: {outcome}
      
      Include the total size of each group for Intervention and Comparator. 
      The YAML format should include the fields "mean", "standard_deviation", and "group_size" for only "intervention" and "comparator" but not "outcome". Example:
      intervention:
          mean: NUMBER
          standard_deviation: NUMBER
          group_size: NUMBER
      comparator:
          mean: NUMBER
          standard_deviation: NUMBER
          group_size: NUMBER

      Only produce YAML response. Do NOT provide an explanation. If any of the numerical information is unavailable or not extractable or not easy to calculate, please say "x".
      If there are numerical data for pre and post-intervention, choose the post-intervention data. If there are multiple timeframes for the outcome, choose the one closest to the outcome timepoint of interest or the very last one.

      YAML:""", encoding)

240

In [74]:
from bs4 import BeautifulSoup, Tag
from copy import deepcopy, copy
from typing import List

# MIN_CHUNK_TOKENS = 250 # Doesn't seem to be needed????

# This class is responsible for chunking the input based on the max tokens.
# Majority of the code was implemented by David Pogrebitskiy (@pogrebitskiy)
class InputChunker:
    def __init__(self, model) -> None:
        self.model = model  # model object for GPT models or other models (HuggingFace)

    def __remove_style_tags(self, soup: BeautifulSoup, tags: list) -> BeautifulSoup:
        """
        Remove the style tags from the soup object.

        Args:
        soup: BeautifulSoup object
        tags: list

        Returns:
        soup: BeautifulSoup object
        """
        # Copy the soup and unwrap the styling tags specified in the list
        soup = deepcopy(soup)
        for tag in soup.find_all(tags):
            tag.unwrap()
        return soup

    def __preprocess_xml(self, xml_string: str, remove_tags: list = None) -> BeautifulSoup:
        """
        Preprocess the xml string by converting to a BeautifulSoup object and removing the styling tags.

        Args:
        xml_string: string

        Returns:
        soup: BeautifulSoup object
        """
        if remove_tags is None:
            remove_tags = ["bold", "italic", "underline", "sup", "sub", "xref"]
        soup = self.__convert_xml_string_to_soup(xml_string)
        soup = self.__remove_style_tags(soup, remove_tags)

        return soup

    def __remove_html_body(self, soup_object: BeautifulSoup) -> BeautifulSoup:
        """
        Remove the html and body tags from the soup object. 
        This is necessary because the lxml parser adds these tags automatically.
        
        Args:
        soup: BeautifulSoup object

        Returns:
        soup: BeautifulSoup object with html and body tags removed
        """
        html_tag = soup_object.html
        body_tag = soup_object.body

        # Unwrap the unnecessary tags that are added by lxml parser
        if html_tag is not None:
            html_tag.unwrap()
        if body_tag is not None:
            body_tag.unwrap()

        return soup_object

    def __convert_xml_string_to_soup(self, xml_string: str) -> BeautifulSoup:
        """
        Convert the xml string to a BeautifulSoup object.
        
        Args:
        xml_string: string
        
        Returns:
        soup: BeautifulSoup object
        """
        soup = BeautifulSoup(xml_string, "lxml")

        # Remove the html and body tags
        return self.__remove_html_body(soup)

    def count_tokens(self, text: str) -> int:
        """
        Count the number of tokens in the text.
        
        Args:
        text: string
        
        Returns:
        token_count: integer
        """
        encoded = self.model.encode_text(text)
        encoded_length = len(encoded)
        return encoded_length

    def __split_table(self, table: BeautifulSoup) -> List[BeautifulSoup]:
        """
        Extract the header and footer, spit the body table rows in half, and return the two tables (first and second halves).

        Args:
        table: BeautifulSoup object

        Returns:
        list of BeautifulSoup objects
        """

        # Copy the input table
        table_copy = deepcopy(table)

        # Keep track of the header
        header = BeautifulSoup("", 'lxml')
        [header.append(copy(tag)) for tag in table_copy.find_all(('label', 'caption'))]
        header.append(copy(table_copy.find('thead')))

        # Keep track of the footer
        footer = table_copy.find('table-wrap-foot')

        # Find all rows in the table
        all_rows = table_copy.find('tbody').find_all('tr', recursive=False)
        num_rows = len(all_rows)

        # Split the rows in half
        first_half = all_rows[:num_rows // 2]
        second_half = all_rows[num_rows // 2:]

        # Create the first table
        first_table = BeautifulSoup("", 'lxml')
        first_table.append(copy(header))
        first_tbody = first_table.new_tag('tbody')
        [first_tbody.append(copy(row)) for row in first_half]
        first_table.append(copy(first_tbody))
        if footer:
            first_table.append(copy(footer))
        first_table_wrap = first_table.new_tag('table-wrap')
        first_table_wrap.append(copy(first_table))

        # Create the second table
        second_table = BeautifulSoup("", 'lxml')
        second_table.append(copy(header))
        second_tbody = second_table.new_tag('tbody')
        [second_tbody.append(copy(row)) for row in second_half]
        second_table.append(copy(second_tbody))
        if footer:
            second_table.append(copy(footer))
        second_table_wrap = second_table.new_tag('table-wrap')
        second_table_wrap.append(copy(second_table))

        return [first_table_wrap, second_table_wrap]

    def __create_xml_chunks(self, xml_soup_element: BeautifulSoup, max_tokens: int) -> List[str]:
        """
        Chunk the xml soup element to minimum chunk size if chunk is too large.

        Args:
        xml_soup_element: BeautifulSoup object
        max_tokens: integer

        Returns:
        keep_chunks: list
        """
        keep_chunks = []

        def process_chunk(xml: BeautifulSoup) -> None:
            """
            Process the chunk.

            Returns:
            None
            """
            chunk = deepcopy(xml)
            print(chunk)
            chunk_token_size = self.count_tokens(str(chunk))

            # we perform special logic for tables if they are too long
            is_table = isinstance(chunk, Tag) and chunk.name == 'table-wrap'

            # If the chunk is a table but it's too large, split it in half and add the two tables to the list
            if is_table and chunk_token_size > max_tokens:
                print("SPLITTING TABLE")
                keep_chunks.extend(self.__split_table(chunk))

            # If the chunk is a table and not too big, append it to the list
            elif is_table and chunk_token_size <= max_tokens:
                keep_chunks.append(chunk)

            # If the chunk isn't smaller than the minimum chunk size, chunk it further
            elif chunk_token_size > max_tokens:
                # Chunk it further, recursively
                keep_chunks.extend(self.__create_xml_chunks(chunk, max_tokens))

            elif chunk_token_size <= max_tokens:
                # if the chunk is too small and the condition is true, keep it
                keep_chunks.append(chunk)

            else:
                # If the chunk is not relevant, don't keep it
                pass
        
        for child in xml_soup_element.contents:
            # If the child is a tag, process the chunk
            if isinstance(child, Tag):
                process_chunk(child)
            # If it's anything else, append it and don't chunk further
            else:
                keep_chunks.append(xml_soup_element)
                continue

        return keep_chunks

    def __combine_xml_chunks(self, chunks_list: List[BeautifulSoup], max_tokens: int) -> List[str]:
        """
        Combine the chunks based on the max tokens.

        Args:
        chunks_list: list
        max_tokens: integer

        Returns:
        final_chunks: list
        """
        final_chunks = []
        current_chunk = ""
        current_length = 0

        for soup in chunks_list:
            soup_length = self.count_tokens(str(soup))
            # If the soup is too long, print ERROR.
            # This should not happen ideally, but if it does, we should know about it.
            if soup_length > max_tokens:
                print(str(soup))
                print(f"ERROR - chunk to combine is too long: {soup_length} tokens")
                continue
            # If adding this soup would exceed max_length, finish the current chunk
            if current_length + soup_length > max_tokens:
                if current_length > 0:  # Avoid adding empty chunks
                    chunk_to_add = {
                        "chunk": current_chunk,
                        "token_size": current_length
                    }
                    final_chunks.append(chunk_to_add)
                    current_chunk = ""  # Reset the current chunk
                    current_length = 0  # Reset the current length
                # Start a new chunk with the current soup
                current_chunk += str(soup)
                current_length += soup_length
            else:
                # If adding this soup wouldn't exceed max_length, add it to the current chunk
                current_chunk += str(soup)
                current_length += soup_length

        # After the loop, add the current_chunk if it's not empty
        if current_length > 0:
            chunk_to_add = {
                "chunk": current_chunk,
                "token_size": current_length
            }
            final_chunks.append(chunk_to_add)

        return final_chunks

    def get_chunked_input(self, xml_string: str, max_chunk_token_size: int) -> List[str]:
        """
        Split a text into chunks of ~max_num_tokens tokens, based on xml tag boundaries.
        
        Args:
        xml_string: string
        max_chunk_token_size: integer
        
        Returns:
        chunked_input: A list of text chunks
        """
        soup = self.__preprocess_xml(xml_string)
        xml_chunks_list = self.__create_xml_chunks(soup, max_chunk_token_size)
        condensed_chunks_list = self.__combine_xml_chunks(xml_chunks_list, max_chunk_token_size)
        return condensed_chunks_list

In [75]:
soups = read_xml_directory(NO_ATTR_XML_DIR)

import tiktoken

class Model():
    def __init__(self) -> None:
        super().__init__()
        self.encoder = tiktoken.get_encoding("cl100k_base")

    def get_context_length(self) -> int:
        return 4080
    
    def encode_text(self, text: str) -> str:
        """
        This method encodes the text

        :param text: text to encode

        :return encoded text
        """
        return self.encoder.encode(text)
    
input_chunker = InputChunker(Model())
keysList = [*soups]
for key in keysList[:100]:
    soup = soups.get(key)
    print(f"key: {key}")
    print(input_chunker.count_tokens(str(soup)))
    list = input_chunker.get_chunked_input(str(soup), 1700)
    print(f"full chunk list length: {len(list)}")
    for chunk in list:
        print(chunk['token_size'])
        print(chunk['chunk'])
        print('---')

key: 2667135
8106
<abstract><sec><title>Objectives</title><p>Administering outpatient parenteral antimicrobial therapy in the community setting (CoPAT) is becoming more common with the increasing emphasis on controlling costs. However, few controlled trials have evaluated this treatment modality.</p></sec><sec><title>Methods</title><p>Using data from a recent randomized trial comparing daptomycin with standard therapy (semi-synthetic penicillin or vancomycin, each with initial low-dose gentamicin) for Staphylococcus aureus bacteraemia and infective endocarditis (SAB/IE), patient characteristics and outcomes were evaluated. Patients receiving their full course of therapy in the hospital setting were compared with those who received some portion outside of the hospital (CoPAT).</p></sec><sec><title>Results</title><p>Among the 200 patients, 51.5% received CoPAT. These patients were generally younger (median age 50 versus 54 years, P = 0.028). In the CoPAT group, there tended to be fewer p

ValueError: Cannot insert None into a tag.

In [73]:
text = "<p>As shown in the final row of Table 1, the composite z-scores (full battery) were −0.25 (SD 0.71) for sham stimulation, 0.13 (SD 0.82) for left anodal stimulation, and 0.11 (SD 0.81) for right anodal stimulation, which repeated measures ANOVA showed to be significantly different (F2,22 = 12.85, p = 0.0002). Post hoc, paired-sample t tests revealed that performances associated with both left anodal active stimulation (t11 = 5.4, p = 0.0002) and right anodal active stimulation (t11 = 3.57, p = 0.004) were better than performances associated with sham tDCS. For composite z-scores (full battery), Cohen’s ds effect sizes were 0.50 for left anodal stimulation and 0.47 for right anodal stimulation. There was no difference in overall WM performance between the two active stimulation conditions (t11 = 0.26, p = 0.796). To assess the role of session order on full-battery composite z-scores, we compared them across sessions. We found composite z-scores (full battery) of −0.03 (SD 0.81) for session 1, −0.06 (SD 0.75) for session 2, and 0.09 (SD 0.83) for session 3. Repeated measures ANOVA showed no significant practice effects across sessions (F2,22 = 0.88, p = 0.428).<table-wrap><label>Table 1</label><caption><p>Behavioral data and statistics: Composite z-score (online) is the mean of four z-scores: spatial span backward maximum length, digit span backward maximum length, online letter n-back accuracy, and online spatial n-back accuracy. Composite z-score (offline) is the mean of three z-scores: offline letter n-back accuracy, offline spatial n-back accuracy, and BTA raw score. Composite z-score (full battery) is the mean of five z-scores: spatial span backward maximum length, digit span backward maximum length, letter n-back accuracy (mean of online and offline percentages), spatial n-back accuracy (mean of online and offline percentages), and BTA raw score. Also included are means and standard deviations for 1-back, 2-back, and 3-back for letter and spatial n-back, both online and offline</p></caption><table><thead><tr><th></th><th>Sham</th><th>Left anodal</th><th>Right anodal</th><th>Repeated measures ANOVA F(2,22)</th><th>Friedman’s Test χ2</th><th>p value                                     </th></tr></thead><tbody><tr><td>WMS-III, spatial span backward, longest span (blocks)</td><td>4.8 (0.9)</td><td>6.2 (1.8)</td><td>6.0 (0.9)</td><td></td><td>9.24</td><td>0.01</td></tr><tr><td>WMS-III, digit span backward, longest span (digits)</td><td>5.3 (1.5)</td><td>5.5 (1.6)</td><td>5.4 (2.0)</td><td>0.12</td><td></td><td>0.891</td></tr><tr><td>Spatial n-back accuracy (online)</td><td>62.4% (25.0%)</td><td>68.6% (22.1%)</td><td>66.7% (23.2%)</td><td></td><td>3.17</td><td>0.205</td></tr><tr><td> 1-back</td><td>74.7% (23.1%)</td><td>80.0% (19.9%)</td><td>79.6% (21.4%)</td><td></td><td></td><td></td></tr><tr><td> 2-back</td><td>61.4% (27.9%)</td><td>68.5% (26.4%)</td><td>66.2% (25.5%)</td><td></td><td></td><td></td></tr><tr><td> 3-back</td><td>49.7% (25.8%)</td><td>55.6% (22.9%)</td><td>52.7% (25.0%)</td><td></td><td></td><td></td></tr><tr><td>Letter n-back accuracy (online)</td><td>69.6% (30.2%)</td><td>76.2% (22.5%)</td><td>76.3% (26.8%)</td><td></td><td>0.5</td><td>0.779</td></tr><tr><td> 1-back</td><td>77.3% (26.6%)</td><td>82.9% (19.9%)</td><td>84.8% (21.7%)</td><td></td><td></td><td></td></tr><tr><td> 2-back</td><td>70.4% (32.4%)</td><td>76.0% (26.0%)</td><td>76.2% (29.3%)</td><td></td><td></td><td></td></tr><tr><td> 3-back</td><td>61.1% (33.1%)</td><td>69.7% (24.2%)</td><td>68.1% (29.7%)</td><td></td><td></td><td></td></tr><tr><td>Composite z-score (online)</td><td>−0.25 (0.74)</td><td>0.15 (0.87)</td><td>0.09 (0.81)</td><td>7.68</td><td></td><td>0.003</td></tr><tr><td>Spatial n-back accuracy (offline)</td><td>63.9% (24.9%)</td><td>70.6% (20.9%)</td><td>67.7% (24.2)</td><td>0.55</td><td></td><td>0.758</td></tr><tr><td> 1-back</td><td>76.2% (21.9%)</td><td>85.1% (20.2%)</td><td>78.4% (21.7%)</td><td></td><td></td><td></td></tr><tr><td> 2-back</td><td>63.2% (29.0%)</td><td>70.5% (24.3%)</td><td>69.8% (26.8%)</td><td></td><td></td><td></td></tr><tr><td> 3-back</td><td>52.5% (26.5%)</td><td>56.3% (21.8%)</td><td>54.6% (26.0)</td><td></td><td></td><td></td></tr><tr><td>Letter n-back accuracy (offline)</td><td>72.3% (29.8%)</td><td>78.5% (22.5%)</td><td>76.5% (25.9%)</td><td></td><td>0.17</td><td>0.92</td></tr><tr><td> 1-back</td><td>80.7% (23.2%)</td><td>86.1% (20.7%)</td><td>84.3% (21.6%)</td><td></td><td></td><td></td></tr><tr><td> 2-back</td><td>71.1% (33.2%)</td><td>77.8% (22.3%)</td><td>77.8% (27.2%)</td><td></td><td></td><td></td></tr><tr><td> 3-back</td><td>65.1% (33.6%)</td><td>71.6% (26.2%)</td><td>67.4% (30.0)</td><td></td><td></td><td></td></tr><tr><td>Brief Test of Attention (offline)</td><td>6.8 (2.5)</td><td>7.4 (2.2)</td><td>7.9 (2.5)</td><td></td><td>7.09</td><td>0.029</td></tr><tr><td>Composite z-score (offline)</td><td>−0.18 (0.92)</td><td>0.09 (0.77)</td><td>0.09 (0.93)</td><td></td><td>7.17</td><td>0.028</td></tr><tr><td>Spatial n-back accuracy combined (mean of online and offline percentages)</td><td>63.2% (24.7%)</td><td>69.6% (20.7%)</td><td>67.3% (23.6%)</td><td></td><td>0.67</td><td>0.717</td></tr><tr><td>Letter n-back accuracy combined (mean of online and offline percentages)</td><td>71.0% (29.8%)</td><td>77.4% (22.1%)</td><td>76.4% (26.3%)</td><td></td><td>0.17</td><td>0.92</td></tr><tr><td>Composite z-score (full battery)</td><td>−0.25 (0.71)</td><td>0.13 (0.82)</td><td>0.11 (0.81)</td><td>12.85</td><td></td><td>0.0002</td></tr></tbody></table></table-wrap></p>"


print(input_chunker.count_tokens(str(text)))
list = input_chunker.get_chunked_input(str(text), 1700)
print(f"full chunk list length: {len(list)}")
for chunk in list:
    print(chunk['token_size'])
    print(chunk['chunk'])
    print('---')

2347
SPLITTING TABLE
<p>As shown in the final row of Table 1, the composite z-scores (full battery) were −0.25 (SD 0.71) for sham stimulation, 0.13 (SD 0.82) for left anodal stimulation, and 0.11 (SD 0.81) for right anodal stimulation, which repeated measures ANOVA showed to be significantly different (F2,22 = 12.85, p = 0.0002). Post hoc, paired-sample t tests revealed that performances associated with both left anodal active stimulation (t11 = 5.4, p = 0.0002) and right anodal active stimulation (t11 = 3.57, p = 0.004) were better than performances associated with sham tDCS. For composite z-scores (full battery), Cohen’s ds effect sizes were 0.50 for left anodal stimulation and 0.47 for right anodal stimulation. There was no difference in overall WM performance between the two active stimulation conditions (t11 = 0.26, p = 0.796). To assess the role of session order on full-battery composite z-scores, we compared them across sessions. We found composite z-scores (full battery) of −0.