This is the python code to convert all xml files into markdown versions using the Pandoc library.
The actual format we convert to is "jats".

In [39]:
import pypandoc
import os
import re

In [40]:
def join_lines_except_table(content):
    """Join lines in the contents except for lines between the special string denoting table"""
    special_string = "::::"
    text_and_tables = content.split(special_string)
    joined_lines = []
    for text_or_table in text_and_tables:
        text_or_table = text_or_table.strip().replace("\\", "")
        if "table-wrap" in text_or_table:
            new_line = special_string + text_or_table + special_string
        else:
            new_line = text_or_table.replace("\n", " ")
            new_line = re.sub(' +', ' ', new_line)
        joined_lines.append(new_line)
    joined_content = " ".join(joined_lines)
    return joined_content

In [41]:
def read_xml_directory(directory):
    """ Read all the XML filenames in the directory and return a list of file names"""
    filenames = []
    for filename in os.listdir(directory):
        filenames.append(filename)

    return filenames

In [42]:
def convert_xml_to_markdown(read_directory, write_directory):
    """Convert XML files to markdown files"""
    for filename in read_xml_directory(read_directory):
        doc = pypandoc.convert_file(read_directory + "/" + filename, "markdown", format="jats")
        # need to add abstract to the beginning of the file since it gets removed during conversion
        doc = "# Abstract\n\n" + doc
        # join lines except for table
        doc = join_lines_except_table(doc) # some pre-processing to reduce unnecessary newlines
        name = filename.split(".")[0]
        output_file=name + ".md"
        with open(write_directory + "/" + output_file, "w") as f:
            f.write(doc)

In [43]:
import tiktoken

def count_tokens(string, encoding):
    """Given a string, return the number of tokens in the text"""
    return len(encoding.encode(string))

def get_token_counts(directory):
    """Return a dictionary of token counts for each file"""
    token_counts = {}
    filenames = read_xml_directory(directory)
    for filename in filenames:
        with open(directory + "/" + filename, "r") as f:
            pmcid = filename.split(".")[0].split('C')[1]
            text = f.read()
            encoding = tiktoken.get_encoding("cl100k_base")
            tokens = count_tokens(text, encoding)
            token_counts[pmcid] = tokens
    return token_counts

In [44]:
import json

def add_markdown_tokens_to_json(read_filename, output_filename, token_counts):
    """Add the number of tokens in the markdown files to the JSON file"""
    # Read the file and load the JSON data
    with open(read_filename, "r") as f:
        data = json.load(f)

    new_data = []
    for example in data:
        pmcid = str(example['pmcid'])
        example['tiktoken_without_attributes_markdown_token_num'] = token_counts[pmcid]
        new_data.append(example)

    # save the new data
    with open(output_filename, "w") as f:
        json.dump(new_data, f)

In [45]:
# For annotated rct dataset (xml without attributes as the base)

convert_xml_to_markdown("no_attributes_xml_files", "no_attributes_markdown_files")
token_counts = get_token_counts("no_attributes_markdown_files")
# Specify the file path
read_filename = "annotated_rct_dataset.json"
output_filename = "annotated_rct_dataset_with_markdown_tokens.json"
add_markdown_tokens_to_json(read_filename, output_filename, token_counts)


In [46]:
print(token_counts)

{'5773985': 1917, '3510731': 943, '5781260': 2662, '4319071': 9628, '3278655': 2857, '5541727': 4032, '4987981': 5779, '4355974': 3820, '3648394': 4261, '2681019': 543, '4511433': 2969, '4033571': 4514, '4210722': 5695, '2952311': 2143, '4709985': 2363, '3309311': 1073, '5268424': 4148, '1863515': 3103, '3496170': 1731, '4574984': 5795, '5360580': 6033, '5777645': 3738, '5534041': 4357, '4928400': 1670, '2596788': 2520, '5515881': 2868, '3321528': 2880, '5971365': 2130, '5086025': 13068, '3003523': 7520, '4323894': 2328, '5771543': 7366, '3877023': 7100, '4879328': 5640, '1574360': 1376, '4188762': 1564, '5617873': 4396, '3136370': 2937, '4483334': 1068, '5777419': 3181, '3912320': 1015, '5711682': 2966, '4678179': 5204, '3195393': 5589, '5079604': 4923, '4215531': 4156, '4541185': 907, '5380326': 4260, '1216327': 3913, '4140238': 5052, '3169777': 3326, '4357072': 1385, '5380284': 1251, '3546023': 11524, '2974815': 4469, '3508963': 1552, '5726464': 3133, '5055753': 2114, '3504298': 333

In [47]:
print(sum(1 for value in token_counts.values() if value > 1700))
print(sum(1 for value in token_counts.values() if value > 7500))
print(sum(1 for value in token_counts.values() if value > 15000))

total_sum = sum(value for value in token_counts.values())
print(total_sum/len(token_counts))


91
9
0
3580.641666666667


In [48]:
# For case study dataset (xml without attributes as the base)

convert_xml_to_markdown("no_attributes_case_study_xml_files", "no_attributes_case_study_markdown_files")
token_counts = get_token_counts("no_attributes_case_study_markdown_files")
# Specify the file path
read_filename = "meta_analysis_case_study.json"
output_filename = "meta_analysis_case_study_with_markdown_tokens.json"
add_markdown_tokens_to_json(read_filename, output_filename, token_counts)


In [49]:
print(token_counts)

{'7190303': 7623, '7727327': 5440, '7442954': 1036, '7262788': 7710}


In [50]:
print(sum(1 for value in token_counts.values() if value > 1700))
print(sum(1 for value in token_counts.values() if value > 7500))
print(sum(1 for value in token_counts.values() if value > 15000))

total_sum = sum(value for value in token_counts.values())
print(total_sum/len(token_counts))

3
2
0
5452.25
