In [49]:
import pypandoc
import os

In [50]:
def read_xml_directory(directory):
    """ Read all the XML filenames in the directory and return a list of file names"""
    filenames = []
    for filename in os.listdir(directory):
        filenames.append(filename)

    return filenames

In [59]:
for filename in read_xml_directory("no_attributes_xml_files"):
    # file = open("no_attributes_xml_files/" + filename, encoding="utf-8")
    doc = pypandoc.convert_file("no_attributes_xml_files/" + filename, "markdown", format="jats")
    # need to add abstract to the beginning of the file since it gets removed during conversion
    doc = "Abstract\n\n" + doc
    name = filename.split(".")[0]
    output_file=name + ".md"
    with open("no_attributes_markdown_files/" + output_file, "w") as f:
        f.write(doc)

In [27]:
import tiktoken

def count_tokens(string, encoding):
    """Given a string, return the number of tokens in the text"""
    return len(encoding.encode(string))

token_counts = {}
filenames = read_xml_directory("no_attributes_markdown_files")
for filename in filenames:
    with open("no_attributes_markdown_files/" + filename, "r") as f:
        pmcid = filename.split(".")[0].split('C')[1]
        text = f.read()
        encoding = tiktoken.get_encoding("cl100k_base")
        tokens = count_tokens(text, encoding)
        token_counts[pmcid] = tokens

In [28]:
print(token_counts)

{'5773985': 2030, '3510731': 1007, '5781260': 2804, '4319071': 10551, '3278655': 2969, '5541727': 4179, '4987981': 5904, '4355974': 3957, '3648394': 4403, '2681019': 598, '4511433': 3074, '4033571': 4700, '4210722': 5983, '2952311': 2201, '4709985': 2502, '3309311': 1118, '5268424': 4337, '1863515': 3292, '3496170': 1851, '4574984': 6009, '5360580': 6134, '5777645': 3846, '5534041': 4501, '4928400': 1722, '2596788': 2613, '5515881': 2973, '3321528': 2944, '5971365': 2203, '5086025': 13283, '3003523': 7725, '4323894': 2453, '5771543': 7564, '3877023': 7286, '4879328': 6027, '1574360': 1469, '4188762': 1636, '5617873': 4548, '3136370': 3039, '4483334': 1148, '5777419': 3269, '3912320': 1077, '5711682': 3018, '4678179': 5434, '3195393': 5770, '5079604': 5087, '4215531': 4286, '4541185': 961, '5380326': 4392, '1216327': 4111, '4140238': 5190, '3169777': 3447, '4357072': 1453, '5380284': 1317, '3546023': 11669, '2974815': 4618, '3508963': 1600, '5726464': 3235, '5055753': 2207, '3504298': 3

In [29]:
count = sum(1 for value in token_counts.values() if value > 1700)
count


94

In [30]:
count = sum(1 for value in token_counts.values() if value > 7500)
count

10

In [31]:
count = sum(1 for value in token_counts.values() if value > 15000)
count

0

In [32]:
total_sum = sum(value for value in token_counts.values())
total_sum/len(token_counts)

3710.4166666666665

In [33]:
import os
import re
# preprocessing to remove lines that are not tables

def read_files(directory):
    """Read all the files in the directory and return a list of file contents"""
    file_contents = []
    for filename in os.listdir(directory):
        with open(os.path.join(directory, filename), "r") as f:
            content = f.read()
            file_contents.append(content)
    return file_contents

def join_lines_except_between_string(contents, string):
    """Join lines in the contents except for lines between the specified string"""
    joined_contents = []
    for content in contents:
        text_and_tables = content.split(string)
        joined_lines = []
        for text_or_table in text_and_tables:
            text_or_table = text_or_table.strip().replace("\\", "")
            if "table-wrap" in text_or_table:
                new_line = string + text_or_table + string
            else:
                new_line = text_or_table.replace("\n", " ")
                new_line = re.sub(' +', ' ', new_line)
            joined_lines.append(new_line)
        joined_content = " ".join(joined_lines)
        print(joined_content)
        joined_contents.append(joined_content)
    return joined_contents

directory = "no_attributes_markdown_files"
contents = read_files(directory)
joined_contents = join_lines_except_between_string(contents, "::::")


# Abstract ## Background: Minor oral surgical procedures are the most commonly performed procedures by oral and maxillofacial surgeons. Performance of painless surgical procedure is highly appreciated by the patients and is possible through the use of local anesthesia, conscious sedation or general anesthesia. Postoperative pain can also be controlled by the use of opioids, as opioid receptors exist in the peripheral nervous system and offers the possibility of providing postoperative analgesia in the surgical patient. The present study compares the efficacy of 0.5% bupivacaine versus 0.5% bupivacaine with 0.3 mg buprenorphine in minor oral surgical procedures. ## Patients and Methods: The present study was conducted in 50 patients who required minor oral surgical procedures under local anesthesia. Two types of local anesthetic solutions were used- 0.5% bupivacaine with 1:200000 epinephrine in group I and a mixture of 39 ml of 0.5% bupivacaine with epinephrine 1:200000 and 1 ml of 300 

In [34]:
token_counts2 = []
for content in joined_contents:
    encoding = tiktoken.get_encoding("cl100k_base")
    tokens = count_tokens(content, encoding)
    token_counts2.append(tokens)


In [35]:
token_counts2

[1917,
 943,
 2662,
 9628,
 2857,
 4032,
 5779,
 3820,
 4261,
 543,
 2969,
 4514,
 5695,
 2143,
 2363,
 1073,
 4148,
 3103,
 1731,
 5795,
 6033,
 3738,
 4357,
 1670,
 2520,
 2868,
 2880,
 2130,
 13068,
 7520,
 2328,
 7366,
 7100,
 5640,
 1376,
 1564,
 4396,
 2937,
 1068,
 3181,
 1015,
 2966,
 5204,
 5589,
 4923,
 4156,
 907,
 4260,
 3913,
 5052,
 3326,
 1385,
 1251,
 11524,
 4469,
 1552,
 3133,
 2114,
 3331,
 2336,
 1980,
 1009,
 721,
 1642,
 3635,
 1567,
 2434,
 1653,
 5693,
 637,
 5770,
 3891,
 9179,
 2274,
 8556,
 4631,
 928,
 5529,
 2065,
 1562,
 6664,
 4155,
 4224,
 3918,
 1931,
 4091,
 966,
 1751,
 2976,
 5281,
 971,
 4323,
 2366,
 4050,
 1617,
 8432,
 725,
 4026,
 3542,
 3573,
 749,
 1505,
 3578,
 3265,
 3773,
 3858,
 4394,
 2573,
 5310,
 2832,
 2232,
 3973,
 8549,
 1429,
 1357,
 1703,
 5694,
 8141,
 2822,
 880]

In [36]:
count = sum(1 for value in token_counts2 if value > 1700)
count

91

In [37]:
sum(1 for value in token_counts2 if value > 2000)


85

In [38]:
total_sum = sum(value for value in token_counts2)
total_sum/len(token_counts2)

3580.641666666667

In [39]:
# import json
# # Specify the file path
# file_path = "annotated_rct_dataset.json"

# # Read the file and load the JSON data
# with open(file_path, "r") as f:
#     data = json.load(f)

# for example in data:
#     pmcid = example[pmcid]
#     example['tiktoken_without_attributes_markdown_token_num'] = token_counts[pmcid]

In [40]:
from numerizer import numerize

numerized_text_list = []
for content in joined_contents:
    try:
        numerized_text = numerize(content)
    except:
        print(f"error in numerizing {content}")
        numerized_text = content
    numerized_text_list.append(numerized_text)

error in numerizing # Abstract *Background*. In view of the adverse effects of using restraints, studies examining the use of restraint reduction programs (RRPs) are needed. *Objectives*. To investigate the effect of an RRP on the reduction of physical restraint rates in rehabilitation hospitals. *Methods*. A prospective quasi-experimental clinical trial was conducted. Demographic data, medical and health-related information on recruited patients from two rehabilitation hospitals, as well as facility data on restraint rates were collected. *Results*. The increase in the restraint rate in the control site was 4.3 times greater than that in the intervention site. Changes in the restraint mode, from continuous to intermittent, and the type of restraint used were found between the pre- and postintervention periods in both the control site and the intervention site. *Discussion*. Compared with that in the control site, the RRP in the intervention site helped arrest any increase in the restr

In [41]:
from numerizer import numerize

numerize('The present has fifty participants study compares the efficacy of 0.5% bbupivacaine versus 0.5% bupivacaine with 0.3 mg buprenorphine in minor')

'The present has 50 participants study compares the efficacy of 0.5% bbupivacaine versus 0.5% bupivacaine with 0.3 mg buprenorphine in minor'

In [47]:
# method for checking if sentence contains numbers
def contains_number(sentence):
    return any(char.isdigit() for char in sentence)

import nltk
from nltk.tokenize import sent_tokenize
nltk.download("punkt")

final_contents = []
for content in numerized_text_list:
    final_content = ""
    special_string = "::::"
    text_or_table = content.split(special_string)
    for text in text_or_table:
        if "table-wrap" in text:
            new_line = special_string + text + special_string
            final_content += new_line
        else:
            # get list of setences
            sentences = sent_tokenize(text)
            new_lines = []
            for sent in sentences:
                if contains_number(sent):
                    new_lines.append(sent)
            joined_content = " ".join(new_lines)
            final_content += joined_content
    final_contents.append(final_content)

[nltk_data] Downloading package punkt to /Users/hyesunyun/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [48]:
for content in final_contents:
    print(content)

The present study compares the efficacy of 0.5% bupivacaine versus 0.5% bupivacaine with 0.3 mg buprenorphine in minor oral surgical procedures. ## Patients and Methods: The present study was conducted in 50 patients who required minor oral surgical procedures under local anesthesia. 2 types of local anesthetic solutions were used  0.5% bupivacaine with 1:200000 epinephrine in group I and 1 mixture of 39 ml of 0.5% bupivacaine with epinephrine 1:200000 and 1 ml of 300 μg buprenorphine (3 μg/kg)in group II. ## Results: The mean duration of postoperative analgesia in bupivacaine group (508.92 ± 63.30 minutes) was quite less than the buprenorphine combination group (1840.84 ± 819.51 minutes). The mean dose of postoperative analgesic medication in bupivacaine group (1.64 ± 0.99 tablets) was higher than buprenorphine combination group (0.80 ± 1.08 tablets). There was no significant difference between the 2 groups regarding the onset of action of the anesthetic effect and duration of anesthe

In [44]:
token_counts3 = []
for content in final_contents:
    encoding = tiktoken.get_encoding("cl100k_base")
    tokens = count_tokens(content, encoding)
    token_counts3.append(tokens)

In [45]:
count = sum(1 for value in token_counts3 if value > 1700)
count

84

In [46]:
total_sum = sum(value for value in token_counts3)
total_sum/len(token_counts3)

3151.575