In [1]:
import json
from collections import Counter

In [2]:
# python script for analyzing the data
# dataset statistics

In [3]:
# total number of PMCIDs (meaning RCTs) in the dataset
with open('annotated_rct_dataset.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# Iterate and add all the PMCIDs to a set to get the unique PMCIDs    
pmcids = set([record['pmcid'] for record in data])
print(f'Total number of PMCIDs in the dataset: {len(pmcids)}')

Total number of PMCIDs in the dataset: 120


In [4]:
# total number of instances in the dataset
print(f'Total number of instances in the dataset: {len(data)}')

Total number of instances in the dataset: 699


In [5]:
# total number of PMCIDs (meaning RCTs) for each split (dev and test)
dev_pmcids = set([record['pmcid'] for record in data if record['split'].lower() == 'dev'])
test_pmcids = set([record['pmcid'] for record in data if record['split'].lower() == 'test'])
print(f'Total number of PMCIDs in the dev split: {len(dev_pmcids)}')
print(f'Total number of PMCIDs in the test split: {len(test_pmcids)}')

Total number of PMCIDs in the dev split: 10
Total number of PMCIDs in the test split: 110


In [6]:
# total number of instances for each split (dev and test)
dev_data = [record for record in data if record['split'].lower() == 'dev']
test_data = [record for record in data if record['split'].lower() == 'test']
print(f'Total number of instances in the dev split: {len(dev_data)}')
print(f'Total number of instances in the test split: {len(test_data)}')

Total number of instances in the dev split: 43
Total number of instances in the test split: 656


In [7]:
# number of instances per class (binary vs continuous)
print('Total instances per class', dict(Counter([record['outcome_type'] for record in data])))

Total instances per class {'binary': 182, 'continuous': 517}


In [8]:
# number of instances per class (binary vs continuous) for each split
print('Total instances per class in dev split', dict(Counter([record['outcome_type'] for record in dev_data])))
print('Total instances per class in test split', dict(Counter([record['outcome_type'] for record in test_data])))

Total instances per class in dev split {'binary': 11, 'continuous': 32}
Total instances per class in test split {'continuous': 485, 'binary': 171}


In [9]:
# number of instances with enough data to calculate point estimates and variances
count = 0
        
for record in data:
    if record['is_data_complete'] == 'TRUE':
        count += 1
        
print(f'Number of instances with enough data to calculate point estimates and variances: {count}')
print(f'Percentage of instances with enough data to calculate point estimates and variances: {count/len(data)*100:.2f}%')

Number of instances with enough data to calculate point estimates and variances: 439
Percentage of instances with enough data to calculate point estimates and variances: 62.80%


In [10]:
# number of instances with enough data to calculate point estimates and variances
count = 0
        
for record in dev_data:
    if record['is_data_complete'] == 'TRUE':
        count += 1
        
        
print(f'Number of instances in DEV with enough data to calculate point estimates and variances: {count}')
print(f'Percentage of instances in DEV with enough data to calculate point estimates and variances: {count/len(dev_data)*100:.2f}%')

Number of instances in DEV with enough data to calculate point estimates and variances: 27
Percentage of instances in DEV with enough data to calculate point estimates and variances: 62.79%


In [11]:
# number of instances with enough data to calculate point estimates and variances
count = 0
        
for record in test_data:
    if record['is_data_complete'] == 'TRUE':
        count += 1
        
        
print(f'Number of instances in TEST with enough data to calculate point estimates and variances: {count}')
print(f'Percentage of instances in TEST with enough data to calculate point estimates and variances: {count/len(test_data)*100:.2f}%')

Number of instances in TEST with enough data to calculate point estimates and variances: 412
Percentage of instances in TEST with enough data to calculate point estimates and variances: 62.80%


In [12]:
def convert_to_bool(value):
    if str(value).lower() == 'true':
        return True
    elif str(value).lower() == 'false':
        return False
    else:
        return None

In [13]:
# number of RCTs with tables that are in graphic/figure format
print('Number of RCTs with tables that are in graphic/figure format: ', len(set([record['pmcid'] for record in data if convert_to_bool(record['is_table_in_graphic_format'])])))

Number of RCTs with tables that are in graphic/figure format:  20


In [14]:
# Convert numbers with commas to numbers
def convert_to_int(value):
    if ',' in str(value):
        return int(value.replace(',', ''))
    else:
        return int(value)

In [15]:
# average number of tokens for each RCT
tokens_map = dict()
for record in data:
    if record['pmcid'] not in tokens_map:
        tokens_map[record['pmcid']] = convert_to_int(record['tiktoken_with_attributes_xml_token_num'])
        
print(f'Average number of tokens for each RCT with xml attributes: {sum(tokens_map.values()) / len(tokens_map)}')

Average number of tokens for each RCT with xml attributes: 7606.283333333334


In [16]:
# average number of tokens for each RCT
tokens_map = dict()
for record in data:
    if record['pmcid'] not in tokens_map:
        tokens_map[record['pmcid']] = convert_to_int(record['tiktoken_without_attributes_xml_token_num'])
        
print(f'Average number of tokens for each RCT WITHOUT xml attributes: {sum(tokens_map.values()) / len(tokens_map)}')

Average number of tokens for each RCT WITHOUT xml attributes: 4868.241666666667


In [17]:
# average number of tokens for each dev RCT
dev_tokens_map = dict()
for record in dev_data:
    if record['pmcid'] not in dev_tokens_map:
        dev_tokens_map[record['pmcid']] = convert_to_int(record['tiktoken_without_attributes_xml_token_num'])
        
average_length = sum(dev_tokens_map.values()) / len(dev_tokens_map)
print(f'Average number of tokens for each DEV RCT WITHOUT xml attributes: {average_length}')


Average number of tokens for each DEV RCT WITHOUT xml attributes: 4565.4


In [18]:
# average number of tokens for each test RCT
test_tokens_map = dict()
for record in test_data:
    if record['pmcid'] not in test_tokens_map:
        test_tokens_map[record['pmcid']] = convert_to_int(record['tiktoken_without_attributes_xml_token_num'])

average_length = sum(test_tokens_map.values()) / len(test_tokens_map)
print(f'Average number of tokens for each TEST RCT WITHOUT xml attributes: {average_length}')

Average number of tokens for each TEST RCT WITHOUT xml attributes: 4895.772727272727


In [19]:
# average number of tokens for each RCT (Markdown version)
tokens_map = dict()
for record in data:
    if record['pmcid'] not in tokens_map:
        tokens_map[record['pmcid']] = convert_to_int(record['tiktoken_without_attributes_markdown_token_num'])
        
print(f'Average number of tokens for each RCT (markdown version): {sum(tokens_map.values()) / len(tokens_map)}')

Average number of tokens for each RCT (markdown version): 3580.641666666667


In [20]:
# average number of tokens for each RCT (Markdown version) in DEV
tokens_map = dict()
for record in dev_data:
    if record['pmcid'] not in tokens_map:
        tokens_map[record['pmcid']] = convert_to_int(record['tiktoken_without_attributes_markdown_token_num'])
        
print(f'Average number of tokens for each RCT (markdown version) in DEV: {sum(tokens_map.values()) / len(tokens_map)}')

Average number of tokens for each RCT (markdown version) in DEV: 3331.4


In [21]:
# average number of tokens for each RCT (Markdown version) in TEST
tokens_map = dict()
for record in test_data:
    if record['pmcid'] not in tokens_map:
        tokens_map[record['pmcid']] = convert_to_int(record['tiktoken_without_attributes_markdown_token_num'])
        
print(f'Average number of tokens for each RCT (markdown version) in TEST: {sum(tokens_map.values()) / len(tokens_map)}')

Average number of tokens for each RCT (markdown version) in TEST: 3603.3


In [22]:
# number of instances where important numerical data was found in tables
print('Number of instances where data is found in tables: ', sum([1 for record in data if convert_to_bool(record["is_relevant_data_in_table"])]))

Number of instances where data is found in tables:  471
