In [68]:
import json
import os
from collections import Counter, defaultdict

In [None]:
# python script for analyzing the data
# dataset statistics

In [17]:
# total number of PMCIDs (meaning RCTs) in the dataset
with open('meta_analysis_dataset.json', 'r') as f:
    data = json.load(f)

# Iterate and add all the PMCIDs to a set to get the unique PMCIDs    
pmcids = set([record['pmcid'] for record in data])
print(f'Total number of PMCIDs in the dataset: {len(pmcids)}')

Total number of PMCIDs in the dataset: 69


In [18]:
# total number of instances in the dataset
print(f'Total number of instances in the dataset: {len(data)}')

Total number of instances in the dataset: 353


In [19]:
# total number of PMCIDs (meaning RCTs) for each split (dev and test)
dev_pmcids = set([record['pmcid'] for record in data if record['split'].lower() == 'dev'])
test_pmcids = set([record['pmcid'] for record in data if record['split'].lower() == 'test'])
print(f'Total number of PMCIDs in the dev split: {len(dev_pmcids)}')
print(f'Total number of PMCIDs in the test split: {len(test_pmcids)}')

Total number of PMCIDs in the dev split: 10
Total number of PMCIDs in the test split: 59


In [21]:
# total number of instances for each split (dev and test)
dev_data = [record for record in data if record['split'].lower() == 'dev']
test_data = [record for record in data if record['split'].lower() == 'test']
print(f'Total number of instances in the dev split: {len(dev_data)}')
print(f'Total number of instances in the test split: {len(test_data)}')

Total number of instances in the dev split: 39
Total number of instances in the test split: 314


In [32]:
# number of instances per class (binary vs continuous)
print('Total instances per class', dict(Counter([record['outcome_type'] for record in data])))

Total instances per class {'binary': 95, 'continuous': 258}


In [33]:
# number of instances per class (binary vs continuous) for each split
print('Total instances per class in dev split', dict(Counter([record['outcome_type'] for record in dev_data])))
print('Total instances per class in test split', dict(Counter([record['outcome_type'] for record in test_data])))

Total instances per class in dev split {'binary': 10, 'continuous': 29}
Total instances per class in test split {'continuous': 229, 'binary': 85}


In [37]:
bool("")

False

In [43]:
# number of instances with enough data to calculate point estimates and variances
count = 0
        
for record in data:
    if record['outcome_type'] == 'continuous':
        if record['standardized_mean_difference'] != 'cannot calculate'  and record['smd_sampling_variance'] != 'cannot calculate':
            count += 1
    else:
        if record['log_odds_ratio'] != 'cannot calculate' and record['lor_sampling_variance'] != 'cannot calculate' and record['log_risk_ratio'] != 'cannot calculate' and record['lrr_sampling_variance'] != 'cannot calculate':
            count += 1
        
        
print(f'Number of instances with enough data to calculate point estimates and variances: {count}')

Number of instances with enough data to calculate point estimates and variances: 201


In [80]:
def convert_to_bool(value):
    if str(value).lower() == 'true':
        return True
    elif str(value).lower() == 'false':
        return False
    else:
        return None

In [63]:
# number of RCTs with tables that are in graphic/figure format
print('Number of RCTs with tables that are in graphic/figure format: ', len(set([record['pmcid'] for record in data if convert_to_bool(record['is_table_in_graphic_format'])])))

Number of RCTs with tables that are in graphic/figure format:  14


In [70]:
# Convert numbers with commas to numbers
def convert_to_int(value):
    if ',' in str(value):
        return int(value.replace(',', ''))
    else:
        return int(value)

In [74]:
# average number of tokens for each RCT
tokens_map = dict()
for record in data:
    if record['pmcid'] not in tokens_map:
        tokens_map[record['pmcid']] = convert_to_int(record['openai_gpt_token_num_abstract_results'])
        
print(f'Average number of tokens for each RCT: {sum(tokens_map.values()) / len(tokens_map)}')

Average number of tokens for each RCT: 7823.144927536232


In [96]:
# number of instances where important numerical data was found in tables
print('Number of instances where data is found in tables: ', sum([1 for record in data if convert_to_bool(record["is_relevant_data_in_table"])]))

Number of instances where data is found in tables:  230


In [97]:
# number of instances where important numerical data was found in the text
# number of instances where important numerical data was found in both tables and text

# any other statistics that you think would be useful