In [1]:
import numpy as np, os, sys, matplotlib.pyplot as plt, seaborn as sns, pandas, orjson
from tqdm import tqdm

In [2]:
def get_dict_of_arrays(exp_model_dict, read_directory='../log_calculations/unified_outputs/', fname_to_read='pppl.npy',convert_for_plot=True):
    np_arrays = {}
    
    for exp, models in exp_model_dict.items():
        for model in models:
            file_name = os.path.join(read_directory, exp, model, fname_to_read)
            if convert_for_plot:
                np_arrays[model] = get_xy(np.load(file_name))
            else:
                np_arrays[model] = np.load(file_name)

    return np_arrays

In [3]:
def get_dict_of_df(df_dict, read_directory='../log_calculations/unified_outputs/', fname_to_read='filtered_df.csv'):
    
    dfs = {}
    read_dirs = []
    for exp, models in df_dict.items():
        for model in models:
            file_name = os.path.join(read_directory, exp, model, fname_to_read)
            read_dirs.append(os.path.join(read_directory, exp, model))
            dfs[model] = pandas.read_csv(file_name)
    return dfs, read_dirs

In [4]:
models = {
    # 'every_5_5000' : ['allenai/scibert_scivocab_cased', 'bert-base-cased', 'roberta-base', 'xlm-roberta-base']
    'every_5_5000' : ['bert-large-cased']
}

dfs, read_dirs = get_dict_of_df(models)
model_nps = get_dict_of_arrays(models, fname_to_read='filtered_log_probs.npy', convert_for_plot=False)

In [5]:
bin_samples_dir = '/projects/abeb4417/jsalt/lm_perplexity/sampling/bin_samples/'

cids_to_check = pandas.concat([df[['bin', 'corpusid']] for _, df in dfs.items()], axis=0).drop_duplicates()
bins = cids_to_check['bin'].unique()

external_data = {}
   
for bin in tqdm(bins):
    cids = cids_to_check[cids_to_check['bin'] == bin]['corpusid'].values
    seen_cids = set()
    with open(os.path.join(bin_samples_dir, f'{bin:03d}')) as input_sample:
        for line in input_sample:
            dat = orjson.loads(line)
            cid = dat['corpusid']
            if cid in cids:
                if cid not in external_data:
                    external_data[cid] = dat['openaccessinfo']['externalids']
                seen_cids.add(cid)
            if seen_cids == set(cids):
                break

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 21/21 [00:33<00:00,  1.60s/it]


In [6]:
(cids_to_check['corpusid'] == 98785756).sum()
external_data[98785756]

{'MAG': '2129968670',
 'ACL': None,
 'DOI': '10.1002/PRAC.19231050116',
 'PubMedCentral': None,
 'ArXiv': None}

In [7]:
k = set()
for _,x in external_data.items():
    for l in x:
        k.add(l)

k

{'ACL', 'ArXiv', 'DOI', 'MAG', 'PubMedCentral'}

In [8]:
# df['has_pubmed'] = df['corpusid'].apply(lambda cid: True if (cid in external_ids) and (('PubMed' in external_ids[cid]['externalIds']) or ('PubMedCentral' in external_ids[cid]['externalIds'])) else False)

for _, df in dfs.items():
    for external_source in ['MAG', 'ACL', 'DOI', 'PubMedCentral', 'ArXiv']:
        df[f'has_{external_source}'.lower()] = df['corpusid'].apply(lambda cid: True if (external_source in external_data[cid]) and (external_data[cid][external_source] is not None) else False)


In [9]:
#Need to query semantic scholar for pubmed
from tqdm import trange
import requests
def fetch_semantic_scholar_info(corpus_ids = [], paper_ids = [], fields="title,abstract,externalIds,corpusId", max_batch_size=500, output_file = None, verbose=False):

	processed_ids = processed_ids = ['CorpusId:{}'.format(id) for id in corpus_ids] + paper_ids

	returned_objects = {}

	for i in trange(0,len(corpus_ids)+1,max_batch_size):
		to_send = processed_ids[i:i+max_batch_size]

		if len(to_send) == 0:
			continue
	
		if verbose:
			print('Sending following IDS to Semantic Scholar (len {}): {}'.format(len(processed_ids), processed_ids))

		apikey=os.environ.get('SPECTER_API_KEY')
		r = requests.post(
			'https://api.semanticscholar.org/graph/v1/paper/batch',
			params={'fields': fields},
			headers={"x-api-key" : apikey},
			json={"ids" : to_send}
		)

		for resp in r.json():
			if resp is not None and isinstance(resp, dict):
				returned_objects[resp['corpusId']] = resp


	if output_file:
		with open(output_file, 'w') as f:
			json.dump(r.json(), f, indent=2)

	return returned_objects

In [10]:
import pickle
with open('/projects/abeb4417/jsalt/lm_perplexity/log_calculations/pickles/every_5_5000/external_ids.pickle', 'rb') as f:
    ss_queries = pickle.load(f)

need_to_fetch = []
for cid in cids_to_check['corpusid']:
    if cid not in ss_queries:
        need_to_fetch.append(cid)

ss_new_resps = fetch_semantic_scholar_info(need_to_fetch, fields='corpusId,externalIds')

ss_pubmed = ss_queries | ss_new_resps

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  3.45it/s]


In [11]:
for model, df in dfs.items():
    df['has_pubmed'] = df['corpusid'].apply(lambda cid: True if (cid in ss_pubmed) and (('PubMed' in ss_pubmed[cid]['externalIds'])) else False)

In [12]:
(df['has_pubmed'] == True).sum()

20747

In [13]:
for (model, df), dir in zip(dfs.items(), read_dirs):
    print(dir)
    print(model)
    df.to_csv(os.path.join(dir, 'filtered_df_with_source.tsv'))

../log_calculations/unified_outputs/every_5_5000/bert-large-cased
bert-large-cased
