# Language Properties over Time: Data Loading

In [7]:
%matplotlib widget
import numpy as np, os, sys, matplotlib.pyplot as plt, seaborn as sns, pandas, orjson, regex as re
from tqdm import tqdm, trange
from iso639 import languages
from langdetect import detect
from mosestokenizer import MosesTokenizer

tqdm.pandas()

## Data Loading

### Load PubMed Data

In [2]:
ids_in_pubmed = np.loadtxt('/rc_scratch/abeb4417/jsalt/semantic_scholar/ids/2022-12-02/database/external_ids/PubMed/sorted_uniq', dtype='int32')
ids_in_pubmed = set(ids_in_pubmed)

### Load Abstracts

In [3]:
def load_abstracts(bin_num, sample_size=10000, bin_samples_dir='/projects/abeb4417/jsalt/lm_perplexity/sampling/bin_samples/'):
    whitespace_pat = re.compile(r'[\n|\t|\s]+')
    
    data = []
    with open(os.path.join(bin_samples_dir, f'{bin_num:03d}')) as input_sample:
        for line in input_sample:
            dat = orjson.loads(line)
            
            abstract = dat['abstract']
            abstract = re.sub(whitespace_pat, ' ', abstract)
            
            data.append({'bin' : bin_num, 'corpusid' : dat['corpusid'], 'abstract' : abstract} | dat['openaccessinfo']['externalids'])

            if len(data) >= sample_size:
                break
        
        return data
        


In [4]:
bin_data = []
for bidx in trange(100):
    bin_data.extend(load_abstracts(bidx))
    
df = pandas.DataFrame.from_records(bin_data)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [01:12<00:00,  1.39it/s]


In [5]:
df['has_pubmed'] = df['corpusid'].progress_apply(lambda x : x in ids_in_pubmed)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000000/1000000 [00:01<00:00, 848062.96it/s]


In [None]:
(df['has_pubmed'] == True).sum() / len(df)

In [None]:
pubmed_df = df[df['has_pubmed'] == True]

In [6]:
df.to_csv('lpt_df.csv')

### Add Language Data

In [None]:
def detect_with_error(abstract):

	try:
		return detect(abstract)
	except:
		return np.nan

In [None]:
df['langdetect'] = df['abstract'].progress_apply(lambda x : detect_with_error(x))
df.to_csv('lpt_df_ld.csv')

### Tokenize

In [8]:
languages = ['en']
tokenizers = {}
for lang in languages:
    tokenizers[lang] = MosesTokenizer(lang)

df['moses_tokens'] = df['abstract'].progress_apply(lambda x: tokenizers['en'](x))


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000000/1000000 [06:39<00:00, 2504.32it/s]


In [None]:
df.to_csv('lpt_df_moses.csv')

## Other


In [None]:
moses_df['moses_tokens_eval'] = moses_df['moses_tokens'].progress_apply(lambda x : eval(x))

In [None]:
# Fertility: 1/n sum(bert_tokens / total_tokens)
def count_continuation(abstract):
    tokens = sp.encode(abstract, out_type=str)
    return len(tokens) - ''.join(tokens).count('▁')  

for i, bidx in tqdm(enumerate(vocabulary_bins), total=len(vocabulary_bins)):
    sp = spm.SentencePieceProcessor(model_file=f'spms/{bidx}_abs.model')
    pubmed_df[f'num_spm_{bidx}_tokens'] = pubmed_df['abstract'].progress_apply(lambda x : len(sp.encode(x)))
    pubmed_df[f'num_spm_{bidx}_continuation_tokens'] = pubmed_df['abstract'].progress_apply(lambda x : count_continuation(x))
    
pubmed_df['num_moses_tokens'] = pubmed_df['moses_tokens'].progress_apply(lambda x : x.count(',') + 1)