In [25]:
import json
import jsonlines
from collections import defaultdict
from tqdm import tqdm
import pickle

In [3]:
data = []

METADATA_FILEPATH = '../data/metadata.jsonl'
with jsonlines.open(METADATA_FILEPATH) as reader:
    for obj in tqdm(reader):
        data.append(obj)

1864576it [00:31, 59826.77it/s]


In [4]:
len(data)

1864576

In [9]:
data[0]

{'id': '0704.0001',
 'submitter': 'Pavel Nadolsky',
 'authors': "C. Bal\\'azs, E. L. Berger, P. M. Nadolsky, C.-P. Yuan",
 'title': 'Calculation of prompt diphoton production cross sections at Tevatron and\n  LHC energies',
 'comments': '37 pages, 15 figures; published version',
 'journal-ref': 'Phys.Rev.D76:013009,2007',
 'doi': '10.1103/PhysRevD.76.013009',
 'abstract': '  A fully differential calculation in perturbative quantum chromodynamics is\npresented for the production of massive photon pairs at hadron colliders. All\nnext-to-leading order perturbative contributions from quark-antiquark,\ngluon-(anti)quark, and gluon-gluon subprocesses are included, as well as\nall-orders resummation of initial-state gluon radiation valid at\nnext-to-next-to-leading logarithmic accuracy. The region of phase space is\nspecified in which the calculation is most reliable. Good agreement is\ndemonstrated with data from the Fermilab Tevatron, and predictions are made for\nmore detailed tests with C

In [10]:
def get_categories(obj):
    cat = set()
    for cs in obj['categories']:
        for c in cs.split(' '):
            cat.add(c) 
    return cat
    
categories = set()
for obj in data:
    for c in get_categories(obj):
        categories.add(c)

In [11]:
categories

{'acc-phys',
 'adap-org',
 'alg-geom',
 'ao-sci',
 'astro-ph',
 'astro-ph.CO',
 'astro-ph.EP',
 'astro-ph.GA',
 'astro-ph.HE',
 'astro-ph.IM',
 'astro-ph.SR',
 'atom-ph',
 'bayes-an',
 'chao-dyn',
 'chem-ph',
 'cmp-lg',
 'comp-gas',
 'cond-mat',
 'cond-mat.dis-nn',
 'cond-mat.mes-hall',
 'cond-mat.mtrl-sci',
 'cond-mat.other',
 'cond-mat.quant-gas',
 'cond-mat.soft',
 'cond-mat.stat-mech',
 'cond-mat.str-el',
 'cond-mat.supr-con',
 'cs.AI',
 'cs.AR',
 'cs.CC',
 'cs.CE',
 'cs.CG',
 'cs.CL',
 'cs.CR',
 'cs.CV',
 'cs.CY',
 'cs.DB',
 'cs.DC',
 'cs.DL',
 'cs.DM',
 'cs.DS',
 'cs.ET',
 'cs.FL',
 'cs.GL',
 'cs.GR',
 'cs.GT',
 'cs.HC',
 'cs.IR',
 'cs.IT',
 'cs.LG',
 'cs.LO',
 'cs.MA',
 'cs.MM',
 'cs.MS',
 'cs.NA',
 'cs.NE',
 'cs.NI',
 'cs.OH',
 'cs.OS',
 'cs.PF',
 'cs.PL',
 'cs.RO',
 'cs.SC',
 'cs.SD',
 'cs.SE',
 'cs.SI',
 'cs.SY',
 'dg-ga',
 'econ.EM',
 'econ.GN',
 'econ.TH',
 'eess.AS',
 'eess.IV',
 'eess.SP',
 'eess.SY',
 'funct-an',
 'gr-qc',
 'hep-ex',
 'hep-lat',
 'hep-ph',
 'hep-th',
 'm

In [12]:
def process_str(s):
    s = s.lower()
    s = s.replace('\n', ' ')
    s = s.replace('\t', ' ')
    return s

In [16]:
data_filtered = {}
for obj in tqdm(data):
    keep = False
    cats = list(get_categories(obj))
    for c in cats:
        if c[:3] == 'cs.':
            keep = True
            break
    if keep:
        data_filtered[obj['id']] = {
            'id': obj['id'],
            'title': process_str(obj['title']),
            'authors': obj['authors'],
            'categories': cats,
            'abstract': process_str(obj['abstract'])
        }

100%|██████████| 1864576/1864576 [00:03<00:00, 474763.95it/s]


In [17]:
len(data_filtered)

274296

In [22]:
data_filtered['0708.2255']

{'id': '0708.2255',
 'title': 'a language for generic programming in the large',
 'authors': 'Jeremy G. Siek and Andrew Lumsdaine',
 'categories': ['cs.SE', 'cs.PL'],
 'abstract': '  generic programming is an effective methodology for developing reusable software libraries. many programming languages provide generics and have features for describing interfaces, but none completely support the idioms used in generic programming. to address this need we developed the language g. the central feature of g is the concept, a mechanism for organizing constraints on generics that is inspired by the needs of modern c++ libraries. g provides modular type checking and separate compilation (even of generics). these characteristics support modular software development, especially the smooth integration of independently developed components. in this article we present the rationale for the design of g and demonstrate the expressiveness of g with two case studies: porting the standard template librar

In [26]:
METADATA_FILTERED_FILEPATH = '../data/metadata_filtered.pkl'
with open(METADATA_FILTERED_FILEPATH, 'wb') as f:
    pickle.dump(data_filtered, f)