### Imports and configuration

In [1]:
# setup variables

import os
import json
import tqdm
from s2orc.config import CURRENT_VERSION

# jsonlines https://jsonlines.readthedocs.io/en/latest/#api
import jsonlines
import gzip
import numpy as np
import matplotlib.pyplot as plt
import hiplot # <3

LOCAL_S2ORC_DIR = 's2orc-data'

psychology_paper_dir = os.path.join(LOCAL_S2ORC_DIR, CURRENT_VERSION, 'psychology')
psychology_paper_suffix = 'psych.text.jsonl'

links_dir = os.path.join(LOCAL_S2ORC_DIR, CURRENT_VERSION, 'psych_links')
links_suffix = 'psych.text.link.jsonl'

## let's do it! <3

In [2]:
import pprint

start = 0
span = 1 # all: 1700

links_files = sorted(os.listdir(links_dir), key=lambda f: int(f.split('.')[0]))[start:(start+span)]
for link_file in tqdm.tqdm(links_files):
    with gzip.open(os.path.join(links_dir, link_file), 'rb') as f_in:
        links = list(jsonlines.Reader(f_in))
        for link in links:
            pp = pprint.PrettyPrinter(indent=1)
            pp.pprint(link['citing_paper']['metadata'])
            pp.pprint(link['citation_context'])
            pp.pprint(link['cited_paper']['metadata'])

100%|██████████| 1/1 [00:00<00:00,  5.94it/s]

{'abstract': None,
 'acl_id': None,
 'arxiv_id': None,
 'authors': [{'first': 'Anagha', 'last': 'Aery', 'middle': [], 'suffix': ''},
             {'first': 'Julie', 'last': 'Hodges', 'middle': [], 'suffix': ''},
             {'first': 'Jamin', 'last': 'Day', 'middle': ['J.'], 'suffix': ''}],
 'doi': '10.2991/uipsur-17.2018.50',
 'journal': 'Proceedings of the Universitas Indonesia International Psychology '
            'Symposium for Undergraduate Research (UIPSUR 2017)',
 'pmc_id': None,
 'pubmed_id': None,
 'title': 'The Effect of School-Based Stepping Stones Triple P on Child and '
          'Parent Outcomes',
 'venue': 'Proceedings of the Universitas Indonesia International Psychology '
          'Symposium for Undergraduate Research (UIPSUR 2017)',
 'year': '2018'}
{'cite_end': 345,
 'cite_start': 297,
 'cite_str': 'Roberts, Mazzucchelli, Studman, & Sanders, 2006;',
 'cited_paper_id': '2019870',
 'context_string': 'Roberts, Mazzucchelli, Studman, & Sanders, 2006;',
 'paper_id': '1




## Get corpus into memory

In [3]:
import pprint

start = 0
span = 1700 # all: 1700

links = []

links_files = sorted(os.listdir(links_dir), key=lambda f: int(f.split('.')[0]))[start:(start+span)]
for link_file in tqdm.tqdm(links_files):
    with gzip.open(os.path.join(links_dir, link_file), 'rb') as f_in:
        batch_links = list(jsonlines.Reader(f_in))
        links.extend(batch_links)

np.random.seed(2134234)
links = np.array(links)
np.random.shuffle(links)

100%|██████████| 1500/1500 [00:38<00:00, 38.86it/s]


In [4]:

n_train_links = int(0.5 * len(links))
n_validation_links = int(0.2 * len(links))
n_test_links = len(links) - n_train_links - n_validation_links
train_links = links[:n_train_links]
validation_links = links[n_train_links:n_train_links + n_validation_links]
test_links = links[-n_test_links:]

## Do stuff