### Imports and configuration

In [7]:
# setup variables

import os
import json
import tqdm
from s2orc.config import CURRENT_VERSION

LOCAL_S2ORC_DIR = 's2orc-data'
local_manifest_file = os.path.join(LOCAL_S2ORC_DIR, CURRENT_VERSION, 'manifest.json')


### Filter psychology papers

In [8]:
# ---- CONFIG ----- #

# jsonlines https://jsonlines.readthedocs.io/en/latest/#api
import jsonlines
import gzip

paper_dir = os.path.join(LOCAL_S2ORC_DIR, CURRENT_VERSION, 'papers')

psychology_paper_dir = os.path.join(LOCAL_S2ORC_DIR, CURRENT_VERSION, 'psychology')
psychology_paper_suffix = 'psych.text.jsonl'


### Extract list of journals meeting our criteria

In [3]:
import re
start = 0
span = 100

psych_journal_re = r'(\s|^)psycho'
psych_journals = set()

batch_files = sorted(os.listdir(paper_dir), key=lambda f: int(f.split('.')[0]))[start:(start+span)]
for batch_file in tqdm.tqdm(batch_files):
    batch_number = batch_file.split('.')[0]
    with gzip.open(os.path.join(paper_dir, batch_file), 'rb') as f_in:
        papers = list(jsonlines.Reader(f_in))
        for paper in papers:
            journal = paper['metadata']['journal']
            if journal is not None and re.search(psych_journal_re, journal, re.I) is not None:
                psych_journals.add(journal)


  0%|          | 0/100 [00:02<?, ?it/s]


NameError: name 're' is not defined

### Save to disk a clean set of full-grobid-parse psychology papers

In [9]:
import collections
import re
start = 0
span = 1700

# output
ids_in_batch = collections.defaultdict(set)

count = 0
os.makedirs(psychology_paper_dir, exist_ok=True)
batch_files = sorted(os.listdir(paper_dir), key=lambda f: int(f.split('.')[0]))[start:(start+span)]
bar = tqdm.tqdm(batch_files)
for batch_file in bar:
    batch_number = batch_file.split('.')[0]
    filtered_papers = []
    with gzip.open(os.path.join(paper_dir, batch_file), 'rb') as f_in:
        papers = list(jsonlines.Reader(f_in))
        for paper in papers:
#             if paper.get('grobid_parse') is not None and paper['metadata']['journal'] in psych_journals:
            if paper.get('grobid_parse') is not None and re.search(psych_journal_re, paper['metadata']['journal'] or '', re.I) is not None: 
                filtered_papers.append(paper)
                ids_in_batch[batch_number].add(paper['paper_id'])
    count += len(filtered_papers)
    bar.set_description(f'{count} papers found')
    out_filename = '.'.join([batch_number, psychology_paper_suffix, 'gz'])
    with gzip.open(os.path.join(psychology_paper_dir, out_filename), mode='w') as f_out:
        jsonlines.Writer(f_out).write_all(filtered_papers)
                

28295 papers found: 100%|██████████| 1700/1700 [2:40:16<00:00,  5.66s/it]  


### Find links

In [10]:
links_dir = os.path.join(LOCAL_S2ORC_DIR, CURRENT_VERSION, 'psych_links')
links_suffix = 'psych.text.link.jsonl'

In [14]:
%load_ext autoreload


In [15]:

# token context 
import functools
from s2orc.get_citation_contexts import get_citation_contexts

start = 0
span = 1700
min_context_size=40

all_psych_ids = functools.reduce(lambda a,b: a.union(b), ids_in_batch.values())

def make_links(papers_to_search, citing_paper, citation_contexts, relevant_paper_ids):
    new_links = []
    for this_batch_match_id in relevant_paper_ids:
        this_batch_match = next(paper for paper in papers_to_search if paper['paper_id'] == this_batch_match_id)
        citation_context = next(context for context in citation_contexts if context['cited_paper_id'] == this_batch_match_id)
        new_links.append({
            'citing_paper': citing_paper,
            'citation_context': citation_context,
            'cited_paper': this_batch_match,
        })
    return new_links

count = 0
os.makedirs(links_dir, exist_ok=True)
batch_files = sorted(os.listdir(psychology_paper_dir), key=lambda f: int(f.split('.')[0]))[start:(start+span)]
bar = tqdm.tqdm(batch_files)
for batch_file in bar:
    links = []
    batch_number = batch_file.split('.')[0]
    filtered_papers = []
    with gzip.open(os.path.join(psychology_paper_dir, batch_file), 'rb') as f_in:
        papers = list(jsonlines.Reader(f_in))
        for citing_paper in papers:
            citation_contexts = get_citation_contexts(citing_paper, toks_in_context=min_context_size)
            cited_paper_ids = set(item['cited_paper_id'] for item in citation_contexts if item['cited_paper_id'] in all_psych_ids)
            # find chunks where these cited papers can be found
            
            for relevant_batch, batch_paper_ids in ids_in_batch.items():
                relevant_paper_ids = cited_paper_ids.intersection(batch_paper_ids)
                if len(relevant_paper_ids) > 0:
                    # the paper can be found here
                    if relevant_batch == batch_number:
                        links.extend(
                            make_links(papers, citing_paper, citation_contexts, relevant_paper_ids)
                        )
                        
                    else:
                        with gzip.open(os.path.join(psychology_paper_dir, '.'.join([relevant_batch, psychology_paper_suffix, 'gz'])), 'rb') as f_relevant:
#                             print(relevant_paper_ids)
#                             print(ids_in_batch[relevant_batch])
                            papers_in_batch = list(jsonlines.Reader(f_relevant))
                            links.extend(
                                make_links(papers_in_batch, citing_paper, citation_contexts, relevant_paper_ids)
                            )
    
    count += len(links)
    bar.set_description(f'{count} links found')
    out_filename = '.'.join([batch_number, links_suffix, 'gz'])
    with gzip.open(os.path.join(links_dir, out_filename), mode='w') as f_out:
        jsonlines.Writer(f_out).write_all(links)
        

8599 links found: 100%|██████████| 1700/1700 [19:53<00:00,  1.42it/s]


In [17]:
with gzip.open(os.path.join(links_dir, '1000.psych.text.link.jsonl.gz'), 'rb') as f_in:
    links = list(jsonlines.Reader(f_in))
import pprint
pp = pprint.PrettyPrinter(indent=1)
i = 1
pp.pprint(links[i]['citing_paper']['metadata'])
pp.pprint(links[i]['citation_context'])
pp.pprint(links[i]['cited_paper']['metadata'])

{'abstract': 'Pain serves as a signal to elicit care from others. In turn, '
             'displaying pain might be attractive because of the benefits it '
             'might bring. Additionally, displaying pain is easy, because '
             'helpers distinguish poorly between genuine pain and faked pain. '
             'Hence, helpers face the problem of distinguishing true sufferers '
             'from free riders, while sufferers face the problem of '
             'communicating need convincingly. This article will propose '
             'solutions to these adaptive problems. Based on theoretical '
             'arguments and on empirical insights from lie detection research, '
             'it will be argued that the credibility of pain signals cannot be '
             'found in features of the signal itself, but in its context. '
             'Namely, pain is obviously credible when the context features '
             'unforgeable cues, such as an open wound or the enlarged ab