### Imports and configuration

In [1]:
# setup variables

import os
import json
import tqdm
from s2orc.config import CURRENT_VERSION

LOCAL_S2ORC_DIR = 's2orc-data'
local_manifest_file = os.path.join(LOCAL_S2ORC_DIR, CURRENT_VERSION, 'manifest.json')


### Process the corpus chunks and join with citations

In [2]:
# ---- CONFIG ----- #

# jsonlines https://jsonlines.readthedocs.io/en/latest/#api
import jsonlines
import gzip

paper_dir = os.path.join(LOCAL_S2ORC_DIR, CURRENT_VERSION, 'papers')
context_dir = os.path.join(LOCAL_S2ORC_DIR, CURRENT_VERSION, 'contexts')
context_file_suffix = 'contexts.jsonl' # filename besides batch number


In [3]:
##### extract citation contexts and save them
from s2orc.get_citation_contexts import get_citation_contexts

start = 0
span = 100

os.makedirs(context_dir, exist_ok=True)
batch_files = sorted(os.listdir(paper_dir), key=lambda f: int(f.split('.')[0]))[start:(start+span)]
for batch_file in tqdm.tqdm(batch_files):
    batch_number = batch_file.split('.')[0]
    contexts = []
    with gzip.open(os.path.join(paper_dir, batch_file), 'rb') as f_in:
        papers = list(jsonlines.Reader(f_in))
        for paper in papers:
            citation_contexts = get_citation_contexts(paper, toks_in_context=20)
            # remove redundant ids
            for item in citation_contexts:
                del item['paper_id']
            entry = { 'paper_id': paper['paper_id'] }
            if len(citation_contexts) > 0:
                entry['contexts'] = citation_contexts
            contexts.append(entry)
    
    out_filename = '.'.join([batch_number, context_file_suffix, 'gz'])
    with gzip.open(os.path.join(context_dir, out_filename), mode='w') as f_out:
        jsonlines.Writer(f_out).write_all(contexts)
#         with jsonlines.open(os.path.join(context_dir, out_filename), mode='w') as writer:
#             writer.write_all(contexts)



100%|██████████| 1/1 [00:08<00:00,  8.20s/it]


In [4]:
# read citation contexts
start = 0
span = 100

contexts = {} # citation contexts by paper id

batch_files = sorted(os.listdir(context_dir), key=lambda f: int(f.split('.')[0]))[start:(start+span)]
for batch_file in tqdm.tqdm(batch_files):
    batch_number = batch_file.split('.')[0]
    with gzip.open(os.path.join(context_dir, batch_file)) as f_in:
        reader = jsonlines.Reader(f_in)
        contexts.update({item['paper_id']: item for item in reader})


100%|██████████| 1/1 [00:00<00:00,  2.97it/s]


### find articles with full grobid parse and intersect them with the papers cited within corpus contexts to get pairs <citation_string, cited full text paper>

In [5]:
full_text_papers = {}

In [6]:
##### get papers with full text parse

start = 0
span = 100

batch_files = sorted(os.listdir(paper_dir), key=lambda f: int(f.split('.')[0]))[start:(start+span)]
for batch_file in tqdm.tqdm(batch_files):
    with gzip.open(os.path.join(paper_dir, batch_file), 'rb') as f_in:
        papers = list(jsonlines.Reader(f_in))
        for paper in papers:
            if paper.get('grobid_parse') is not None:
                full_text_papers[paper['paper_id']] = paper




100%|██████████| 1/1 [00:06<00:00,  6.52s/it]


In [7]:
len(full_text_papers)

2817

In [8]:
# intersect full text papers with paper citations
links = []
bar = tqdm.tqdm(list(contexts.values()))
for paper in bar:
    bar.set_description('links found: ' + str(len(links)))
#     print(type(paper['paper_id']))
#     print(paper['contexts'])
    for context in paper.get('contexts', []):
#         print(type(context[0]['cited_paper_id']))
        cited_id = context['cited_paper_id']
        if cited_id in full_text_papers:
            links.append((context, full_text_papers[cited_id]))
            
# len(links)

links found: 22: 100%|██████████| 8259/8259 [00:14<00:00, 569.33it/s]


In [9]:
# print(full_text_papers.values())
for context, paper in links[50:100]:
    print(context['context_string'], paper['metadata']['title'], sep='\n', end='\n\n')
# print(links[0][0])
# print(links[0][1])
# 17913703

### observations

- embeddings will be applicable more to more "fuzzy" ideas and papers?
- nominal phrases VS verbal descriptions
- broader context of the citing paper would be useful (at least for identifying keywords from abstract)
- unknown words must be taken into account (many neologisms, variable names and other very hermetic words in pure science papers).

### todo
- checkout sciBERT
- checkout what kind of embeddings would be useful
- what would be useful: extracting precise information, or just fuzzy semantic detection (e.g. topic).


In [10]:
from syntok.segmenter import split
from syntok.tokenizer import Tokenizer
import json
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentPoolEmbeddings, StackedEmbeddings, Sentence
import numpy as np
import tqdm

links_random_order = list(links)
np.random.seed(234243)
np.random.shuffle(links_random_order)

glove_embedding = DocumentPoolEmbeddings([WordEmbeddings('glove')])

# flair_embedding = StackedEmbeddings([
#                                         FlairEmbeddings('news-forward'),
#                                         FlairEmbeddings('news-backward'),
#                                        ])

with open('glove_examples.txt', 'w') as f_out:
    for context, paper in tqdm.tqdm(links_random_order):
        # find the adequate context pair
        glove_similarities = []
        flair_similarities = []
    #     next(context[''] for context in contexts[paper['paper_id']]['contexts'] if context['cited_paper_id'] == paper
        citing_string = ''.join([context['pre_context'], context['context_string'], context['post_context']])
        s = Sentence(citing_string, use_tokenizer=True)
        glove_embedding.embed(s)
        glove_citation_embedding = s.embedding.detach()

    #     s = Sentence(citing_string, use_tokenizer=True)
    #     flair_embedding.embed(s)
    #     flair_citation_embedding = s.embedding.detach()
        for paper_part, text_chunks in paper['grobid_parse'].items():
            if text_chunks is not None:
                for text_chunk in text_chunks:
                    if isinstance(text_chunk, dict):
                        text = text_chunk.get('text')
                        tokenized_sents = list(split(Tokenizer().tokenize(text)))
                        sents = [' '.join(str(token) for token in sent) for sent in tokenized_sents]

                        sentences = [s for s in [Sentence(sent, use_tokenizer=True) for sent in sents] if len(s.tokens) > 0]
                        embeddings = glove_embedding.embed(sentences)
                        for s in sentences:
                            e1 = s.embedding.detach()
                            e2 = glove_citation_embedding
                            sim = np.dot(e1, e2) / (np.sqrt(np.dot(e1, e1)) * np.sqrt(np.dot(e2, e2)))
                            glove_similarities.append((sim, s.to_original_text()))


    #                     sentences = [s for s in [Sentence(sent, use_tokenizer=True) for sent in sents] if len(s.tokens) > 0]
    #                     embeddings = flair_embedding.embed(sentences)
    #                     for s in sentences:
    #                         e1 = s.embedding.detach()
    #                         e2 = flair_citation_embedding
    #                         sim = np.dot(e1, e2) / (np.sqrt(np.dot(e1, e1)) * np.sqrt(np.dot(e2, e2)))
    #                         flair_similarities.append((sim, s.to_original_text()))

                
                print('\n\n\n--- PAPER ---', '\n', paper['metadata']['title'], file=f_out, end='\n')
                print('\n\ncontext: ', citing_string, file=f_out, end='\n\n')
                print('\n\nsimilarities: ', file=f_out, end='\n\n')
                print(*sorted(glove_similarities, key=lambda x: x[0], reverse=True)[:20], file=f_out, sep='\n\n')
    #             print('FLAIR')
    #             print(*sorted(flair_similarities, key=lambda x: x[0], reverse=True)[:10], sep='\n\n')    

100%|██████████| 22/22 [00:10<00:00,  2.02it/s]


### conclusions
- sometimes the context says only what an article "does" (not what is its content), e.g.: "A more detailed description of the study population that participated in the autonomic measurements has been reported elsewhere (Dietrich et al., 2006)"
- the major problem is the citation context, and it seems glove embeddings do find some relevant sentences when the citing sentence describes accurately
- distinction between "nominal" and "verbal" contexts would help a lot, since verbal contexts often need a large context, and the nominal sometimes need probably only 3-4 words to compare with the text.
- sometimes, when the whole article seems to correspond to the citing sentence, in fact the task of finding relevant ones seem to be very ambiguous. We should probably see possible clusters of score distribution

### TODO:
- get flair embeddings to work (define their own metric of similarity)