In [66]:
### Imports and configuration

# setup variables

import os
import json
import tqdm
from s2orc.config import CURRENT_VERSION

# jsonlines https://jsonlines.readthedocs.io/en/latest/#api
import jsonlines
import gzip
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import hiplot

LOCAL_S2ORC_DIR = 's2orc-data'

psychology_paper_dir = os.path.join(LOCAL_S2ORC_DIR, CURRENT_VERSION, 'psychology')
psychology_paper_suffix = 'psych.text.jsonl'

links_dir = os.path.join(LOCAL_S2ORC_DIR, CURRENT_VERSION, 'psych_links')
links_suffix = 'psych.text.link.jsonl'


In [67]:
## Get corpus into memory

start = 0
span = 100 # all: 1700

links = []

links_files = sorted(os.listdir(links_dir), key=lambda f: int(f.split('.')[0]))[start:(start+span)]
for link_file in tqdm.tqdm(links_files):
    with gzip.open(os.path.join(links_dir, link_file), 'rb') as f_in:
        batch_links = list(jsonlines.Reader(f_in))
        for link in batch_links:
            if link['citing_paper']['grobid_parse'].get('body_text') is not None and link['cited_paper']['grobid_parse'].get('body_text') is not None:
                links.append(link)

np.random.seed(2134234)
links = np.array(links)
np.random.shuffle(links)

100%|██████████| 100/100 [00:02<00:00, 43.46it/s]


# work on the data

In [68]:
from flair.embeddings import WordEmbeddings, DocumentPoolEmbeddings, TransformerDocumentEmbeddings
from flair.data import Sentence

In [69]:
import regex
from syntok.segmenter import split
from syntok.tokenizer import Tokenizer

token_filter_re = r'^\p{L}(\p{L}|\.|[0-9])+$'
science_blacklist = {'et', 'al', 'al.'} # remove words specific for scientific papers without any importance for the task

def filter_tokens(tokens):
    # Filter out non-common-word/non-real-world-entity tokens
    return [
        token for token in tokens if 
            regex.match(token_filter_re, token.text) 
            and token.text not in science_blacklist 
    ]

author_re = r'(((de|De|van|Van|von|Von)\s+(\p{Ll}+\s+)?)?\p{Lu}(\p{Ll}|-)+)'

citation_re = (
               r'([(;,]|\s)'
            + author_re +
               r'(\s?((([&,]|and|\s)+\s?' + author_re + r')|(et al\.?)))?' # alternative second author
               r'('
                 r'([;,]|\s)+'
                 r'\s*[0-9]{4}\p{Ll}?\s*' # year
               r')+'
               r'([);,]|\s)'
              )

in_text_citation_re = (
    author_re + r'\s*\((\s*[0-9]{4}\p{Ll}?\s*,?)\)'
)
def filter_citations(text):
    n_subs_made = 1
    while n_subs_made > 0:
        text, n_subs_made = regex.subn(citation_re, ';', text)
        text, n_subs_made2 = regex.subn(citation_re, ';', text)
        n_subs_made += n_subs_made2
    return text

def process_section_to_chunks(text):
    chunksize = 1
    tokenized_sents = list(split(Tokenizer().tokenize(text)))
    sents = [' '.join(str(token) for token in sent) for sent in tokenized_sents]
    sents = [' '.join(sents[i:i+chunksize]) for i in range(len(sents)-chunksize)]
    sentences = []
    for sent in sents:
        sent = text_to_sentence(sent)
        if sent is not None:
            sentences.append(sent)
    return sentences

def text_to_sentence(text):
    text = filter_citations(text) 
    s = Sentence(text, use_tokenizer=True)
    s.tokens = filter_tokens(s.tokens)
    if len(s.tokens) > 0:
        return s
    else:
        return None

In [70]:
# extract the important information from the original S2ORC corpus format
def process_link(link, process_section):
    context = link['citation_context']
    citing_paper = link['citing_paper']
    cited_paper = link['cited_paper']
    
    parts = []
    for text_chunk in cited_paper['grobid_parse']['body_text']:
        text = text_chunk.get('text')
        if text is not None:
            chunk_parts = process_section(text)
            parts.extend(chunk_parts)
    citing_string = ''.join([context['pre_context'], context['context_string'], context['post_context']])
    return {
        'citing_str': context['context_string'],
        'citing_context': citing_string,
        'citing_context_part': text_to_sentence(citing_string),
        'cited_text_parts': parts,
    }


In [82]:
import spacy
spacy_stopwords = spacy.load('en_core_web_sm').Defaults.stop_words


def calc_embedding_scores(link, metrics, embedding_name, embedding):
    all_sim = {}
    
    s = link['citing_context_part']
    s.tokens = [token for token in s if token.text not in spacy_stopwords]

    embedding.embed(s)
    citation_embedding = s.embedding.detach().numpy()

    sentences = link['cited_text_parts']
    for sentence in sentences:
        embedding.embed(sentence)
        all_sim[sentence.to_plain_string()] = {}
        for name, metric in metrics.items():
            sim = 1 - metric(sentence.embedding.detach().numpy(), citation_embedding)
            full_name = '_'.join([embedding_name, name])
            all_sim[sentence.to_plain_string()][full_name] = sim
        sentence.clear_embeddings()
    s.clear_embeddings()
    
    return pd.DataFrame(all_sim).T



[W031] Model 'en_core_web_sm' (2.2.5) requires spaCy v2.2 and is incompatible with the current spaCy version (2.3.2). This may lead to unexpected results or runtime errors. To resolve this, download a newer compatible model or retrain your custom model with the current spaCy version. For more details and available updates, run: python -m spacy validate



### Bag of words weighted by Inverse Term Frequency (with lemmatization)

In [72]:
from wordfreq import word_frequency
from collections import defaultdict
from nltk.stem import WordNetLemmatizer 
import skfuzzy as fuzz

def get_length_penalty(sent_len):
    # 0.5 value for len=2
    # quickly raising to 1 at ~10 words
    return fuzz.sigmf(sent_len, 2, 0.5)

def get_word_itf(string_tokens):
    min_val = 10e-7
    return [1./max(min_val, word_frequency(token, 'en')) for token in string_tokens]

def transform_tokens(string_tokens, lemmatizer):
    return [lemmatizer.lemmatize(token.lower()) for token in string_tokens]

def calc_bow_itf_score(link, name):
    """
    Score using term frequency
    """
    lemmatizer = WordNetLemmatizer()
    scores = []
    s = link['citing_context_part']
    s_tokens = [token.text for token in s.tokens]
    context_token_set = set(transform_tokens(s_tokens, lemmatizer=lemmatizer))
    sentences = link['cited_text_parts']
    for sentence in sentences:
        sent_tokens = [token.text for token in sentence.tokens]
        sent_token_set = set(transform_tokens(sent_tokens, lemmatizer))
        if len(sent_token_set) == 0:
            scores.append(0)
            continue
        matching_tokens = sent_token_set.intersection(context_token_set)
        all_tokens = sent_token_set.union(context_token_set)
        raw_score = np.sum(get_word_itf(matching_tokens)) / np.sum(get_word_itf(all_tokens))
        # Eliminate artifacts by penalizing extremely short matches
        raw_score *= get_length_penalty(len(sent_token_set))
        if np.isclose(raw_score, 0):
            score = 0.
        else:
            score = 1./-np.log(raw_score)
        scores.append(score)
    return pd.DataFrame({
        name: scores
    }, index=[sent.to_plain_string() for sent in sentences])


###  TS SS distance metric

In [73]:
from scipy.spatial import distance
from scipy.linalg import norm
from Vector_Similarity import *

def TS_SS(vec1, vec2) :
    val = Triangle(vec1, vec2) * Sector(vec1, vec2)
    return min(1, val/3)

def Triangle(vec1, vec2) :
    theta = math.radians(Theta(vec1,vec2))
    return (norm(vec1) * norm(vec2) * math.sin(theta)) / 2

def Theta(vec1, vec2) :
    return math.acos(1 - distance.cosine(vec1, vec2)) + math.radians(10)

def Magnitude_Difference(vec1, vec2) :
    return abs(norm(vec1) - norm(vec2))

def Sector(vec1, vec2) :
    ED = distance.euclidean(vec1, vec2)
    MD = Magnitude_Difference(vec1, vec2)
    theta = Theta(vec1, vec2)
    return math.pi * math.pow((ED+MD),2) * theta/360

### Prepare data

In [74]:
bert_embedding = TransformerDocumentEmbeddings('bert-base-uncased', fine_tune=False)
roberta_embedding = TransformerDocumentEmbeddings('roberta-base', fine_tune=False)
glove_embedding = DocumentPoolEmbeddings([WordEmbeddings('glove')])

In [75]:
n_train_links = int(0.5 * len(links))
n_validation_links = int(0.2 * len(links))
n_test_links = len(links) - n_train_links - n_validation_links
train_links = links[:15]
validation_links = links[15:45]
test_links = links[-n_test_links:]
#links = None

In [76]:
import copy
def prepare_link_and_val(link, val_links):
    result = {"original" : process_link(link, process_section_to_chunks)}
    val_links = val_links or []
    for i in range(len(val_links)):
        val_link = copy.deepcopy(val_links[i])
        val_link2 = copy.deepcopy(link)
        val_link2['citation_context'] = val_link['citation_context']
        val_link['citation_context'] = link['citation_context']
        result.update({
            "val_orig_context_"+str(i) : process_link(val_link, process_section_to_chunks),
            "val_orig_paper_"+str(i) : process_link(val_link2, process_section_to_chunks)
        })
    return result

### Score pipeline

In [83]:
from plot_text_sim import plot_text_sim 
from scipy.spatial import distance
import itertools
metrics = {"cos" : distance.cosine, "ts_ss" : TS_SS}

# do it in a function to prevent memory leaks
def calculate_similarities(train_links, validation_links):   
    results = []
    for link, val_links in tqdm.tqdm(list(itertools.zip_longest(train_links, validation_links))):
        preprocessed = prepare_link_and_val(link, val_links)
        bow_itf = {name: calc_bow_itf_score(link, name='bow_itf_'+name) for name, link in preprocessed.items()}
        
        bert = {name: calc_embedding_scores(link, metrics, embedding_name='bert_'+name, embedding = bert_embedding) for name, link in preprocessed.items()}
        
        data = {name : pd.merge(bert[name], bow_itf[name], left_index = True, right_index = True) for name, link in preprocessed.items()}
#         data = {name : bow_itf[name] for name, link in preprocessed.items()}
        results.append({
            'citing_str': preprocessed['original']['citing_str'],
            'citing_context': preprocessed['original']['citing_context'],
            'data': data,
        })

    return results, preprocessed


In [84]:

start = 12
span = 1
val_span = 0
val = np.split(validation_links[start:start+val_span*span], span)
train = train_links[start:start+span]
results, preprocessed = calculate_similarities(train, val)


The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.

  0%|          | 0/1 [00:17<?, ?it/s]


KeyboardInterrupt: 

In [None]:
processed_results = copy.deepcopy(results)

for result in processed_results:
    df = result['data']['original']
    df['bert_original_cos_ampl'] = df['bert_original_cos']**6
    df['comb_bow_itf_bert_cos'] = np.sqrt(df['bow_itf_original'] * df['bert_original_cos_ampl'])
    df['comb_bow_itf_bert_ts_ss'] = np.sqrt(df['bow_itf_original'] * df['bert_original_ts_ss'])
    # derive weighted moving average


In [None]:
processed_results = copy.deepcopy(results)

for result in processed_results:
    df = result['data']['original']
    df['comb_bow_itf_bert'] = np.sqrt(df['bow_itf_original'] * (1 - df['bert_original_cos'])**4)
    df['comb_bow_itf_glove'] = np.sqrt(df['bow_itf_original'] * (1 - df['glove_original_cos'])**4)
    df['comb_bow_itf_roberta'] = np.sqrt(df['bow_itf_original'] * (1 - df['roberta_original_cos'])**4)
    # derive weighted moving average
    for column in df.columns:
        df[f'{column}_rolling'] = df[column].rolling(4, center=True, win_type='gaussian').mean(std=2)

In [None]:
import pickle
with open("results_emb_bow.p", "wb") as file:
    pickle.dump(processed_results, file)

In [None]:
with open("preprocessed_.p", "wb") as file:
    pickle.dump(preprocessed, file)

In [None]:
def mean_results(results):
    return{
        name : {
            column : np.mean(df[column]) for column in df.columns if df[column].dtype == float
        } for name, df in results.items()
    }

In [None]:
mean_res = mean_results(results[0]["data"])

In [None]:
with open("results_mean.p", "wb") as file:
    pickle.dump(mean_res, file)

In [None]:
process_link(train_links[start], process_section_to_chunks)['citing_str'] 

In [None]:
import hiplot as hip
result_df = results[0]['data']['original'].assign(text_beginning = lambda df: df.index.str[:10])
exp = hip.Experiment.from_dataframe(result_df)
displayed_exp = exp.display()

### Slightly wider manual comparison

In [86]:
start = 2
span = 2 #10
test = test_links[start:start+span]

test_results, test_preprocessed = calculate_similarities(test, [])

processed_test_results = copy.deepcopy(test_results)

for result in processed_test_results:
    df = result['data']['original']
    df['bert_original_cos_ampl'] = df['bert_original_cos']**6
    df['comb_bow_itf_bert_cos'] = np.sqrt(df['bow_itf_original'] * df['bert_original_cos_ampl'])
    df['comb_bow_itf_bert_ts_ss'] = np.sqrt(df['bow_itf_original'] * df['bert_original_ts_ss'])


100%|██████████| 2/2 [01:25<00:00, 42.53s/it]


In [None]:
import pickle
with open("results_emb_bow_test.p", "wb") as file:
    pickle.dump(processed_test_results, file)

In [None]:
from scipy import stats
stats.spearmanr(df['bert_original_cos_ampl'],df['bert_original_ts_ss'] )

In [87]:

def display_best(df, column):
    print(f'\n-- {column.upper()} --')
    print(*[f'{i+1}: "{chunk}"' 
            for i, chunk in enumerate(df[column].sort_values(ascending=False).index.values.tolist()[:5])
           ],
          sep='\n'
         )

for result in processed_test_results:
    result['citing_str']
    result['citing_context']
    df = result['data']['original']
    display_best(df, 'bow_itf_original')
    display_best(df, 'bert_original_ts_ss')
    display_best(df, 'comb_bow_itf_bert_ts_ss')


'Gick & Holyoak, 1983;'

'that motivated our hypothesis is that researchers have demonstrated that learning can be improved by studying two items of the same category simultaneously, as opposed to one at a time (Catrambone & Holyoak, 1989; Gentner, Loewenstein, & Thompson, 2003; Gick & Holyoak, 1983; Hammer, Diesendruck, Weinshall, & The data from the studies presented in this article are publicly available from: https://osf.io/vh7pn.'


-- BOW_ITF_ORIGINAL --
1: "For example Gentner and Gentner in press have demonstrated that alternative analogies known by subjects prior to the experiment produce systematically varying patterns of difficulty among types of electricity problems"
2: "However the present study provides evidence against this hypothesis"
3: "More generally the function of an analogy is to derive new solution hypothesis or prediction this is done by finding an initial partial mapping between the two analogs and then extending the mapping by retrieving or creating additional knowledge about the analog that was initially less well understood"
4: "This pattern is in accord with our hypothesis regarding analog similarity while dissimilar analogs have greater potential to yield optimal schemas they are also more likely to fail to produce any useful schema"
5: "Experiment thus yielded no support for the hypothesis that augmenting the story analog with verbal principle would increase analogical transfer"

-- BERT

'Bardo and Bevins, 2000;'

'decision making task, similar to the conditioned place preference (CPP) paradigm widely applied to the study of addiction in animal models. CPP has been commonly used to measure the reward value of different drugs of abuse (for reviews, see Bardo and Bevins, 2000; Tzschentke, 2007) . Here, drug-free subjects (typically rodents), are first allowed to explore an apparatus consisting of at least two distinct interconnected chambers to measure initial preference (i.e., by comparing time spent in each context). In subsequent conditioning sessions,'


-- BOW_ITF_ORIGINAL --
1: "The purpose of this review is to provide an evaluation of conditioned place preference CPP as an experimental protocol for measuring drug reward in laboratory animals"
2: "Another limitation of CPP stems from the tendency of animals to prefer one of the two distinct contexts of the apparatus before conditioning occurs"
3: "Since it is not clear what class of behaviors are reinforced during CPP conditioning trials the term reward seems more appropriate to describe drug induced CPP"
4: "Despite this advantage it has been argued that CPP is not particularly sensitive to changes in drug dose"
5: "Since the early there has been some disagreement about whether drug CPP and self administration represent two alternative methods for measuring common reward process"

-- BERT_ORIGINAL_TS_SS --
1: "Intermixed with these context US pairings is similar exposure to the other context without the US.Following conditioning is choice test in which animals receive unrestricted 

In [88]:



for result in processed_test_results:
    result['citing_str']
    result['citing_context']
    df = result['data']['original']
    _ = plot_text_sim(df['bow_itf_original'], df.index, title='bow_itf')
    _ = plot_text_sim(df['bert_original_cos_ampl'], df.index, title='bert_cos')
    _ = plot_text_sim(df['bert_original_ts_ss'], df.index, title='bert_ts_ss')
    _ = plot_text_sim(df['comb_bow_itf_bert_cos'], df.index, title='comb_bow_itf_bert_cos')
    _ = plot_text_sim(df['comb_bow_itf_bert_ts_ss'], df.index, title='comb_bow_itf_bert_ts_ss')



'Gick & Holyoak, 1983;'

'that motivated our hypothesis is that researchers have demonstrated that learning can be improved by studying two items of the same category simultaneously, as opposed to one at a time (Catrambone & Holyoak, 1989; Gentner, Loewenstein, & Thompson, 2003; Gick & Holyoak, 1983; Hammer, Diesendruck, Weinshall, & The data from the studies presented in this article are publicly available from: https://osf.io/vh7pn.'

'Bardo and Bevins, 2000;'

'decision making task, similar to the conditioned place preference (CPP) paradigm widely applied to the study of addiction in animal models. CPP has been commonly used to measure the reward value of different drugs of abuse (for reviews, see Bardo and Bevins, 2000; Tzschentke, 2007) . Here, drug-free subjects (typically rodents), are first allowed to explore an apparatus consisting of at least two distinct interconnected chambers to measure initial preference (i.e., by comparing time spent in each context). In subsequent conditioning sessions,'

In [108]:
# links[109]
# [(i, doc['cited_paper']['metadata']['title']) for i, doc in enumerate(links)]
