In [1]:
### Imports and configuration

# setup variables

import os
import json
import tqdm
from s2orc.config import CURRENT_VERSION

# jsonlines https://jsonlines.readthedocs.io/en/latest/#api
import jsonlines
import gzip
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import hiplot

LOCAL_S2ORC_DIR = 's2orc-data'

psychology_paper_dir = os.path.join(LOCAL_S2ORC_DIR, CURRENT_VERSION, 'psychology')
psychology_paper_suffix = 'psych.text.jsonl'

links_dir = os.path.join(LOCAL_S2ORC_DIR, CURRENT_VERSION, 'psych_links')
links_suffix = 'psych.text.link.jsonl'


In [2]:
## Get corpus into memory

start = 0
span = 100 # all: 1700

links = []

links_files = sorted(os.listdir(links_dir), key=lambda f: int(f.split('.')[0]))[start:(start+span)]
for link_file in tqdm.tqdm(links_files):
    with gzip.open(os.path.join(links_dir, link_file), 'rb') as f_in:
        batch_links = list(jsonlines.Reader(f_in))
        for link in batch_links:
            if link['citing_paper']['grobid_parse'].get('body_text') is not None and link['cited_paper']['grobid_parse'].get('body_text') is not None:
                links.append(link)

np.random.seed(2134234)
links = np.array(links)
np.random.shuffle(links)

100%|██████████| 100/100 [00:03<00:00, 25.39it/s]


# work on the data

In [3]:
from flair.embeddings import WordEmbeddings, DocumentPoolEmbeddings, TransformerDocumentEmbeddings
from flair.data import Sentence

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [4]:
import regex
from syntok.segmenter import split
from syntok.tokenizer import Tokenizer

token_filter_re = r'^\p{L}(\p{L}|\.|[0-9])+$'
science_blacklist = {'et', 'al', 'al.'} # remove words specific for scientific papers without any importance for the task

def filter_tokens(tokens):
    # Filter out non-common-word/non-real-world-entity tokens
    return [
        token for token in tokens if 
            regex.match(token_filter_re, token.text) 
            and token.text not in science_blacklist
    ]

author_re = r'(((de|De|van|Van|von|Von)\s+(\p{Ll}+\s+)?)?\p{Lu}(\p{Ll}|-)+)'

citation_re = (
               r'([(;,]|\s)'
            + author_re +
               r'(\s?((([&,]|and|\s)+\s?' + author_re + r')|(et al\.?)))?' # alternative second author
               r'('
                 r'([;,]|\s)+'
                 r'\s*[0-9]{4}\p{Ll}?\s*' # year
               r')+'
               r'([);,]|\s)'
              )

in_text_citation_re = (
    author_re + r'\s*\((\s*[0-9]{4}\p{Ll}?\s*,?)\)'
)
def filter_citations(text):
    n_subs_made = 1
    while n_subs_made > 0:
        text, n_subs_made = regex.subn(citation_re, ';', text)
        text, n_subs_made2 = regex.subn(citation_re, ';', text)
        n_subs_made += n_subs_made2
    return text

def process_section_to_chunks(text):
    chunksize = 1
    tokenized_sents = list(split(Tokenizer().tokenize(text)))
    sents = [' '.join(str(token) for token in sent) for sent in tokenized_sents]
    sents = [' '.join(sents[i:i+chunksize]) for i in range(len(sents)-chunksize)]
    sentences = []
    for sent in sents:
        sent = text_to_sentence(sent)
        if sent is not None:
            sentences.append(sent)
    return sentences

def text_to_sentence(text):
    text = filter_citations(text) 
    s = Sentence(text, use_tokenizer=True)
    s.tokens = filter_tokens(s.tokens)
    if len(s.tokens) > 0:
        return s
    else:
        return None

In [5]:
# extract the important information from the original S2ORC corpus format
def process_link(link, process_section):
    context = link['citation_context']
    citing_paper = link['citing_paper']
    cited_paper = link['cited_paper']
    
    parts = []
    for text_chunk in cited_paper['grobid_parse']['body_text']:
        text = text_chunk.get('text')
        if text is not None:
            chunk_parts = process_section(text)
            parts.extend(chunk_parts)
    citing_string = ''.join([context['pre_context'], context['context_string'], context['post_context']])
    return {
        'citing_str': context['context_string'],
        'citing_context': citing_string,
        'citing_context_part': text_to_sentence(citing_string),
        'cited_text_parts': parts,
    }


In [6]:
def calc_embedding_scores(link, metrics, embedding_name, embedding):
    all_sim = {}
    
    s = link['citing_context_part']
    embedding.embed(s)
    citation_embedding = s.embedding.detach().numpy()

    sentences = link['cited_text_parts']
    for sentence in sentences:
        embedding.embed(sentence)
        all_sim[sentence.to_plain_string()] = {}
        for name, metric in metrics.items():
            sim = 1 - metric(sentence.embedding.detach().numpy(), citation_embedding)
            full_name = '_'.join([embedding_name, name])
            all_sim[sentence.to_plain_string()][full_name] = sim
        sentence.clear_embeddings()
    s.clear_embeddings()
    
    return pd.DataFrame(all_sim).T


### Bag of words (unweighted)

In [7]:
def calc_bow_occurrence_score(link, name):
    scores = []
    s = Sentence(link['citing_context'], use_tokenizer=True)
    context_token_set = set(token.text for token in s.tokens)
    sentences = link['cited_text_parts']
    for sentence in sentences:
        sent_token_set = set(token.text for token in sentence.tokens)
        score = len(sent_token_set.intersection(context_token_set)) / len(sent_token_set.union(context_token_set))            
        scores.append(score)
    return pd.DataFrame({
        name: scores
    }, index=[sent.to_plain_string() for sent in sentences])

### Bag of words weighted by Inverse Term Frequency (with lemmatization)

In [8]:
from wordfreq import word_frequency
from collections import defaultdict
from nltk.stem import WordNetLemmatizer 
import skfuzzy as fuzz

def get_length_penalty(sent_len):
    # 0.5 value for len=2
    # quickly raising to 1 at ~10 words
    return fuzz.sigmf(sent_len, 2, 0.5)

def get_word_itf(string_tokens):
    min_val = 10e-7
    return [1./max(min_val, word_frequency(token, 'en')) for token in string_tokens]

def transform_tokens(string_tokens, lemmatizer):
    return [lemmatizer.lemmatize(token.lower()) for token in string_tokens]

def calc_bow_itf_score(link, name):
    """
    Score using term frequency
    """
    lemmatizer = WordNetLemmatizer()
    scores = []
    s = link['citing_context_part']
    s_tokens = [token.text for token in s.tokens]
    context_token_set = set(transform_tokens(s_tokens, lemmatizer=lemmatizer))
    sentences = link['cited_text_parts']
    for sentence in sentences:
        sent_tokens = [token.text for token in sentence.tokens]
        sent_token_set = set(transform_tokens(sent_tokens, lemmatizer))
        if len(sent_token_set) == 0:
            scores.append(0)
            continue
        matching_tokens = sent_token_set.intersection(context_token_set)
        all_tokens = sent_token_set.union(context_token_set)
        raw_score = np.sum(get_word_itf(matching_tokens)) / np.sum(get_word_itf(all_tokens))
        # Eliminate artifacts by penalizing extremely short matches
        raw_score *= get_length_penalty(len(sent_token_set))
        if np.isclose(raw_score, 0):
            score = 0.
        else:
            score = 1./-np.log(raw_score)
        scores.append(score)
    return pd.DataFrame({
        name: scores
    }, index=[sent.to_plain_string() for sent in sentences])


###  TS SS distance metric

In [9]:
from scipy.spatial import distance
from scipy.linalg import norm

#the below functions are slightly modified versions 
#of the ones in https://github.com/taki0112/Vector_Similarity

def TS_SS(vec1, vec2) :
    val = Triangle(vec1, vec2) * Sector(vec1, vec2)
    return min(1, val/3)

def Triangle(vec1, vec2) :
    theta = math.radians(Theta(vec1,vec2))
    return (norm(vec1) * norm(vec2) * math.sin(theta)) / 2

def Theta(vec1, vec2) :
    return math.acos(1 - distance.cosine(vec1, vec2)) + math.radians(10)

def Magnitude_Difference(vec1, vec2) :
    return abs(norm(vec1) - norm(vec2))

def Sector(vec1, vec2) :
    ED = distance.euclidean(vec1, vec2)
    MD = Magnitude_Difference(vec1, vec2)
    theta = Theta(vec1, vec2)
    return math.pi * math.pow((ED+MD),2) * theta/360

### Prepare data

In [10]:
bert_embedding = TransformerDocumentEmbeddings('bert-base-uncased', fine_tune=False)
roberta_embedding = TransformerDocumentEmbeddings('roberta-base', fine_tune=False)
glove_embedding = DocumentPoolEmbeddings([WordEmbeddings('glove')])

In [11]:
n_train_links = int(0.5 * len(links))
n_validation_links = int(0.2 * len(links))
n_test_links = len(links) - n_train_links - n_validation_links
train_links = links[:15]
validation_links = links[15:45]
test_links = links[-n_test_links:]
#links = None

In [12]:
import copy
def prepare_link_and_val(link, val_links):
    result = {"original" : process_link(link, process_section_to_chunks)}
    for i in range(len(val_links)):
        val_link = copy.deepcopy(val_links[i])
        val_link2 = copy.deepcopy(link)
        val_link2['citation_context'] = val_link['citation_context']
        val_link['citation_context'] = link['citation_context']
        result.update({
            "val_orig_context_"+str(i) : process_link(val_link, process_section_to_chunks),
            "val_orig_paper_"+str(i) : process_link(val_link2, process_section_to_chunks)
        })
    return result

### Score pipeline

In [13]:
from plot_text_sim import plot_text_sim 
from scipy.spatial import distance
import itertools
metrics = {"cos" : distance.cosine, "ts_ss" : TS_SS}

# do it in a function to prevent memory leaks
def calculate_similarities(train_links, validation_links):
    results = []
    for link, val_links in tqdm.tqdm(list(itertools.zip_longest(train_links, validation_links))):
        
        preprocessed = prepare_link_and_val(link, val_links or [])
        bow_itf = {name: calc_bow_itf_score(link, name='bow_itf_'+name) for name, link in preprocessed.items()}
        
        bert = {name: calc_embedding_scores(link, metrics, embedding_name='bert_'+name, embedding = bert_embedding) for name, link in preprocessed.items()}
        
        glove = {name: calc_embedding_scores(link, metrics, embedding_name='glove_'+name, embedding = glove_embedding) for name, link in preprocessed.items()}
        roberta = {name: calc_embedding_scores(link, metrics, embedding_name='roberta_'+name, embedding = roberta_embedding) for name, link in preprocessed.items()}
        bow_occur = {name: calc_bow_occurrence_score(link, name='bow_occurrence_'+name) for name, link in preprocessed.items()}
        
        data_emb = {name : pd.merge(pd.merge(glove[name], bert[name], left_index = True, right_index = True), roberta[name], left_index = True, right_index = True) for name, link in preprocessed.items()}
        data_bow = {name : pd.merge(bow_occur[name], bow_itf[name], left_index = True, right_index = True) for name, link in preprocessed.items()}
        data = {name : pd.merge(data_emb[name], data_bow[name], left_index = True, right_index = True) for name, link in preprocessed.items()}
#         data = {name : bow_itf[name] for name, link in preprocessed.items()}
        results.append({
            'citing_str': preprocessed['original']['citing_str'],
            'citing_context': preprocessed['original']['citing_context'],
            'data': data,
        })

    return results, preprocessed


In [14]:

start = 12
span = 1
val_span = 2
val = np.split(validation_links[start:start+val_span*span], span)
train = train_links[start:start+span]
results, preprocessed = calculate_similarities(train, val)

  0%|          | 0/1 [00:00<?, ?it/s]


ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [None]:
processed_results = copy.deepcopy(results)

for result in processed_results:
    df = result['data']['original']
    df['bert_original_cos_ampl'] = df['bert_original_cos']**6
    df['comb_bow_itf_bert_cos'] = np.sqrt(df['bow_itf_original'] * df['bert_original_cos_ampl'])
    df['comb_bow_itf_bert_ts_ss'] = np.sqrt(df['bow_itf_original'] * df['bert_original_ts_ss'])
    # derive weighted moving average
    for column in df.columns:
        df[f'{column}_rolling'] = df[column].rolling(4, center=True, win_type='gaussian').mean(std=2)


In [None]:
processed_results = copy.deepcopy(results)

for result in processed_results:
    df = result['data']['original']
    df['comb_bow_itf_bert'] = np.sqrt(df['bow_itf_original'] * (1 - df['bert_original_cos'])**4)
    df['comb_bow_itf_glove'] = np.sqrt(df['bow_itf_original'] * (1 - df['glove_original_cos'])**4)
    df['comb_bow_itf_roberta'] = np.sqrt(df['bow_itf_original'] * (1 - df['roberta_original_cos'])**4)
    # derive weighted moving average
    for column in df.columns:
        df[f'{column}_rolling'] = df[column].rolling(4, center=True, win_type='gaussian').mean(std=2)

In [None]:
import pickle
with open("results_emb_bow.p", "wb") as file:
    pickle.dump(processed_results, file)

In [None]:
with open("preprocessed_.p", "wb") as file:
    pickle.dump(preprocessed, file)

In [None]:
def mean_results(results):
    return{
        name : {
            column : np.mean(df[column]) for column in df.columns if df[column].dtype == float
        } for name, df in results.items()
    }

In [None]:
mean_res = mean_results(results[0]["data"])

In [None]:
with open("results_mean.p", "wb") as file:
    pickle.dump(mean_res, file)

In [None]:
process_link(train_links[start], process_section_to_chunks)['citing_str'] 

In [None]:
import hiplot as hip
result_df = results[0]['data']['original'].assign(text_beginning = lambda df: df.index.str[:10])
exp = hip.Experiment.from_dataframe(result_df)
displayed_exp = exp.display()

In [None]:
_test_results, _test_preprocessed = calculate_similarities([test[2]], [])


### Slightly wider manual comparison

In [None]:
start = 0
span = 10
test = test_links[start:start+span]

test_results, test_preprocessed = calculate_similarities(test, [])

processed_test_results = copy.deepcopy(test_results)

for result in processed_test_results:
    df = result['data']['original']
    df['bert_original_cos_ampl'] = df['bert_original_cos']**6
    df['comb_bow_itf_bert_cos'] = np.sqrt(df['bow_itf_original'] * df['bert_original_cos_ampl'])
    df['comb_bow_itf_bert_ts_ss'] = np.sqrt(df['bow_itf_original'] * df['bert_original_ts_ss'])
    # derive weighted moving average
    for column in df.columns:
        df[f'{column}_rolling'] = df[column].rolling(4, center=True, win_type='gaussian').mean(std=2)


In [None]:
import pickle
with open("results_emb_bow_test.p", "wb") as file:
    pickle.dump(processed_test_results, file)

In [None]:
from scipy import stats
stats.spearmanr(df['bert_original_cos_ampl'],df['bert_original_ts_ss'] )

In [None]:

def display_best(df, column):
    print(f'\n-- {column.upper()} --')
    print(*[f'{i+1}: "{chunk}"' 
            for i, chunk in enumerate(df[column].sort_values(ascending=False).index.values.tolist()[:5])
           ],
          sep='\n'
         )
    
for result in processed_test_results:
    result['citing_str']
    result['citing_context']
    df = result['data']['original']
    display_best(df, 'bow_itf_original')
    display_best(df, 'bert_original_ts_ss')
    display_best(df, 'comb_bow_itf_bert_ts_ss')


In [None]:



for result in processed_test_results:
    result['citing_str']
    result['citing_context']
    df = result['data']['original']
    _ = plot_text_sim(df['bow_itf_original'], df.index, title='bow_itf')
    _ = plot_text_sim(df['bert_original_cos_ampl'], df.index, title='bert_cos')
    _ = plot_text_sim(df['bert_original_ts_ss'], df.index, title='bert_ts_ss')
    _ = plot_text_sim(df['comb_bow_itf_bert_cos'], df.index, title='comb_bow_itf_bert_cos')
    _ = plot_text_sim(df['comb_bow_itf_bert_ts_ss'], df.index, title='comb_bow_itf_bert_ts_ss')

