In [2]:
### Imports and configuration

# setup variables

import os
import json
import tqdm
from s2orc.config import CURRENT_VERSION

# jsonlines https://jsonlines.readthedocs.io/en/latest/#api
import jsonlines
import gzip
import numpy as np
import matplotlib.pyplot as plt
import hiplot # <3

LOCAL_S2ORC_DIR = 's2orc-data'

psychology_paper_dir = os.path.join(LOCAL_S2ORC_DIR, CURRENT_VERSION, 'psychology')
psychology_paper_suffix = 'psych.text.jsonl'

links_dir = os.path.join(LOCAL_S2ORC_DIR, CURRENT_VERSION, 'psych_links')
links_suffix = 'psych.text.link.jsonl'


In [95]:
## Get corpus into memory

start = 0
span = 100 # all: 1700

links = []

links_files = sorted(os.listdir(links_dir), key=lambda f: int(f.split('.')[0]))[start:(start+span)]
for link_file in tqdm.tqdm(links_files):
    with gzip.open(os.path.join(links_dir, link_file), 'rb') as f_in:
        batch_links = list(jsonlines.Reader(f_in))
        for link in batch_links:
            if link['citing_paper']['grobid_parse'].get('body_text') is not None and link['cited_paper']['grobid_parse'].get('body_text') is not None:
                links.append(link)

np.random.seed(2134234)
links = np.array(links)
np.random.shuffle(links)

100%|██████████| 100/100 [00:03<00:00, 31.01it/s]


# work on the data

In [5]:
from syntok.segmenter import split
from syntok.tokenizer import Tokenizer
import json
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentPoolEmbeddings, StackedEmbeddings, TransformerDocumentEmbeddings
from flair.data import Sentence
import pandas as pd
import numpy as np
import tqdm
import sys


Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.


Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.


Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.


Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.


Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.


Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.


Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be

In [6]:
def process_section_to_chunks(text):
    chunksize = 1
    tokenized_sents = list(split(Tokenizer().tokenize(text)))
    sents = [' '.join(str(token) for token in sent) for sent in tokenized_sents]
    sents = [' '.join(sents[i:i+chunksize]) for i in range(len(sents)-chunksize)]
    sentences = [s for s in [Sentence(sent, use_tokenizer=True) for sent in sents]\
                 if len(s.tokens) > 0]
    return sentences

In [7]:
def partition_get_whole_section(text):
    return [text]

def process_link(link, process_section):
    context = link['citation_context']
    citing_paper = link['citing_paper']
    cited_paper = link['cited_paper']
    
    parts = []
    for text_chunk in cited_paper['grobid_parse']['body_text']:
        text = text_chunk.get('text')
        if text is not None:
            chunk_parts = process_section(text)
            parts.extend(chunk_parts)
    citing_string = ''.join([context['pre_context'], context['context_string'], context['post_context']])
    return {
        'citing_str': context['context_string'],
        'citing_context': citing_string,
        'cited_text_parts': parts,
    }

In [8]:
def calc_embedding_scores(link, metrics, embedding_name, embedding):
    all_sim = {}
    
    s = Sentence(link['citing_context'], use_tokenizer=True)
    embedding.embed(s)
    citation_embedding = s.embedding.detach().numpy()
    sentences = link['cited_text_parts']
    for sentence in sentences:
        sentence.clear_embeddings()
    embedding.embed(sentences)
    for sentence in sentences:
        all_sim[sentence.to_plain_string()] = {}
        for name, metric in metrics.items():
            sim = metric(sentence.embedding.detach().numpy(), citation_embedding)
            full_name = '_'.join([embedding_name, name])
            all_sim[sentence.to_plain_string()][full_name] = sim
    return pd.DataFrame(all_sim).T

In [9]:
from scipy.spatial import distance
from scipy.linalg import norm
from Vector_Similarity import *

def TS_SS(vec1, vec2) :
    return Triangle(vec1, vec2) * Sector(vec1, vec2)

def Triangle(vec1, vec2) :
    theta = math.radians(Theta(vec1,vec2))
    return (norm(vec1) * norm(vec2) * math.sin(theta)) / 2

def Theta(vec1, vec2):
    try:
        return math.acos(1-distance.cosine(vec1, vec2)) + math.radians(10)
    except:
        #print(vec1)
        #print(vec2)
        print(distance.cosine(vec1, vec2))
        

def Magnitude_Difference(vec1, vec2) :
    return abs(norm(vec1) - norm(vec2))

def Sector(vec1, vec2) :
    ED = distance.euclidean(vec1, vec2)
    MD = Magnitude_Difference(vec1, vec2)
    theta = Theta(vec1, vec2)
    return math.pi * math.pow((ED+MD),2) * theta/360

2.23606797749979
0.9999999999999998
0.0004639582566999478


### cool pipelines with cool plots and coolest hiplots

In [40]:
bert_embedding = TransformerDocumentEmbeddings('bert-base-uncased', fine_tune=False)
roberta_embedding = TransformerDocumentEmbeddings('roberta-base', fine_tune=False)
glove_embedding = DocumentPoolEmbeddings([WordEmbeddings('glove')])

In [103]:
import copy
def prepare_link_and_val(link, val_links):
    result = {"original" : process_link(link, process_section_to_chunks)}
    for i in range(len(val_links)):
        val_link = copy.deepcopy(val_links[i])
        val_link2 = copy.deepcopy(link)
        val_link2['citation_context'] = val_link['citation_context']
        val_link['citation_context'] = link['citation_context']
        result.update({
            "val_orig_context_"+str(i) : process_link(val_link, process_section_to_chunks),
            "val_orig_paper_"+str(i) : process_link(val_link2, process_section_to_chunks)
        })
    return result

In [97]:
#n_train_links = int(0.5 * len(links))
#n_validation_links = int(0.2 * len(links))
#n_test_links = len(links) - n_train_links - n_validation_links
train_links = links[:15]
validation_links = links[15:30]
#test_links = links[-n_test_links:]
#links = None

In [None]:
from plot_text_sim import plot_text_sim 
metrics = {"cos" : distance.cosine, "ts_ss" : TS_SS}
start = 7
span = 1
val_span = 2
results = []
val = np.split(validation_links[start:start+val_span], span)
for link, val_links in zip(train_links[start:start+span], val):
    preprocessed = prepare_link_and_val(link, val_links)
    glove = {name: calc_embedding_scores(link, metrics, embedding_name='glove_'+name, embedding = glove_embedding) for name, link in preprocessed.items()}
    bert = {name: calc_embedding_scores(link, metrics, embedding_name='bert_'+name, embedding = bert_embedding) for name, link in preprocessed.items()}
    roberta = {name: calc_embedding_scores(link, metrics, embedding_name='roberta_'+name, embedding = roberta_embedding) for name, link in preprocessed.items()}
    data = {name : pd.merge(pd.merge(glove[name], bert[name], left_index = True, right_index = True), roberta[name], left_index = True, right_index = True) for name, link in preprocessed.items()}
    results.append(data)
    for link in preprocessed.values():
        for sentence in link['cited_text_parts']:
            sentence.clear_embeddings()


invalid value encountered in float_scalars



In [67]:
preprocessed.keys()

dict_keys(['original', 'val_orig_context_0', 'val_orig_paper_0', 'val_orig_context_1', 'val_orig_paper_1'])

In [71]:
preprocessed['val_orig_paper_1']["citing_context"]

". Sleep deprivation has serious negative consequences for people's health and well-being: it causes hormonal and immunological disruptions that put people at risk for developing diabetes, obesity, cancer, cardiovascular disease, chronic infections, and neuropsychiatric diseases (Strine and Chapman, 2005; Irwin, 2015) . Sleep deprivation reduces health-related quality of life (Paiva et al., 2015) and contributes to increased mortality (Gallicchio and Kalesan, 2009 ). Sleep researchers have long focused on identifying medical causes for sleep insufficiency (e.g., sleep apnea, insomnia), but"

In [69]:
preprocessed['val_orig_context_0']["citing_context"]

". Sleep deprivation has serious negative consequences for people's health and well-being: it causes hormonal and immunological disruptions that put people at risk for developing diabetes, obesity, cancer, cardiovascular disease, chronic infections, and neuropsychiatric diseases (Strine and Chapman, 2005; Irwin, 2015) . Sleep deprivation reduces health-related quality of life (Paiva et al., 2015) and contributes to increased mortality (Gallicchio and Kalesan, 2009 ). Sleep researchers have long focused on identifying medical causes for sleep insufficiency (e.g., sleep apnea, insomnia), but"

In [54]:
with open("preprocessed.p", "wb") as file:
    pickle.dump(preprocessed, file)

In [55]:
import pickle
with open("first_results.p", "wb") as file:
    pickle.dump(results, file)

In [56]:
results[0]["original"]['text_beginning'] = results[0]["original"].index.str[:10]
_ = plot_text_sim(results[0]["original"]['bert_original_cos'], results[0]["original"].index)

In [15]:
results[0]["val1"]['text_beginning'] = results[0]["val1"].index.str[:10]
_ = plot_text_sim(results[0]["val1"]['bert_val1_cos'], results[0]["val1"].index)

In [58]:
results[0].keys()

dict_keys(['original', 'val_orig_context_0', 'val_orig_paper_0', 'val_orig_context_1', 'val_orig_paper_1'])

In [57]:
results[0]["val2"]['text_beginning'] = results[0]["val2"].index.str[:10]
_ = plot_text_sim(results[0]["val2"]['bert_val2_cos'], results[0]["val2"].index)

KeyError: 'val2'

In [27]:
def mean_results(results):
    return{
        name : {
            column : np.mean(df[column]) for column in df.columns if df[column].dtype == float
        } for name, df in results.items()
    }

In [28]:
mean_res = mean_results(results[0])
mean_res["original"], mean_res["val1"], mean_res["val2"]

({'glove_original_cos': 0.1011742378103322,
  'glove_original_ts_ss': 0.0018200093773764342,
  'bert_original_cos': 0.25996915698051454,
  'bert_original_ts_ss': 2.061615702028393,
  'roberta_original_cos': 0.004458772314005884,
  'roberta_original_ts_ss': 0.0010263028691913931},
 {'glove_val1_cos': 0.09375385038531313,
  'glove_val1_ts_ss': 0.0018688362978431972,
  'bert_val1_cos': 0.20651484835556363,
  'bert_val1_ts_ss': 1.3820442829151451,
  'roberta_val1_cos': 0.004113437787766026,
  'roberta_val1_ts_ss': 0.0015539529869920353},
 {'glove_val2_cos': 0.1011742378103322,
  'glove_val2_ts_ss': 0.0018200093773764342,
  'bert_val2_cos': 0.25996915698051454,
  'bert_val2_ts_ss': 2.061615702028393,
  'roberta_val2_cos': 0.004458772314005884,
  'roberta_val2_ts_ss': 0.0010263028691913931})

In [35]:
with open("first_results_mean.p", "wb") as file:
    pickle.dump(mean_res, file)

In [29]:
import hiplot as hip
exp = hip.Experiment.from_dataframe(results[0]["original"])
displayed_exp = exp.display()
displayed_exp

<IPython.core.display.Javascript object>

<hiplot.ipython.IPythonExperimentDisplayed at 0x1cb7afd6d8>

In [31]:
import hiplot as hip
exp = hip.Experiment.from_dataframe(results[0]["val2"])
displayed_exp = exp.display()
displayed_exp

<IPython.core.display.Javascript object>

<hiplot.ipython.IPythonExperimentDisplayed at 0x1cb7d169e8>

In [30]:
import hiplot as hip
exp = hip.Experiment.from_dataframe(results[0]["val1"])
displayed_exp = exp.display()
displayed_exp

<IPython.core.display.Javascript object>

<hiplot.ipython.IPythonExperimentDisplayed at 0x1cb7bd9e48>

In [14]:
import time
from IPython.display import clear_output

selected = displayed_exp.get_selected()
idx = np.array([int(data_point.uid) for data_point in selected])
clear_output(wait=True)
print(*list(zip(idx, df.iloc[idx, :].index.to_list())), sep='\n')

GetSelectedFailure: No data received from the front-end. Please make sure that:
    1. You don't call "get_selected" on the same cell
    2. The interface has loaded
    3. You are in a Jupyter notebook (Jupyter lab is *not* supported)

# TODO
 ## Dimensionality reduction of embeddings!!!!!!!!!!!!!!!!!1