In [1]:
### Imports and configuration

# setup variables

import os
import json
import tqdm
from s2orc.config import CURRENT_VERSION

# jsonlines https://jsonlines.readthedocs.io/en/latest/#api
import jsonlines
import gzip
import numpy as np
import matplotlib.pyplot as plt
import hiplot # <3

LOCAL_S2ORC_DIR = 's2orc-data'

psychology_paper_dir = os.path.join(LOCAL_S2ORC_DIR, CURRENT_VERSION, 'psychology')
psychology_paper_suffix = 'psych.text.jsonl'

links_dir = os.path.join(LOCAL_S2ORC_DIR, CURRENT_VERSION, 'psych_links')
links_suffix = 'psych.text.link.jsonl'


In [2]:
## Get corpus into memory

start = 0
span = 100 # all: 1700

links = []

links_files = sorted(os.listdir(links_dir), key=lambda f: int(f.split('.')[0]))[start:(start+span)]
for link_file in tqdm.tqdm(links_files):
    with gzip.open(os.path.join(links_dir, link_file), 'rb') as f_in:
        batch_links = list(jsonlines.Reader(f_in))
        for link in batch_links:
            if link['citing_paper']['grobid_parse'].get('body_text') is not None and link['cited_paper']['grobid_parse'].get('body_text') is not None:
                links.append(link)

np.random.seed(2134234)
links = np.array(links)
np.random.shuffle(links)

100%|██████████| 100/100 [00:04<00:00, 21.31it/s]


# work on the data

In [3]:
from syntok.segmenter import split
from syntok.tokenizer import Tokenizer
import json
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentPoolEmbeddings, StackedEmbeddings, TransformerDocumentEmbeddings
from flair.data import Sentence
import pandas as pd
import numpy as np
import tqdm
import sys

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [4]:
def process_section_to_chunks(text):
    chunksize = 3
    tokenized_sents = list(split(Tokenizer().tokenize(text)))
    sents = [' '.join(str(token) for token in sent) for sent in tokenized_sents]
    sents = [' '.join(sents[i:i+chunksize]) for i in range(len(sents)-chunksize)]
    sentences = [s for s in [Sentence(sent, use_tokenizer=True) for sent in sents]\
                 if len(s.tokens) > 0]
    return sentences

In [5]:
def partition_get_whole_section(text):
    return [text]

def process_link(link, process_section):
    context = link['citation_context']
    citing_paper = link['citing_paper']
    cited_paper = link['cited_paper']
    
    parts = []
    for text_chunk in cited_paper['grobid_parse']['body_text']:
        text = text_chunk.get('text')
        if text is not None:
            chunk_parts = process_section(text)
            parts.extend(chunk_parts)
    citing_string = ''.join([context['pre_context'], context['context_string'], context['post_context']])
    return {
        'citing_str': context['context_string'],
        'citing_context': citing_string,
        'cited_text_parts': parts,
    }

In [6]:
def calc_embedding_scores(link, metrics, embedding_name, embedding):
    all_sim = {}
    
    s = Sentence(link['citing_context'], use_tokenizer=True)
    embedding.embed(s)
    citation_embedding = s.embedding.detach().numpy()
    sentences = link['cited_text_parts']
    for sentence in sentences:
        sentence.clear_embeddings()
    embedding.embed(sentences)
    for sentence in sentences:
        all_sim[sentence.to_plain_string()] = {}
        for name, metric in metrics.items():
            sim = metric(sentence.embedding.detach().numpy(), citation_embedding)
            full_name = '_'.join([embedding_name, name])
            all_sim[sentence.to_plain_string()][full_name] = sim
    return pd.DataFrame(all_sim).T

In [7]:
from scipy.spatial import distance
from scipy.linalg import norm
from Vector_Similarity import *

def TS_SS(vec1, vec2) :
    return Triangle(vec1, vec2) * Sector(vec1, vec2)

def Triangle(vec1, vec2) :
    theta = math.radians(Theta(vec1,vec2))
    return (norm(vec1) * norm(vec2) * math.sin(theta)) / 2

def Theta(vec1, vec2):
    try:
        return math.acos(1-distance.cosine(vec1, vec2)) + math.radians(10)
    except:
        #print(vec1)
        #print(vec2)
        print(distance.cosine(vec1, vec2))
        

def Magnitude_Difference(vec1, vec2) :
    return abs(norm(vec1) - norm(vec2))

def Sector(vec1, vec2) :
    ED = distance.euclidean(vec1, vec2)
    MD = Magnitude_Difference(vec1, vec2)
    theta = Theta(vec1, vec2)
    return math.pi * math.pow((ED+MD),2) * theta/360

2.23606797749979
0.9999999999999998
0.0004639582566999478


### cool pipelines with cool plots and coolest hiplots

In [8]:
bert_embedding = TransformerDocumentEmbeddings('bert-base-uncased', fine_tune=False)
roberta_embedding = TransformerDocumentEmbeddings('roberta-base', fine_tune=False)
glove_embedding = DocumentPoolEmbeddings([WordEmbeddings('glove')])

In [9]:
import copy
def prepare_link_and_val(link, val_links):
    result = {"original" : process_link(link, process_section_to_chunks)}
    for i in range(len(val_links)):
        val_link = copy.deepcopy(val_links[i])
        val_link2 = copy.deepcopy(link)
        val_link2['citation_context'] = val_link['citation_context']
        val_link['citation_context'] = link['citation_context']
        result.update({
            "val_orig_context_"+str(i) : process_link(val_link, process_section_to_chunks),
            "val_orig_paper_"+str(i) : process_link(val_link2, process_section_to_chunks)
        })
    return result

In [10]:
#n_train_links = int(0.5 * len(links))
#n_validation_links = int(0.2 * len(links))
#n_test_links = len(links) - n_train_links - n_validation_links
train_links = links[:15]
validation_links = links[15:30]
#test_links = links[-n_test_links:]
#links = None

In [11]:
from plot_text_sim import plot_text_sim 
metrics = {"cos" : distance.cosine, "ts_ss" : TS_SS}
start = 7
span = 1
val_span = 2
results = []
val = np.split(validation_links[start:start+val_span], span)
for link, val_links in zip(train_links[start:start+span], val):
    preprocessed = prepare_link_and_val(link, val_links)
    glove = {name: calc_embedding_scores(link, metrics, embedding_name='glove_'+name, embedding = glove_embedding) for name, link in preprocessed.items()}
    bert = {name: calc_embedding_scores(link, metrics, embedding_name='bert_'+name, embedding = bert_embedding) for name, link in preprocessed.items()}
    roberta = {name: calc_embedding_scores(link, metrics, embedding_name='roberta_'+name, embedding = roberta_embedding) for name, link in preprocessed.items()}
    data = {name : pd.merge(pd.merge(glove[name], bert[name], left_index = True, right_index = True), roberta[name], left_index = True, right_index = True) for name, link in preprocessed.items()}
    results.append(data)
    for link in preprocessed.values():
        for sentence in link['cited_text_parts']:
            sentence.clear_embeddings()

In [12]:
with open("preprocessed_chunk3.p", "wb") as file:
    pickle.dump(preprocessed, file)

NameError: name 'pickle' is not defined

In [None]:
import pickle
with open("first_results_chunk3.p", "wb") as file:
    pickle.dump(results, file)

In [None]:
results[0]["original"]['text_beginning'] = results[0]["original"].index.str[:10]
_ = plot_text_sim(results[0]["original"]['bert_original_cos'], results[0]["original"].index)

In [None]:
results[0]["val1"]['text_beginning'] = results[0]["val1"].index.str[:10]
_ = plot_text_sim(results[0]["val1"]['bert_val1_cos'], results[0]["val1"].index)

In [None]:
results[0].keys()

In [None]:
results[0]["val2"]['text_beginning'] = results[0]["val2"].index.str[:10]
_ = plot_text_sim(results[0]["val2"]['bert_val2_cos'], results[0]["val2"].index)

In [None]:
def mean_results(results):
    return{
        name : {
            column : np.mean(df[column]) for column in df.columns if df[column].dtype == float
        } for name, df in results.items()
    }

In [None]:
mean_res = mean_results(results[0])
mean_res["original"], mean_res["val1"], mean_res["val2"]

In [None]:
with open("first_results_mean_chunk3.p", "wb") as file:
    pickle.dump(mean_res, file)

In [None]:
import hiplot as hip
exp = hip.Experiment.from_dataframe(results[0]["original"])
displayed_exp = exp.display()
displayed_exp

In [None]:
import hiplot as hip
exp = hip.Experiment.from_dataframe(results[0]["val2"])
displayed_exp = exp.display()
displayed_exp

In [None]:
import hiplot as hip
exp = hip.Experiment.from_dataframe(results[0]["val1"])
displayed_exp = exp.display()
displayed_exp

In [None]:
import time
from IPython.display import clear_output

selected = displayed_exp.get_selected()
idx = np.array([int(data_point.uid) for data_point in selected])
clear_output(wait=True)
print(*list(zip(idx, df.iloc[idx, :].index.to_list())), sep='\n')

# TODO
 ## Dimensionality reduction of embeddings!!!!!!!!!!!!!!!!!1