In [None]:
### Imports and configuration

# setup variables

import os
import json
import tqdm
from s2orc.config import CURRENT_VERSION

# jsonlines https://jsonlines.readthedocs.io/en/latest/#api
import jsonlines
import gzip
import numpy as np
import matplotlib.pyplot as plt
import hiplot # <3

LOCAL_S2ORC_DIR = 's2orc-data'

psychology_paper_dir = os.path.join(LOCAL_S2ORC_DIR, CURRENT_VERSION, 'psychology')
psychology_paper_suffix = 'psych.text.jsonl'

links_dir = os.path.join(LOCAL_S2ORC_DIR, CURRENT_VERSION, 'psych_links')
links_suffix = 'psych.text.link.jsonl'


In [None]:
## Get corpus into memory

start = 0
span = 100 # all: 1700

links = []

links_files = sorted(os.listdir(links_dir), key=lambda f: int(f.split('.')[0]))[start:(start+span)]
for link_file in tqdm.tqdm(links_files):
    with gzip.open(os.path.join(links_dir, link_file), 'rb') as f_in:
        batch_links = list(jsonlines.Reader(f_in))
        for link in batch_links:
            if link['citing_paper']['grobid_parse'].get('body_text') is not None and link['cited_paper']['grobid_parse'].get('body_text') is not None:
                links.append(link)

np.random.seed(2134234)
links = np.array(links)
np.random.shuffle(links)

In [None]:
n_train_links = int(0.5 * len(links))
n_validation_links = int(0.2 * len(links))
n_test_links = len(links) - n_train_links - n_validation_links
train_links = links[:15]
#validation_links = links[n_train_links:n_train_links + n_validation_links]
#test_links = links[-n_test_links:]
links = None

# work on the data

In [None]:
from syntok.segmenter import split
from syntok.tokenizer import Tokenizer
import json
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentPoolEmbeddings, StackedEmbeddings, TransformerDocumentEmbeddings
from flair.data import Sentence
import pandas as pd
import numpy as np
import tqdm
import sys


In [None]:
def process_section_to_chunks(text):
    chunksize = 1
    tokenized_sents = list(split(Tokenizer().tokenize(text)))
    sents = [' '.join(str(token) for token in sent) for sent in tokenized_sents]
    sents = [' '.join(sents[i:i+chunksize]) for i in range(len(sents)-chunksize)]
    sentences = [s for s in [Sentence(sent, use_tokenizer=True) for sent in sents]\
                 if len(s.tokens) > 0]
    return sentences
                

In [None]:
def partition_get_whole_section(text):
    return [text]

def process_link(link, process_section):
    context = link['citation_context']
    citing_paper = link['citing_paper']
    cited_paper = link['cited_paper']
    
    parts = []
    for text_chunk in cited_paper['grobid_parse']['body_text']:
        text = text_chunk.get('text')
        if text is not None:
            chunk_parts = process_section(text)
            parts.extend(chunk_parts)
    citing_string = ''.join([context['pre_context'], context['context_string'], context['post_context']])
    return {
        'citing_str': context['context_string'],
        'citing_context': citing_string,
        'cited_text_parts': parts,
    }

# process_link(train_links[0], process_section_to_chunks)




In [None]:
def calc_embedding_scores(link, metrics, embedding_name, embedding):
    all_sim = {}
    
    s = Sentence(link['citing_context'], use_tokenizer=True)
    embedding.embed(s)
    citation_embedding = s.embedding.detach().numpy()

    sentences = link['cited_text_parts']
    for sentence in sentences:
        embedding.embed(sentence)
    for sentence in sentences:
        all_sim[sentence.to_plain_string()] = {}
        for name, metric in metrics.items():
            sim = metric(sentence.embedding.detach().numpy(), citation_embedding)
            full_name = '_'.join([embedding_name, name])
            all_sim[sentence.to_plain_string()][full_name] = sim
    for sentence in sentences:
        sentence.clear_embeddings()
    
    return pd.DataFrame(all_sim).T


In [226]:
def calc_bow_occurrence_score(link, name):
    scores = []
    s = Sentence(link['citing_context'], use_tokenizer=True)
    context_token_set = set(token.text for token in s.tokens)
#     print (context_token_set)
    sentences = link['cited_text_parts']
#     print('SENTS')
    for sentence in sentences:
        sent_token_set = set(token.text for token in sentence.tokens)
#         print (sent_token_set)
        
        score = len(sent_token_set.intersection(context_token_set)) / len(context_token_set)            
        scores.append(score)
    return pd.DataFrame({
        name: scores
    }, index=[sent.to_plain_string() for sent in sentences])

In [255]:
from wordfreq import word_frequency
import regex
from collections import defaultdict

def get_word_itf(string_tokens, weights: dict):
    min_val = 10e-7
    return [weights[token] * 1./max(min_val, word_frequency(token, 'en')) for token in string_tokens]

science_blacklist = {'al', 'al.', 'et'}

token_re = r'^\p{L}(\p{L}|\.|[0-9])+$'
def process_tokens(string_tokens):
    return [token.lower() for token in string_tokens if regex.match(token_re, token) and token not in science_blacklist]

def calc_bow_itf_score(link, name):
    """
    Score using term frequency
    """
    scores = []
    s = Sentence(link['citing_context'], use_tokenizer=True)
    s_tokens = [token.text for token in s.tokens]
    processed_tokens = process_tokens(s_tokens)
    print(processed_tokens)
#     weights = defaultdict(float)
#     for i, token in enumerate(processed_tokens):
#         weight = i - len(processed_tokens)/2.          
#         weights[token] = max(weight, weights.get(token, 0))
    weights = defaultdict(lambda: 1)
    context_token_set = set(processed_tokens)
    total_denominator = np.sum(get_word_itf(context_token_set, weights=defaultdict(lambda: 1)))
    sentences = link['cited_text_parts']
    for sentence in sentences:
        sent_tokens = [token.text for token in sentence.tokens]
        sent_token_set = set(process_tokens(sent_tokens))
#         print (sent_token_set)
        matching = sent_token_set.intersection(context_token_set)
        token_sum = np.sum([get_word_itf(matching, weights=weights)]) * len(matching) / len(sent_token_set)
        score = token_sum/total_denominator #/  np.sum(list(weights.values()))
        scores.append(score)
    return pd.DataFrame({
        name: scores
    }, index=[sent.to_plain_string() for sent in sentences])

In [None]:
from scipy.spatial import distance
from scipy.linalg import norm
from Vector_Similarity import *

def TS_SS(vec1, vec2) :
    return Triangle(vec1, vec2) * Sector(vec1, vec2)

def Triangle(vec1, vec2) :
    theta = math.radians(Theta(vec1,vec2))
    return (norm(vec1) * norm(vec2) * math.sin(theta)) / 2

def Theta(vec1, vec2) :
    return math.acos(1 - distance.cosine(vec1, vec2)) + math.radians(10)

def Magnitude_Difference(vec1, vec2) :
    return abs(norm(vec1) - norm(vec2))

def Sector(vec1, vec2) :
    ED = distance.euclidean(vec1, vec2)
    MD = Magnitude_Difference(vec1, vec2)
    theta = Theta(vec1, vec2)
    return math.pi * math.pow((ED+MD),2) * theta/360

### cool pipelines with cool plots and coolest hiplots

In [None]:
bert_embedding = TransformerDocumentEmbeddings('bert-base-uncased', fine_tune=False)
roberta_embedding = TransformerDocumentEmbeddings('roberta-base', fine_tune=False)
glove_embedding = DocumentPoolEmbeddings([WordEmbeddings('glove')])

In [256]:
from plot_text_sim import plot_text_sim 
from scipy.spatial import distance
metrics = {"cos" : distance.cosine, "ts_ss" : TS_SS}

# do it in a function to prevent memory leaks
def calculate_similarities(links):
    preprocessed_links = []
    for link in links:
        preprocessed_link = process_link(link, process_section_to_chunks)
        preprocessed_links.append(preprocessed_link)
    results = []
    for link in preprocessed_links:
        
        glove_sim = calc_embedding_scores(link, metrics, embedding_name='glove', embedding = glove_embedding)
        bert_sim = calc_embedding_scores(link, metrics, embedding_name='bert', embedding = bert_embedding)
        roberta_sim = calc_embedding_scores(link, metrics, embedding_name='roberta', embedding = roberta_embedding)
        bow_occurrence_sim = calc_bow_occurrence_score(link, name='bow_occurrence_sim')
        bow_itf_sim = calc_bow_itf_score(link, name='bow_itf_sim')
        
        data = pd.merge(glove_sim, bert_sim, left_index = True, right_index = True)
        
        data = pd.merge(data, roberta_sim, left_index = True, right_index = True)
        data = pd.merge(data, bow_occurrence_sim, left_index = True, right_index = True)
        data = pd.merge(data, bow_itf_sim, left_index = True, right_index = True)
        
        results.append({
            'citing_str': link['citing_str'],
            'citing_context': link['citing_context'],
            'data': data,
        })
    
#     print(bow_itf_sim)
#     print(glove_sim)
#     print(set(bow_itf_sim.index).intersection(set(glove_sim.index)))
#     print(bow_itf_sim.index[0])
#     print(glove_sim.index[0])

    return results


start = 12
span = 1
results = calculate_similarities(train_links[start:start+span])


['performance', 'to', 'controls', 'this', 'has', 'been', 'accompanied', 'by', 'altered', 'electrophysiology', 'or', 'increased', 'activation', 'of', 'brain', 'regions', 'indicating', 'that', 'users', 'may', 'require', 'increased', 'neural', 'effort', 'in', 'order', 'to', 'maintain', 'adequate', 'performance', 'levels', 'battisti', 'hester', 'tapert', 'deficits', 'have', 'been', 'identified', 'in', 'small', 'number', 'of', 'studies', 'that', 'used', 'tasks', 'specifically', 'designed', 'to', 'measure', 'risky', 'or', 'impulsive', 'decision', 'making', 'such', 'as', 'the', 'matching', 'familiar', 'figures', 'task', 'mfft', 'kagan', 'or', 'the', 'iowa', 'gambling', 'task', 'igt']



invalid value encountered in double_scalars



In [258]:
process_link(train_links[start], process_section_to_chunks)['citing_str'] 

'Tapert et al. 2007'

In [None]:
result_df = results[0]
import hiplot as hip
result_df = result_df['data'].assign(text_beginning = lambda df: df.index.str[:10])
exp = hip.Experiment.from_dataframe(result_df)
displayed_exp = exp.display()


In [None]:
from IPython.display import clear_output

selected = displayed_exp.get_selected()
idx = np.array([int(data_point.uid) for data_point in selected])
clear_output(wait=True)
print(*list(zip(idx, df.iloc[idx, :].index.to_list())), sep='\n')


In [None]:
result_df = results[0]
result_df['citing_context']
_ = plot_text_sim(result_df['data']['bow_itf_sim'], result_df['data'].index)

In [257]:
result_df = results[0]
result_df['citing_context']
df = result_df['data']
_ = plot_text_sim(df['glove_cos']**8, df.index)
_ = plot_text_sim(df['bow_itf_sim'], df.index)
_ = plot_text_sim(df['bow_itf_sim'] * df['glove_cos']**8, df.index)
_ = plot_text_sim(df['bow_itf_sim'] * df['bert_cos']**8, df.index)
_ = plot_text_sim(df['bow_itf_sim'] * df['roberta_cos']**8, df.index)

'performance to controls, this has been accompanied by altered electrophysiology or increased activation of brain regions, indicating that users may require increased neural effort in order to maintain adequate performance levels (Battisti et al. 2010; Hester et al. 2009; Tapert et al. 2007 ).Cannabis-related deficits have been identified in a small number of studies that used tasks specifically designed to measure risky or impulsive decision making, such as the Matching Familiar Figures Task (MFFT) (Kagan 1966) or the Iowa Gambling Task (IGT;'

# TODO
 ## Dimensionality reduction of embeddings!!!!!!!!!!!!!!!!!1