In [39]:
import pandas as pd
from nltk.tokenize import sent_tokenize, PunktSentenceTokenizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer
import math
from rouge import Rouge
from tqdm.notebook import tqdm
import seaborn as sns
import numpy as np
import pickle
sns.set()

In [2]:
def show(x):
    print(x.shape)
    return x.head()

In [3]:
references = pd.read_csv("../data/references.csv").set_index("global_reference_id")
show(references)
references.head(10)

(28319, 2)


Unnamed: 0_level_0,title,abstract
global_reference_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Distributed Representations of Words and Phras...,The recently introduced continuous Skip-gram...
1,BERT: Pre-training of Deep Bidirectional Trans...,We introduce a new language representation m...
2,GloVe : Global Vectors for Word Representation,Recent methods for learning vector space repre...
3,Attention Is All You Need,The dominant sequence transduction models ar...
4,Adam: A Method for Stochastic Optimization,"We introduce Adam, an algorithm for first-or..."
5,Long Short-Term Memory,
6,Deep Contextualized Word Representations,We introduce a new type of deep contextualiz...
7,Efficient Estimation of Word Representations i...,We propose two novel model architectures for...
8,Neural Machine Translation By Jointly Learning...,Neural machine translation is a recently pro...
9,Deep Residual Learning for Image Recognition,Deeper neural networks are more difficult to...


In [4]:
papers = pd.read_csv("../data/papers.csv").set_index("paper_id")
show(papers)

(1211, 3)


Unnamed: 0_level_0,title,abstract,text
paper_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,IDST at TREC 2019 Deep Learning Track: Deep Ca...,This paper describes our participation in the ...,"KEYWORDS cascade ranking, pre-trained language..."
3,BatchBALD: Efficient and Diverse Batch Acquisi...,"We develop BatchBALD, a tractable approximatio...",A key problem in deep learning is data efficie...
4,A Sentence Compression Based Framework to Quer...,We consider the problem of using sentence comp...,Proceedings of the 51st Annual Meeting of the ...
5,DR-BiLSTM: Dependent Reading Bidirectional LST...,We present a novel deep learning architecture ...,Natural Language Inference (NLI; a.k.a. Recogn...
6,Mental health consequences of infections by co...,"1The Department of Cerebrovascular Diseases, T...",Brain and Behavior. 2020;00:e01901. | 1 of 7 h...


In [5]:
citations = pd.read_csv("../data/citations.csv")
show(citations)

(62988, 6)


Unnamed: 0,paper_id,internal_reference_id,global_reference_id,context,start_offset,end_offset
0,1,0,8672,"JOURNAL OF COMBINATORIAL THEORY 9, 129--135 (1...",36,51
1,2,1,1,Our approach is mainly based on the BERT langu...,56,59
2,2,8,521,Different from many other ranking methods whic...,216,219
3,2,5,171,"Moreover, for full ranking subtask, we use a s...",78,81
4,2,6,27,The proposed model is based on the pointer-gen...,59,62


In [6]:
citations.groupby(["paper_id", "global_reference_id"]).size()

paper_id  global_reference_id
1         8672                   1
2         1                      2
          3                      1
          12                     1
          20                     1
                                ..
1364      21077                  1
          21363                  1
          21979                  1
          24086                  1
          25998                  1
Length: 36096, dtype: int64

# retrieving sentences

In [15]:
queries = citations[["paper_id", "global_reference_id"]]\
.merge(references.drop(columns = "title"), on = "global_reference_id")\
.merge(papers.drop(columns = ["abstract", "title"]), on = "paper_id")\
.rename(columns = {"abstract":"reference_abstract", "context":"hypothesis", "text":"paper_text"})\
.dropna()\
.reset_index(drop = True)
show(queries)

(21673, 4)


Unnamed: 0,paper_id,global_reference_id,reference_abstract,paper_text
0,2,1,We introduce a new language representation m...,"KEYWORDS cascade ranking, pre-trained language..."
1,2,1,We introduce a new language representation m...,"KEYWORDS cascade ranking, pre-trained language..."
2,2,521,"Recently, the pre-trained language model, BE...","KEYWORDS cascade ranking, pre-trained language..."
3,2,521,"Recently, the pre-trained language model, BE...","KEYWORDS cascade ranking, pre-trained language..."
4,2,521,"Recently, the pre-trained language model, BE...","KEYWORDS cascade ranking, pre-trained language..."


In [10]:
tokenizer = PunktSentenceTokenizer(train_text = papers.text.sum())
tokenizer

<nltk.tokenize.punkt.PunktSentenceTokenizer at 0x7f9c54a5b220>

In [11]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2', device = "cuda")

In [31]:
data = queries.paper_text#.head()
sentences = []
for d in tqdm(data):
    s = tokenizer.tokenize(d)
    e = model.encode(s)
    sentences.append(dict(sentences = np.array(s), embeddings = e))
example = sentences[0]
print(example["sentences"].shape, example["embeddings"].shape)

  0%|          | 0/21673 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [33]:
q = queries[["global_reference_id", "reference_abstract"]].drop_duplicates()
show(q)

(4328, 2)


Unnamed: 0,global_reference_id,reference_abstract
0,1,We introduce a new language representation m...
2,521,"Recently, the pre-trained language model, BE..."
5,27,Neural sequence-to-sequence models have prov...
8,2006,Sequence-to-Sequence (seq2seq) modeling has ...
9,3,The dominant sequence transduction models ar...


In [38]:
# query_embeddings = model.encode(q.reference_abstract.tolist(), show_progress_bar = True)
# print(query_embeddings.shape)

# with open("../emb/reference_abstracts.npy", "wb") as file:
#     pickle.dump(query_embeddings, file)

Batches:   0%|          | 0/136 [00:00<?, ?it/s]

(4328, 384)

In [44]:
with open("../emb/reference_abstracts.npy", "rb") as file:
    query_embeddings = pickle.load(file)
query_embeddings.shape

(4328, 384)

# rouge

In [None]:
rouge = Rouge()
scores = rouge.get_scores(hyps = queries.hypothesis, refs = queries.reference)
print(len(scores))
scores[:5]

In [None]:
rouge_df = pd.DataFrame.from_records(scores)
show(rouge_df)

In [None]:
rouge_1 = pd.DataFrame.from_records(rouge_df["rouge-1"])\
.rename(columns = {"f":"rouge_1_f", "r":"rouge_1_r", "p":"rouge_1_p"})
show(rouge_1)

In [None]:
rouge_2 = pd.DataFrame.from_records(rouge_df["rouge-2"])\
.rename(columns = {"f":"rouge_2_f", "r":"rouge_2_r", "p":"rouge_2_p"})
show(rouge_2)

In [None]:
rouge_l = pd.DataFrame.from_records(rouge_df["rouge-l"])\
.rename(columns = {"f":"rouge_l_f", "r":"rouge_l_r", "p":"rouge_l_p"})
show(rouge_l)

In [None]:
full_rouge = pd.concat((queries, rouge_1, rouge_2, rouge_l), axis = 1)
show(full_rouge)

In [None]:
f, ax = plt.subplots(3, 1, sharex = True, constrained_layout=True, figsize = (12, 6))
ax[0].hist(full_rouge.rouge_1_f, bins = 50)
ax[0].set_title("Rouge-1")
ax[1].hist(full_rouge.rouge_2_f, bins = 50, color = "green")
ax[1].set_title("Rouge-2")
ax[2].hist(full_rouge.rouge_l_f, bins = 50, color = "red")
ax[2].set_title("Rouge-L")

plt.show()

In [None]:
data = full_rouge.sort_values("rouge_1_f").reset_index()
f, ax = plt.subplots(3, 1, sharex = True, constrained_layout=True, figsize = (12, 6))
ax[0].plot(data.rouge_1_f)
ax[0].set_title("Rouge-1")
ax[1].plot(data.rouge_2_f, color = "green")
ax[1].set_title("Rouge-2")
ax[2].plot(data.rouge_l_f, color = "red")
ax[2].set_title("Rouge-L")

plt.show()

In [None]:
sns.pairplot(full_rouge, vars = ["rouge_1_f","rouge_2_f", "rouge_l_f"])
plt.show()

In [None]:
data = full_rouge.sort_values("rouge_1_f").reset_index()
f, ax = plt.subplots(3, 1, sharex = True, constrained_layout=True, figsize = (12, 6))
ax[0].plot(data.rouge_1_f)
ax[0].set_title("Rouge-1 F")
ax[1].plot(data.rouge_1_p)
ax[1].set_title("Rouge-1 P")
ax[2].plot(data.rouge_1_r)
ax[2].set_title("Rouge-1 R")

plt.show()

In [None]:
sns.pairplot(full_rouge, vars = ["rouge_1_f","rouge_1_p", "rouge_1_r"])
plt.show()