In [1]:
from tqdm.notebook import tqdm
import json
import pandas as pd
import os
from nltk.tokenize import sent_tokenize

In [2]:
def show(x, n = 5):
    print(x.shape)
    return x.head(n)

In [3]:
folder = "talksumm/data/json"
files = os.listdir(folder)
print(len(files))
files[:5]

1651


['Straight to the Tree: Constituency Parsing with Neural Syntactic Distance.pdf.json',
 'Exploring Optimism and Pessimism in Twitter Using Deep Learning.pdf.json',
 'Streaming Principal Component Analysis in Noisy Settings.pdf.json',
 'Oracle Complexity of Second-Order Methods for Finite-Sum Problems.pdf.json',
 'Doubly Greedy Primal-Dual Coordinate Descent for Sparse Empirical Risk Minimization.pdf.json']

In [4]:
data = []
for f in tqdm(files):
    with open(folder + "/" + f) as doc:
        data.append(json.load(doc))

print(data[0].keys())
print(data[0]["name"])
data[0]["metadata"].keys()

  0%|          | 0/1651 [00:00<?, ?it/s]

dict_keys(['name', 'metadata'])
Straight to the Tree: Constituency Parsing with Neural Syntactic Distance.pdf


dict_keys(['source', 'title', 'authors', 'emails', 'sections', 'references', 'referenceMentions', 'year', 'abstractText', 'creator'])

In [5]:
papers = pd.DataFrame.from_records([d["metadata"] for d in data])\
.reset_index()\
.rename(columns = {"index":"paper_id"})
show(papers)

(1651, 11)


Unnamed: 0,paper_id,source,title,authors,emails,sections,references,referenceMentions,year,abstractText,creator
0,0,META,Straight to the Tree: Constituency Parsing wit...,"[Yikang Shen, Zhouhan Lin, Athul Paul Jacob, A...","[kang.shen@umontreal.ca,, zhouhan.lin@umontrea...","[{'heading': None, 'text': 'Proceedings of the...",[{'title': 'Globally normalized transition-bas...,"[{'referenceID': 3, 'context': 'Parsing has be...",2018,"In this work, we propose a novel constituency ...",LaTeX with hyperref package
1,1,META,Exploring Optimism and Pessimism in Twitter Us...,"[Cornelia Caragea, Liviu P. Dinu, Bogdan Dumitru]","[ccaragea@ksu.edu,, ldinu@fmi.unibuc.ro,, bogd...","[{'heading': None, 'text': 'Proceedings of the...",[{'title': 'TensorFlow: Large-scale machine le...,"[{'referenceID': 22, 'context': 'Much has been...",2018,Identifying optimistic and pessimistic viewpoi...,LaTeX with hyperref package
2,2,META,Streaming Principal Component Analysis in Nois...,"[Teodor V. Marinov, Poorya Mianjy, Raman Arora]",[<tmarino2@cs.jhu.edu>.],"[{'heading': '1. Introduction', 'text': 'Princ...",[{'title': 'First efficient convergence for st...,"[{'referenceID': 2, 'context': 'from an unknow...",2018,We study streaming algorithms for principal co...,LaTeX with hyperref package
3,3,CRF,Oracle Complexity of Second-Order Methods for ...,"[Yossi Arjevani, Ohad Shamir]","[Arjevani<yossi.arjevani@weizmann.ac.il>,, <oh...","[{'heading': '1. Introduction', 'text': 'We co...",[{'title': 'A lower bound for the optimization...,"[{'referenceID': 8, 'context': 'To study the c...",2017,Finite-sum optimization problems are ubiquitou...,LaTeX with hyperref package
4,4,CRF,Doubly Greedy Primal-Dual Coordinate Descent f...,"[Qi Lei, Ian E.H. Yen, Chao-yuan Wu, Inderjit ...","[<leiqi@ices.utexas.edu>,, <eyan@cs.cmu.edu>.]","[{'heading': '1. Introduction', 'text': 'Regul...",[{'title': 'Large-scale machine learning with ...,"[{'referenceID': 2, 'context': 'Applications w...",2017,We consider the popular problem of sparse empi...,Preview


In [6]:
paper_sentences = papers[["paper_id", "title", "sections"]]\
.assign(full_text = lambda x: x.sections.map(lambda y: "\n".join([z["text"] for z in y])))\
.assign(sentences = lambda x: x.full_text.map(lambda y: list(enumerate(sent_tokenize(y)))))\
.explode("sentences")\
.drop(columns = ["sections", "full_text"])\
.assign(
    sentence = lambda x: x.sentences.map(lambda y: y[0]),
    text = lambda x: x.sentences.map(lambda y: y[1].strip())
)\
.drop(columns = "sentences")\
.reset_index(drop = True)\
# .rename(columns = {"index":"paper"})

show(paper_sentences)

(372550, 4)


Unnamed: 0,paper_id,title,sentence,text
0,0,Straight to the Tree: Constituency Parsing wit...,0,Proceedings of the 56th Annual Meeting of the ...
1,0,Straight to the Tree: Constituency Parsing wit...,1,Parsing has been useful for incorporating ling...
2,0,Straight to the Tree: Constituency Parsing wit...,2,Neural network-based approaches relying on den...
3,0,Straight to the Tree: Constituency Parsing wit...,3,"Generally speaking, either these approaches pr..."
4,0,Straight to the Tree: Constituency Parsing wit...,4,Corresponding authors: yikang.shen@umontreal.c...


In [7]:
summaries_folder = "LongSumm/extractive_summaries/talksumm_summaries/"
files = os.listdir(summaries_folder)
print(len(files))
files[:5]

1704


['Neural Audio Synthesis of Musical Notes  with WaveNet Autoencoders.txt',
 'Gradient Descent for Sparse Rank-One Matrix Completion for Crowd-Sourced Aggregation of Sparsely Interacting Workers.txt',
 'Analyzing the Robustness of Nearest Neighbors to Adversarial Examples.txt',
 'How Much Information Does a Human Translator Add to the Original?.txt',
 'An Efficient, Sparsity-Preserving, Online Algorithm for Low-Rank Approximation.txt']

In [8]:
data = []
for f in tqdm(files):
    with open(summaries_folder + "/" + f) as doc:
        data.append((f.replace(".txt", "").strip(), doc.readlines()))

print(data[0])

  0%|          | 0/1704 [00:00<?, ?it/s]

('Neural Audio Synthesis of Musical Notes  with WaveNet Autoencoders', ['6\t23\tFurther, we show that this model can learn a semantically meaningful hidden representation that can be used as a high-level control signal for manipulating tone, timbre, and dynamics during playback.\n', '7\t20\tExplicitly, our two contributions to advance the state of generative audio modeling are: • A WaveNet-style autoencoder that learns temporal hidden codes to effectively capture longer term structure without external conditioning.\n', '8\t112\t• NSynth: a large-scale dataset for exploring neural audio synthesis of musical notes.\n', '9\t43\tThe primary motivation for our novel autoencoder structure follows from the recent advances in autoregressive models like WaveNet (van den Oord et al., 2016a) and SampleRNN (Mehri et al., 2016).\n', '10\t36\tThey have proven to be effective at modeling short and medium scale (∼500ms) signals, but rely on external conditioning for longer-term dependencies.\n', '15\t

In [9]:
data = [[[t] + l.strip().split("\t") for l in d] for t, d in tqdm(data)]
data[0]

  0%|          | 0/1704 [00:00<?, ?it/s]

[['Neural Audio Synthesis of Musical Notes  with WaveNet Autoencoders',
  '6',
  '23',
  'Further, we show that this model can learn a semantically meaningful hidden representation that can be used as a high-level control signal for manipulating tone, timbre, and dynamics during playback.'],
 ['Neural Audio Synthesis of Musical Notes  with WaveNet Autoencoders',
  '7',
  '20',
  'Explicitly, our two contributions to advance the state of generative audio modeling are: • A WaveNet-style autoencoder that learns temporal hidden codes to effectively capture longer term structure without external conditioning.'],
 ['Neural Audio Synthesis of Musical Notes  with WaveNet Autoencoders',
  '8',
  '112',
  '• NSynth: a large-scale dataset for exploring neural audio synthesis of musical notes.'],
 ['Neural Audio Synthesis of Musical Notes  with WaveNet Autoencoders',
  '9',
  '43',
  'The primary motivation for our novel autoencoder structure follows from the recent advances in autoregressive mode

In [10]:
summaries = pd.DataFrame(sum(data, start = []), columns = ["title", "sentence_index", "sentence_score", "text"])
show(summaries)

(50242, 4)


Unnamed: 0,title,sentence_index,sentence_score,text
0,Neural Audio Synthesis of Musical Notes with ...,6,23,"Further, we show that this model can learn a s..."
1,Neural Audio Synthesis of Musical Notes with ...,7,20,"Explicitly, our two contributions to advance t..."
2,Neural Audio Synthesis of Musical Notes with ...,8,112,• NSynth: a large-scale dataset for exploring ...
3,Neural Audio Synthesis of Musical Notes with ...,9,43,The primary motivation for our novel autoencod...
4,Neural Audio Synthesis of Musical Notes with ...,10,36,They have proven to be effective at modeling s...


In [11]:
f"{372564 * 50242:,}"

'18,718,360,488'

In [12]:
summaries.groupby("title").count().shape[0]

1704

In [13]:
%%time

labels = paper_sentences\
.assign(clean_text = lambda x: x.text.str.replace("\W", "", regex = True).str.lower())\
.merge(
    summaries\
    .assign(clean_text = lambda x: x.text.str.replace("\W", "", regex = True).str.lower())\
    .drop(columns = "text"), 
    how = "left", 
    left_on = ["title", "clean_text"], 
    right_on = ["title", "clean_text"],
#     left_on = ["title", "text"], 
#     right_on = ["title", "text"],
)\
.groupby(["paper_id", "sentence"]).head(1)\
.assign(in_summary = lambda x: ~x.sentence_score.isna())\
.drop(columns = ["sentence_index", "title", "clean_text"])

show(labels, 20)

(372550, 5)
CPU times: user 3.7 s, sys: 68 ms, total: 3.76 s
Wall time: 3.76 s


Unnamed: 0,paper_id,sentence,text,sentence_score,in_summary
0,0,0,Proceedings of the 56th Annual Meeting of the ...,,False
1,0,1,Parsing has been useful for incorporating ling...,,False
2,0,2,Neural network-based approaches relying on den...,,False
3,0,3,"Generally speaking, either these approaches pr...",,False
4,0,4,Corresponding authors: yikang.shen@umontreal.c...,,False
5,0,5,"†Work done while at Microsoft Research, Montreal.",,False
6,0,6,the sequence of transitions in a transition-ba...,,False
7,0,7,Transition-based models decompose the structur...,,False
8,0,8,This enables fast greedy decoding but also lea...,17.0,True
9,0,9,Solutions to this problem usually complexify t...,16.0,True


# exploratory data analysis

In [14]:
labels.groupby(["paper_id", "sentence"]).text.count().reset_index().sort_values("text", ascending = False).head(20)

Unnamed: 0,paper_id,sentence,text
0,0,0,1
248374,1101,114,1
248372,1101,112,1
248371,1101,111,1
248370,1101,110,1
248369,1101,109,1
248368,1101,108,1
248367,1101,107,1
248366,1101,106,1
248365,1101,105,1


In [15]:
paper_sentences.query("paper_id == 223")[["sentence", "text"]].values

array([[0,
        'Deep linear networks (DLN) are neural networks that have multiple hidden layers but have no nonlinearities between layers.'],
       [1,
        'That is, for given data points {x(i)}Ni=1 the outputs of such networks are computed via a series\nŷ(i) = WLWL−1 · · ·W1x(i)\nof matrix multiplications.'],
       [2,
        'Given a target y(i) for the ith data point and a pairwise loss function `(ŷ(i),y(i)), forming the usual summation\nL(W1, .'],
       [3, '.'],
       [4, '.'],
       [5,
        ',WL) = 1\nN N∑ i=1 `(ŷ(i),y(i)) (1)\nthen yields the total loss.'],
       [6,
        'Such networks have few direct applications, but they frequently appear as a class of toy models used to understand the loss surfaces of deep neural networks (Saxe et al., 2014; Kawaguchi, 2016; Lu & Kawaguchi, 2017; Hardt & Ma, 2017).'],
       [7,
        'For example, numerical experiments indicate that DLNs exhibit some behavior that resembles the behavior of\n*Equal contribution 1D

In [16]:
labels.query("paper_id == 223").drop(columns = "paper_id").values.tolist()

[[0,
  'Deep linear networks (DLN) are neural networks that have multiple hidden layers but have no nonlinearities between layers.',
  nan,
  False],
 [1,
  'That is, for given data points {x(i)}Ni=1 the outputs of such networks are computed via a series\nŷ(i) = WLWL−1 · · ·W1x(i)\nof matrix multiplications.',
  nan,
  False],
 [2,
  'Given a target y(i) for the ith data point and a pairwise loss function `(ŷ(i),y(i)), forming the usual summation\nL(W1, .',
  nan,
  False],
 [3, '.', nan, False],
 [4, '.', nan, False],
 [5,
  ',WL) = 1\nN N∑ i=1 `(ŷ(i),y(i)) (1)\nthen yields the total loss.',
  nan,
  False],
 [6,
  'Such networks have few direct applications, but they frequently appear as a class of toy models used to understand the loss surfaces of deep neural networks (Saxe et al., 2014; Kawaguchi, 2016; Lu & Kawaguchi, 2017; Hardt & Ma, 2017).',
  nan,
  False],
 [7,
  'For example, numerical experiments indicate that DLNs exhibit some behavior that resembles the behavior of\n*E

In [17]:
labels.query("in_summary == True").shape
# (37749, 5) exact match
# (43787, 5) remove non-word characters and make it lowercase

(43773, 5)

In [18]:
labels.groupby("paper_id").in_summary.sum().sort_values(ascending = False).head(20)

paper_id
825     32
1496    32
1498    32
1278    31
745     31
1012    31
133     31
1554    31
434     31
691     31
346     31
1647    31
556     31
195     30
974     30
939     30
964     30
1239    30
293     30
290     30
Name: in_summary, dtype: int64

In [19]:
sum(labels.groupby("paper_id").in_summary.sum() > 0)

1606

# writing to disk

In [20]:
labels.to_pickle("labels.pkl")

In [21]:
papers.to_pickle("papers.pkl")

In [22]:
summaries.to_pickle("summaries.pkl")