In [1]:
import gensim
import arxiv
import pandas as pd

import itertools

import matplotlib.pyplot as plt
%matplotlib inline

# Load ML articles from arxiv

In [2]:
%%time

search_query = 'text summarization'

results = arxiv.query(search_query=search_query,
                          max_results=500)

print('Query returned {} results'.format(len(results)))

Query returned 500 results
CPU times: user 1.71 s, sys: 28.1 ms, total: 1.74 s
Wall time: 6.49 s


## Display some basic information

In [3]:
n_examples = 20

for entry in results[:n_examples]:
  print(20 * '*')
  print(entry['title'])
  print(20 * '*')
  print(', '.join(entry['authors']))
  print(entry['date'])
  print(entry['summary'])
  print()

********************
Bengali text summarization by sentence extraction
********************
Kamal Sarkar
2012-01-11T04:56:59Z
Text summarization is a process to produce an abstract or a summary by
selecting significant portion of the information from one or more texts. In an
automatic text summarization process, a text is given to the computer and the
computer returns a shorter less redundant extract or abstract of the original
text(s). Many techniques have been developed for summarizing English text(s).
But, a very few attempts have been made for Bengali text summarization. This
paper presents a method for Bengali text summarization which extracts important
sentences from a Bengali document to produce a summary.

********************
A Semantic Relevance Based Neural Network for Text Summarization and
  Text Simplification
********************
Shuming Ma, Xu Sun
2017-10-06T09:06:33Z
Text summarization and text simplification are two major ways to simplify the
text for poor readers, in

In [4]:
articles_df = pd.DataFrame(results)

In [5]:
articles_df.columns

Index(['affiliation', 'arxiv_comment', 'arxiv_primary_category', 'arxiv_url',
       'author', 'author_detail', 'authors', 'doi', 'guidislink', 'id',
       'journal_reference', 'links', 'pdf_url', 'published',
       'published_parsed', 'summary', 'summary_detail', 'tags', 'title',
       'title_detail', 'updated', 'updated_parsed'],
      dtype='object')

In [6]:
articles_df.head()

Unnamed: 0,affiliation,arxiv_comment,arxiv_primary_category,arxiv_url,author,author_detail,authors,doi,guidislink,id,...,pdf_url,published,published_parsed,summary,summary_detail,tags,title,title_detail,updated,updated_parsed
0,,,"{'term': 'cs.IR', 'scheme': 'http://arxiv.org/...",http://arxiv.org/abs/1201.2240v1,Kamal Sarkar,{'name': 'Kamal Sarkar'},[Kamal Sarkar],,True,http://arxiv.org/abs/1201.2240v1,...,http://arxiv.org/pdf/1201.2240v1,2012-01-11T04:56:59Z,"(2012, 1, 11, 4, 56, 59, 2, 11, 0)",Text summarization is a process to produce an ...,"{'language': None, 'value': 'Text summarizatio...","[{'label': None, 'term': 'cs.IR', 'scheme': 'h...",Bengali text summarization by sentence extraction,"{'language': None, 'value': 'Bengali text summ...",2012-01-11T04:56:59Z,"(2012, 1, 11, 4, 56, 59, 2, 11, 0)"
1,,,"{'term': 'cs.CL', 'scheme': 'http://arxiv.org/...",http://arxiv.org/abs/1710.02318v1,Xu Sun,{'name': 'Xu Sun'},"[Shuming Ma, Xu Sun]",,True,http://arxiv.org/abs/1710.02318v1,...,http://arxiv.org/pdf/1710.02318v1,2017-10-06T09:06:33Z,"(2017, 10, 6, 9, 6, 33, 4, 279, 0)",Text summarization and text simplification are...,"{'language': None, 'value': 'Text summarizatio...","[{'label': None, 'term': 'cs.CL', 'scheme': 'h...",A Semantic Relevance Based Neural Network for ...,"{'language': None, 'value': 'A Semantic Releva...",2017-10-06T09:06:33Z,"(2017, 10, 6, 9, 6, 33, 4, 279, 0)"
2,,"12 pages, 4 figures","{'term': 'cs.CL', 'scheme': 'http://arxiv.org/...",http://arxiv.org/abs/1704.03242v1,Korra Sathya Babu,{'name': 'Korra Sathya Babu'},"[Santosh Kumar Bharti, Korra Sathya Babu]",,True,http://arxiv.org/abs/1704.03242v1,...,http://arxiv.org/pdf/1704.03242v1,2017-04-11T11:20:19Z,"(2017, 4, 11, 11, 20, 19, 1, 101, 0)","In recent times, data is growing rapidly in ev...","{'language': None, 'value': 'In recent times, ...","[{'label': None, 'term': 'cs.CL', 'scheme': 'h...",Automatic Keyword Extraction for Text Summariz...,"{'language': None, 'value': 'Automatic Keyword...",2017-04-11T11:20:19Z,"(2017, 4, 11, 11, 20, 19, 1, 101, 0)"
3,,Pages: 07 Figures : 07,"{'term': 'cs.IR', 'scheme': 'http://arxiv.org/...",http://arxiv.org/abs/1305.2831v1,Urmila Shrawankar,{'name': 'Urmila Shrawankar'},"[Khushboo Thakkar, Urmila Shrawankar]",,True,http://arxiv.org/abs/1305.2831v1,...,http://arxiv.org/pdf/1305.2831v1,2013-05-10T08:06:15Z,"(2013, 5, 10, 8, 6, 15, 4, 130, 0)",Text Categorization is the task of automatical...,"{'language': None, 'value': 'Text Categorizati...","[{'label': None, 'term': 'cs.IR', 'scheme': 'h...",Test Model for Text Categorization and Text Su...,"{'language': None, 'value': 'Test Model for Te...",2013-05-10T08:06:15Z,"(2013, 5, 10, 8, 6, 15, 4, 130, 0)"
4,,,"{'term': 'cs.CL', 'scheme': 'http://arxiv.org/...",http://arxiv.org/abs/1605.02948v3,Nasser Ghadiri,{'name': 'Nasser Ghadiri'},"[Milad Moradi, Nasser Ghadiri]",,True,http://arxiv.org/abs/1605.02948v3,...,http://arxiv.org/pdf/1605.02948v3,2016-05-10T11:33:33Z,"(2016, 5, 10, 11, 33, 33, 1, 131, 0)",Automatic text summarization tools help users ...,"{'language': None, 'value': 'Automatic text su...","[{'label': None, 'term': 'cs.CL', 'scheme': 'h...",Different approaches for identifying important...,"{'language': None, 'value': 'Different approac...",2017-05-30T14:37:31Z,"(2017, 5, 30, 14, 37, 31, 1, 150, 0)"


## Extract keywords from summaries

In [7]:
%%time

articles_df['summary_keywords'] = articles_df['summary'].apply(gensim.summarization.keywords)

CPU times: user 10.7 s, sys: 19.5 ms, total: 10.8 s
Wall time: 10.8 s


In [8]:
articles_df['summary_keywords'] = articles_df['summary_keywords'].str.split('\n')

In [9]:
for __, row in itertools.islice(articles_df.iterrows(), n_examples):
  print(20 * '*')
  print(row['title'])
  print(20 * '*')
  print('keywords:', row['summary_keywords'])
  print()


********************
Bengali text summarization by sentence extraction
********************
keywords: ['texts', 'text summarization', 'significant', 'summarizing']

********************
A Semantic Relevance Based Neural Network for Text Summarization and
  Text Simplification
********************
keywords: ['text', 'texts', 'semantic', 'model', 'attention', 'summary', 'summaries', 'meaning', 'generation', 'generated', 'readers', 'non']

********************
Automatic Keyword Extraction for Text Summarization: A Survey
********************
keywords: ['recent', 'research', 'researchers', 'summarizer', 'summarize', 'summarization', 'different', 'matrices', 'data', 'social', 'banking', 'challenges']

********************
Test Model for Text Categorization and Text Summarization
********************
keywords: ['text', 'documents', 'document', 'manner', 'summarization', 'user']

********************
Different approaches for identifying important concepts in probabilistic
  biomedical text su