# Summarization

## Import packages

In [1]:
import sys
import warnings
warnings.filterwarnings('ignore')
sys.path.append('../data_helpers/')
sys.path.append('../statistics/')
sys.path.append('../preprocess')
sys.path.append('../cluster/')

from twitter_data_helper import TwitterDataHelper
from reddit_data_helper import RedditDataHelper
from data_aggregator import DataAggregator
from statistics_aggregator import StatisticsAggregator
from text_cleaner import TextCleaner
from lda_cluster import LDACluster
from data_enhancer import DataOrganizer, SummarizeNER

## Data comes from Reddit and Twitter
Crawl data from Twitter and Reddit daily

In [2]:
data_helper = DataAggregator()
df = data_helper.get_data(date_range=['2017-08-22'])
df['source'].value_counts()

twitter    137
Name: source, dtype: int64

## LDA Cluster
clusters to two classes. 

In [3]:
text_cleaner = TextCleaner(filter_sentiment_words=True)
texts = df['text']
docs = text_cleaner.clean(texts)

cluster = LDACluster(num_topics=2)
cluster.fit(docs)

print("-"*115)
df['cluster'] = cluster.labels
df['cluster'].value_counts()
cluster.model.print_topics()

* [TextCleaner] Initializing...
* [TextCleaner] Loading SpaCy "en_core_web_md" corpus...


137it [00:00, 1077.27it/s]

* [TextCleaner] Loading stopwords...
* [TextCleaner] Loading sentinent words...
--------------------------------------------------------------------------------------------------------------------
* [TextCleaner] Cleaning text...





* [LDA] Training model...
-------------------------------------------------------------------------------------------------------------------


[(0,
  '0.025*"neural" + 0.024*"machine" + 0.020*"learning" + 0.017*"translation" + 0.016*"learn" + 0.014*"machine_translation" + 0.014*"clac" + 0.013*"discourse" + 0.013*"language" + 0.009*"deep"'),
 (1,
  '0.022*"neural" + 0.016*"language" + 0.015*"deep" + 0.014*"natural" + 0.014*"learning" + 0.013*"discourse" + 0.012*"space" + 0.010*"text" + 0.010*"network" + 0.008*"clac"')]

In [4]:
import pandas as pd
cdf = [df.loc[df['cluster'] == r] for r in range(len(cluster.model.print_topics()))]
cdf[0] # Cluster 0

Unnamed: 0,source,created_at,author,text,url,raw_data,cluster
7,twitter,2017-08-22 20:41:38,arxiv_cscl,Neural machine translation for low-resource la...,http://arxiv.org/abs/1708.05729,Status(_json={'created_at': 'Tue Aug 22 20:41:...,0
10,twitter,2017-08-22 16:41:41,arxiv_cscl,Cross-Lingual Dependency Parsing for Closely R...,http://arxiv.org/abs/1708.05719,Status(_json={'created_at': 'Tue Aug 22 16:41:...,0
16,twitter,2017-08-22 14:43:28,arxiv_cscl,The CLaC Discourse Parser at CoNLL-2015 https:...,http://arxiv.org/abs/1708.05857,Status(_json={'created_at': 'Tue Aug 22 14:43:...,0
18,twitter,2017-08-22 12:41:55,arxiv_cscl,Learning Visual Reasoning Without Strong Prior...,http://arxiv.org/abs/1707.03017,Status(_json={'created_at': 'Tue Aug 22 12:41:...,0
20,twitter,2017-08-22 12:41:54,arxiv_cscl,Acquisition of Translation Lexicons for Histor...,http://arxiv.org/abs/1706.01570,Status(_json={'created_at': 'Tue Aug 22 12:41:...,0
21,twitter,2017-08-22 12:41:54,arxiv_cscl,Using Global Constraints and Reranking to Impr...,http://arxiv.org/abs/1704.07050,Status(_json={'created_at': 'Tue Aug 22 12:41:...,0
29,twitter,2017-08-22 12:41:52,arxiv_cscl,Learning to Paraphrase for Question Answering ...,http://arxiv.org/abs/1708.06022,Status(_json={'created_at': 'Tue Aug 22 12:41:...,0
33,twitter,2017-08-22 12:41:51,arxiv_cscl,Neural Machine Translation with Extended Conte...,http://arxiv.org/abs/1708.05943,Status(_json={'created_at': 'Tue Aug 22 12:41:...,0
34,twitter,2017-08-22 12:41:51,arxiv_cscl,The Helsinki Neural Machine Translation System...,http://arxiv.org/abs/1708.05942,Status(_json={'created_at': 'Tue Aug 22 12:41:...,0
36,twitter,2017-08-22 12:41:50,arxiv_cscl,The CLaC Discourse Parser at CoNLL-2015 https:...,http://arxiv.org/abs/1708.05857,Status(_json={'created_at': 'Tue Aug 22 12:41:...,0


In [5]:
cdf[1] # Cluster 1

Unnamed: 0,source,created_at,author,text,url,raw_data,cluster
0,twitter,2017-08-22 05:56:44,AndrewYNg,This is great. Glad to see researchers working...,https://twitter.com/i/web/status/8998729499135...,Status(_json={'created_at': 'Tue Aug 22 05:56:...,1
1,twitter,2017-08-22 23:41:13,arxiv_cscl,Story Generation from Sequence of Independent ...,http://arxiv.org/abs/1707.05501,Status(_json={'created_at': 'Tue Aug 22 23:41:...,1
2,twitter,2017-08-22 23:41:10,arxiv_cscl,A Batch Noise Contrastive Estimation Approach ...,http://arxiv.org/abs/1708.05997,Status(_json={'created_at': 'Tue Aug 22 23:41:...,1
3,twitter,2017-08-22 23:41:07,arxiv_cscl,The Natural Stories Corpus https://t.co/pa103d...,http://arxiv.org/abs/1708.05763,Status(_json={'created_at': 'Tue Aug 22 23:41:...,1
4,twitter,2017-08-22 22:41:05,arxiv_cscl,AudioPairBank: Towards A Large-Scale Tag-Pair-...,http://arxiv.org/abs/1607.03766,Status(_json={'created_at': 'Tue Aug 22 22:41:...,1
5,twitter,2017-08-22 22:41:01,arxiv_cscl,The Natural Stories Corpus https://t.co/pa103e...,http://arxiv.org/abs/1708.05763,Status(_json={'created_at': 'Tue Aug 22 22:41:...,1
6,twitter,2017-08-22 20:41:40,arxiv_cscl,Measuring the Effect of Discourse Relations on...,http://arxiv.org/abs/1708.05803,Status(_json={'created_at': 'Tue Aug 22 20:41:...,1
8,twitter,2017-08-22 19:41:27,arxiv_cscl,Portuguese Word Embeddings: Evaluating on Word...,http://arxiv.org/abs/1708.06025,Status(_json={'created_at': 'Tue Aug 22 19:41:...,1
9,twitter,2017-08-22 18:41:32,arxiv_cscl,CLaC @ QATS: Quality Assessment for Text Simpl...,http://arxiv.org/abs/1708.05797,Status(_json={'created_at': 'Tue Aug 22 18:41:...,1
11,twitter,2017-08-22 15:41:45,arxiv_cscl,Vector Space Model as Cognitive Space for Text...,http://arxiv.org/abs/1708.06068,Status(_json={'created_at': 'Tue Aug 22 15:41:...,1


## Statistics and Data Enhancement

### General Statistics
- (hotness) calculated through a ranking algorithm that takes in the up/down votes and date as inputs
- (sentiment) using NLTK's sentiment analysis modules
- (type) pre-defined rules to classify urls as a dataset, code, paper, tutorial, social-media, blog, "shortened-link", or news.

In [6]:
stats_helper = StatisticsAggregator(cdf[0])
stats = stats_helper.get_stats()

In [7]:
stats

Unnamed: 0,source,created_at,author,text,url,raw_data,hotness,sentiment_polarity,sentiment,type
0,twitter,2017-08-22 20:41:38,arxiv_cscl,Neural machine translation for low-resource la...,http://arxiv.org/abs/1708.05729,Status(_json={'created_at': 'Tue Aug 22 20:41:...,8209.033222,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",neu,paper
1,twitter,2017-08-22 16:41:41,arxiv_cscl,Cross-Lingual Dependency Parsing for Closely R...,http://arxiv.org/abs/1708.05719,Status(_json={'created_at': 'Tue Aug 22 16:41:...,8208.713289,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",neu,paper
2,twitter,2017-08-22 14:43:28,arxiv_cscl,The CLaC Discourse Parser at CoNLL-2015 https:...,http://arxiv.org/abs/1708.05857,Status(_json={'created_at': 'Tue Aug 22 14:43:...,8208.555667,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",neu,paper
3,twitter,2017-08-22 12:41:55,arxiv_cscl,Learning Visual Reasoning Without Strong Prior...,http://arxiv.org/abs/1707.03017,Status(_json={'created_at': 'Tue Aug 22 12:41:...,8208.69463,"{'neg': 0.0, 'neu': 0.602, 'pos': 0.398, 'comp...",pos,paper
4,twitter,2017-08-22 12:41:54,arxiv_cscl,Acquisition of Translation Lexicons for Histor...,http://arxiv.org/abs/1706.01570,Status(_json={'created_at': 'Tue Aug 22 12:41:...,8208.393578,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",neu,paper
5,twitter,2017-08-22 12:41:54,arxiv_cscl,Using Global Constraints and Reranking to Impr...,http://arxiv.org/abs/1704.07050,Status(_json={'created_at': 'Tue Aug 22 12:41:...,8208.393578,"{'neg': 0.0, 'neu': 0.734, 'pos': 0.266, 'comp...",neu,paper
6,twitter,2017-08-22 12:41:52,arxiv_cscl,Learning to Paraphrase for Question Answering ...,http://arxiv.org/abs/1708.06022,Status(_json={'created_at': 'Tue Aug 22 12:41:...,8208.393533,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",neu,paper
7,twitter,2017-08-22 12:41:51,arxiv_cscl,Neural Machine Translation with Extended Conte...,http://arxiv.org/abs/1708.05943,Status(_json={'created_at': 'Tue Aug 22 12:41:...,8208.393511,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",neu,paper
8,twitter,2017-08-22 12:41:51,arxiv_cscl,The Helsinki Neural Machine Translation System...,http://arxiv.org/abs/1708.05942,Status(_json={'created_at': 'Tue Aug 22 12:41:...,8208.393511,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",neu,paper
9,twitter,2017-08-22 12:41:50,arxiv_cscl,The CLaC Discourse Parser at CoNLL-2015 https:...,http://arxiv.org/abs/1708.05857,Status(_json={'created_at': 'Tue Aug 22 12:41:...,8208.393489,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",neu,paper


In [8]:
stats['type'].value_counts()

paper                            30
twitter status                   14
unknown link                      7
subreddit: /r/MachineLearning     5
blog                              1
code                              1
Name: type, dtype: int64

In [9]:
stats['sentiment'].value_counts()

neu    53
pos     5
Name: sentiment, dtype: int64

### Data Organizer
- Takes in twitter and reddit 'text' as an input.
- Uses this text and performs a google query
- Classify the results with pre-defined rules.

In [15]:
do = DataOrganizer(stats)
extended = do.enhance()

*[Data Organizer] Downloading Google Search Results


In [16]:
for i, e in extended.iterrows():
    print('* Date: {}'.format(e['created_at']))
    print('* Text: {}'.format(e['text']))
    print('* Related Urls: ')
    for ty, r in e['types'].items():
        if len(r) != 0:
            print('* * Type: {}'.format(ty))
            for result in r:
                print('* * url: {} \n* * snippet: {}'.format(result[0], result[1]))
    print('-'*115)

* Date: 2017-08-22 20:41:38
* Text: Neural machine translation for low-resource languages https://t.co/dvHZJqVDIs
* Related Urls: 
* * Type: paper
* * url: http://arxiv.org/abs/1708.05729 
* * snippet: [1708.05729] Neural machine translation for low-resource ... | 7 days ago - We demonstrate that NMT can be used for low-resource languages as well, ... machine translation (SMT) and NMT to investigate the lower limits of the respective technologies. We find ...
* * url: https://arxiv.org/abs/1604.02201 
* * snippet: Transfer Learning for Low-Resource Neural Machine Translation | Apr 8, 2016 - Our key idea is to first train a high-resource language pair (the parent model), then transfer some of the ...
-------------------------------------------------------------------------------------------------------------------
* Date: 2017-08-22 16:41:41
* Text: Cross-Lingual Dependency Parsing for Closely Related Languages - Helsinki's Submission to VarDial 2017 https://t.co/2Vuj8b96k0
* Related Ur

### Name Entity Recognition and Wikipedia Summarization
- Takes text of twitter and reddit as inputs.
- Performs NERs on all texts
- Use these NERs to perform a wikipedia query.
- Summarize the wikipedia article.

In [17]:
se = SummarizeNER(extended)
tagged = se.get_summarized_data()



In [18]:
tagged

Unnamed: 0,source,created_at,author,text,url,raw_data,hotness,sentiment_polarity,sentiment,type,cleaned_text,google-search,types,NER,Wiki-NER-Sumarry
0,twitter,2017-08-22 20:41:38,arxiv_cscl,Neural machine translation for low-resource la...,http://arxiv.org/abs/1708.05729,Status(_json={'created_at': 'Tue Aug 22 20:41:...,8209.033222,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",neu,paper,b'Neural machine translation for low-resource ...,"[(http://arxiv.org/abs/1708.05729, [1708.05729...","{'dataset': [], 'code': [], 'paper': [('http:/...",,No wikipedia page found
1,twitter,2017-08-22 16:41:41,arxiv_cscl,Cross-Lingual Dependency Parsing for Closely R...,http://arxiv.org/abs/1708.05719,Status(_json={'created_at': 'Tue Aug 22 16:41:...,8208.713289,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",neu,paper,"b""Cross-Lingual Dependency Parsing for Closely...","[(https://arxiv.org/abs/1708.05719, Cross-Ling...","{'dataset': [], 'code': [], 'paper': [('https:...","(Helsinki, ORGANIZATION)",Helsinki (; Finnish pronunciation: [ˈhelsiŋki]...
2,twitter,2017-08-22 14:43:28,arxiv_cscl,The CLaC Discourse Parser at CoNLL-2015 https:...,http://arxiv.org/abs/1708.05857,Status(_json={'created_at': 'Tue Aug 22 14:43:...,8208.555667,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",neu,paper,b'The CLaC Discourse Parser at CoNLL-2015 http...,"[(http://arxiv.org/abs/1708.05857, [1708.05857...","{'dataset': [], 'code': [], 'paper': [('http:/...",,No wikipedia page found
3,twitter,2017-08-22 12:41:55,arxiv_cscl,Learning Visual Reasoning Without Strong Prior...,http://arxiv.org/abs/1707.03017,Status(_json={'created_at': 'Tue Aug 22 12:41:...,8208.69463,"{'neg': 0.0, 'neu': 0.602, 'pos': 0.398, 'comp...",pos,paper,b'Learning Visual Reasoning Without Strong Pri...,"[(https://arxiv.org/abs/1707.03017, Learning V...","{'dataset': [], 'code': [], 'paper': [('https:...",,No wikipedia page found
4,twitter,2017-08-22 12:41:54,arxiv_cscl,Acquisition of Translation Lexicons for Histor...,http://arxiv.org/abs/1706.01570,Status(_json={'created_at': 'Tue Aug 22 12:41:...,8208.393578,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",neu,paper,b'Acquisition of Translation Lexicons for Hist...,"[(https://arxiv.org/abs/1706.01570, Acquisitio...","{'dataset': [], 'code': [], 'paper': [('https:...",,No wikipedia page found
5,twitter,2017-08-22 12:41:54,arxiv_cscl,Using Global Constraints and Reranking to Impr...,http://arxiv.org/abs/1704.07050,Status(_json={'created_at': 'Tue Aug 22 12:41:...,8208.393578,"{'neg': 0.0, 'neu': 0.734, 'pos': 0.266, 'comp...",neu,paper,b'Using Global Constraints and Reranking to Im...,"[(https://arxiv.org/pdf/1704.07050, Using Glob...","{'dataset': [], 'code': [], 'paper': [('https:...",,No wikipedia page found
6,twitter,2017-08-22 12:41:52,arxiv_cscl,Learning to Paraphrase for Question Answering ...,http://arxiv.org/abs/1708.06022,Status(_json={'created_at': 'Tue Aug 22 12:41:...,8208.393533,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",neu,paper,b'Learning to Paraphrase for Question Answerin...,"[(http://arxiv.org/abs/1708.06022, [1708.06022...","{'dataset': [], 'code': [], 'paper': [('http:/...",,No wikipedia page found
7,twitter,2017-08-22 12:41:51,arxiv_cscl,Neural Machine Translation with Extended Conte...,http://arxiv.org/abs/1708.05943,Status(_json={'created_at': 'Tue Aug 22 12:41:...,8208.393511,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",neu,paper,b'Neural Machine Translation with Extended Con...,"[(http://arxiv.org/abs/1708.05943, [1708.05943...","{'dataset': [], 'code': [], 'paper': [('http:/...",,No wikipedia page found
8,twitter,2017-08-22 12:41:51,arxiv_cscl,The Helsinki Neural Machine Translation System...,http://arxiv.org/abs/1708.05942,Status(_json={'created_at': 'Tue Aug 22 12:41:...,8208.393511,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",neu,paper,b'The Helsinki Neural Machine Translation Syst...,"[(http://arxiv.org/abs/1708.05942, [1708.05942...","{'dataset': [], 'code': [('https://github.com/...",,No wikipedia page found
9,twitter,2017-08-22 12:41:50,arxiv_cscl,The CLaC Discourse Parser at CoNLL-2015 https:...,http://arxiv.org/abs/1708.05857,Status(_json={'created_at': 'Tue Aug 22 12:41:...,8208.393489,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",neu,paper,b'The CLaC Discourse Parser at CoNLL-2015 http...,"[(http://arxiv.org/abs/1708.05857, [1708.05857...","{'dataset': [], 'code': [], 'paper': [('http:/...",,No wikipedia page found


In [19]:
for i, t in tagged.iterrows():
    print('* [Date]: {}'.format(t['created_at']))
    print('* [Text]: {}'.format(t['text']))
    print('* [Link]: {}'.format(t['url']))
    print('* [NER detedted]: ', end='')
    if t['NER'] != "N/A":
        print('{}'.format(t['NER']))
        print('* [Wiki NER Summary]: {}'.format(t['Wiki-NER-Sumarry']))
    else:
        print('No NER detected')
    print('-'*115)

* [Date]: 2017-08-22 20:41:38
* [Text]: Neural machine translation for low-resource languages https://t.co/dvHZJqVDIs
* [Link]: http://arxiv.org/abs/1708.05729
* [NER detedted]: No NER detected
-------------------------------------------------------------------------------------------------------------------
* [Date]: 2017-08-22 16:41:41
* [Text]: Cross-Lingual Dependency Parsing for Closely Related Languages - Helsinki's Submission to VarDial 2017 https://t.co/2Vuj8b96k0
* [Link]: http://arxiv.org/abs/1708.05719
* [NER detedted]: ('Helsinki', 'ORGANIZATION')
* [Wiki NER Summary]: Helsinki (; Finnish pronunciation: [ˈhelsiŋki]; Swedish: Helsingfors; Swedish pronunciation: [hɛlsɪŋˈfɔrs]) is the capital and largest city of Finland. It is in the region of Uusimaa, in southern Finland, on the shore of the Gulf of Finland. Helsinki has a population of 629,512, an urban population of 1,231,595, and a metropolitan population of over 1.4 million, making it the most populous municipality and ur

In [20]:
from data_aggregator import DataAggregator

In [21]:
sys.path.append('../crawler')

In [22]:
from url_content_crawler import get_url_content