# Summarization

## Import packages

In [1]:
import sys
import warnings
warnings.filterwarnings('ignore')
sys.path.append('../data_helpers/')
sys.path.append('../statistics/')
sys.path.append('../preprocess')
sys.path.append('../cluster/')

from twitter_data_helper import TwitterDataHelper
from reddit_data_helper import RedditDataHelper
from data_aggregator import DataAggregator
from statistics_aggregator import StatisticsAggregator
from text_cleaner import TextCleaner
from lda_cluster import LDACluster
from data_enhancer import DataOrganizer, SummarizeNER

Using TensorFlow backend.


## Data comes from Reddit and Twitter
Crawl data from Twitter and Reddit daily

In [2]:
data_helper = DataAggregator()
df = data_helper.get_data(date_range=['2017-08-22'])
df['source'].value_counts()

twitter    282
reddit       1
Name: source, dtype: int64

## LDA Cluster
clusters to two classes. 

In [3]:
text_cleaner = TextCleaner(filter_sentiment_words=True)
texts = df['text']
docs = text_cleaner.clean(texts)

cluster = LDACluster(num_topics=2)
cluster.fit(docs)

print("-"*115)
df['cluster'] = cluster.labels
df['cluster'].value_counts()
cluster.model.print_topics()

* [TextCleaner] Initializing...
* [TextCleaner] Loading SpaCy "en_core_web_md" corpus...


0it [00:00, ?it/s]

* [TextCleaner] Loading stopwords...
* [TextCleaner] Loading sentinent words...
--------------------------------------------------------------------------------------------------------------------
* [TextCleaner] Cleaning text...


283it [00:00, 1205.88it/s]


* [LDA] Training model...
-------------------------------------------------------------------------------------------------------------------


[(0,
  '0.011*"neural" + 0.010*"security" + 0.008*"machine" + 0.007*"learn" + 0.007*"language" + 0.007*"learning" + 0.006*"discourse" + 0.006*"amp" + 0.006*"clac" + 0.005*"network"'),
 (1,
  '0.008*"security" + 0.008*"neural" + 0.007*"author" + 0.007*"learn" + 0.007*"deep" + 0.007*"natural" + 0.007*"blog" + 0.006*"language" + 0.006*"discourse" + 0.005*"learning"')]

In [4]:
import pandas as pd
cdf = [df.loc[df['cluster'] == r] for r in range(len(cluster.model.print_topics()))]
cdf[0] # Cluster 0

Unnamed: 0,source,created_at,author,text,url,raw_data,cluster
3,twitter,2017-08-22 10:36:03,ncsc,"We have awarded a £500,000 grant to fund 'Deve...",https://www.ncsc.gov.uk/blog-post/ncsc-award-0...,"Status(user=User(following=True, profile_backg...",0
5,twitter,2017-08-22 08:09:03,ncsc,Threat Report: US Internal Revenue Service war...,https://www.ncsc.gov.uk/report/weekly-threat-r...,"Status(user=User(following=True, profile_backg...",0
7,twitter,2017-08-22 09:56:25,ConradLongmore,"Malware spam from ""Voicemail Service"" [pbx@loc...",http://blog.dynamoo.com/2017/08/malware-spam-f...,"Status(user=User(following=True, profile_backg...",0
8,twitter,2017-08-22 20:26:59,bry_campbell,Can I take tomorrow off now https://t.co/dw7hF...,https://twitter.com/kpoulsen/status/9000362154...,"Status(user=User(following=True, profile_backg...",0
9,twitter,2017-08-22 20:02:43,bry_campbell,Great. More Ukraine backdoors. https://t.co/vY...,https://issp.ua/issp_system_images/Crystal_Fin...,"Status(user=User(following=True, profile_backg...",0
10,twitter,2017-08-22 19:17:33,bry_campbell,So @cybereason specifically say the rise in ra...,https://itunes.apple.com/gb/podcast/malicious-...,"Status(user=User(following=True, profile_backg...",0
12,twitter,2017-08-22 19:33:02,symantec,Symantec’s @liam_omurchu discusses #Stuxnet so...,http://symc.ly/2g3wsTU,"Status(user=User(following=True, profile_backg...",0
13,twitter,2017-08-22 17:38:11,symantec,Congrats @professor__plum &amp; @threatintel! ...,https://twitter.com/alexz1elke/status/90004489...,"Status(user=User(following=True, profile_backg...",0
14,twitter,2017-08-22 17:19:03,symantec,Take a look at Symantec's interactive map of a...,https://twitter.com/i/web/status/9000446611123...,"Status(user=User(following=True, profile_backg...",0
16,twitter,2017-08-22 15:00:05,symantec,Attending the #Oktane17 Conference in Las Vega...,https://twitter.com/i/web/status/9000096873320...,"Status(user=User(following=True, profile_backg...",0


In [5]:
cdf[1] # Cluster 1

Unnamed: 0,source,created_at,author,text,url,raw_data,cluster
0,twitter,2017-08-22 15:18:02,ncsc,There's a lot of advice out there but how shou...,https://twitter.com/i/web/status/9000142059900...,"Status(user=User(following=True, profile_backg...",1
1,twitter,2017-08-22 14:17:02,ncsc,The UK Public Sector DNS service is now live. ...,https://twitter.com/i/web/status/8999988528912...,"Status(user=User(following=True, profile_backg...",1
2,twitter,2017-08-22 12:16:02,ncsc,Web Check: helping you to secure your public s...,https://www.ncsc.gov.uk/blog-post/web-check-he...,"Status(user=User(following=True, profile_backg...",1
4,twitter,2017-08-22 09:15:02,ncsc,Ransomware: what are the effects and what shou...,https://www.ncsc.gov.uk/guidance/protecting-yo...,"Status(user=User(following=True, profile_backg...",1
6,twitter,2017-08-22 11:49:01,ConradLongmore,Stoned Virus (1987) \nYour PC is now STONED!\n...,http://www.retromobe.com/2017/08/stoned-virus-...,"Status(user=User(following=True, profile_backg...",1
11,twitter,2017-08-22 21:22:02,symantec,"ICYMI: According to @Gartner_inc, #cybersecuri...",http://symc.ly/2g2Mxt5,"Status(user=User(following=True, profile_backg...",1
15,twitter,2017-08-22 16:00:06,symantec,Join today's free #webinar &amp; discover how ...,https://twitter.com/i/web/status/9000247894689...,"Status(user=User(following=True, profile_backg...",1
18,twitter,2017-08-22 19:37:23,syhw,A Brief Survey of Deep RL https://t.co/tDjGUaa...,https://arxiv.org/abs/1708.05866,"Status(user=User(following=True, profile_backg...",1
23,twitter,2017-08-22 21:05:38,Google,Sip this: a 2.4 GHz Wi-Fi wave's as tall as a ...,https://twitter.com/i/web/status/9001016800077...,"Status(user=User(following=True, profile_backg...",1
25,twitter,2017-08-22 16:19:23,Google,"Step into #DanceTonite, an ever-changing VR co...",https://twitter.com/i/web/status/9000296438849...,"Status(user=User(following=True, profile_backg...",1


## Statistics and Data Enhancement

### General Statistics
- (hotness) calculated through a ranking algorithm that takes in the up/down votes and date as inputs
- (sentiment) using NLTK's sentiment analysis modules
- (type) pre-defined rules to classify urls as a dataset, code, paper, tutorial, social-media, blog, "shortened-link", or news.

In [6]:
stats_helper = StatisticsAggregator(cdf[0])
stats = stats_helper.get_stats()

In [7]:
stats

Unnamed: 0,source,created_at,author,text,url,raw_data,hotness,sentiment_polarity,sentiment,type
0,twitter,2017-08-22 10:36:03,ncsc,"We have awarded a £500,000 grant to fund 'Deve...",https://www.ncsc.gov.uk/blog-post/ncsc-award-0...,"Status(user=User(following=True, profile_backg...",8209.003929,"{'neg': 0.0, 'pos': 0.458, 'compound': 0.765, ...",pos,blog
1,twitter,2017-08-22 08:09:03,ncsc,Threat Report: US Internal Revenue Service war...,https://www.ncsc.gov.uk/report/weekly-threat-r...,"Status(user=User(following=True, profile_backg...",8208.029778,"{'neg': 0.563, 'pos': 0.0, 'compound': -0.891,...",neg,unknown link
2,twitter,2017-08-22 09:56:25,ConradLongmore,"Malware spam from ""Voicemail Service"" [pbx@loc...",http://blog.dynamoo.com/2017/08/malware-spam-f...,"Status(user=User(following=True, profile_backg...",8208.172933,"{'neg': 0.263, 'pos': 0.0, 'compound': -0.3612...",neu,blog
3,twitter,2017-08-22 20:26:59,bry_campbell,Can I take tomorrow off now https://t.co/dw7hF...,https://twitter.com/kpoulsen/status/9000362154...,"Status(user=User(following=True, profile_backg...",8209.013689,"{'neg': 0.0, 'pos': 0.0, 'compound': 0.0, 'neu...",neu,twitter status
4,twitter,2017-08-22 20:02:43,bry_campbell,Great. More Ukraine backdoors. https://t.co/vY...,https://issp.ua/issp_system_images/Crystal_Fin...,"Status(user=User(following=True, profile_backg...",8208.981333,"{'neg': 0.0, 'pos': 0.577, 'compound': 0.6249,...",pos,paper
5,twitter,2017-08-22 19:17:33,bry_campbell,So @cybereason specifically say the rise in ra...,https://itunes.apple.com/gb/podcast/malicious-...,"Status(user=User(following=True, profile_backg...",8208.921111,"{'neg': 0.0, 'pos': 0.0, 'compound': 0.0, 'neu...",neu,unknown link
6,twitter,2017-08-22 19:33:02,symantec,Symantec’s @liam_omurchu discusses #Stuxnet so...,http://symc.ly/2g3wsTU,"Status(user=User(following=True, profile_backg...",8209.242786,"{'neg': 0.0, 'pos': 0.0, 'compound': 0.0, 'neu...",neu,unknown link
7,twitter,2017-08-22 17:38:11,symantec,Congrats @professor__plum &amp; @threatintel! ...,https://twitter.com/alexz1elke/status/90004489...,"Status(user=User(following=True, profile_backg...",8208.788622,"{'neg': 0.0, 'pos': 0.552, 'compound': 0.5707,...",pos,twitter status
8,twitter,2017-08-22 17:19:03,symantec,Take a look at Symantec's interactive map of a...,https://twitter.com/i/web/status/9000446611123...,"Status(user=User(following=True, profile_backg...",8208.763111,"{'neg': 0.0, 'pos': 0.0, 'compound': 0.0, 'neu...",neu,twitter status
9,twitter,2017-08-22 15:00:05,symantec,Attending the #Oktane17 Conference in Las Vega...,https://twitter.com/i/web/status/9000096873320...,"Status(user=User(following=True, profile_backg...",8208.878852,"{'neg': 0.0, 'pos': 0.0, 'compound': 0.0, 'neu...",neu,twitter status


In [10]:
stats['type'].value_counts()

paper                            38
twitter status                   33
unknown link                     31
shortened-link                   29
news                              5
blog                              3
subreddit: /r/MachineLearning     3
code                              1
Name: type, dtype: int64

In [11]:
stats['sentiment'].value_counts()

neu    115
pos     16
neg     12
Name: sentiment, dtype: int64

### Data Organizer
- Takes in twitter and reddit 'text' as an input.
- Uses this text and performs a google query
- Classify the results with pre-defined rules.

In [15]:
do = DataOrganizer(stats)
extended = do.enhance()

*[Data Organizer] Downloading Google Search Results


In [16]:
extended

Unnamed: 0,source,created_at,author,text,url,raw_data,hotness,sentiment_polarity,sentiment,type,cleaned_text,google-search,types
0,twitter,2017-08-22 10:36:03,ncsc,"We have awarded a £500,000 grant to fund 'Deve...",https://www.ncsc.gov.uk/blog-post/ncsc-award-0...,"Status(user=User(following=True, profile_backg...",8209.003929,"{'neg': 0.0, 'pos': 0.458, 'compound': 0.765, ...",pos,blog,"b""We have awarded a \xc2\xa3500,000 grant to f...",[(https://dba.stackexchange.com/questions/305/...,"{'tutorial': [], 'social-media': [], 'shortene..."
1,twitter,2017-08-22 08:09:03,ncsc,Threat Report: US Internal Revenue Service war...,https://www.ncsc.gov.uk/report/weekly-threat-r...,"Status(user=User(following=True, profile_backg...",8208.029778,"{'neg': 0.563, 'pos': 0.0, 'compound': -0.891,...",neg,unknown link,b'Threat Report: US Internal Revenue Service w...,[(https://www.inc.com/joseph-steinberg/irs-war...,"{'tutorial': [], 'social-media': [], 'shortene..."
2,twitter,2017-08-22 09:56:25,ConradLongmore,"Malware spam from ""Voicemail Service"" [pbx@loc...",http://blog.dynamoo.com/2017/08/malware-spam-f...,"Status(user=User(following=True, profile_backg...",8208.172933,"{'neg': 0.263, 'pos': 0.0, 'compound': -0.3612...",neu,blog,"b'Malware spam from ""Voicemail Service"" [pbx@l...",[],"{'tutorial': [], 'social-media': [], 'shortene..."
3,twitter,2017-08-22 20:26:59,bry_campbell,Can I take tomorrow off now https://t.co/dw7hF...,https://twitter.com/kpoulsen/status/9000362154...,"Status(user=User(following=True, profile_backg...",8209.013689,"{'neg': 0.0, 'pos': 0.0, 'compound': 0.0, 'neu...",neu,twitter status,b'Can I take tomorrow off now https://t.co/dw7...,[(https://forum.wordreference.com/threads/take...,"{'tutorial': [], 'social-media': [], 'shortene..."
4,twitter,2017-08-22 20:02:43,bry_campbell,Great. More Ukraine backdoors. https://t.co/vY...,https://issp.ua/issp_system_images/Crystal_Fin...,"Status(user=User(following=True, profile_backg...",8208.981333,"{'neg': 0.0, 'pos': 0.577, 'compound': 0.6249,...",pos,paper,b'Great. More Ukraine backdoors. https://t.co/...,[(http://www.slate.com/blogs/future_tense/2017...,"{'tutorial': [], 'social-media': [], 'shortene..."
5,twitter,2017-08-22 19:17:33,bry_campbell,So @cybereason specifically say the rise in ra...,https://itunes.apple.com/gb/podcast/malicious-...,"Status(user=User(following=True, profile_backg...",8208.921111,"{'neg': 0.0, 'pos': 0.0, 'compound': 0.0, 'neu...",neu,unknown link,b'So @cybereason specifically say the rise in ...,[(/aclk?sa=l&ai=DChcSEwjg4suL8-7VAhXHXH4KHeqWD...,"{'tutorial': [], 'social-media': [], 'shortene..."
6,twitter,2017-08-22 19:33:02,symantec,Symantec’s @liam_omurchu discusses #Stuxnet so...,http://symc.ly/2g3wsTU,"Status(user=User(following=True, profile_backg...",8209.242786,"{'neg': 0.0, 'pos': 0.0, 'compound': 0.0, 'neu...",neu,unknown link,b'Symantec\xe2\x80\x99s @liam_omurchu discusse...,[],"{'tutorial': [], 'social-media': [], 'shortene..."
7,twitter,2017-08-22 17:38:11,symantec,Congrats @professor__plum &amp; @threatintel! ...,https://twitter.com/alexz1elke/status/90004489...,"Status(user=User(following=True, profile_backg...",8208.788622,"{'neg': 0.0, 'pos': 0.552, 'compound': 0.5707,...",pos,twitter status,b'Congrats @professor__plum &amp; @threatintel...,"[(https://twitter.com/threatintel?lang=en, Sec...","{'tutorial': [], 'social-media': [('https://tw..."
8,twitter,2017-08-22 17:19:03,symantec,Take a look at Symantec's interactive map of a...,https://twitter.com/i/web/status/9000446611123...,"Status(user=User(following=True, profile_backg...",8208.763111,"{'neg': 0.0, 'pos': 0.0, 'compound': 0.0, 'neu...",neu,twitter status,"b""Take a look at Symantec's interactive map of...",[(http://mapmaker.nationalgeographic.org/hBqEW...,"{'tutorial': [], 'social-media': [], 'shortene..."
9,twitter,2017-08-22 15:00:05,symantec,Attending the #Oktane17 Conference in Las Vega...,https://twitter.com/i/web/status/9000096873320...,"Status(user=User(following=True, profile_backg...",8208.878852,"{'neg': 0.0, 'pos': 0.0, 'compound': 0.0, 'neu...",neu,twitter status,b'Attending the #Oktane17 Conference in Las Ve...,"[(https://www.okta.com/oktane17/, Oktane17 - R...","{'tutorial': [], 'social-media': [], 'shortene..."


In [45]:
for i, e in extended.iterrows():
    print('* Date: {}'.format(e['created_at']))
    print('* Text: {}'.format(e['text']))
    print('* Related Urls: ')
    for ty, r in e['types'].items():
        if len(r) != 0:
            print('* * Type: {}'.format(ty))
            for result in r:
                print('* * url: {} \n* * snippet: {}'.format(result[0], result[1]))
    print('-'*115)

* Date: 2017-08-22 10:36:03
* Text: We have awarded a £500,000 grant to fund 'Developer Centred Security' research https://t.co/AI0Wzdcz7t #devops https://t.co/aPZKemUXjn
* Related Urls: 
-------------------------------------------------------------------------------------------------------------------
* Date: 2017-08-22 08:09:03
* Text: Threat Report: US Internal Revenue Service warns of fake tax software update scam https://t.co/FkHhcgtC6F https://t.co/nIqfrb7DCo
* Related Urls: 
* * Type: news
* * url: https://www.irs.gov/uac/newsroom/irs-warns-of-latest-scam-variation-involving-bogus-federal-student-tax 
* * snippet: IRS Warns of Latest Scam Variation Involving Bogus “Federal Student ... | Mar 1, 2017 - In this newest twist, they try to convince people to wire money ... for this fake “federal student tax”, the scammer threatens to report the student to the police. ... a tax company and sometimes even a state revenue department. ... nor will the agency call about taxes owed without 

### Name Entity Recognition and Wikipedia Summarization
- Takes text of twitter and reddit as inputs.
- Performs NERs on all texts
- Use these NERs to perform a wikipedia query.
- Summarize the wikipedia article.

In [46]:
se = SummarizeNER(extended)
tagged = se.get_summarized_data()

Downloading wikipedia pages...

In [52]:
tagged

Unnamed: 0,source,created_at,author,text,url,raw_data,hotness,sentiment_polarity,sentiment,type,cleaned_text,google-search,types,NER,Wiki-NER-Sumarry
0,twitter,2017-08-22 10:36:03,ncsc,"We have awarded a £500,000 grant to fund 'Deve...",https://www.ncsc.gov.uk/blog-post/ncsc-award-0...,"Status(user=User(following=True, profile_backg...",8209.003929,"{'neg': 0.0, 'pos': 0.458, 'compound': 0.765, ...",pos,blog,"b""We have awarded a \xc2\xa3500,000 grant to f...",[(https://dba.stackexchange.com/questions/305/...,"{'tutorial': [], 'social-media': [], 'shortene...",,No wikipedia page found
1,twitter,2017-08-22 08:09:03,ncsc,Threat Report: US Internal Revenue Service war...,https://www.ncsc.gov.uk/report/weekly-threat-r...,"Status(user=User(following=True, profile_backg...",8208.029778,"{'neg': 0.563, 'pos': 0.0, 'compound': -0.891,...",neg,unknown link,b'Threat Report: US Internal Revenue Service w...,[(https://www.inc.com/joseph-steinberg/irs-war...,"{'tutorial': [], 'social-media': [], 'shortene...","(US Internal Revenue Service, ORGANIZATION)",The Internal Revenue Service (IRS) is the reve...
2,twitter,2017-08-22 09:56:25,ConradLongmore,"Malware spam from ""Voicemail Service"" [pbx@loc...",http://blog.dynamoo.com/2017/08/malware-spam-f...,"Status(user=User(following=True, profile_backg...",8208.172933,"{'neg': 0.263, 'pos': 0.0, 'compound': -0.3612...",neu,blog,"b'Malware spam from ""Voicemail Service"" [pbx@l...",[],"{'tutorial': [], 'social-media': [], 'shortene...","(Voicemail Service, ORGANIZATION)",Visual voicemail is random-access voicemail wi...
3,twitter,2017-08-22 20:26:59,bry_campbell,Can I take tomorrow off now https://t.co/dw7hF...,https://twitter.com/kpoulsen/status/9000362154...,"Status(user=User(following=True, profile_backg...",8209.013689,"{'neg': 0.0, 'pos': 0.0, 'compound': 0.0, 'neu...",neu,twitter status,b'Can I take tomorrow off now https://t.co/dw7...,[(https://forum.wordreference.com/threads/take...,"{'tutorial': [], 'social-media': [], 'shortene...",,No wikipedia page found
4,twitter,2017-08-22 20:02:43,bry_campbell,Great. More Ukraine backdoors. https://t.co/vY...,https://issp.ua/issp_system_images/Crystal_Fin...,"Status(user=User(following=True, profile_backg...",8208.981333,"{'neg': 0.0, 'pos': 0.577, 'compound': 0.6249,...",pos,paper,b'Great. More Ukraine backdoors. https://t.co/...,[(http://www.slate.com/blogs/future_tense/2017...,"{'tutorial': [], 'social-media': [], 'shortene...","(Ukraine, LOCATION)","Ukraine (; Ukrainian: Україна, tr. Ukrajina [u..."
5,twitter,2017-08-22 19:17:33,bry_campbell,So @cybereason specifically say the rise in ra...,https://itunes.apple.com/gb/podcast/malicious-...,"Status(user=User(following=True, profile_backg...",8208.921111,"{'neg': 0.0, 'pos': 0.0, 'compound': 0.0, 'neu...",neu,unknown link,b'So @cybereason specifically say the rise in ...,[(/aclk?sa=l&ai=DChcSEwjg4suL8-7VAhXHXH4KHeqWD...,"{'tutorial': [], 'social-media': [], 'shortene...",,No wikipedia page found
6,twitter,2017-08-22 19:33:02,symantec,Symantec’s @liam_omurchu discusses #Stuxnet so...,http://symc.ly/2g3wsTU,"Status(user=User(following=True, profile_backg...",8209.242786,"{'neg': 0.0, 'pos': 0.0, 'compound': 0.0, 'neu...",neu,unknown link,b'Symantec\xe2\x80\x99s @liam_omurchu discusse...,[],"{'tutorial': [], 'social-media': [], 'shortene...","(Symantec, ORGANIZATION)",Symantec Corporation (commonly known as Syman...
7,twitter,2017-08-22 17:38:11,symantec,Congrats @professor__plum &amp; @threatintel! ...,https://twitter.com/alexz1elke/status/90004489...,"Status(user=User(following=True, profile_backg...",8208.788622,"{'neg': 0.0, 'pos': 0.552, 'compound': 0.5707,...",pos,twitter status,b'Congrats @professor__plum &amp; @threatintel...,"[(https://twitter.com/threatintel?lang=en, Sec...","{'tutorial': [], 'social-media': [('https://tw...",,No wikipedia page found
8,twitter,2017-08-22 17:19:03,symantec,Take a look at Symantec's interactive map of a...,https://twitter.com/i/web/status/9000446611123...,"Status(user=User(following=True, profile_backg...",8208.763111,"{'neg': 0.0, 'pos': 0.0, 'compound': 0.0, 'neu...",neu,twitter status,"b""Take a look at Symantec's interactive map of...",[(http://mapmaker.nationalgeographic.org/hBqEW...,"{'tutorial': [], 'social-media': [], 'shortene...","(Symantec, ORGANIZATION)",Symantec Corporation (commonly known as Syman...
9,twitter,2017-08-22 15:00:05,symantec,Attending the #Oktane17 Conference in Las Vega...,https://twitter.com/i/web/status/9000096873320...,"Status(user=User(following=True, profile_backg...",8208.878852,"{'neg': 0.0, 'pos': 0.0, 'compound': 0.0, 'neu...",neu,twitter status,b'Attending the #Oktane17 Conference in Las Ve...,"[(https://www.okta.com/oktane17/, Oktane17 - R...","{'tutorial': [], 'social-media': [], 'shortene...","(Las Vegas Symantec, ORGANIZATION)",COMDEX (an abbreviation of Computer Dealers' E...


In [57]:
for i, t in tagged.iterrows():
    print('* [Date]: {}'.format(t['created_at']))
    print('* [Text]: {}'.format(t['text']))
    print('* [Link]: {}'.format(t['url']))
    print('* [NER detedted]: ', end='')
    if t['NER'] != "N/A":
        print('{}'.format(t['NER']))
        print('* [Wiki NER Summary]: {}'.format(t['Wiki-NER-Sumarry']))
    else:
        print('No NER detected')
    print('-'*115)

* [Date]: 2017-08-22 10:36:03
* [Text]: We have awarded a £500,000 grant to fund 'Developer Centred Security' research https://t.co/AI0Wzdcz7t #devops https://t.co/aPZKemUXjn
* [Link]: https://www.ncsc.gov.uk/blog-post/ncsc-award-05m-grant-fund-developer-centred-security-research
* [NER detedted]: No NER detected
-------------------------------------------------------------------------------------------------------------------
* [Date]: 2017-08-22 08:09:03
* [Text]: Threat Report: US Internal Revenue Service warns of fake tax software update scam https://t.co/FkHhcgtC6F https://t.co/nIqfrb7DCo
* [Link]: https://www.ncsc.gov.uk/report/weekly-threat-report-18th-august-2017
* [NER detedted]: ('US Internal Revenue Service', 'ORGANIZATION')
* [Wiki NER Summary]: The Internal Revenue Service (IRS) is the revenue service of the United States federal government. The government agency is a bureau of the Department of the Treasury, and is under the immediate direction of the Commissioner of Inte

* [NER detedted]: ('Ali Ghodsi Bloomberg TV', 'ORGANIZATION')
* [Wiki NER Summary]: No wikipedia page found
-------------------------------------------------------------------------------------------------------------------
* [Date]: 2017-08-22 00:50:57
* [Text]: Aug-22,2017(JST). MalSpam attached rar-&gt;js. Infects #Locky #Ransomware. Encrypted file extensions #LUKITUS. POST In… https://t.co/KrQzn95aUu
* [Link]: https://twitter.com/i/web/status/899795994400800768
* [NER detedted]: No NER detected
-------------------------------------------------------------------------------------------------------------------
* [Date]: 2017-08-22 22:40:48
* [Text]: Watch out for "IMPORTANT.jar" 😂 files on @Dropbox &amp; why we add value vs. relying purely on byte patterns/heuristics… https://t.co/F4poJkAtNO
* [Link]: https://twitter.com/i/web/status/900125631857516545
* [NER detedted]: No NER detected
---------------------------------------------------------------------------------------------------

In [58]:
from data_aggregator import DataAggregator

In [59]:
sys.path.append('../crawler')

In [60]:
from url_content_crawler import get_url_content