# Data Preparation

## 1. Import packages and retrieve data

In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.text import Text
from nltk.sentiment.vader import SentimentIntensityAnalyzer

import pandas as pd
import numpy as np

import string, re
import pandas_profiling
import slack

from progressbar import Bar, BouncingBar, Counter, ETA, \
    AdaptiveETA, FileTransferSpeed, FormatLabel, Percentage, \
    ProgressBar, ReverseBar, RotatingMarker, \
    SimpleProgress, Timer, UnknownLength
pbar = ProgressBar()
%store -r article_df

## 2. Download NLTK corpora for stemming, tokenization, lemmatization
For more information: https://www.nltk.org/book/ch02.html

In [2]:
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package wordnet to /home/bking/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/bking/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

## 2. Get word count of articles

### 2.1 Tokenize, Stem, and Lemmatize 

In [3]:
article_df.head()

Unnamed: 0,id,text
0,106081,Trump Supporter “Kicked Pregnant Muslim Woman ...
1,129341,UW Facts and Figures – University of Wisconsin...
2,100963,Gun Control Advocates Target Peaceful Switzerl...
3,12200,U.S. and Republic of Korea Conclude New Specia...
4,128496,Kremlin's persistent claim of “expected chemic...


In [4]:
# Filter text to remove punctuation and stopwords
stop_words = list(set(stopwords.words('english')))
def remove_stopwords(text):
    text = re.sub(r'[^\w\s]', '', text)
    text = word_tokenize(text)
    return [w for w in text if not w in stop_words]

In [5]:
# a = list that gets populated with count of words from each article with stopwords removed.
# b = list that gets populated with articles with stopwords removed.
# c = list that gets populated with token count of each article.
# d = list that gets populated with brevity score = word_count_no_stopwords / token_count
# j = progress_indicator
# pbar = progress_bar
filtered_df = []
a = []
b = []
c = []
d = []
j = 0
pbar = ProgressBar(widgets=[Percentage(), Bar(), ETA()], maxval=len(article_df)).start()
for i, (article) in enumerate(zip(article_df.text)):
    b.append(remove_stopwords(article[0]))
    a.append(len(b[i]))
    word_count_no_stopwords = a[i]
    token_count = len(nltk.word_tokenize(article[0]))
    brevity_score = word_count_no_stopwords / token_count
    c.append(token_count)
    d.append(brevity_score)
    j += 1
    limit = 0 
    pbar.update(i+1)
    if j%10000 == 0:
        slack.SlackNotification('BK_slackbot', '%s / %s have completed' % (j, len(article_df)))
pbar.finish()
article_df['word_count_no_stop_words'] = a
article_df['filtered_text'] = b
article_df['token_count'] = c
article_df['brevity_score'] = d
slack.SlackNotification('BK_slackbot', 'All stopwords have been removed')

100%|############################################################|Time: 0:09:26


In [7]:
article_df.head()

Unnamed: 0,id,text,word_count_no_stop_words,filtered_text,token_count,brevity_score
0,106081,Trump Supporter “Kicked Pregnant Muslim Woman ...,189,"[Trump, Supporter, Kicked, Pregnant, Muslim, W...",330,0.572727
1,129341,UW Facts and Figures – University of Wisconsin...,40,"[UW, Facts, Figures, University, WisconsinMadi...",69,0.57971
2,100963,Gun Control Advocates Target Peaceful Switzerl...,909,"[Gun, Control, Advocates, Target, Peaceful, Sw...",1549,0.58683
3,12200,U.S. and Republic of Korea Conclude New Specia...,173,"[US, Republic, Korea, Conclude, New, Special, ...",284,0.609155
4,128496,Kremlin's persistent claim of “expected chemic...,351,"[Kremlins, persistent, claim, expected, chemic...",679,0.516937


## 3. Generating sentiment data

### 3.1 Using NLTK vader
http://www.nltk.org/howto/sentiment.html


    neg: Negative
    neu: Neutral
    pos: Positive
    compound: Compound (i.e. aggregated score)


In [8]:
sid = SentimentIntensityAnalyzer()

In [9]:
articles = article_df['filtered_text'].tolist()

In [27]:
# pos = list that gets populated with positive sentiment for each article with stopwords removed.
# neg = list that gets populated with negative sentiment for each article with stopwords removed.
# neu = list that gets populated with neutral sentiment for each article with stopwords removed.
# comp = list that gets populated with compound score of senttiment for each article with stopwords removed.
# j = progress_indicator
# pbar = progress_bar
j = 0
pos = []
neg = []
neu = []
comp = []
pbar = ProgressBar(widgets=[Percentage(), Bar(), ETA()], maxval=len(article_df)).start()
for article in articles:
    article_sentence = ' '.join(word for word in article)
    ss = sid.polarity_scores(article_sentence)
    pos.append(ss['pos'])
    neg.append(ss['neg'])
    neu.append(ss['neu'])
    comp.append(ss['compound'])
    pbar.update(i+1)
    j += 1
    if j%5000 == 0:
        slack.SlackNotification('datacup', '%s / %s articles have been analyzed for sentiment.' % (j, len(article_df)))
pbar.finish()
article_df['pos'] = pos
article_df['neg'] = neg
article_df['neu'] = neu
article_df['compound'] = comp
slack.SlackNotification('datacup', 'All sentiment has been analyzed using the NLTK vader sentiment analysis method')

100%|############################################################|Time: 0:07:14


In [31]:
article_df.head()

Unnamed: 0,id,text,word_count_no_stop_words,filtered_text,token_count,brevity_score,pos,neg,neu,compound
0,106081,Trump Supporter “Kicked Pregnant Muslim Woman ...,189,"[Trump, Supporter, Kicked, Pregnant, Muslim, W...",330,0.572727,0.037,0.338,0.625,-0.9971
1,129341,UW Facts and Figures – University of Wisconsin...,40,"[UW, Facts, Figures, University, WisconsinMadi...",69,0.57971,0.0,0.0,1.0,0.0
2,100963,Gun Control Advocates Target Peaceful Switzerl...,909,"[Gun, Control, Advocates, Target, Peaceful, Sw...",1549,0.58683,0.123,0.147,0.73,-0.9666
3,12200,U.S. and Republic of Korea Conclude New Specia...,173,"[US, Republic, Korea, Conclude, New, Special, ...",284,0.609155,0.288,0.014,0.698,0.9944
4,128496,Kremlin's persistent claim of “expected chemic...,351,"[Kremlins, persistent, claim, expected, chemic...",679,0.516937,0.057,0.192,0.751,-0.9944


In [32]:
# article_df.to_csv(r'data/article_data.csv')

In [36]:
profile = article_df.profile_report(style={'full_width':True})

In [37]:
profile.to_file(output_file="enriched_data_profile.html")

In [34]:
# Store article_df_enriched for loading in Model Development
article_df_enriched = article_df
%store article_df_enriched 

Stored 'article_df_enriched' (DataFrame)


## Rough Notes

In [34]:
# Lemmatization
wnl = nltk.WordNetLemmatizer()
lemma = set([wnl.lemmatize(t) for t in tokens])

In [None]:
sorted(set(tokens))[:15]

In [42]:
word_tags = nltk.pos_tag(tokens)