# Data Preparation

## 1. Import packages and retrieve data

In [3]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.text import Text
from nltk.sentiment.vader import SentimentIntensityAnalyzer

import pandas as pd
import numpy as np
import json
import string, re
import pandas_profiling
import local_modules.slack as slack

from progressbar import Bar, BouncingBar, Counter, ETA, \
    AdaptiveETA, FileTransferSpeed, FormatLabel, Percentage, \
    ProgressBar, ReverseBar, RotatingMarker, \
    SimpleProgress, Timer, UnknownLength
pbar = ProgressBar()
%store -r article_df article_df_enriched

## 2. Download NLTK corpora for stemming, tokenization, lemmatization
For more information: https://www.nltk.org/book/ch02.html

In [2]:
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package wordnet to /home/bking/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/bking/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

## 2. Get word count of articles

### 2.1 Tokenize, Stem, and Lemmatize 

In [3]:
article_df.head()

Unnamed: 0,id,text
0,106081,Trump Supporter “Kicked Pregnant Muslim Woman ...
1,129341,UW Facts and Figures – University of Wisconsin...
2,100963,Gun Control Advocates Target Peaceful Switzerl...
3,12200,U.S. and Republic of Korea Conclude New Specia...
4,128496,Kremlin's persistent claim of “expected chemic...


In [4]:
# Filter text to remove punctuation and stopwords
stop_words = list(set(stopwords.words('english')))
def remove_stopwords(text):
    text = re.sub(r'[^\w\s]', '', text)
    text = word_tokenize(text)
    return [w for w in text if not w in stop_words]

In [5]:
# a = list that gets populated with count of words from each article with stopwords removed.
# b = list that gets populated with articles with stopwords removed.
# c = list that gets populated with token count of each article.
# d = list that gets populated with brevity score = word_count_no_stopwords / token_count
# j = progress_indicator
# pbar = progress_bar
filtered_df = []
a = []
b = []
c = []
d = []
j = 0
pbar = ProgressBar(widgets=[Percentage(), Bar(), ETA()], maxval=len(article_df)).start()
for i, (article) in enumerate(zip(article_df.text)):
    b.append(remove_stopwords(article[0]))
    a.append(len(b[i]))
    word_count_no_stopwords = a[i]
    token_count = len(nltk.word_tokenize(article[0]))
    brevity_score = word_count_no_stopwords / token_count
    c.append(token_count)
    d.append(brevity_score)
    j += 1
    limit = 0 
    pbar.update(i+1)
    if j%10000 == 0:
        slack.SlackNotification('BK_slackbot', '%s / %s have completed' % (j, len(article_df)))
pbar.finish()
article_df['word_count_no_stop_words'] = a
article_df['filtered_text'] = b
article_df['token_count'] = c
article_df['brevity_score'] = d
slack.SlackNotification('BK_slackbot', 'All stopwords have been removed')

100%|############################################################|Time: 0:09:26


In [7]:
article_df.head()

Unnamed: 0,id,text,word_count_no_stop_words,filtered_text,token_count,brevity_score
0,106081,Trump Supporter “Kicked Pregnant Muslim Woman ...,189,"[Trump, Supporter, Kicked, Pregnant, Muslim, W...",330,0.572727
1,129341,UW Facts and Figures – University of Wisconsin...,40,"[UW, Facts, Figures, University, WisconsinMadi...",69,0.57971
2,100963,Gun Control Advocates Target Peaceful Switzerl...,909,"[Gun, Control, Advocates, Target, Peaceful, Sw...",1549,0.58683
3,12200,U.S. and Republic of Korea Conclude New Specia...,173,"[US, Republic, Korea, Conclude, New, Special, ...",284,0.609155
4,128496,Kremlin's persistent claim of “expected chemic...,351,"[Kremlins, persistent, claim, expected, chemic...",679,0.516937


## 3. Generating sentiment data

### 3.1 Using NLTK vader
http://www.nltk.org/howto/sentiment.html


    neg: Negative
    neu: Neutral
    pos: Positive
    compound: Compound (i.e. aggregated score)


In [8]:
sid = SentimentIntensityAnalyzer()

In [9]:
articles = article_df['filtered_text'].tolist()

In [27]:
# pos = list that gets populated with positive sentiment for each article with stopwords removed.
# neg = list that gets populated with negative sentiment for each article with stopwords removed.
# neu = list that gets populated with neutral sentiment for each article with stopwords removed.
# comp = list that gets populated with compound score of sentiment for each article with stopwords removed.
# j = progress_indicator
# pbar = progress_bar
j = 0
pos = []
neg = []
neu = []
comp = []
pbar = ProgressBar(widgets=[Percentage(), Bar(), ETA()], maxval=len(article_df)).start()
for article in articles:
    article_sentence = ' '.join(word for word in article)
    ss = sid.polarity_scores(article_sentence)
    pos.append(ss['pos'])
    neg.append(ss['neg'])
    neu.append(ss['neu'])
    comp.append(ss['compound'])
    pbar.update(i+1)
    j += 1
    if j%5000 == 0:
        slack.SlackNotification('datacup', '%s / %s articles have been analyzed for sentiment.' % (j, len(article_df)))
pbar.finish()
article_df['pos'] = pos
article_df['neg'] = neg
article_df['neu'] = neu
article_df['compound'] = comp
slack.SlackNotification('datacup', 'All sentiment has been analyzed using the NLTK vader sentiment analysis method')

100%|############################################################|Time: 0:07:14


In [44]:
article_df.head()

Unnamed: 0,id,text
0,106081,Trump Supporter “Kicked Pregnant Muslim Woman ...
1,129341,UW Facts and Figures – University of Wisconsin...
2,100963,Gun Control Advocates Target Peaceful Switzerl...
3,12200,U.S. and Republic of Korea Conclude New Specia...
4,128496,Kremlin's persistent claim of “expected chemic...


In [46]:
enriched_article_profile = article_df_enriched.profile_report(style={'full_width':True})
enriched_article_profile.to_file(output_file="data_profiles/enriched_article_data_profile.html")

In [34]:
# Store article_df_enriched for loading in Model Development
article_df_enriched = article_df
%store article_df_enriched 

Stored 'article_df_enriched' (DataFrame)


## 4. Training data preparation

Here we will summarize the article data for each claim, building the training data for model development. 
Summary statistics include mean, variance.

In [4]:
with open("data/train.json") as f:
    train_data = json.load(f)

train_df = pd.DataFrame.from_records(train_data)

In [5]:
train_df.head()

Unnamed: 0,claim,claimant,date,label,related_articles,id
0,A line from George Orwell's novel 1984 predict...,,2017-07-17,0,"[122094, 122580, 130685, 134765]",0
1,Maine legislature candidate Leslie Gibson insu...,,2018-03-17,2,"[106868, 127320, 128060]",1
2,A 17-year-old girl named Alyssa Carson is bein...,,2018-07-18,1,"[132130, 132132, 149722]",4
3,In 1988 author Roald Dahl penned an open lette...,,2019-02-04,2,"[123254, 123418, 127464]",5
4,"When it comes to fighting terrorism, ""Another ...",Hillary Clinton,2016-03-22,2,"[41099, 89899, 72543, 82644, 95344, 88361]",6


In [6]:
j = 0
mean_pos = []
mean_neg = []
mean_neu = []
mean_comp = []
mean_brevity = []
var_pos = []
var_neg = []
var_neu = []
var_comp = []
var_brevity = []
for i, (claim) in enumerate(zip(train_df.related_articles)):
    brevity = np.zeros([len(claim[0]), 1])
    pos = np.zeros([len(claim[0]), 1])
    neg = np.zeros([len(claim[0]), 1])
    neu = np.zeros([len(claim[0]), 1])
    comp = np.zeros([len(claim[0]), 1])
    for k, article_id in enumerate(claim[0]):
        target = article_df_enriched.loc[article_df_enriched['id'] == article_id]
        if target.empty:
            continue
        brevity[k-1, 0] = target['brevity_score'].values
        pos[k-1, 0] = target['pos'].values  
        neg[k-1, 0] = target['neg'].values  
        neu[k-1, 0] = target['neu'].values
        comp[k-1, 0] = target['compound'].values    
    mean_pos.append(np.mean(pos))
    mean_neg.append(np.mean(neg))
    mean_neu.append(np.mean(neu))
    mean_comp.append(np.mean(comp))
    mean_brevity.append(np.mean(brevity))
    var_pos.append(np.var(pos))
    var_neg.append(np.var(neg))
    var_neu.append(np.var(neu))
    var_comp.append(np.var(comp))
    var_brevity.append(np.var(brevity))
    j += 1
    if j%5000 == 0:
        slack.SlackNotification('BK_slackbot', '%s / %s claims data have been populated.' % (j, len(train_df)))
#     if j == 2:
#         break

train_df['mean_pos'] = mean_pos
train_df['mean_neg'] = mean_neg
train_df['mean_neu'] = mean_neu
train_df['mean_comp'] = mean_comp
train_df['mean_brevity'] = mean_brevity

train_df['var_pos'] = var_pos
train_df['var_neg'] = var_neg
train_df['var_neu'] = var_neu
train_df['var_comp'] = var_comp
train_df['var_brevity'] = var_brevity

slack.SlackNotification('BK_slackbot', 'All claims data have been populated.')

# def find_articles():
#     df.loc[df['column_name'] == some_value]

# def create_summary_stats(item):
#     df.loc[df['column_name'] == some_value]

In [7]:
train_df.head()

Unnamed: 0,claim,claimant,date,label,related_articles,id,mean_pos,mean_neg,mean_neu,mean_comp,mean_brevity,var_pos,var_neg,var_neu,var_comp,var_brevity
0,A line from George Orwell's novel 1984 predict...,,2017-07-17,0,"[122094, 122580, 130685, 134765]",0,0.10575,0.0775,0.56675,0.3605,0.397071,0.004667,0.003895,0.112099,0.197894,0.053023
1,Maine legislature candidate Leslie Gibson insu...,,2018-03-17,2,"[106868, 127320, 128060]",1,0.037667,0.014667,0.281,0.311633,0.186249,0.002838,0.00043,0.157922,0.194231,0.069377
2,A 17-year-old girl named Alyssa Carson is bein...,,2018-07-18,1,"[132130, 132132, 149722]",4,0.148667,0.008667,0.509333,0.659933,0.358814,0.014507,4.8e-05,0.133561,0.217814,0.067672
3,In 1988 author Roald Dahl penned an open lette...,,2019-02-04,2,"[123254, 123418, 127464]",5,0.087333,0.117333,0.462,-0.666133,0.333458,0.00383,0.007124,0.107106,0.221867,0.055601
4,"When it comes to fighting terrorism, ""Another ...",Hillary Clinton,2016-03-22,2,"[41099, 89899, 72543, 82644, 95344, 88361]",6,0.017167,0.002167,0.147333,0.1541,0.095579,0.001473,2.3e-05,0.108536,0.118734,0.045677


In [12]:
# Profile training data
profile_train_df = train_df.profile_report(style={'full_width':True})
profile_train_df.to_file(output_file="data_profiles/training_data_profile.html")

  return N_k * (np.log(N_k) - np.log(T_k))


## 5. Fill out empty data for claimants

In [8]:
train_df['claimant'].replace('', 'anon', inplace=True)

## 6. Split data into labels and features

In [16]:
# Labels are the values we want to predict
labels = train_df['label']
features = train_df \
    .drop('claim', axis = 1) \
    .drop('label', axis = 1) \
    .drop('related_articles', axis = 1) \
    .drop('id', axis = 1) \
    .drop('date', axis = 1) \

In [10]:
features.head()

Unnamed: 0,claimant,mean_pos,mean_neg,mean_neu,mean_comp,mean_brevity,var_pos,var_neg,var_neu,var_comp,var_brevity
0,anon,0.10575,0.0775,0.56675,0.3605,0.397071,0.004667,0.003895,0.112099,0.197894,0.053023
1,anon,0.037667,0.014667,0.281,0.311633,0.186249,0.002838,0.00043,0.157922,0.194231,0.069377
2,anon,0.148667,0.008667,0.509333,0.659933,0.358814,0.014507,4.8e-05,0.133561,0.217814,0.067672
3,anon,0.087333,0.117333,0.462,-0.666133,0.333458,0.00383,0.007124,0.107106,0.221867,0.055601
4,Hillary Clinton,0.017167,0.002167,0.147333,0.1541,0.095579,0.001473,2.3e-05,0.108536,0.118734,0.045677


## 6. Standardization of data

How to encode claimant? High dimensionality

In [11]:
features_no_claimant = features.drop('claimant', axis = 1)

In [12]:
from sklearn import preprocessing# Get column names first
names = features_no_claimant.columns# Create the Scaler object
scaler = preprocessing.StandardScaler()# Fit your data on the scaler object
scaled_features = scaler.fit_transform(features_no_claimant)
scaled_features = pd.DataFrame(scaled_features, columns=names)

# scaled_labels = scaler.fit_transform(labels.reshape(-1, 1))

## 7. Split data into training and testing sets

In [17]:
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split # Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(scaled_features, labels, test_size = 0.25, random_state = 42)

In [18]:
# Make sure splitting was done right
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

Training Features Shape: (11666, 10)
Training Labels Shape: (11666,)
Testing Features Shape: (3889, 10)
Testing Labels Shape: (3889,)


In [19]:
# Store data for loading in Model Development
%store train_features   
%store test_features
%store train_labels
%store test_labels

Stored 'train_features' (DataFrame)
Stored 'test_features' (DataFrame)
Stored 'train_labels' (Series)
Stored 'test_labels' (Series)


## Rough Notes

In [34]:
# Lemmatization
wnl = nltk.WordNetLemmatizer()
lemma = set([wnl.lemmatize(t) for t in tokens])

In [None]:
sorted(set(tokens))[:15]

In [42]:
word_tags = nltk.pos_tag(tokens)