# Data Preparation

## 1. Import packages and retrieve data

In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.text import Text
from nltk.sentiment.vader import SentimentIntensityAnalyzer

import pandas as pd
import numpy as np
import json
import string, re
import pandas_profiling
import local_modules.slack as slack
import local_modules.DataPreparation as dp

from progressbar import Bar, BouncingBar, Counter, ETA, \
    AdaptiveETA, FileTransferSpeed, FormatLabel, Percentage, \
    ProgressBar, ReverseBar, RotatingMarker, \
    SimpleProgress, Timer, UnknownLength
pbar = ProgressBar()
%store -r article_df article_df_enriched

## 2. Download NLTK corpora for stemming, tokenization, lemmatization
For more information: https://www.nltk.org/book/ch02.html

In [None]:
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

## 2. Get word count of articles

### 2.1 Tokenize, Stem, and Lemmatize 

In [2]:
article_df.head()

Unnamed: 0,id,text
0,106081,Trump Supporter “Kicked Pregnant Muslim Woman ...
1,129341,UW Facts and Figures – University of Wisconsin...
2,100963,Gun Control Advocates Target Peaceful Switzerl...
3,12200,U.S. and Republic of Korea Conclude New Specia...
4,128496,Kremlin's persistent claim of “expected chemic...


In [None]:
ns = dp.remove_stopwords(article_df.iloc[0]['text'])

In [3]:
dp.create_append_feature(article_df, 'word_count', dp.get_word_count, 'text')
dp.create_append_feature(article_df, 'token_count', dp.get_token_count, 'text')
dp.create_append_feature(article_df, 'brevity_score', dp.brevity_score, 'text')

In [None]:
dp.create_append_feature(article_df, 'filtered_text', dp.remove_stopwords, 'text')

In [4]:
article_df.head()

Unnamed: 0,id,text,word_count,token_count,brevity_score
0,106081,Trump Supporter “Kicked Pregnant Muslim Woman ...,189,330,0.572727
1,129341,UW Facts and Figures – University of Wisconsin...,40,69,0.57971
2,100963,Gun Control Advocates Target Peaceful Switzerl...,909,1549,0.58683
3,12200,U.S. and Republic of Korea Conclude New Specia...,173,284,0.609155
4,128496,Kremlin's persistent claim of “expected chemic...,351,679,0.516937


## 3. Generating sentiment data

### 3.1 Using NLTK vader
http://www.nltk.org/howto/sentiment.html


    neg: Negative
    neu: Neutral
    pos: Positive
    compound: Compound (i.e. aggregated score)


In [None]:
sid = SentimentIntensityAnalyzer()

In [None]:
dp.create_append_feature(article_df, 'brevity_score', dp.get_positive_sentiment, 'text')
get_positive_sentiment(sid.polarity_scores, )

In [None]:
articles = article_df['filtered_text'].tolist()

In [None]:
# pos = list that gets populated with positive sentiment for each article with stopwords removed.
# neg = list that gets populated with negative sentiment for each article with stopwords removed.
# neu = list that gets populated with neutral sentiment for each article with stopwords removed.
# comp = list that gets populated with compound score of sentiment for each article with stopwords removed.
# j = progress_indicator
# pbar = progress_bar
j = 0
pos = []
neg = []
neu = []
comp = []
pbar = ProgressBar(widgets=[Percentage(), Bar(), ETA()], maxval=len(article_df)).start()
for article in articles:
    article_sentence = ' '.join(word for word in article)
    ss = sid.polarity_scores(article_sentence)
    pos.append(ss['pos'])
    neg.append(ss['neg'])
    neu.append(ss['neu'])
    comp.append(ss['compound'])
    pbar.update(i+1)
    j += 1
    if j%5000 == 0:
        slack.SlackNotification('datacup', '%s / %s articles have been analyzed for sentiment.' % (j, len(article_df)))
pbar.finish()
article_df['pos'] = pos
article_df['neg'] = neg
article_df['neu'] = neu
article_df['compound'] = comp
slack.SlackNotification('datacup', 'All sentiment has been analyzed using the NLTK vader sentiment analysis method')

In [None]:
article_df.head()

In [None]:
enriched_article_profile = article_df_enriched.profile_report(style={'full_width':True})
enriched_article_profile.to_file(output_file="data_profiles/enriched_article_data_profile.html")

In [None]:
# Store article_df_enriched for loading in Model Development
article_df_enriched = article_df
%store article_df_enriched 

## 4. Training data preparation

Here we will summarize the article data for each claim, building the training data for model development. 
Summary statistics include mean, variance.

In [None]:
with open("data/train.json") as f:
    train_data = json.load(f)

train_df = pd.DataFrame.from_records(train_data)

In [None]:
train_df.head()

In [None]:
j = 0
mean_pos = []
mean_neg = []
mean_neu = []
mean_comp = []
mean_brevity = []
var_pos = []
var_neg = []
var_neu = []
var_comp = []
var_brevity = []
for i, (claim) in enumerate(zip(train_df.related_articles)):
    brevity = np.zeros([len(claim[0]), 1])
    pos = np.zeros([len(claim[0]), 1])
    neg = np.zeros([len(claim[0]), 1])
    neu = np.zeros([len(claim[0]), 1])
    comp = np.zeros([len(claim[0]), 1])
    for k, article_id in enumerate(claim[0]):
        target = article_df_enriched.loc[article_df_enriched['id'] == article_id]
        if target.empty:
            continue
        brevity[k-1, 0] = target['brevity_score'].values
        pos[k-1, 0] = target['pos'].values  
        neg[k-1, 0] = target['neg'].values  
        neu[k-1, 0] = target['neu'].values
        comp[k-1, 0] = target['compound'].values    
    mean_pos.append(np.mean(pos))
    mean_neg.append(np.mean(neg))
    mean_neu.append(np.mean(neu))
    mean_comp.append(np.mean(comp))
    mean_brevity.append(np.mean(brevity))
    var_pos.append(np.var(pos))
    var_neg.append(np.var(neg))
    var_neu.append(np.var(neu))
    var_comp.append(np.var(comp))
    var_brevity.append(np.var(brevity))
    j += 1
    if j%5000 == 0:
        slack.SlackNotification('BK_slackbot', '%s / %s claims data have been populated.' % (j, len(train_df)))
#     if j == 2:
#         break

train_df['mean_pos'] = mean_pos
train_df['mean_neg'] = mean_neg
train_df['mean_neu'] = mean_neu
train_df['mean_comp'] = mean_comp
train_df['mean_brevity'] = mean_brevity

train_df['var_pos'] = var_pos
train_df['var_neg'] = var_neg
train_df['var_neu'] = var_neu
train_df['var_comp'] = var_comp
train_df['var_brevity'] = var_brevity

slack.SlackNotification('BK_slackbot', 'All claims data have been populated.')

# def find_articles():
#     df.loc[df['column_name'] == some_value]

# def create_summary_stats(item):
#     df.loc[df['column_name'] == some_value]

In [None]:
train_df.head()

In [None]:
# Profile training data
profile_train_df = train_df.profile_report(style={'full_width':True})
profile_train_df.to_file(output_file="data_profiles/training_data_profile.html")

## 5. Fill out empty data for claimants

In [None]:
train_df['claimant'].replace('', 'anon', inplace=True)

## 6. Split data into labels and features

In [None]:
# Labels are the values we want to predict
labels = train_df['label']
features = train_df \
    .drop('claim', axis = 1) \
    .drop('label', axis = 1) \
    .drop('related_articles', axis = 1) \
    .drop('id', axis = 1) \
    .drop('date', axis = 1) \

In [None]:
features.head()

## 6. Standardization of data

How to encode claimant? High dimensionality

In [None]:
features_no_claimant = features.drop('claimant', axis = 1)

In [None]:
from sklearn import preprocessing# Get column names first
names = features_no_claimant.columns# Create the Scaler object
scaler = preprocessing.StandardScaler()# Fit your data on the scaler object
scaled_features = scaler.fit_transform(features_no_claimant)
scaled_features = pd.DataFrame(scaled_features, columns=names)

# scaled_labels = scaler.fit_transform(labels.reshape(-1, 1))

## 7. Split data into training and testing sets

In [None]:
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split # Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(scaled_features, labels, test_size = 0.25, random_state = 42)

In [None]:
# Make sure splitting was done right
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

In [None]:
# Store data for loading in Model Development
%store train_features   
%store test_features
%store train_labels
%store test_labels

## Rough Notes

In [None]:
# Lemmatization
wnl = nltk.WordNetLemmatizer()
lemma = set([wnl.lemmatize(t) for t in tokens])

In [None]:
sorted(set(tokens))[:15]

In [None]:
word_tags = nltk.pos_tag(tokens)