# Data Preparation

## 1. Import packages and retrieve data

In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.text import Text
from nltk.sentiment.vader import SentimentIntensityAnalyzer

import pandas as pd
import numpy as np
import json
import string, re
import pandas_profiling
import local_modules.slack as slack
import local_modules.DataPreparation as dp

from progressbar import Bar, BouncingBar, Counter, ETA, \
    AdaptiveETA, FileTransferSpeed, FormatLabel, Percentage, \
    ProgressBar, ReverseBar, RotatingMarker, \
    SimpleProgress, Timer, UnknownLength
pbar = ProgressBar()
%store -r article_df article_df_enriched train_df

no stored variable article_df_enriched train_df


In [2]:
%load_ext ipycache

  from IPython.utils.traitlets import Unicode


In [3]:
%%cache article_df_enriched.pkl article_df_enriched
%store -r article_df_enriched

[Skipped the cell's code and loaded variables article_df_enriched from file '/home/bking/Projects/pipenvs/Fake_News_Data_Cup/article_df_enriched.pkl'.]


## 2. Download NLTK corpora for stemming, tokenization, lemmatization
For more information: https://www.nltk.org/book/ch02.html

In [None]:
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

## 2. Get word count of articles

### 2.1 Tokenize, Stem, and Lemmatize 

In [4]:
article_df_enriched.head()

Unnamed: 0,id,text,word_count,token_count,sentence_count,brevity_score,filtered_text,nltk_pos_neg_neu_compound,nltk_pos,nltk_neg,nltk_neu,nltk_comp
0,106081,Trump Supporter “Kicked Pregnant Muslim Woman ...,225,330,14,0.681818,"[Trump, Supporter, “, Kicked, Pregnant, Muslim...","[0.027, 0.254, 0.719, -0.9973]",0.027,0.254,0.719,-0.9973
1,129341,UW Facts and Figures – University of Wisconsin...,50,69,2,0.724638,"[UW, Facts, Figures, –, University, Wisconsin–...","[0.0, 0.0, 1.0, 0.0]",0.0,0.0,1.0,0.0
2,100963,Gun Control Advocates Target Peaceful Switzerl...,1055,1549,53,0.681085,"[Gun, Control, Advocates, Target, Peaceful, Sw...","[0.092, 0.101, 0.807, -0.8059]",0.092,0.101,0.807,-0.8059
3,12200,U.S. and Republic of Korea Conclude New Specia...,202,284,8,0.711268,"[U.S., Republic, Korea, Conclude, New, Special...","[0.221, 0.01, 0.769, 0.9952]",0.221,0.01,0.769,0.9952
4,128496,Kremlin's persistent claim of “expected chemic...,437,679,18,0.643594,"[Kremlin, 's, persistent, claim, “, expected, ...","[0.043, 0.13, 0.827, -0.9954]",0.043,0.13,0.827,-0.9954


In [5]:
ns = dp.remove_stopwords(article_df.iloc(0)[0]['text'])

In [None]:
# Run functions and append to dataframe for word count, token count, and brevity score
dp.create_append_feature(article_df, 'text', 'claim_word_count', dp.get_word_count)
dp.create_append_feature(article_df, 'text', 'claim_token_count', dp.get_token_count)
dp.create_append_feature(article_df, 'text', 'claim_brevity_score', dp.get_brevity_score)
dp.create_append_feature(article_df, 'text', 'claim_filtered_text', dp.remove_stopwords)
dp.create_append_feature(article_df, 'text', 'claim_filtered_text', dp.get_sentiment_nltk_vader)

## 3. Generating sentiment data

### 3.1 Using NLTK vader
http://www.nltk.org/howto/sentiment.html


    neg: Negative
    neu: Neutral
    pos: Positive
    compound: Compound (i.e. aggregated score)


In [None]:
dp.create_append_feature(article_df, 'claim_sentiment_nltk', dp.get_sentiment_nltk_vader, 'text')

In [None]:
articles = article_df['filtered_text'].tolist()

In [11]:
article_df_enriched.head()

Unnamed: 0,id,text,word_count,token_count,sentence_count,brevity_score,filtered_text,nltk_pos_neg_neu_compound,nltk_pos,nltk_neg,nltk_neu,nltk_comp
0,106081,Trump Supporter “Kicked Pregnant Muslim Woman ...,225,330,14,0.681818,"[Trump, Supporter, “, Kicked, Pregnant, Muslim...","[0.027, 0.254, 0.719, -0.9973]",0.027,0.254,0.719,-0.9973
1,129341,UW Facts and Figures – University of Wisconsin...,50,69,2,0.724638,"[UW, Facts, Figures, –, University, Wisconsin–...","[0.0, 0.0, 1.0, 0.0]",0.0,0.0,1.0,0.0
2,100963,Gun Control Advocates Target Peaceful Switzerl...,1055,1549,53,0.681085,"[Gun, Control, Advocates, Target, Peaceful, Sw...","[0.092, 0.101, 0.807, -0.8059]",0.092,0.101,0.807,-0.8059
3,12200,U.S. and Republic of Korea Conclude New Specia...,202,284,8,0.711268,"[U.S., Republic, Korea, Conclude, New, Special...","[0.221, 0.01, 0.769, 0.9952]",0.221,0.01,0.769,0.9952
4,128496,Kremlin's persistent claim of “expected chemic...,437,679,18,0.643594,"[Kremlin, 's, persistent, claim, “, expected, ...","[0.043, 0.13, 0.827, -0.9954]",0.043,0.13,0.827,-0.9954


In [None]:
enriched_article_profile = article_df_enriched.profile_report(style={'full_width':True})
enriched_article_profile.to_file(output_file="data_profiles/enriched_article_data_profile.html")

In [None]:
# Store article_df_enriched for loading in Model Development
article_df_enriched = article_df
%store article_df_enriched 

## 4. Training data preparation

Here we will summarize the article data for each claim, building the training data for model development. 
Summary statistics include mean, variance.

In [5]:
with open("data/train.json") as f:
    train_data = json.load(f)

train_df = pd.DataFrame.from_records(train_data)

In [16]:
train_df.head()

Unnamed: 0,claim,claimant,date,label,related_articles,id,num_related_articles,claim_nltk_sentiment,claim_pos,claim_neg
0,A line from George Orwell's novel 1984 predict...,anon,2017-07-17,0,"[122094, 122580, 130685, 134765]",0,4,"[0.173, 0.0, 0.827, 0.3182]",0.173,0.0
1,Maine legislature candidate Leslie Gibson insu...,anon,2018-03-17,2,"[106868, 127320, 128060]",1,3,"[0.133, 0.176, 0.691, -0.2023]",0.133,0.176
2,A 17-year-old girl named Alyssa Carson is bein...,anon,2018-07-18,1,"[132130, 132132, 149722]",4,3,"[0.0, 0.0, 1.0, 0.0]",0.0,0.0
3,In 1988 author Roald Dahl penned an open lette...,anon,2019-02-04,2,"[123254, 123418, 127464]",5,3,"[0.0, 0.0, 1.0, 0.0]",0.0,0.0
4,"When it comes to fighting terrorism, ""Another ...",Hillary Clinton,2016-03-22,2,"[41099, 89899, 72543, 82644, 95344, 88361]",6,6,"[0.0, 0.344, 0.656, -0.9001]",0.0,0.344


In [7]:
# Add number of related articles to training data
train_df['num_related_articles']= train_df['related_articles'].apply(lambda x: len(x))

In [None]:
%%cache data/train_df.pkl train_df
%store -r train_df

In [None]:
# Profile training data
profile_train_df = train_df.profile_report(style={'full_width':True})
profile_train_df.to_file(output_file="data_profiles/training_data_profile.html")

## 5. Fill out empty data for claimants

In [8]:
train_df['claimant'].replace('', 'anon', inplace=True)

## 6. Create claim sentiment data

In [9]:
dp.create_append_feature(train_df, 'claim', 'claim_nltk_sentiment', dp.get_sentiment_nltk_vader)

Unnamed: 0,claim,claimant,date,label,related_articles,id,num_related_articles,claim_nltk_sentiment
0,A line from George Orwell's novel 1984 predict...,anon,2017-07-17,0,"[122094, 122580, 130685, 134765]",0,4,"[0.173, 0.0, 0.827, 0.3182]"
1,Maine legislature candidate Leslie Gibson insu...,anon,2018-03-17,2,"[106868, 127320, 128060]",1,3,"[0.133, 0.176, 0.691, -0.2023]"
2,A 17-year-old girl named Alyssa Carson is bein...,anon,2018-07-18,1,"[132130, 132132, 149722]",4,3,"[0.0, 0.0, 1.0, 0.0]"
3,In 1988 author Roald Dahl penned an open lette...,anon,2019-02-04,2,"[123254, 123418, 127464]",5,3,"[0.0, 0.0, 1.0, 0.0]"
4,"When it comes to fighting terrorism, ""Another ...",Hillary Clinton,2016-03-22,2,"[41099, 89899, 72543, 82644, 95344, 88361]",6,6,"[0.0, 0.344, 0.656, -0.9001]"
...,...,...,...,...,...,...,...,...
15550,"The omnibus spending bill has ""9,427 pork barr...",John McCain,2009-02-25,2,"[82947, 93503]",17137,2,"[0.0, 0.0, 1.0, 0.0]"
15551,Representative Maxine Waters said Muslims were...,anon,2017-06-06,0,"[103780, 104726, 126025]",17138,3,"[0.0, 0.258, 0.742, -0.765]"
15552,"""We were not, I repeat, were not told that wat...",Nancy Pelosi,2009-04-23,0,"[11331, 68915, 2186, 2185, 88418, 81950]",17139,6,"[0.0, 0.0, 1.0, 0.0]"
15553,"As of August 2017, members of the public could...",anon,2018-05-14,2,"[121353, 152864, 154411]",17140,3,"[0.134, 0.075, 0.791, 0.3612]"


In [3]:
%store -r train_df

In [4]:
# train_df['num_related_articles']= train_df['related_articles'].apply(lambda x: len(x))
train_df['claim_pos'] = train_df['claim_nltk_sentiment'].apply(lambda x: x[0])
train_df['claim_neg'] = train_df['claim_nltk_sentiment'].apply(lambda x: x[1])
train_df['claim_neu'] = train_df['claim_nltk_sentiment'].apply(lambda x: x[2])
train_df['claim_comp'] = train_df['claim_nltk_sentiment'].apply(lambda x: x[3])

In [5]:
train_df.head()

Unnamed: 0,claim,claimant,date,label,related_articles,id,num_related_articles,claim_nltk_sentiment,claim_pos,claim_neg,claim_neu,claim_comp
0,A line from George Orwell's novel 1984 predict...,anon,2017-07-17,0,"[122094, 122580, 130685, 134765]",0,4,"[0.173, 0.0, 0.827, 0.3182]",0.173,0.0,0.827,0.3182
1,Maine legislature candidate Leslie Gibson insu...,anon,2018-03-17,2,"[106868, 127320, 128060]",1,3,"[0.133, 0.176, 0.691, -0.2023]",0.133,0.176,0.691,-0.2023
2,A 17-year-old girl named Alyssa Carson is bein...,anon,2018-07-18,1,"[132130, 132132, 149722]",4,3,"[0.0, 0.0, 1.0, 0.0]",0.0,0.0,1.0,0.0
3,In 1988 author Roald Dahl penned an open lette...,anon,2019-02-04,2,"[123254, 123418, 127464]",5,3,"[0.0, 0.0, 1.0, 0.0]",0.0,0.0,1.0,0.0
4,"When it comes to fighting terrorism, ""Another ...",Hillary Clinton,2016-03-22,2,"[41099, 89899, 72543, 82644, 95344, 88361]",6,6,"[0.0, 0.344, 0.656, -0.9001]",0.0,0.344,0.656,-0.9001


## 6. Split data into labels and features

In [6]:
# Labels are the values we want to predict
labels = train_df['label']
features = train_df \
    .drop('claim_nltk_sentiment', axis = 1) \
    .drop('claim', axis = 1) \
    .drop('label', axis = 1) \
    .drop('related_articles', axis = 1) \
    .drop('id', axis = 1) \
    .drop('date', axis = 1) \

In [7]:
features.head()

Unnamed: 0,claimant,num_related_articles,claim_pos,claim_neg,claim_neu,claim_comp
0,anon,4,0.173,0.0,0.827,0.3182
1,anon,3,0.133,0.176,0.691,-0.2023
2,anon,3,0.0,0.0,1.0,0.0
3,anon,3,0.0,0.0,1.0,0.0
4,Hillary Clinton,6,0.0,0.344,0.656,-0.9001


In [6]:
features['labels'] = labels

In [7]:
export_csv = features.to_csv(r'data/export_train_dataframe.csv', sep=',', header=True, index=False)

## 6. Standardization of data

How to encode claimant? High dimensionality

In [22]:
# Categorical boolean mask
categorical_feature_mask = features.dtypes==object
# filter categorical columns using mask and turn it into a list
categorical_cols = features.columns[categorical_feature_mask].tolist()

In [25]:
# import labelencoder
from sklearn.preprocessing import LabelEncoder# instantiate labelencoder object
le = LabelEncoder()

In [29]:
# apply le on categorical feature columns
features[categorical_cols] = features[categorical_cols].apply(lambda col: le.fit_transform(col))
features[categorical_cols].head(10)

Unnamed: 0,claimant
0,3081
1,3081
2,3081
3,3081
4,1121
5,1650
6,1327
7,3081
8,2385
9,2001


In [45]:
# import OneHotEncoder
from sklearn.preprocessing import OneHotEncoder# instantiate OneHotEncoder
ohe = OneHotEncoder(categories='auto', sparse=True ) 
# categorical_features = boolean mask for categorical columns
# sparse = False output an array not sparse matrix

In [10]:
# Alternative one hot encoding - using pandas.get_dummies
features_ohe_pandas = pd.get_dummies(features, prefix=['claimant'])

In [12]:
features_ohe_pandas['labels'] = labels

In [11]:
features_ohe_pandas.head()

Unnamed: 0,num_related_articles,claim_pos,claim_neg,claim_neu,claim_comp,labels,"claimant_""A Woman’s Right to Know Information Material”","claimant_""suburban mom"" for Scott Taylor",claimant_@LagBeachAntifa9,claimant_@Sowellnomics,...,claimant_religionmind.com,claimant_states-news.com,claimant_teaparty.org,claimant_therightwingportal.com,claimant_tmzbreaking,claimant_truthcommand.com,claimant_usaviralnews.info,claimant_whitehouse.gov,claimant_worldnewsdailyreport.com,claimant_Мikhail Aleksandrov
0,4,0.173,0.0,0.827,0.3182,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,3,0.133,0.176,0.691,-0.2023,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,0.0,0.0,1.0,0.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,0.0,0.0,1.0,0.0,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,6,0.0,0.344,0.656,-0.9001,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
export_csv = features_ohe_pandas.to_csv(r'data/export_train_dataframe.csv', sep='\t', header=True, index=False)

In [46]:
features_ohe = ohe.fit_transform(features) # It returns an numpy array

In [61]:
from scipy.sparse import csr_matrix
S = csr_matrix(features_ohe)
print(S.shape)

(15555, 6047)


In [52]:
import scipy.sparse as ssp

In [67]:
# The above manipulations one-hot encode claimant. Use the below if you do not want a sparse vector
features_no_claimant = features.drop('claimant', axis = 1)

In [68]:
stacked=ssp.hstack( [S,features_no_claimant] ).todense()

In [69]:
stacked.shape

(15555, 6052)

In [91]:
labels_array = np.array(labels).reshape(15555,1)

In [93]:
np.append(stacked, labels_array, axis=1)

matrix([[ 0.    ,  0.    ,  0.    , ...,  0.827 ,  0.3182,  0.    ],
        [ 0.    ,  0.    ,  0.    , ...,  0.691 , -0.2023,  2.    ],
        [ 0.    ,  0.    ,  0.    , ...,  1.    ,  0.    ,  1.    ],
        ...,
        [ 0.    ,  0.    ,  0.    , ...,  1.    ,  0.    ,  0.    ],
        [ 0.    ,  0.    ,  0.    , ...,  0.791 ,  0.3612,  2.    ],
        [ 0.    ,  0.    ,  0.    , ...,  1.    ,  0.    ,  1.    ]])

In [94]:
np.savetxt("data/dataset.csv", stacked, delimiter=",")

In [70]:
from sklearn import preprocessing# Get column names first
names = stacked.columns# Create the Scaler object
scaler = preprocessing.StandardScaler()# Fit your data on the scaler object
scaled_features = scaler.fit_transform(stacked)
scaled_features = pd.DataFrame(scaled_features, columns=names)

# scaled_labels = scaler.fit_transform(labels.reshape(-1, 1))

AttributeError: 'matrix' object has no attribute 'columns'

## 7. Split data into training and testing sets

In [71]:
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split # Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(stacked, labels, test_size = 0.25, random_state = 42)

In [72]:
# Make sure splitting was done right
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

Training Features Shape: (11666, 6052)
Training Labels Shape: (11666,)
Testing Features Shape: (3889, 6052)
Testing Labels Shape: (3889,)


In [73]:
# Store data for loading in Model Development
%store train_features   
%store test_features
%store train_labels
%store test_labels

Stored 'train_features' (matrix)
Stored 'test_features' (matrix)
Stored 'train_labels' (Series)
Stored 'test_labels' (Series)


## Rough Notes

In [None]:
# Lemmatization
wnl = nltk.WordNetLemmatizer()
lemma = set([wnl.lemmatize(t) for t in tokens])

In [None]:
sorted(set(tokens))[:15]

In [None]:
word_tags = nltk.pos_tag(tokens)