# Exploratory text analysis tutorial

This notebook contains code for the tutorial on "Exploratory text analysis for computational social science."
The following sections will be used throughout the tutorial.

1. [Word frequency](#Word-frequency)
2. [Topic modeling](#Topic-modeling)
3. [Word embeddings](#Word-embeddings)

# Word frequency

First step in exploration: which words occur more frequently in one data set versus another?

In [105]:
## small fake news dataset
import os
import re
import pandas as pd
FILE_TOPIC_MATCHER = re.compile('^[a-z]+(?=[0-9])')
FILE_ID_MATCHER = re.compile('(?<=[a-z])[0-9]+')
FILE_ENDING_MATCHER = re.compile('\.(fake|legit)\.txt')
def process_file(text_file):
    # get file topic/ID
    text_file_clean = FILE_ENDING_MATCHER.sub('', os.path.basename(text_file))
#     print(text_file_clean)
    article_topic = FILE_TOPIC_MATCHER.search(text_file_clean).group(0)
    article_id = int(FILE_ID_MATCHER.search(text_file_clean).group(0))
    text_file_lines = open(text_file, 'r').readlines()
    text_file_lines = list(map(lambda x: x.strip(), text_file_lines))
    article_title = text_file_lines[0]
    article_text = text_file_lines[-1]
    article_data = pd.Series([article_title, article_text, article_topic, article_id], 
                             index=['title', 'text', 'topic', 'id'])
    return article_data

def load_all_data(data_dir):
    data_files = list(map(lambda x: os.path.join(data_dir, x), os.listdir(data_dir)))
    data = pd.concat(list(map(lambda x: process_file(x), data_files)), axis=1).transpose()
    data.sort_values(['topic', 'id'], inplace=True, ascending=True)
    return data

fake_news_data_dir = 'data/fakeNewsDatasets/fakeNewsDataset/fake/'
real_news_data_dir = 'data/fakeNewsDatasets/fakeNewsDataset/legit/'
fake_news_data = load_all_data(fake_news_data_dir)
real_news_data = load_all_data(real_news_data_dir)
## save to combined files!!
fake_news_data.to_csv('data/fakeNewsDatasets/fake_news_small.tsv', sep='\t', index=False)
real_news_data.to_csv('data/fakeNewsDatasets/real_news_small.tsv', sep='\t', index=False)
display(fake_news_data.head())
display(real_news_data.head())
print(fake_news_data.shape[0])
# print(fake_news_data[5])
# print(real_news_data[5])

Unnamed: 0,title,text,topic,id
69,"Alex Jones Vindicated in ""Pizzagate"" Controversy","""Alex Jones, purveyor of the independent inves...",biz,1
4,THE BIG DATA CONSPIRACY,so that in the no so far future can institute ...,biz,2
202,California Surprisingly Lenient on Auto Emissi...,"Setting Up Face-Off With Trump ""California's c...",biz,3
138,Mexicans Are Chomping at the Bit to Stop NAFTA...,Mexico has been unfairly gaining from NAFTA as...,biz,4
181,Breaking News: Snapchat to purchase Twitter fo...,Yahoo and AOL could be extremely popular over ...,biz,5


Unnamed: 0,title,text,topic,id
0,Alex Jones Apologizes for Promoting 'Pizzagate...,Alex Jones a prominent conspiracy theorist an...,biz,1
11,Banks and Tech Firms Battle Over Something Aki...,The big banks and Silicon Valley are waging an...,biz,2
4,California Upholds Auto Emissions Standards,"Setting Up Face-Off With Trump ""California's ...",biz,3
171,Renegotiate Nafta? Mexicans Say Get On With It,For more than two decades free trade has been...,biz,4
168,Snapchat 'will be bigger than Twitter,"Yahoo and AOL with advertisers' ""Snapchat cou...",biz,5


240


In [28]:
# word frequency
from nltk.tokenize import WordPunctTokenizer
from sklearn.feature_extraction.text import CountVectorizer
from stop_words import get_stop_words
en_stops = get_stop_words('en')
tokenizer = WordPunctTokenizer()
cv = CountVectorizer(min_df=0.001, max_df=0.75, 
                     tokenizer=tokenizer.tokenize, stop_words=en_stops,
                     ngram_range=(1,1))
# get vocab for all data
combined_txt = fake_news_data.loc[:, 'text'].append(real_news_data.loc[:, 'text'])
combined_txt_dtm = cv.fit_transform(combined_txt)
sorted_vocab = list(sorted(cv.vocabulary_.keys(), key=cv.vocabulary_.get))
# get separate DTM for each news data
cv = CountVectorizer(min_df=0.001, max_df=0.75, tokenizer=tokenizer.tokenize, stop_words=en_stops, vocabulary=vocab)
fake_news_dtm = cv.fit_transform(fake_news_data.loc[:, 'text'].values)
real_news_dtm = cv.fit_transform(real_news_data.loc[:, 'text'].values)



In [32]:
## top words
import numpy as np
fake_news_dtm_top_words = pd.Series(np.array(fake_news_dtm.sum(axis=0))[0], index=sorted_vocab).sort_values(ascending=False)
real_news_dtm_top_words = pd.Series(np.array(real_news_dtm.sum(axis=0))[0], index=sorted_vocab).sort_values(ascending=False)
print(fake_news_dtm_top_words.head(20))
print(real_news_dtm_top_words.head(20))

!".          851
249          384
!            343
100th        278
'            182
20th         161
24           113
3bn           90
2010          89
allows        86
650m          76
biased        61
adult         56
2013          54
2018          53
2007          53
10th          49
119           48
admission     43
46            43
dtype: int64
249         451
'           344
!           339
100th       323
!".         314
3bn         148
20th         87
)-           64
19th         53
biased       52
2010         51
24           47
allows       47
75           45
athlete      41
2018         41
),           38
actual       38
bathroom     38
brash        37
dtype: int64


In [38]:
# per-topic
article_topics = fake_news_data.loc[:, 'topic'].unique()
en_stops = get_stop_words('en')
tokenizer = WordPunctTokenizer()
top_k = 20
for topic_i in article_topics:
    print(f'topic = {topic_i}')
    fake_news_data_i = fake_news_data[fake_news_data.loc[:, 'topic']==topic_i]
    real_news_data_i = real_news_data[real_news_data.loc[:, 'topic']==topic_i]
    # get vocab, compute counts, etc.
    cv = CountVectorizer(min_df=0.001, max_df=0.75, 
                         tokenizer=tokenizer.tokenize, stop_words=en_stops,
                         ngram_range=(1,1))
    combined_txt_i = fake_news_data_i.loc[:, 'text'].append(real_news_data_i.loc[:, 'text'])
    combined_txt_dtm_i = cv.fit_transform(combined_txt_i)
    sorted_vocab_i = list(sorted(cv.vocabulary_.keys(), key=cv.vocabulary_.get))
    # get separate DTM for each news data
    cv = CountVectorizer(min_df=0.001, max_df=0.75, 
                         tokenizer=tokenizer.tokenize, stop_words=en_stops,
                         ngram_range=(1,1), vocabulary=sorted_vocab_i)
    fake_news_dtm_i = cv.fit_transform(fake_news_data_i.loc[:, 'text'].values)
    real_news_dtm_i = cv.fit_transform(real_news_data_i.loc[:, 'text'].values)
    # get top counts
    fake_news_dtm_top_words_i = pd.Series(np.array(fake_news_dtm_i.sum(axis=0))[0], index=sorted_vocab_i).sort_values(ascending=False).head(top_k)
    real_news_dtm_top_words_i = pd.Series(np.array(real_news_dtm_i.sum(axis=0))[0], index=sorted_vocab_i).sort_values(ascending=False).head(top_k)
    print('top words for fake news articles')
    display(fake_news_dtm_top_words_i)
    print('top words for real news articles')
    display(real_news_dtm_top_words_i)

topic = biz
top words for fake news articles


,            106
'             41
"             34
s             33
-             28
will          28
uk            21
said          19
$             14
trump         13
eu            13
deal          13
."            13
company       11
many          10
companies     10
european      10
now            9
may            8
jobs           8
dtype: int64

top words for real news articles


'            73
s            65
-            58
"            50
said         44
$            26
will         18
us           16
1            16
)            15
company      15
:            13
last         13
firm         13
trump        13
uk           13
financial    13
european     13
eu           13
two          12
dtype: int64

topic = edu
top words for fake news articles


"            53
school       47
'            45
students     38
s            31
-            23
will         22
education    20
trump        15
president    12
new          12
children     11
student      10
."           10
said         10
parents      10
law          10
time         10
schools      10
first        10
dtype: int64

top words for real news articles


'             29
s             24
-             24
school        23
students      21
"             19
education     11
said           9
,"             9
student        8
year           8
percent        7
children       6
president      6
according      5
will           5
college        5
)              5
(              5
university     5
dtype: int64

topic = entmt
top words for fake news articles


,            161
"            106
s             64
-             31
will          29
."            25
t             24
one           17
time          16
show          16
new           16
also          14
said          14
fans          13
way           12
now           11
last          11
just          11
(             11
character     10
dtype: int64

top words for real news articles


"        151
s        102
-         89
said      42
,         30
."        29
also      23
t         21
will      20
one       16
film      16
year      16
first     15
told      14
new       14
--        14
news      13
show      13
john      11
years     11
dtype: int64

topic = polit
top words for fake news articles


trump         79
'             69
"             59
s             56
president     50
clinton       29
-             25
donald        22
said          20
house         16
white         16
washington    15
will          14
."            13
just          11
cnn           11
)             11
(             11
obama         11
us            11
dtype: int64

top words for real news articles


"            33
'            32
s            28
trump        25
-            20
said         18
,"           12
president    10
mr            9
clinton       7
campaign      6
t             5
will          5
time          5
:             5
first         5
u             4
america       4
press         4
order         4
dtype: int64

topic = sports
top words for fake news articles


,         148
"          55
s          51
-          41
will       26
game       24
team       23
."         21
said       18
two        16
one        14
years      13
year       13
last       12
time       11
new        10
brazil     10
world      10
sports      9
just        9
dtype: int64

top words for real news articles


-          124
s           83
"           71
will        25
year        23
said        21
world       17
game        16
sport       16
one         15
."          15
two         14
win         14
time        13
6           13
team        13
federer     12
old         12
sports      11
(           11
dtype: int64

topic = tech
top words for fake news articles


'           49
s           43
will        42
"           36
new         34
-           34
can         16
amazon      14
google      13
now         12
many        11
apple       10
t            9
world        9
devices      9
said         9
(            9
time         9
research     8
app          8
dtype: int64

top words for real news articles


-             29
'             27
s             21
"             15
will          14
said          14
new           12
,"             9
also           7
devices        7
can            6
google         6
year           6
like           6
t              6
announced      5
monday         5
see            5
game           4
technology     4
dtype: int64

In [61]:
def compute_frequency(text_data, tokenizer, stops, vocab):
    cv = CountVectorizer(tokenizer=tokenizer.tokenize, stop_words=stops,
                         ngram_range=(1,1), vocabulary=vocab)
    dtm = cv.fit_transform(text_data)
    word_frequency = np.array(dtm.sum(axis=0))[0]
    word_frequency = pd.Series(word_frequency, index=vocab)
    return word_frequency

In [62]:
fake_news_text = fake_news_data.loc[:, 'text'].values
real_news_text = real_news_data.loc[:, 'text'].values
fake_news_word_frequency = compute_frequency(fake_news_text, tokenizer, en_stops, sorted_vocab)
real_news_word_frequency = compute_frequency(real_news_text, tokenizer, en_stops, sorted_vocab)
# compute difference
fake_vs_real_news_word_frequency_diff = fake_news_word_frequency - real_news_word_frequency
fake_vs_real_news_word_frequency_diff.sort_values(inplace=True, ascending=False)
# show words with highest/lowest difference
top_k = 20
print('words that occurred in more fake news articles')
print(fake_vs_real_news_word_frequency_diff.head(top_k))
print('words that occurred in more real news articles')
print(fake_vs_real_news_word_frequency_diff.tail(top_k))

words that occurred in more fake news articles
,            537
will          74
trump         66
president     41
new           39
."            38
many          33
clinton       27
donald        25
time          24
now           24
school        24
can           23
stated        20
students      19
even          18
white         18
order         16
way           16
great         16
dtype: int64
words that occurred in more real news articles
)             -8
000           -9
–             -9
6            -10
report       -11
m            -11
4            -11
tuesday      -11
three        -11
1            -12
financial    -12
$            -16
--           -17
also         -19
:            -20
year         -27
s            -45
said         -58
'            -67
-           -162
dtype: int64




These differences suggest that fake news articles focused more on the actions of specific people (`trump`, `clinton`) and less on specific details (`tuesday`, `financial`).

However, these results could be due to longer articles that allowed e.g. real news writers to cover more details. How do we control for length?

Let's compute the normalized frequency for fake news and real news articles, to identify words that occurred more often than expected in one genre of article.

In [53]:
def compute_norm_frequency(text_data, tokenizer, stops, vocab):
    cv = CountVectorizer(tokenizer=tokenizer.tokenize, stop_words=stops,
                         ngram_range=(1,1), vocabulary=vocab)
    dtm = cv.fit_transform(text_data)
    # normalize by column
    word_norm_frequency = np.array(dtm.sum(axis=0) / dtm.sum(axis=0).sum())[0]
    # store in format that is easy to manipulate
    word_norm_frequency = pd.Series(word_norm_frequency, index=vocab)
    return word_norm_frequency

In [112]:
tokenizer = WordPunctTokenizer()
stops = get_stop_words('en')
fake_news_word_norm_frequency = compute_norm_frequency(fake_news_data.loc[:, 'text'].values, tokenizer, stops, sorted_vocab)
real_news_word_norm_frequency = compute_norm_frequency(real_news_data.loc[:, 'text'].values, tokenizer, stops, sorted_vocab)
## compute ratio: what words are used more often in fake news than real news?
def compute_text_word_ratio(text_data_1, text_data_2):
    text_word_ratio = text_data_1 / text_data_2
    # drop non-occurring words
    text_word_ratio = text_word_ratio[~np.isinf(text_word_ratio)]
    text_word_ratio = text_word_ratio[~np.isnan(text_word_ratio)]
    text_word_ratio = text_word_ratio[text_word_ratio != 0.]
    text_word_ratio.sort_values(inplace=True, ascending=False)
    return text_word_ratio
fake_vs_real_news_word_frequency_ratio = compute_text_word_ratio(fake_news_word_norm_frequency, real_news_word_norm_frequency)
# show words with highest/lowest ratio
top_k = 20
print('words that occurred in more fake news articles')
print(fake_real_news_word_frequency_ratio.head(top_k))
print('words that occurred in more real news articles')
print(fake_real_news_word_frequency_ratio.tail(top_k))

words that occurred in more fake news articles
hillary      11.724431
commented    10.886971
needs        10.049512
secret        9.212053
caused        8.374593
ai            8.374593
provided      7.537134
earth         6.699675
begin         6.699675
instead       6.699675
attempt       5.862215
release       5.862215
success       5.862215
stein         5.862215
lack          5.862215
charges       5.024756
tennis        5.024756
met           5.024756
phone         5.024756
groups        5.024756
dtype: float64
words that occurred in more real news articles
anniversary    0.167492
customer       0.167492
missing        0.167492
saw            0.167492
value          0.167492
jersey         0.167492
providers      0.167492
potentially    0.167492
growing        0.139577
indian         0.139577
story          0.139577
drawn          0.139577
january        0.139577
february       0.139577
vehicle        0.119637
brady          0.119637
40             0.119637
18             0.119637



OK! We see that fake news consistently focuses on `hillary` (e.g. her email case) s well as potential conspiracy theories (`secret`, `ai`). In contrast, real news focuses on concrete time details (`january`, `40`) and provides some words to "hedge" their claims (`potentially`, `story`).

What if we want to identify words that occur frequency in just a few documents? E.g. some fake news stories may disproportionately use rare but inflammatory words.

Let's try TF-IDF, which normalizes term frequency by the inverse document frequency:

$$\text{tf-idf(word)} = \frac{\text{freq(word)}}{\text{document-freq(word)}}$$

In [98]:
from sklearn.feature_extraction.text import TfidfVectorizer
def compute_non_zero_mean(data):
    non_zero_data = data[data != 0.]
    non_zero_mean = non_zero_data.mean()
    return non_zero_mean
def compute_tfidf(text_data, tokenizer, stops, vocab):
    tfidf_vec = TfidfVectorizer(tokenizer=tokenizer.tokenize, stop_words=stops, vocabulary=vocab)
    text_tfidf_matrix = tfidf_vec.fit_transform(text_data).toarray()
#     print(text_tfidf_matrix.shape)
#     return text_tfidf_matrix
    # compute mean over non-zero TF-IDF values
#     text_tfidf_score = np.apply_along_axis(lambda x: x.mean(), 0, text_tfidf_matrix)
    text_tfidf_score = np.apply_along_axis(lambda x: compute_non_zero_mean(x), 0, text_tfidf_matrix)
#     text_tfidf_score = text_tfidf_matrix.max(axis=0)
    text_tfidf_score = pd.Series(text_tfidf_score, index=vocab)
    return text_tfidf_score

In [104]:
fake_news_tfidf = compute_tfidf(fake_news_text, tokenizer, en_stops, sorted_vocab)
real_news_tfidf = compute_tfidf(real_news_text, tokenizer, en_stops, sorted_vocab)
fake_vs_real_news_word_tfidf_ratio = fake_news_tfidf / real_news_tfidf
fake_vs_real_news_word_tfidf_ratio.dropna(inplace=True)
fake_vs_real_news_word_tfidf_ratio.sort_values(inplace=True, ascending=False)
top_k = 20
print('words with higher TF-IDF scores in fake news')
display(fake_vs_real_news_word_tfidf_ratio.head(top_k))
print('words with higher TF-IDF scores in real news')
display(fake_vs_real_news_word_tfidf_ratio.tail(top_k))
# raw TF-IDF scores
# fake_news_tfidf.sort_values(inplace=True, ascending=False)
# real_news_tfidf.sort_values(inplace=True, ascending=False)
# top_k = 20
# print('words with high TF-IDF scores in fake news')
# print(fake_news_tfidf.head(top_k))
# print('words with high TF-IDF scores in real news')
# print(real_news_tfidf.head(top_k))

words with higher TF-IDF scores in fake news


  non_zero_mean = non_zero_data.mean()


steel         4.280091
retailers     4.089067
tourists      3.949276
friendship    3.927483
morgan        3.752379
bruno         3.728585
gas           3.509625
saudi         3.392921
arnold        3.364493
privacy       3.356762
sacrifice     3.214699
emoji         3.112207
duncan        3.080392
michelle      3.053198
ebony         3.014936
qatar         2.980143
fees          2.926611
ai            2.926181
wawrinka      2.924895
kyrgios       2.892984
dtype: float64

words with higher TF-IDF scores in real news


virtual           0.331943
comfortable       0.330759
saran             0.312199
hacking           0.309510
junco             0.308395
farah             0.303634
iphones           0.300928
putin             0.297846
engines           0.292603
fisher            0.280006
alcohol           0.279075
absurdity         0.277041
suddenly          0.272967
pizzagate         0.270884
punk              0.265890
authentication    0.264018
factor            0.264018
graduates         0.254588
tempe             0.239919
investigators     0.221240
dtype: float64

This method succeeds in identifying fairly rare words that characterize real and fake news.

For fake news, we see that words with higher TF-IDF scores include those related to business transactions (`retailers`, `gas`) and Middle Eastern countries (`saudi`, `qatar`).

For real news, the words with higher TF-IDF scores include words that directly address conspiracies (`pizzagate`, `investigators`, `authentication`) and words that speculate on the veracity of claims (`absurdity`, `suddenly`).

### Exploration

Now it's time for you to explore the data a little more with word frequency modeling!

Some thoughts:

- The original data are organized by topic. What are the words that characterize real/fake news in each topic?
- Changing the vocabulary size could identify more rare words (e.g. lowering `min_df` threshold in `CountVectorizer`). What happens if you include more words in the vocabulary?
- Up until now we have focused more strongly on single words (unigrams). What if we include phrases (changing `ngram_range` in the `CountVectorizer`)? Will we see more examples of conspiracy theories being highlighted by the real news?

In [118]:
## example: test different n-gram range
## generate new vocabulary
tokenizer = WordPunctTokenizer()
en_stops = get_stop_words('en')
def compute_word_freq_custom(text_data, custom_cv, vocab):
    text_dtm = custom_cv.transform(text_data)
    word_norm_frequency = np.array(text_dtm.sum(axis=0) / text_dtm.sum(axis=0).sum())[0]
    word_norm_frequency = pd.Series(word_norm_frequency, index=vocab)
    return word_norm_frequency
# create custom vectorizer for bigrams
bigram_cv = CountVectorizer(min_df=0.001, max_df=0.75, 
                            tokenizer=tokenizer.tokenize, stop_words=en_stops,
                            ngram_range=(2,2))
# get vocab for all data
combined_txt = fake_news_data.loc[:, 'text'].append(real_news_data.loc[:, 'text'])
combined_txt_dtm = bigram_cv.fit_transform(combined_txt)
sorted_bigram_vocab = list(sorted(bigram_cv.vocabulary_.keys(), key=bigram_cv.vocabulary_.get))
## compute frequency ratio for bigrams
fake_news_bigram_frequency = compute_word_freq_custom(fake_news_data.loc[:, 'text'].values, bigram_cv, sorted_bigram_vocab)
fake_vs_real_news_bigram_word_frequency_ratio = compute_text_word_ratio(fake_news_bigram_frequency, real_news_bigram_frequency)
fake_vs_real_news_bigram_word_frequency_ratio.sort_values(inplace=True, ascending=False)
top_k = 20
print('top unigrams/bigrams that occur more often in fake news data')
display(fake_vs_real_news_bigram_word_frequency_ratio.head(top_k))
print('top unigrams/bigrams that occur more often in real news data')
display(fake_vs_real_news_bigram_word_frequency_ratio.tail(top_k))

top unigrams/bigrams that occur more often in fake news data




hillary clinton     10.837533
, "                  7.711322
however ,            7.502908
trump .              7.502908
, many               6.669251
, one                5.835595
game ,               5.001938
president donald     5.001938
couldn '             5.001938
first lady           5.001938
white house          4.724053
donald trump         4.335013
monday .             4.168282
night ,              4.168282
now ,                4.168282
trump tower          4.168282
" just               4.168282
. new                3.751454
, wanted             3.334626
supreme court        3.334626
dtype: float64

top unigrams/bigrams that occur more often in real news data


economy .        0.208414
performance -    0.208414
2 .              0.208414
well -           0.208414
. k              0.208414
. 4              0.208414
2016 .           0.208414
. still          0.208414
s really         0.208414
, adding         0.208414
science ,        0.208414
k .              0.208414
- year           0.189467
year -           0.185257
- old            0.175507
middle east      0.166731
indian wells     0.166731
. report         0.166731
world number     0.166731
1 .              0.083366
dtype: float64

 # Topic modeling
Another way to compare documents is to extract the latent topics that group words within each document, and compare those distributions.

We'll continue on the topic of fake news with another dataset that has examples of both fake and real news articles, at a much larger scale than the previous data.

In [219]:
## data = fake news challenge
import pandas as pd
fake_news_article_data = pd.read_csv('data/fake_news_challenge/Fake.csv', sep=',', index_col=False)
real_news_article_data = pd.read_csv('data/fake_news_challenge/True.csv', sep=',', index_col=False)
# get rid of duplicate articles
fake_news_article_data.drop_duplicates('text', inplace=True)
real_news_article_data.drop_duplicates('text', inplace=True)
display(fake_news_article_data.loc[:, 'text'].head(10).values)

array(['Donald Trump just couldn t wish all Americans a Happy New Year and leave it at that. Instead, he had to give a shout out to his enemies, haters and  the very dishonest fake news media.  The former reality show star had just one job to do and he couldn t do it. As our Country rapidly grows stronger and smarter, I want to wish all of my friends, supporters, enemies, haters, and even the very dishonest Fake News Media, a Happy and Healthy New Year,  President Angry Pants tweeted.  2018 will be a great year for America! As our Country rapidly grows stronger and smarter, I want to wish all of my friends, supporters, enemies, haters, and even the very dishonest Fake News Media, a Happy and Healthy New Year. 2018 will be a great year for America!  Donald J. Trump (@realDonaldTrump) December 31, 2017Trump s tweet went down about as welll as you d expect.What kind of president sends a New Year s greeting like this despicable, petty, infantile gibberish? Only Trump! His lack of decency w

Before we try topic modeling, we have to convert the text to a usable format (document-term matrix, like before).

In [220]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import WordPunctTokenizer
from stop_words import get_stop_words
## combine text data, keep track of fake/real news indices
combined_news_text = fake_news_article_data.loc[:, 'text'].append(real_news_article_data.loc[:, 'text'])
fake_news_text_indices = list(range(fake_news_article_data.shape[0]))
real_news_text_indices = list(range(fake_news_article_data.shape[0], combined_news_text.shape[0]))
## convert text to DTM
en_stops = get_stop_words('en')
tokenizer = WordPunctTokenizer()
cv = CountVectorizer(min_df=0.001, max_df=0.75, lowercase=True, 
                     ngram_range=(1,1), stop_words=en_stops, tokenizer=tokenizer.tokenize)
combined_news_text_dtm = cv.fit_transform(combined_news_text)
print(combined_news_text_dtm.shape)



(38647, 13508)


For our first method, let's try Latent Semantic Analysis, which is a form of dimensionality reduction.

In [221]:
## LSA
from sklearn.decomposition import TruncatedSVD
num_topics = 10
num_iter = 10
lsa_model = TruncatedSVD(n_components=num_topics, n_iter=num_iter, random_state=123)
combined_news_text_lsa_topics = lsa_model.fit_transform(combined_news_text_dtm)
print(combined_news_text_lsa_topics.shape)

(38647, 10)


The LSA process outputs continuous values [-inf, +inf] which we need to convert to probabilities [0,1]. We can use the softmax function along each dimension to convert the topic-document matrix to probabilities:

$$\text{softmax}(x_{i}) = \frac{e^{x_{i}}}{\sum_{j}^{K}e^{x_{j}}}$$

where $x$ is one of $K$ topic dimensions.

In [222]:
from sklearn.utils.extmath import softmax
from sklearn.preprocessing import StandardScaler
import numpy as np
# convert per-column scores to a normal distribution (0,1)
scaler = StandardScaler()
combined_news_text_lsa_topic_scores = scaler.fit_transform(combined_news_text_lsa_topics)
# soft-max per-column
combined_news_text_lsa_topic_probs = softmax(combined_news_text_lsa_topic_scores.T).T
# normalize per-row so that probabilities sum to 1
combined_news_text_lsa_topic_probs = combined_news_text_lsa_topic_probs / combined_news_text_lsa_topic_probs.sum(axis=1).reshape(-1,1)

What is the expected probability of a document being assigned to a topic?

In [227]:
combined_news_text_lsa_expected_topics = pd.Series(combined_news_text_lsa_topic_probs.mean(axis=0))
print(f'expected probability of topics = \n{combined_news_text_lsa_expected_topics}')

expected probability of topics = 
0    0.618483
1    0.000006
2    0.000006
3    0.000007
4    0.215442
5    0.000013
6    0.000009
7    0.156142
8    0.000013
9    0.009879
dtype: float64


It looks like the data is "dominated" by 3 topics with high probability.

To figure out what "topics" the model learned, let's look at the news articles with the highest probability for each topic.

We'll take the arg-max along each topic and print the text for the corresponding articles.
We'll look at the most likely topics (0, 4, 7) as a first pass.

In [258]:
def show_articles_with_highest_prob_per_topic(doc_topic_probs, doc_text, num_topics):
    topic_ids = list(range(num_topics))
    top_articles_per_topic = 10
    text_sample_len = 200
    for topic_id_i in topic_ids:
        print(f'processing topic {topic_id_i}')
        # get indices for articles with highest topic probability
        top_article_indices_i = np.argsort(doc_topic_probs[:, topic_id_i])[-top_articles_per_topic:]
        top_article_indices_i = list(reversed(top_article_indices_i))
        for index_j in top_article_indices_i:
            topic_prob_i_j = doc_topic_probs[index_j, topic_id_i]
            print(f'\tarticle {index_j} has P(topic)={topic_prob_i_j} with text = {doc_text.iloc[index_j][:text_sample_len]}')

In [260]:
show_articles_with_highest_prob_per_topic(combined_news_text_lsa_topic_probs, combined_news_text, num_topics)

processing topic 0
	article 9196 has P(topic)=0.9999988709477023 with text = With mainstream media and establishment politicians stacked against him from the moment he announced his run for the presidency, Donald J. Trump has been in an ongoing pitched battle to communicate hi
	article 17381 has P(topic)=0.9999964560569015 with text = Shawn Helton   21st Century WireGOP presidential frontrunner Donald Trump is a populist candidate among a bevy of warhawk rivals  yet many still wonder how the real estate mogul has marched virtually 
	article 12986 has P(topic)=0.9999855797336982 with text = This is a must read for anyone who s undecided or plans on voting for a third party candidate It covers all the bases and cements for you the duty as an American to do what s best for our nation. If y
	article 16773 has P(topic)=0.999961726727553 with text =  By ramping up US troop levels in Afghanistan, Trump is alienating many supporters. (Photo: DoD/USAF Tech Sgt Brigitte N Brantley. Source: Wikic

In [229]:
topic_ids = list(range(num_topics))
top_articles_per_topic = 10
for topic_id_i in topic_ids:
    print(f'processing topic {topic_id_i}')
    # get indices for articles with highest topic probability
    top_article_indices_i = np.argsort(combined_news_text_lsa_topic_probs[:, topic_id_i])[-top_articles_per_topic:]
    top_article_indices_i = list(reversed(top_article_indices_i))
    for index_j in top_article_indices_i:
        topic_prob_i_j = combined_news_text_lsa_topic_probs[index_j, topic_id_i]
        print(f'\tarticle {index_j} has P(topic)={topic_prob_i_j} with text = {combined_news_text.iloc[index_j][:200]}')

processing topic 0
	article 9196 has P(topic)=0.9999988709477023 with text = With mainstream media and establishment politicians stacked against him from the moment he announced his run for the presidency, Donald J. Trump has been in an ongoing pitched battle to communicate hi
	article 17381 has P(topic)=0.9999964560569015 with text = Shawn Helton   21st Century WireGOP presidential frontrunner Donald Trump is a populist candidate among a bevy of warhawk rivals  yet many still wonder how the real estate mogul has marched virtually 
	article 12986 has P(topic)=0.9999855797336982 with text = This is a must read for anyone who s undecided or plans on voting for a third party candidate It covers all the bases and cements for you the duty as an American to do what s best for our nation. If y
	article 16773 has P(topic)=0.999961726727553 with text =  By ramping up US troop levels in Afghanistan, Trump is alienating many supporters. (Photo: DoD/USAF Tech Sgt Brigitte N Brantley. Source: Wikic

Looking at the article text qualitatively, we observe the following:

- Topic 0 includes major election issues such as U.S. president Trump's campaign and action in office.
- Topic 4 includes more subjective claims (`anti-American`, `whine`) and more extreme issues (`conspiracy`, `chaos`, `violence`).
- Topic 7 includes discussion of the 2016 election, particularly related to Clinton (`email`, `classified`).

Which topics are more prevalent in fake news versus real news?

In [228]:
fake_news_text_lsa_topic_probs = combined_news_text_lsa_topic_probs[fake_news_text_indices, :]
real_news_text_lsa_topic_probs = combined_news_text_lsa_topic_probs[real_news_text_indices, :]
fake_news_text_lsa_expected_topics = pd.Series(fake_news_text_lsa_topic_probs.mean(axis=0))
real_news_text_lsa_expected_topics = pd.Series(real_news_text_lsa_topic_probs.mean(axis=0))
print(f'expected probability of topics for fake news = \n{fake_news_text_lsa_expected_topics}')
print(f'expected probability of topics for real news = \n{real_news_text_lsa_expected_topics}')

expected probability of topics for fake news = 
0    0.587222
1    0.000014
2    0.000014
3    0.000014
4    0.254715
5    0.000028
6    0.000019
7    0.150289
8    0.000029
9    0.007655
dtype: float64
expected probability of topics for real news = 
0    6.442312e-01
1    6.450358e-50
2    4.350468e-21
3    2.759873e-08
4    1.830952e-01
5    2.407049e-15
6    1.779703e-12
7    1.609624e-01
8    7.720062e-07
9    1.171039e-02
dtype: float64


It looks like real news discusses topic 0 (possible criticism of Trump?) slightly more than fake news, while fake news discusses discusses topic 4 (conspiracy theories?) slightly more than real news.

While this is a useful first pass on the data, it doesn't help us identify which words or phrases may differentiate fake news from real news. 

We'll move onto a more complicated method (Latent Dirichlet Allocation) that identifies latent topics from which words are "generated." 
This will help us pull out specific words that characterize the topics.

In [249]:
## LDA
# get text tokens first using the CountVectorizer from earlier
combined_news_text_dtm_tokens = cv.inverse_transform(combined_news_text_dtm)
from gensim.corpora import Dictionary
lda_dict = Dictionary(combined_news_text_dtm_tokens)
combined_news_text_corpus = list(map(lambda x: lda_dict.doc2bow(x), combined_news_text_dtm_tokens))
# train model
from gensim.models import LdaModel
num_topics = 10
iterations = 50
lda_model = LdaModel(corpus=combined_news_text_corpus, num_topics=10, iterations=iterations)

Like before, let's look at the distribution of topics over all documents and get a sense of the articles that correspond to each topic.

In [261]:
def compute_lda_topic_probs(text_doc, model):
    doc_topics = model.get_document_topics(text_doc, minimum_probability=0.)
    # convert to probability array
    doc_topic_ids, doc_topic_probs = zip(*doc_topics)
    return doc_topic_probs
combined_news_text_lda_topic_probs = np.array(list(map(lambda x: compute_lda_topic_probs(x, lda_model), combined_news_text_corpus)))
combined_news_text_lda_topic_expected_prob = combined_news_text_lda_topic_probs.mean(axis=0)
print(f'expected value of LDA topics =\n{combined_news_text_lda_topic_expected_prob}')

expected value of LDA topics =
[0.05210339 0.10082803 0.09245355 0.19189136 0.06793168 0.08321581
 0.07276974 0.06280325 0.14228573 0.13371876]


In contrast to the SVD analysis, we see a more even distribution of topics. Let's see which articles were more strongly associated with each topic.

In [262]:
show_articles_with_highest_prob_per_topic(combined_news_text_lda_topic_probs, combined_news_text, num_topics)

processing topic 0
	article 14595 has P(topic)=0.9808146953582764 with text = Because Obama s doing such a great job keeping Putin in check right?  President Obama mocked Republican candidates who suggested they ll be tough with Vladimir Putin but  can t handle a bunch of CNBC 
	article 11502 has P(topic)=0.9571210145950317 with text = Blow wrote in his article today for the New York Times that he has no desire to work with Trump voters. He clearly stated that there is no room for compromise with anyone who doesn t agree with him:
	article 13247 has P(topic)=0.9549550414085388 with text = Next stop after BREXIT is the US! Judge Jeanine nails it and says we need to know the facts to fight this elitist agenda. We re  tired of being lectured to by the fat cats in Washington . 
	article 13839 has P(topic)=0.935684859752655 with text = Yes, it s really hard to be a dictator in America Obama obviously doesn t like power to be given to anyone but him. 
	article 14139 has P(topic)=0.9249395728

Restricting ourselves to the top 5 most frequent topics in the data based on the probabilities above (topics 3, 8, 9, 1, 2), we see the following trends:

- Topic 1 includes U.S. election issues and general content concerning the president.
- Topic 2 includes disasters and violence, possibly fear-mongering.
- Topic 3 includes international politics.
- Topic 8 seems to include inflammatory and "alternative" news content (`hypocrites`, `trashing`).
- Topic 9 includes the politics around U.S. healthcare.

Let's also compare the distribution of topics in each text category.

In [263]:
fake_news_text_lda_topic_probs = combined_news_text_lda_topic_probs[fake_news_text_indices, :]
real_news_text_lda_topic_probs = combined_news_text_lda_topic_probs[real_news_text_indices, :]
fake_news_text_lda_expected_topics = pd.Series(fake_news_text_lda_topic_probs.mean(axis=0))
real_news_text_lda_expected_topics = pd.Series(real_news_text_lda_topic_probs.mean(axis=0))
print(f'expected probability of topics for fake news = \n{fake_news_text_lda_expected_topics}')
print(f'expected probability of topics for real news = \n{real_news_text_lda_expected_topics}')

expected probability of topics for fake news = 
0    0.082665
1    0.098468
2    0.070029
3    0.074751
4    0.029148
5    0.164867
6    0.067225
7    0.043522
8    0.287306
9    0.082018
dtype: float32
expected probability of topics for real news = 
0    0.026931
1    0.102773
2    0.110924
3    0.288375
4    0.099876
5    0.015963
6    0.077336
7    0.078685
8    0.022834
9    0.176304
dtype: float32


Real news articles tend to have more representation for topics 3 and 9, while fake news articles have more representation for topics 1, 2 and 8, which makes sense given the more violent and "alternative" content included in those topics.

Now that we've established the high-level differences in topics between fake news and real news, let's look at the individual words that make up the topics.

Specifically, we're going to compute the probability of observing a word given a topic, using the parameters learned by the LDA model.

In [277]:
def show_top_words_all_topics(model, model_dict, num_topics, words_per_topic):
    topic_ids = list(range(num_topics))
    for topic_i in topic_ids:
        topic_word_id_scores_i = model.get_topic_terms(topic_i, topn=words_per_topic)
        topic_word_ids_i, topic_word_scores_i = zip(*topic_word_id_scores_i)
        # convert word ID to words
        topic_words_i = list(map(model_dict.get, topic_word_ids_i))
        print(f'topic {topic_i} has top words: \n\t{", ".join(topic_words_i)}')

In [278]:
words_per_topic = 20
show_top_words_all_topics(lda_model, lda_dict, num_topics, words_per_topic)

topic 0 has top words: 
	emirates, .”, ,”, trump, republican, t, comments, donald, “, interview, ”, think, ’, runner, asked, president, presidential, going, flows, news
topic 1 has top words: 
	presidential, election, (, republican, democratic, reuters, party, candidate, hillary, campaign, clinton, ’, donald, “, nov, trump, vote, ,”, senator, washington
topic 2 has top words: 
	(, police, people, year, two, 000, one, killed, reuters, city, told, also, authorities, three, near, state, since, area, last, old
topic 3 has top words: 
	reuters, (, u, government, president, united, also, state, will, foreign, told, states, security, new, last, officials, minister, two, international, including
topic 4 has top words: 
	(, minister, reuters, prime, president, told, will, donald, trump, korea, reporters, military, news, britain, london, u, north, wednesday, british, theresa
topic 5 has top words: 
	people, speech, rally, crowd, :, anti, t, supporters, protesters, one, party, trump, right, front

Looking at the top words confirms what we saw before, that fake news articles tend to focus on election conflicts (topic 1), violence (topic 2), and possibly more simple or engaging words to correspond with more "opinion" pieces (topic 8).

What happens if we train separate topic models on real news and fake news? This could help highlight groups of words that are specific only to fake news or to real news, which may be "washed out" with the combined topic model.

In [281]:
num_topics = 10
iterations = 100
# train fake news model
def train_lda_model_from_corpus(text_corpus, num_topics, iterations):
    lda_model = LdaModel(text_corpus, num_topics=num_topics, iterations=iterations)
    return lda_model
# fake_news_text_dtm_tokens = list(map(lambda x: combined_news_text_dtm_tokens[x], fake_news_text_indices))
# real_news_text_dtm_tokens = list(map(lambda x: combined_news_text_dtm_tokens[x], real_news_text_indices))
fake_news_text_corpus = list(map(lambda x: combined_news_text_corpus[x], fake_news_text_indices))
real_news_text_corpus = list(map(lambda x: combined_news_text_corpus[x], real_news_text_indices))
## train models
fake_news_lda_model = train_lda_model_from_corpus(fake_news_text_corpus, num_topics, iterations)
real_news_lda_model = train_lda_model_from_corpus(real_news_text_corpus, num_topics, iterations)

What are the top words captured per-topic from each model?

In [282]:
words_per_topic = 20
print('real news: top words per topic')
show_top_words_all_topics(real_news_lda_model, lda_dict, num_topics, words_per_topic)
print('fake news: top words per topic')
show_top_words_all_topics(fake_news_lda_model, lda_dict, num_topics, words_per_topic)

real news: top words per topic
topic 0 has top words: 
	(, reuters, will, percent, year, $, billion, 1, also, government, next, new, vote, budget, house, president, million, bill, tax, irma
topic 1 has top words: 
	election, presidential, candidate, reuters, party, (, campaign, former, democratic, clinton, hillary, republican, voters, vote, candidates, national, nov, 8, race, new
topic 2 has top words: 
	(, reuters, u, president, told, russian, investigation, moscow, russia, state, news, reported, washington, government, last, statement, denied, charges, thursday, allegations
topic 3 has top words: 
	(, court, reuters, will, law, government, ruling, also, rights, president, year, former, one, case, u, supreme, years, new, last, justice
topic 4 has top words: 
	reuters, (, will, year, government, president, power, also, years, country, last, national, one, head, leftist, ($, u, change, afd, new
topic 5 has top words: 
	percent, 1, 000, (, since, people, 25, reuters, million, 5, governme

The real news topics include concrete details such as money (topic 1), immigration statistics (topic 5), and international diplomacy (topic 7).

The fake news topics include sub-discussions around Donald Trump (topic 2: Trump vs. Obama; topic 5: election results) and some topics related to social justice (topic 7: `black`, `white`; topic 8: `protesters`, `police`).

In [None]:
## stretch goal: visualizing topics??

### Exploration
Now it's time for you to keep exploring what topic models can tell us about real and fake news.

Some ideas:
- We used word frequency to represent words when training the topic models, but you can try other metrics such as TF-IDF, which we saw before can up-weight rarer words. What happens if you re-train the topic model using another form of word frequency?
- You can change the number of topics learned by the model to include more or less detail that may reveal different "levels" of granularity. You may want to try using "coherence" as a metric to determine the number of topics that maximizes the similarity among words within the same topic. What broad or fine-grained differences can you find that differentiate real and fake news? 
- One way of reducing "overlap" among words within topics is to **stem** each word and convert it to a base form that is shared among different versions of the word (e.g. `dog` and `dogs` stemmed to `dog`). What happens if you stem the text before training the topic model?
- Some topics may be closer together in "space" than others. For instance, topics that discuss different aspects of international relations. [This package](https://github.com/bmabey/pyLDAvis) visualizes the relationship between LDA topics by projecting the topics to a shared 2-dimensional space via PCA. Can you find topics that are unexpectedly close, and whether these topics indicate similarities or differences between real and fake news?

# Word embeddings

The previous analyses have shown that fake news tends to use consistently inflammatory and subjective vocabulary, and tends to cover issues that may incite controversy.

Let's drill down to the word level and look for connotations among words used in both fake and real news. This could reveal underlying biases that shape how certain words like `election` or `president` are perceived.

In [24]:
## data = fake news challenge
import pandas as pd
fake_news_article_data = pd.read_csv('data/fake_news_challenge/Fake.csv', sep=',', index_col=False)
real_news_article_data = pd.read_csv('data/fake_news_challenge/True.csv', sep=',', index_col=False)
display(fake_news_article_data.head())

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [37]:
# ## clean data
from nltk.tokenize import PunktSentenceTokenizer, WordPunctTokenizer
sent_tokenizer = PunktSentenceTokenizer()
word_tokenizer = WordPunctTokenizer()
def get_sentence_word_tokens(text, word_tokenizer, sent_tokenizer):
    text_sents = sent_tokenizer.tokenize(text)
    text_sent_tokens = list(map(word_tokenizer.tokenize, text_sents))
    return text_sent_tokens
fake_news_sentences = fake_news_article_data.loc[:, 'text'].apply(lambda x: get_sentence_word_tokens(x, word_tokenizer, sent_tokenizer))
real_news_sentences = real_news_article_data.loc[:, 'text'].apply(lambda x: get_sentence_word_tokens(x, word_tokenizer, sent_tokenizer))
# flatten for processing
from functools import reduce
def flatten_list_data(data):
    flat_data = []
    for x in data:
        flat_data.extend(x)
    return flat_data
fake_news_sentences = flatten_list_data(fake_news_sentences)
real_news_sentences = flatten_list_data(real_news_sentences)

In [124]:
## train word2vec embeddings
from gensim.models.word2vec import Word2Vec
def train_word2vec_model(text_sents, model_out_file):
    dim = 50
    alpha = 0.025
    window = 5
    min_count = 5
    model = Word2Vec(sentences=text_sents, size=dim, alpha=alpha, window=window, min_count=min_count)
#     model.build_vocab(text_sents)
    model.save(model_out_file)
fake_news_word2vec_model_out_file = 'data/fake_news_challenge/fake_news_word2vec_embed.model'
real_news_word2vec_model_out_file = 'data/fake_news_challenge/real_news_word2vec_embed.model'
train_word2vec_model(fake_news_sentences, fake_news_word2vec_model_out_file)
train_word2vec_model(real_news_sentences, real_news_word2vec_model_out_file)

In [130]:
## load from file
fake_news_word2vec_embed_model = Word2Vec.load(fake_news_word2vec_model_out_file)
real_news_word2vec_embed_model = Word2Vec.load(real_news_word2vec_model_out_file)

In [None]:
## train Glove embeddings
from glove import Glove, Corpus
def fit_glove_model(text_sents, model_out_file):
    dim = 50
    learning_rate = 0.05
    alpha = 0.025
    random_state = 123
    train_epochs = 100
    num_threads = 4
    window = 5
    glove_corpus = Corpus()
    glove_corpus.fit(text_sents, window=window)
    glove_embed_model = Glove(no_components=dim, learning_rate=learning_rate, 
                              alpha=alpha, random_state=random_state)
    # note: this takes ~ 5 minutes with 4 threads on a server
    glove_embed_model.fit(glove_corpus.matrix, epochs=train_epochs,
                          no_threads=num_threads, verbose=True)
    glove_embed_model.add_dictionary(glove_corpus.dictionary)
    glove_embed_model.save(model_out_file)
fake_news_glove_model_out_file = 'data/fake_news_challenge/fake_news_glove_embed.model'
real_news_glove_model_out_file = 'data/fake_news_challenge/real_news_glove_embed.model'
print('fitting Glove embeddings for fake news')
fit_glove_model(fake_news_sentences, fake_news_glove_model_out_file)
print('fitting Glove embeddings for real news')
fit_glove_model(real_news_sentences, real_news_glove_model_out_file)

In [95]:
## reload models after training
fake_news_glove_embed_model = Glove.load(fake_news_glove_model_out_file)
real_news_glove_embed_model = Glove.load(real_news_glove_model_out_file)

Let's start out by looking at the nearest neighbors for some test words. 

We'll get the test words by filtering from the most frequent words.

In [57]:
from collections import Counter
from stop_words import get_stop_words
import pandas as pd
pd.set_option('display.max_rows', 100)
news_word_counter = Counter()
for sent_i in fake_news_sentences:
    news_word_counter.update(sent_i)
for sent_i in real_news_sentences:
    news_word_counter.update(sent_i)
news_word_counts = pd.Series(dict(news_word_counter)).sort_values(inplace=False, ascending=False)
en_stops = set(get_stop_words('en')) & set(news_word_counts.index)
news_word_counts.drop(en_stops, inplace=True)
display(news_word_counts.head(100))

.                 899948
,                 872906
s                 231123
-                 191679
Trump             132996
said              132673
The               115553
’                  70098
:                  63415
)                  63128
I                  62061
(                  57607
U                  54124
“                  53931
t                  53069
S                  51453
will               48561
people             39640
President          36123
one                32627
also               30519
It                 29705
Reuters            29343
Clinton            28556
Donald             28066
Obama              28035
?                  27492
government         26786
can                26401
He                 26344
Republican         25450
House              25418
In                 25292
year               24425
,”                 24296
/                  23695
told               23330
United             22860
just               22835
We                 22082


In [None]:
test_words = ['Trump', 'President', 'election', 'Republicans', 'Democratic']

In [152]:
## test word2vec first
N_neighbors = 10
for test_word_i in test_words:
    print(f'testing word = {test_word_i}')
    print(f'\tfake news neighbors')
    print(fake_news_word2vec_embed_model.most_similar(test_word_i, topn=N_neighbors))
    print(f'\treal news neighbors')
    print(real_news_word2vec_embed_model.most_similar(test_word_i, topn=N_neighbors))

testing word = Trump
	fake news neighbors
[('Rubio', 0.7030426263809204), ('Obama', 0.6997063159942627), ('Cruz', 0.6733947992324829), ('he', 0.6496449112892151), ('him', 0.6216100454330444), ('trump', 0.6005026698112488), ('Hillary', 0.5969979166984558), ('Russia', 0.5960808992385864), ('Putin', 0.5954418778419495), ('He', 0.5948378443717957)]
	real news neighbors
[('Pence', 0.6959595084190369), ('he', 0.6848228573799133), ('Cruz', 0.6819908022880554), ('Abe', 0.6777504682540894), ('Macron', 0.672048807144165), ('him', 0.6628677845001221), ('Clinton', 0.6577510237693787), ('Rubio', 0.6568259596824646), ('Obama', 0.6540595293045044), ('Duterte', 0.650699257850647)]
testing word = President
	fake news neighbors
[('Barack', 0.8168875575065613), ('president', 0.8109959959983826), ('administration', 0.7899814248085022), ('Administration', 0.73664391040802), ('presidency', 0.7126862406730652), ('Donald', 0.641764223575592), ('Michelle', 0.60544753074646), ('regime', 0.5730191469192505), ('2

  print(fake_news_word2vec_embed_model.most_similar(test_word_i, topn=N_neighbors))
  print(real_news_word2vec_embed_model.most_similar(test_word_i, topn=N_neighbors))


In [153]:
## test Glove embeddings
N_neighbors = 10
for test_word_i in test_words:
    print(f'testing word = {test_word_i}')
    print(f'\tfake news neighbors')
    print(fake_news_glove_embed_model.most_similar(test_word_i, number=N_neighbors))
    print(f'\treal news neighbors')
    print(real_news_glove_embed_model.most_similar(test_word_i, number=N_neighbors))

testing word = Trump
	fake news neighbors
[('Donald', 0.8976928852032244), ('he', 0.750797318175638), ('President', 0.7224733577899606), ('elect', 0.6950681370557232), ('his', 0.6873461754719997), ('him', 0.6607629129400844), ('presidency', 0.6461921033349649), ('supporter', 0.6430482444622551), ('Q13FOXWATCH', 0.6391587798777106)]
	real news neighbors
[('Donald', 0.9018925471531778), ('Obama', 0.7329701551897114), ('Clinton', 0.7253358950095211), ('he', 0.7219566927581803), ('Putin', 0.7197551851813184), ('administration', 0.6996611063749594), ('He', 0.6981073253324681), ('president', 0.6959299640971673), ('elect', 0.6852015437442879)]
testing word = President
	fake news neighbors
[('Obama', 0.8745549150502164), ('president', 0.8300468244161047), ('Barack', 0.8096793836572981), ('Donald', 0.7728321776884208), ('ObamaE', 0.7725010583428265), ('administration', 0.7640559438864527), ('elect', 0.72963454464097), ('Trump', 0.7224733577899606), ('Putin', 0.6720365624445165)]
	real news neig

We see some aspects of potential bias with these test words.

For `word2vec`:
- `Trump` is associated with almost exclusively Republican politicians in fake news and with a mix of politicians in real news
- `President` is associated more with U.S. politics in fake news and more with international politicians in real news
- `Democratic` are associated more with U.S. politics in fake news and more with international politics in real news

For `Glove`:
- `Trump` is associated with himself (and news network? `Q13FOXWATCH`) in fake news and with other presidents in real news
- `President` is associated with Trump and Obama in fake news and more with international politicians in real news
- `Democratic` is associated with U.S. party politics in both fake and real news

This qualitative analysis helps us understand that some words may indeed have significant divergence in their connotations between the different data sets, while others are more stable.

Which words are the most different across the data?

We'll measure "difference" using the overlap in nearest neighbors (i.e. Jaccard similarity).

$$\text{diff(word1, word2)} = 1 - \frac{\text{neighbors(word1)} \: \cap \: \text{neighbors(word2)}}{\text{neighbors(word1)} \cup \text{neighbors(word2)}}$$

A difference of 100% means that the words have no neighbors in common, while a difference of 0% means that the words have identical neighbors.

In [138]:
def compute_neighbor_diff(neighbors_1, neighbors_2):
    neighbor_intersect = set(neighbors_1) & set(neighbors_2)
    neighbor_union = set(neighbors_1) | set(neighbors_2)
    neighbor_diff = 1 - len(neighbor_intersect) / len(neighbor_union)
    return neighbor_diff
def compute_neighbor_diff_model(word, model_1, model_2, N_neighbor, model_type='word2vec'):
    if(model_type == 'word2vec'):
        neighbors_1, neighbor_scores_1 = zip(*model_1.wv.most_similar(word, topn=N_neighbor))
        neighbors_2, neighbor_scores_2 = zip(*model_2.wv.most_similar(word, topn=N_neighbor))
    elif(model_type == 'glove'):
        neighbors_1, neighbor_scores_1 = zip(*model_1.most_similar(word, number=N_neighbor))
        neighbors_2, neighbor_scores_2 = zip(*model_2.most_similar(word, number=N_neighbor))
    neighbor_diff = compute_neighbor_diff(neighbors_1, neighbors_2)
    return neighbor_diff

In [140]:
# get shared vocabulary
shared_word2vec_vocab = list(set(fake_news_word2vec_embed_model.wv.vocab.keys()) & set(real_news_word2vec_embed_model.wv.vocab.keys()))
print(f'{len(shared_word2vec_vocab)} words in word2vec vocab')
# compute neighbor differences for all valid words
model_type = 'word2vec'
N_neighbor = 10
fake_vs_real_word2vec_neighbor_diffs = list(map(lambda x: compute_neighbor_diff_model(x, fake_news_word2vec_embed_model, real_news_word2vec_embed_model, N_neighbor, model_type=model_type), shared_word2vec_vocab))
# add vocabulary as index
fake_vs_real_word2vec_neighbor_diffs = pd.Series(fake_vs_real_word2vec_neighbor_diffs, index=shared_word2vec_vocab)
fake_vs_real_word2vec_neighbor_diffs.sort_values(inplace=True, ascending=False)

23922 words in word2vec vocab


In [141]:
top_k = 20
print('words with most neighbor difference')
print(fake_vs_real_word2vec_neighbor_diffs.head(top_k))
print('words with most neighbor similarity')
print(fake_vs_real_word2vec_neighbor_diffs.tail(top_k))

words with most neighbor difference
Charlie         1.0
placement       1.0
installation    1.0
stoned          1.0
convenience     1.0
Federalist      1.0
rewards         1.0
Pablo           1.0
princess        1.0
mortgage        1.0
fentanyl        1.0
rebuked         1.0
hog             1.0
cantons         1.0
installed       1.0
systemically    1.0
purchases       1.0
sideshow        1.0
Medium          1.0
theme           1.0
dtype: float64
words with most neighbor similarity
Friday       0.181818
Sunday       0.181818
February     0.181818
Their        0.181818
two          0.181818
November     0.181818
their        0.181818
Monday       0.181818
Thursday     0.181818
March        0.181818
October      0.181818
down         0.181818
Tuesday      0.181818
December     0.181818
July         0.181818
January      0.181818
Wednesday    0.181818
April        0.181818
cannot       0.181818
15           0.000000
dtype: float64


The words with the biggest neighbor differences don't seem to be super informative and may reflect topical differences (e.g. fake news tends to discuss `Charlie` more often and therefore has more consistent nearest neighbors).

What if we restrict to the top-1000 most frequent words?

In [150]:
# only keep the words that are in the word2vec vocab
word2vec_vocab_news_word_counts = news_word_counts.loc[(news_word_counts.index & set(shared_word2vec_vocab))].sort_values(inplace=False, ascending=False)
top_N_words = word2vec_vocab_news_word_counts.iloc[:1000].index.tolist()
top_N_fake_vs_real_word2vec_neighbor_diffs = fake_vs_real_word2vec_neighbor_diffs.loc[top_N_words].sort_values(inplace=False, ascending=False)
top_k = 50
print('frequent words with most neighbor difference')
print(top_N_fake_vs_real_word2vec_neighbor_diffs.head(top_k))

frequent words with most neighbor difference
left            1.0
process         1.0
=               1.0
via             1.0
talks           1.0
fear            1.0
!               1.0
O               1.0
J               1.0
News            1.0
Barack          1.0
face            1.0
influence       1.0
course          1.0
(@              1.0
like            1.0
image           1.0
United          1.0
co              1.0
host            1.0
twitter         1.0
comment         1.0
continued       1.0
hit             1.0
Minister        1.0
Black           1.0
independence    1.0
[               1.0
&               1.0
West            1.0
use             1.0
.-              1.0
reality         1.0
'               1.0
head            1.0
New             1.0
yet             1.0
Islamic         1.0
/               1.0
air             1.0
Images          1.0
(               1.0
com             1.0
Watch           1.0
corruption      1.0
*               1.0
Johnson         1.0
immediately    

OK! This leaves us with some interesting words to investigate:

- `left` (related to politics?)
- `Barack`
- `twitter`
- `Black`
- `Islamic`
- `corruption`

In [154]:
# print neighbors for all high-difference words
high_diff_words = ['left', 'Barack', 'twitter', 'Black', 'Islamic', 'corruption']
N_neighbors = 10
for word_i in high_diff_words:
    print(f'testing word = {word_i}')
    print(f'\tfake news neighbors')
    print(fake_news_word2vec_embed_model.most_similar(word_i, topn=N_neighbors))
    print(f'\treal news neighbors')
    print(real_news_word2vec_embed_model.most_similar(word_i, topn=N_neighbors))

testing word = left
	fake news neighbors
[('right', 0.7388289570808411), ('gone', 0.5697188973426819), ('fringe', 0.5553905367851257), ('conservative', 0.5482199192047119), ('liberal', 0.5463820695877075), ('destroyed', 0.5287902355194092), ('pushed', 0.5214146375656128), ('side', 0.5122994780540466), ('Right', 0.5121271014213562), ('shifted', 0.5109315514564514)]
	real news neighbors
[('lost', 0.7303896546363831), ('abandoned', 0.7045692205429077), ('entered', 0.685556173324585), ('kept', 0.6793034672737122), ('gained', 0.6761770844459534), ('regained', 0.667672336101532), ('stayed', 0.6657394766807556), ('stuck', 0.6575697660446167), ('started', 0.638910174369812), ('secured', 0.6378580927848816)]
testing word = Barack
	fake news neighbors
[('President', 0.816887617111206), ('Michelle', 0.7705560922622681), ('Administration', 0.7330065965652466), ('2016President', 0.7039303779602051), ('administration', 0.6872981786727905), ('Ronald', 0.6653918027877808), ('Jeb', 0.6605669260025024),

  print(fake_news_word2vec_embed_model.most_similar(word_i, topn=N_neighbors))
  print(real_news_word2vec_embed_model.most_similar(word_i, topn=N_neighbors))


This reveals some serious bias going on in the fake news articles.

- `left` is more associated with extreme political views in fake news, and more associated with the traditional verb sense in real news
- `Barack` is more associated with the Obama administration (and his "unusual" name `Hussein`) in fake news, and more associated with world leaders in real news
- `twitter` is more associated with "alternative" news sources in fake news, and more associated with social media in general in real news
- `Black` is more associated with the Black Lives Matter movement and other left-wing movements (`antifa`) in fake news, and more associated with a variety of organizations in real news
- `Islamic` is more associated with terrorist and perceived "radical" movements in fake news, and more associated with Middle Eastern politics in real news

In [None]:
## TODO: visualize?? https://stackoverflow.com/questions/43776572/visualise-word2vec-generated-from-gensim

### Exploration
Now it's time for you to try out some more tests with word embeddings!

- Increasing the **window size** when training embeddings can help the embeddings capture more global context (e.g. associating `tomato` with cooking details from the wider sentence context). How would this help capture divides between fake news and real news?
- One way to determine the **connotation** of a word in embedding space is to look at its proximity to positive and negative words: e.g. if `Barack` is consistently closer to words like `bad` and `terrible` than to `good` and `nice`. Can you come up with a way to test word connotations using this kind of approach, and determine whether some words have consistently better or worse connotations in fake news articles?
- Another useful aspect of word embeddings is their tendency to **cluster** words into general semantic fields, e.g. grouping all politician names near one another. Using the visualization technique from earlier, try to find words that (1) consistently fall into neat clusters and (2) sometimes appear outside of the expected clusters in the data. Which political and organizational words tend to be represented outside of their expected cluster, and why do you think that happens? 

### Old code

In [None]:
# ## data = cross-cultural data
# # collect/save
# import os
# import pandas as pd
# import re
# def load_deception_data_from_dir(data_dir):
#     sub_dirs = list(map(lambda x: os.path.join(data_dir, x), os.listdir(data_dir)))
#     sub_dirs = list(filter(lambda x: os.path.isdir(x), sub_dirs))
#     print(sub_dirs)
#     invalid_file_matcher = re.compile('^\._?')
#     sub_dir_data_files = [os.path.join(x, y) for x in sub_dirs for y in os.listdir(x) if invalid_file_matcher.search(y) is None]
#     combined_data = []
#     for data_file_i in sub_dir_data_files:
# #         print(f'processing data = {data_file_i}')
#         topic_i, label_i = os.path.basename(data_file_i).split('.')
# #         full_data_file_i = os.path.join(data_dir, data_file_i)
# #         print()
#         data_i = pd.read_csv(data_file_i, sep='\t', header=None, index_col=False, skip_blank_lines=True)
#         data_i.columns = ['id', 'text']
#         # keep valid text
#         data_i = data_i[data_i.loc[:, 'text'].apply(lambda x: type(x) is str)]
# #         print(data_i.head())
# #         for x in data_i.loc[:, 'id'].values:
# #             print(x)
# #             print(x.split('_')[1])
#         data_i = data_i.assign(**{
#             'id' : data_i.loc[:, 'id'].apply(lambda x: int(x.split('_')[1]))
#         })
#         data_i = data_i.assign(**{'topic' : topic_i, 'label' : label_i})
#         combined_data.append(data_i)
#     combined_data = pd.concat(combined_data, axis=0)
#     return combined_data
# # us_deception_data_dir = 'data/crossCulturalDeception.2014/EnglishUS/'
# # print([x for x in os.listdir(us_deception_data_dir) if os.path.isdir(os.path.join(us_deception_data_dir, x))])
# us_deception_data = load_deception_data_from_dir(us_deception_data_dir)
# display(us_deception_data.head())