In [None]:
"""
This notebook does some exploratory data analysis using spacy
"""

In [1]:
import spacy
nlp = spacy.load('en_core_web_sm')



In [2]:
import pandas as pd
import pickle

In [19]:
# Gets the data from csv files
df = pd.read_csv('archive/Fake.csv')
df2 = pd.read_csv('archive/True.csv')

In [20]:
# Creates a column true that is 0 if the news is fake and 1 if it is true
df['true'] = 0
df2['true'] = 1

In [21]:
# Combine the two data frames into one
df_all = pd.concat([df, df2])

In [27]:
# Clean up text by removing apostrophes, alphanumeric combinations, punctuation and setting
# everything to lowercase
import re
import string

apostrophe = lambda x: re.sub("\'", '', x)
apostrophe2 = lambda x: re.sub("’", '', x)
alphanumeric = lambda x: re.sub('\w*\d\w*', ' ', x)
punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower())

df_all['text'] = df_all['text'].map(apostrophe).map(apostrophe2).map(alphanumeric).map(punc_lower)

In [29]:
# Takes a sample of the data from the dataframe 
# Tried running the visualizations using the full dataset, but it took a really
# long time to run and the hmtl kept crashing and would never load 
df_sample = df_all.sample(n = 1000) 

In [30]:
df_sample.head()

Unnamed: 0,title,text,subject,date,true,spacy_doc
10492,Obama to visit Patagonian tourist city Bariloc...,buenos aires reuters president barack obam...,politicsNews,"March 5, 2016",1,"(buenos, aires, , reuters, , president, ba..."
17635,WOW! IS MEGHAN MCCAIN FINISHED With The Hatefu...,it s pretty common knowledge for anyone s who ...,left-news,"Nov 17, 2017",0,"(it, s, pretty, common, knowledge, for, anyone..."
12840,CHAMPION OF WOMEN? How Hillary Used Private In...,hillary won t be able to claim a vast right wi...,politics,"Oct 4, 2016",0,"(hillary, won, t, be, able, to, claim, a, vast..."
22974,Trump Asks Congress To Investigate Former Obam...,century wire says president donald j trump ...,Middle-east,"March 5, 2017",0,"( , century, wire, says, president, donald, j..."
829,Less Than Half Of Trump Voters Believe Donald...,according to the results from a recent survey ...,News,"July 19, 2017",0,"(according, to, the, results, from, a, recent,..."


In [30]:
# Process documents using spacy
df_sample['spacy_doc'] = list(nlp.pipe(df_sample['text']))

In [29]:
from collections import Counter

In [32]:
# Separates the sample data set into real news and fake news
real_news = df_sample[df_sample['true']==1]
fake_news = df_sample[df_sample['true']==0]

In [33]:
# find the adjectives and nouns for real news and fake news
real_adj = [token.text.lower() for doc in real_news.spacy_doc for token in doc if token.pos_=='ADJ']
fake_adj = [token.text.lower() for doc in fake_news.spacy_doc for token in doc if token.pos_=='ADJ']

real_noun = [token.text.lower() for doc in real_news.spacy_doc for token in doc if token.pos_=='NOUN']
fake_noun = [token.text.lower() for doc in fake_news.spacy_doc for token in doc if token.pos_=='NOUN']

In [None]:
"""
The next few cells look at the top 10 most common adjectives and nouns for the fake news and real
news.  As you can see, there are a lot of similarities between the two, although there are also 
some interesting differences.  For example, t is a common 'noun' for fake news.  The t probably 
comes from contractions, so it seems that fake news is more likely to have contractions
"""

In [34]:
Counter(real_adj).most_common(10)

[('other', 281),
 ('more', 273),
 ('last', 266),
 ('former', 204),
 ('republican', 196),
 ('presidential', 173),
 ('new', 168),
 ('political', 167),
 ('democratic', 150),
 ('first', 146)]

In [35]:
Counter(fake_adj).most_common(10)

[('other', 321),
 ('more', 308),
 ('many', 240),
 ('new', 194),
 ('black', 194),
 ('american', 187),
 ('political', 186),
 ('last', 183),
 ('former', 169),
 ('first', 150)]

In [36]:
Counter(fake_noun).most_common(10)

[('trump', 839),
 ('people', 681),
 ('time', 305),
 ('t', 305),
 ('campaign', 305),
 ('state', 266),
 ('year', 239),
 ('media', 239),
 ('president', 231),
 ('image', 228)]

In [37]:
Counter(real_noun).most_common(10)

[('government', 397),
 ('trump', 370),
 ('people', 328),
 ('state', 327),
 ('election', 285),
 ('year', 280),
 ('percent', 277),
 ('tax', 222),
 ('campaign', 221),
 ('week', 191)]

In [None]:
"""
Next few cells generates a scattertext plot which is saved in an html file. 
"""

In [31]:
import scattertext as st

In [84]:
import spacy
nlp = spacy.load('en_core_web_sm')



In [32]:
df_sample['Category'] = df_sample['true']

In [33]:
df_sample['Category'] = df_sample['Category'].replace([1,0], ['true', 'fake'])

In [34]:
corpus = st.CorpusFromPandas(df_sample, 
                            category_col='Category', 
                             text_col='text',
                            nlp=nlp).build()

In [37]:
html = st.produce_scattertext_explorer(
        corpus,
        category='true',
        category_name='Real News',
        not_category_name='Fake News',
        minimum_term_frequency=20,
        pmi_threshold_coefficient=5,
        width_in_pixels=1000,
        metadata=df_sample['title']
        )

In [36]:
open('scattertext_demo.html', 'wb').write(html.encode('utf-8'));

In [None]:
"""
Next few cells were an attempt to make a scattertext figure
using scraped data.  For some reason, I could not get this to
work, but would like to in the future
"""

In [3]:
df_comb = pd.read_pickle('df_comb.pickle')

In [4]:
df_comb['Category'] = df_comb['true']
df_comb['Category'] = df_comb['Category'].replace([1,0], ['true', 'fake'])

In [7]:
corpus = st.CorpusFromPandas(df_comb, 
                            category_col='Category', 
                             text_col='text',
                            nlp=nlp).build()

In [11]:
html = st.produce_scattertext_explorer(
        corpus,
        category='true',
        category_name='Real News',
        not_category_name='Fake News',
        minimum_term_frequency=10,
        pmi_threshold_coefficient=5,
        width_in_pixels=1000,
        metadata=df_comb.index)

AssertionError: 

In [16]:
open('scattertext_demo.html', 'wb').write(html.encode('utf-8'));