In [2]:
import pandas as pd
import re
import numpy as np
from nltk.corpus import stopwords
import nltk
stop_words = nltk.corpus.stopwords.words('english')
from nltk.stem import PorterStemmer
my_stem = PorterStemmer()
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer()
dictionary = set(w.lower() for w in nltk.corpus.words.words())
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score

In [3]:
politifact = pd.read_csv("raw-data/FakeNewsNet-master/Data/Cleaned_DSPP/politifact.csv")

### remove articles with no text
politifact = politifact[~politifact.text.isna()]

In [4]:
the_df = pd.DataFrame()
## clean/prepare text
for rev, outcome in zip(politifact.text.tolist(), politifact.fake.tolist()):
    
    # only keep words (remove other characters)
    tmp_read = re.sub('[^a-zA-Z]+', ' ', rev).lower()

    #Tokenization and remove stop words
    tmp_read = [word for word in tmp_read.split() if word not in stop_words]

    #dictionary words
    dict_read = [word for word in tmp_read if word in dictionary]
    
    # stemming
    tmp_read_stm = [my_stem.stem(word) for word in tmp_read]
    dict_read_stm = [my_stem.stem(word) for word in dict_read]

    # lemminization
    tmp_read_lem = [lemmatizer.lemmatize(word) for word in tmp_read]
    dict_read_lem = [lemmatizer.lemmatize(word) for word in dict_read]

    
    # rejoin reviews
    tmp_read = ' '.join(tmp_read)
    tmp_read_stm = ' '.join(tmp_read_stm)
    tmp_read_lem = ' '.join(tmp_read_lem)
    
    dict_read = ' '.join(dict_read)
    dict_read_stm = ' '.join(dict_read_stm)
    dict_read_lem = ' '.join(dict_read_lem)


    # add to new df
    tmp = pd.DataFrame([rev], columns=['original text'])
    tmp['body'] = tmp_read
    tmp['body_stem'] = tmp_read_stm
    tmp['body_lem'] = tmp_read_lem
    tmp['body_dict'] = dict_read
    tmp['body_dict_stem'] = dict_read_stm
    tmp['fake'] = outcome

    the_df = the_df.append(tmp, ignore_index=True)

In [5]:
the_df.head()

Unnamed: 0,original text,body,body_stem,body_lem,body_dict,body_dict_stem,fake
0,335 SHARES SHARE THIS STORY\n\nRepublican atta...,shares share story republican attacks transgen...,share share stori republican attack transgend ...,share share story republican attack transgende...,share story republican religious fight keep ge...,share stori republican religi fight keep gende...,1
1,BREAKING!\n\nLiberal rag Huffington Post is re...,breaking liberal rag huffington post really ru...,break liber rag huffington post realli run sto...,breaking liberal rag huffington post really ru...,breaking liberal rag post really running story...,break liber rag post realli run stori washingt...,1
2,Three women who all went missing in the mid-19...,three women went missing mid turned least part...,three women went miss mid turn least part stee...,three woman went missing mid turned least part...,three went missing mid turned least steel indu...,three went miss mid turn least steel industri ...,1
3,"On Monday, Bumble Bee Foods and 2 employees we...",monday bumble bee foods employees charged los ...,monday bumbl bee food employe charg lo angel p...,monday bumble bee food employee charged los an...,monday bumble bee safety death worker industri...,monday bumbl bee safeti death worker industri ...,1
4,"Republican Rep. Trey Gowdy, who sits on the Ho...",republican rep trey gowdy sits house judiciary...,republican rep trey gowdi sit hous judiciari c...,republican rep trey gowdy sits house judiciary...,republican rep trey house judiciary committee ...,republican rep trey hous judiciari committe fr...,1


In [6]:
def article_length(the_df):
    '''count number of words in each speech'''
    l = pd.Series.tolist(the_df[['original text']])
    ct = 0
    count_per_speech = []
    for line in l:
        for i in line:
            i = i.split()
            ct += 1
            count_per_speech.append(len(i))
    the_df['total_words'] = count_per_speech
    return(the_df)

In [7]:
def summary_stats(the_df):
    '''sent_length = num of sentences in speech
       num_word_unique = num of unique words in speech'''
    from nltk.tokenize import sent_tokenize, word_tokenize
    
    sent_len = []
    unique_len = []
    body_basic = []
    dict_len = []
    dict_uniq_len = []
    punctuations = '''!()-[]{}';:'"\,<>./?@#$%^&*_~``'''
    ct = 0
    for i in the_df['original text']:
        sents = sent_tokenize(i)
        tmp = word_tokenize(i)
        tmp_words = [i.lower() for i in tmp if i not in punctuations]
        #dictionary words
        dict_read = [word for word in tmp_words if word in dictionary]
        dict_unique = len(set(dict_read))
        dict_len.append(len(dict_read))
        dict_uniq_len.append(dict_unique)
        
        tmp_unique = len(set(tmp))
        tmp_words = ' '.join(tmp_words)
        sent_len.append(len(sents))
        unique_len.append(tmp_unique)
        body_basic.append(tmp_words)

    the_df['body_basic'] = body_basic
    the_df['sent_length'] = sent_len
    the_df['num_word_unique'] = unique_len
    the_df['num_dict_word_unique'] = dict_uniq_len
    the_df['total_dict_words'] = dict_len
    return(the_df)

In [8]:
the_df = article_length(the_df)
the_df = summary_stats(the_df)

In [9]:
the_df.head()

Unnamed: 0,original text,body,body_stem,body_lem,body_dict,body_dict_stem,fake,total_words,body_basic,sent_length,num_word_unique,num_dict_word_unique,total_dict_words
0,335 SHARES SHARE THIS STORY\n\nRepublican atta...,shares share story republican attacks transgen...,share share stori republican attack transgend ...,share share story republican attack transgende...,share story republican religious fight keep ge...,share stori republican religi fight keep gende...,1,287,335 shares share this story republican attacks...,8,182,132,245
1,BREAKING!\n\nLiberal rag Huffington Post is re...,breaking liberal rag huffington post really ru...,break liber rag huffington post realli run sto...,breaking liberal rag huffington post really ru...,breaking liberal rag post really running story...,break liber rag post realli run stori washingt...,1,367,breaking liberal rag huffington post is really...,16,231,157,302
2,Three women who all went missing in the mid-19...,three women went missing mid turned least part...,three women went miss mid turn least part stee...,three woman went missing mid turned least part...,three went missing mid turned least steel indu...,three went miss mid turn least steel industri ...,1,337,three women who all went missing in the mid-19...,12,219,161,292
3,"On Monday, Bumble Bee Foods and 2 employees we...",monday bumble bee foods employees charged los ...,monday bumbl bee food employe charg lo angel p...,monday bumble bee food employee charged los an...,monday bumble bee safety death worker industri...,monday bumbl bee safeti death worker industri ...,1,258,on monday bumble bee foods and 2 employees wer...,11,167,105,195
4,"Republican Rep. Trey Gowdy, who sits on the Ho...",republican rep trey gowdy sits house judiciary...,republican rep trey gowdi sit hous judiciari c...,republican rep trey gowdy sits house judiciary...,republican rep trey house judiciary committee ...,republican rep trey hous judiciari committe fr...,1,563,republican rep. trey gowdy who sits on the hou...,20,294,204,454


In [9]:
politifact.head()

Unnamed: 0.1,Unnamed: 0,source,date,title,text,fake,date_time
0,PolitiFact_Fake_1-Webpage.json,http://www.occupydemocrats.com,2016-01-12,Virginia Republican Wants Schools To Check Chi...,335 SHARES SHARE THIS STORY\n\nRepublican atta...,1,2016-01-12 15:02:28
1,PolitiFact_Fake_10-Webpage.json,http://usasnich.com,2016-12-11,BREAKING: PUTIN INTERFERENCE COULD GIVE COURTS...,BREAKING!\n\nLiberal rag Huffington Post is re...,1,2016-12-11 13:03:24
2,PolitiFact_Fake_100-Webpage.json,http://freedomcrossroads.us,2017-06-20,BREAKING: Barrels Removed From Clinton Propert...,Three women who all went missing in the mid-19...,1,2017-06-20 16:34:03
3,PolitiFact_Fake_101-Webpage.json,,,,"On Monday, Bumble Bee Foods and 2 employees we...",1,
4,PolitiFact_Fake_102-Webpage.json,,,,"Republican Rep. Trey Gowdy, who sits on the Ho...",1,


In [10]:
# rejoin and sort by date
output = the_df.merge(politifact[["text","date"]], left_on='original text',right_on="text", how='left').sort_values("date").reset_index()
# output.to_csv("./output-data/output.csv")

In [11]:
grouped_df_avg = the_df.groupby(['fake']).mean()
grouped_df_avg

Unnamed: 0_level_0,total_words,sent_length,num_word_unique,num_dict_word_unique,total_dict_words
fake,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,642.266667,27.625,321.358333,222.283333,541.991667
1,394.872881,17.567797,220.322034,159.008475,341.550847


In [12]:
# prop_uniq_dict - of dictionary words in an article, what proportion of the words are unique?
grouped_df_avg['prop_uniq_dict'] = grouped_df_avg['num_dict_word_unique'] / grouped_df_avg['total_dict_words']
# prop_uniq - of total words in an article, what proportion of the words are unique?
grouped_df_avg['prop_uniq'] = grouped_df_avg['num_word_unique'] / grouped_df_avg['total_words']
# prop_dict_words - of total words in an article, what proportion of the words are in a dictionary?
grouped_df_avg['prop_dict_words'] = grouped_df_avg['total_dict_words'] / grouped_df_avg['total_words']

In [13]:
grouped_df_avg

Unnamed: 0_level_0,total_words,sent_length,num_word_unique,num_dict_word_unique,total_dict_words,prop_uniq_dict,prop_uniq,prop_dict_words
fake,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,642.266667,27.625,321.358333,222.283333,541.991667,0.410123,0.50035,0.843873
1,394.872881,17.567797,220.322034,159.008475,341.550847,0.465548,0.557957,0.864964


In [55]:
## null hypothesis: number of unique works of fake and real articles 
## are the same

### https://machinelearningmastery.com/how-to-code-the-students-t-test-from-scratch-in-python/
def independent_ttest(data1, data2, alpha= 0.05):
    from math import sqrt
    from numpy.random import seed
    from numpy.random import randn
    from numpy import mean
    from scipy.stats import sem
    from scipy.stats import t
    # calculate means
    mean1, mean2 = mean(data1), mean(data2)
    # calculate standard errors
    se1, se2 = sem(data1), sem(data2)
    # standard error on the difference between the samples
    sed = sqrt(se1**2.0 + se2**2.0)
    # calculate the t statistic
    t_stat = (mean1 - mean2) / sed
    # degrees of freedom
    df = len(data1) + len(data2) - 2
    # calculate the critical value
    cv = t.ppf(1.0 - alpha, df)
    # calculate the p-value
    p = (1.0 - t.cdf(abs(t_stat), df)) * 2.0
    # return everything
    return t_stat, df, cv, p

**statistically significant difference in the number of unique words in fake vs real news articles.** We can reject the null hypothesis that the means of unique words in these articles are equal.

In [63]:
data1 = the_df[the_df.fake==0].num_word_unique.tolist()
data2 = the_df[the_df.fake==1].num_word_unique.tolist()

t_stat, df, cv, p = independent_ttest(data1, data2)
p

0.0009826839274891253

**statistically significant difference in the length of sentences in fake vs real news articles.**

In [65]:
data1 = the_df[the_df.fake==0].sent_length.tolist()
data2 = the_df[the_df.fake==1].sent_length.tolist()

t_stat, df, cv, p = independent_ttest(data1, data2)
p

0.004532788546822708

**statistically significant difference in the article lengths in fake vs real news articles.**

In [101]:
data1 = the_df[the_df.fake==0].article_length.tolist()
data2 = the_df[the_df.fake==1].article_length.tolist()

t_stat, df, cv, p = independent_ttest(data1, data2)
p

0.0019798857071273712

**standardizing -- proportion of the article**

In [102]:
# prop_uniq_dict - of dictionary words in an article, what proportion of the words are unique?
the_df['prop_uniq_dict'] = the_df['num_dict_word_unique'] / the_df['total_dict_words']
# prop_uniq - of total words in an article, what proportion of the words are unique?
the_df['prop_uniq'] = the_df['num_word_unique'] / the_df['total_words']
# prop_dict_words - of total words in an article, what proportion of the words are in a dictionary?
the_df['prop_dict_words'] = the_df['total_dict_words'] / the_df['total_words']

**statistically significant difference in the proportion of unique dictionary words in fake vs real news articles.**

In [103]:
data1 = the_df[the_df.fake==0].prop_uniq_dict.tolist()
data2 = the_df[the_df.fake==1].prop_uniq_dict.tolist()

t_stat, df, cv, p = independent_ttest(data1, data2)
p

0.000233015652925328

**statistically significant difference in the proportion of unique words in fake vs real news articles.**

In [104]:
data1 = the_df[the_df.fake==0].prop_uniq.tolist()
data2 = the_df[the_df.fake==1].prop_uniq.tolist()

t_stat, df, cv, p = independent_ttest(data1, data2)
p

0.00028340517072944493

**no statistically significant difference in the proportion of total dictionary words in fake vs real news articles.**

In [105]:
data1 = the_df[the_df.fake==0].prop_dict_words.tolist()
data2 = the_df[the_df.fake==1].prop_dict_words.tolist()

t_stat, df, cv, p = independent_ttest(data1, data2)
p

0.07454492366895393

In [107]:
the_df.head()

Unnamed: 0,original text,body,body_stem,body_lem,body_dict,body_dict_stem,fake,article_length,body_basic,sent_length,num_word_unique,total_words,num_dict_word_unique,total_dict_words,prop_uniq_dict,prop_uniq,prop_dict_words
0,335 SHARES SHARE THIS STORY\n\nRepublican atta...,shares share story republican attacks transgen...,share share stori republican attack transgend ...,share share story republican attack transgende...,share story republican religious fight keep ge...,share stori republican religi fight keep gende...,1,287,335 shares share this story republican attacks...,8,182,287,132,245,0.538776,0.634146,0.853659
1,BREAKING!\n\nLiberal rag Huffington Post is re...,breaking liberal rag huffington post really ru...,break liber rag huffington post realli run sto...,breaking liberal rag huffington post really ru...,breaking liberal rag post really running story...,break liber rag post realli run stori washingt...,1,367,breaking liberal rag huffington post is really...,16,231,367,157,302,0.519868,0.629428,0.822888
2,Three women who all went missing in the mid-19...,three women went missing mid turned least part...,three women went miss mid turn least part stee...,three woman went missing mid turned least part...,three went missing mid turned least steel indu...,three went miss mid turn least steel industri ...,1,337,three women who all went missing in the mid-19...,12,219,337,161,292,0.55137,0.649852,0.866469
3,"On Monday, Bumble Bee Foods and 2 employees we...",monday bumble bee foods employees charged los ...,monday bumbl bee food employe charg lo angel p...,monday bumble bee food employee charged los an...,monday bumble bee safety death worker industri...,monday bumbl bee safeti death worker industri ...,1,258,on monday bumble bee foods and 2 employees wer...,11,167,258,105,195,0.538462,0.647287,0.755814
4,"Republican Rep. Trey Gowdy, who sits on the Ho...",republican rep trey gowdy sits house judiciary...,republican rep trey gowdi sit hous judiciari c...,republican rep trey gowdy sits house judiciary...,republican rep trey house judiciary committee ...,republican rep trey hous judiciari committe fr...,1,563,republican rep. trey gowdy who sits on the hou...,20,294,563,204,454,0.449339,0.522202,0.806394


#### Part of Speech analysis

In [21]:
import gensim

In [63]:
the_adj = ['JJ', 'JJR', 'JJS']
the_verb = ['VB', 'VBD', 'VGB', 'VBN', 'VBP', 'VBZ']
adj_set_real = []
adj_set_fake = []

for rev, outcome in zip(politifact.text.tolist(), politifact.fake.tolist()):
    
    # only keep words (remove other characters)
    tmp_read = re.sub('[^a-zA-Z]+', ' ', rev).lower()

    #Tokenization and remove stop words
    tmp_read = [word for word in tmp_read.split() if word not in stop_words]

    pos = nltk.pos_tag(tmp_read)
    for w in pos:
        if nltk.pos_tag(w)[0][1] in the_adj and outcome==1:
            adj_set_fake.add(w[0])
        elif nltk.pos_tag(w)[0][1]  in the_adj and outcome==0:
            adj_set_real.add(w[0])


In [None]:
# more to explore here!