In [1]:
#from wordcloud import STOPWORDS
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
## read in data from csv
realOrFake = pd.read_csv('fake_or_real_news.csv')
realOrFake.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [3]:
## dropped unnamed column- it is unique identifier/non-predictive
realOrFake.drop(realOrFake.columns[realOrFake.columns.str.contains('unnamed',case = False)], axis=1, inplace=True)
realOrFake.head()

Unnamed: 0,title,text,label
0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [4]:
#convert label to int
d = {'FAKE': 1, 'REAL': 0}
# value of 1 indicates Partner = Yes
realOrFake['Fake'] = realOrFake['label'].map(d)
realOrFake.head()

Unnamed: 0,title,text,label,Fake
0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE,1
1,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE,1
2,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL,0
3,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE,1
4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL,0


In [5]:
realOrFake.drop(['label'], axis=1, inplace=True)
realOrFake.head()

Unnamed: 0,title,text,Fake
0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",1
1,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,1
2,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,0
3,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",1
4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,0


In [6]:
realOrFake.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6335 entries, 0 to 6334
Data columns (total 3 columns):
title    6335 non-null object
text     6335 non-null object
Fake     6335 non-null int64
dtypes: int64(1), object(2)
memory usage: 148.6+ KB


In [7]:
realOrFake.iloc[0]

title                         You Can Smell Hillary’s Fear
text     Daniel Greenfield, a Shillman Journalism Fello...
Fake                                                     1
Name: 0, dtype: object

In [8]:
fake = realOrFake[(realOrFake['Fake']==1)]
fake.head()

Unnamed: 0,title,text,Fake
0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",1
1,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,1
3,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",1
5,"Tehran, USA","\nI’m not an immigrant, but my grandparents ...",1
6,Girl Horrified At What She Watches Boyfriend D...,"Share This Baylee Luciani (left), Screenshot o...",1


In [9]:
real = realOrFake[(realOrFake['Fake']==0)]
real.head()

Unnamed: 0,title,text,Fake
2,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,0
4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,0
7,‘Britain’s Schindler’ Dies at 106,A Czech stockbroker who saved more than 650 Je...,0
8,Fact check: Trump and Clinton at the 'commande...,Hillary Clinton and Donald Trump made some ina...,0
9,Iran reportedly makes new push for uranium con...,Iranian negotiators reportedly have made a las...,0


In [10]:
fake.count()

title    3164
text     3164
Fake     3164
dtype: int64

In [11]:
real.count()

title    3171
text     3171
Fake     3171
dtype: int64

Note: fake and real samples are ~equal number of real and fake news articles so no need to balance the classes

### Train Test Split

In [12]:
y = realOrFake['Fake']

In [13]:
X = realOrFake['text']

In [14]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=53)

### Model 1: Count vectorizer- Text only

In [15]:
from sklearn.feature_extraction.text import CountVectorizer

#ignore english stop words
cv = CountVectorizer(stop_words='english')

count_train = cv.fit_transform(X_train)

In [16]:
cv.vocabulary_

{'report': 42470,
 'copyright': 12105,
 'violation': 54177,
 'think': 50628,
 'doom': 15924,
 'sayers': 44520,
 'trump': 51896,
 'office': 35783,
 'notice': 35256,
 'glp': 21881,
 'republican': 42534,
 'bush': 8399,
 'left': 29531,
 'doomsaying': 15927,
 'increases': 25686,
 'sure': 49203,
 'effect': 16814,
 'opposite': 36087,
 'gets': 21568,
 'increase': 25684,
 'political': 38823,
 'spectrum': 47506,
 'page': 36831,
 'election': 16972,
 '232': 762,
 'photos': 38140,
 '43': 1127,
 'numbers': 35403,
 '131': 229,
 'quotes': 40794,
 'candidates': 8779,
 'center': 9445,
 'email': 17139,
 'wonder': 55615,
 'mind': 32729,
 'today': 51050,
 'notable': 35236,
 'people': 37695,
 'don': 15867,
 'miss': 32983,
 'unbelievable': 52435,
 'roundup': 43680,
 'best': 6494,
 'talked': 49772,
 'day': 13519,
 'course': 12424,
 'schindler': 44691,
 'list': 30107,
 'saddest': 44042,
 'movies': 33689,
 'time': 50906,
 'going': 21981,
 'brag': 7659,
 'chance': 9616,
 'liam': 29814,
 'neeson': 34495,
 'humili

In [17]:
cv.inverse_transform(count_train)

[array([], 
       dtype='<U91'),
 array(['page', 'spectrum', 'political', 'increase', 'gets', 'opposite',
        'effect', 'sure', 'increases', 'doomsaying', 'left', 'bush',
        'republican', 'glp', 'notice', 'office', 'trump', 'sayers', 'doom',
        'think', 'violation', 'copyright', 'report'], 
       dtype='<U91'),
 array(['center', 'candidates', 'quotes', '131', 'numbers', '43', 'photos',
        '232', 'election'], 
       dtype='<U91'),
 array(['chandler', 'kyle', 'love', 'stuff', 'good', 'garcia', 'lorena',
        'soon', 'coming', 'knowing', 'laugh', 'food', 'joke', 'little',
        'fun', 'solitary', 'sending', 'whisper', 'like', 'microwave',
        'humility', 'neeson', 'liam', 'chance', 'brag', 'going', 'time',
        'movies', 'saddest', 'list', 'schindler', 'course', 'day', 'talked',
        'best', 'roundup', 'unbelievable', 'miss', 'don', 'people',
        'notable', 'today', 'mind', 'wonder', 'email', 'quotes'], 
       dtype='<U91'),
 array(['head', 'rogue

In [18]:
#transform test to match train
count_test = cv.transform(X_test)

#### Naive Bayes

In [19]:
from sklearn.naive_bayes import MultinomialNB

nb1 = MultinomialNB()

nb1.fit(count_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [20]:
print ("Classification rate for NB- Model 1 (Train):", round(nb1.score(count_train, y_train),4)*100,"%")
print ("Classification rate for NB- Model 1 (Test):", round(nb1.score(count_test, y_test),4)*100,"%")

Classification rate for NB- Model 1 (Train): 94.67 %
Classification rate for NB- Model 1 (Test): 89.34 %


### Model 2: TFID Vectorizer- Text only

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

tv = TfidfVectorizer(stop_words='english')

tfid_train = tv.fit_transform(X_train)

In [22]:
tfid_train.shape

(4244, 56922)

In [23]:
tfid_test = cv.transform(X_test)

#### Naive Bayes

In [24]:
from sklearn.naive_bayes import MultinomialNB

nb2 = MultinomialNB()

nb2.fit(tfid_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [25]:
print ("Classification rate for NB- Model 2 (Train):", round(nb2.score(tfid_train, y_train),4)*100,"%")
print ("Classification rate for NB- Model 2 (Test):", round(nb2.score(tfid_test, y_test),4)*100,"%")

Classification rate for NB- Model 2 (Train): 91.49 %
Classification rate for NB- Model 2 (Test): 79.29 %


Without making any adjustments to the Vectorization methods, NB for count vectorizer performs better then for TFID vectorizer.

### Model 3: Count vectorizer- Title only

In [26]:
X = realOrFake['title']
y = realOrFake['Fake']

In [27]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=53)

In [28]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english')

count_train2 = cv.fit_transform(X_train)

In [29]:
cv.vocabulary_

{'reader': 5763,
 'refers': 5842,
 'englishman': 2475,
 'pat': 5161,
 'condell': 1560,
 'brexit': 1011,
 'trump': 7360,
 'election': 2394,
 'america': 397,
 'moment': 4616,
 'truth': 7373,
 'think': 7163,
 'doom': 2237,
 'sayers': 6236,
 'office': 4947,
 'bernie': 801,
 'california': 1128,
 'endgame': 2453,
 'said': 6196,
 'liam': 4157,
 'neeson': 4785,
 'lorena': 4251,
 'garcia': 3035,
 'kyle': 4027,
 'chandler': 1285,
 'say': 6235,
 'wells': 7777,
 'fargo': 2706,
 'rotting': 6130,
 'gop': 3136,
 'mount': 4664,
 'party': 5149,
 'challenge': 1278,
 'shiite': 6429,
 'militia': 4539,
 'says': 6238,
 'close': 1437,
 'tal': 7046,
 'afar': 283,
 'turkey': 7388,
 'warned': 7714,
 'limits': 4195,
 'blood': 892,
 'money': 4623,
 'killer': 3976,
 'cops': 1680,
 'privatization': 5517,
 'funding': 3008,
 'racist': 5695,
 'logic': 4234,
 'police': 5357,
 'charlie': 1308,
 'hebdo': 3330,
 'attack': 590,
 'terrorists': 7129,
 'killed': 3975,
 'raids': 5707,
 'food': 2902,
 'mixology': 4588,
 'eaten'

In [30]:
cv.inverse_transform(count_train2)

[array(['truth', 'moment', 'america', 'election', 'trump', 'brexit',
        'condell', 'pat', 'englishman', 'refers', 'reader'], 
       dtype='<U18'), array(['office', 'sayers', 'doom', 'think', 'trump'], 
       dtype='<U18'), array(['endgame', 'california', 'bernie'], 
       dtype='<U18'), array(['say', 'chandler', 'kyle', 'garcia', 'lorena', 'neeson', 'liam',
        'said'], 
       dtype='<U18'), array(['rotting', 'fargo', 'wells'], 
       dtype='<U18'), array(['challenge', 'party', 'mount', 'gop', 'trump'], 
       dtype='<U18'), array(['limits', 'warned', 'turkey', 'afar', 'tal', 'close', 'says',
        'militia', 'shiite'], 
       dtype='<U18'), array(['police', 'logic', 'racist', 'funding', 'privatization', 'cops',
        'killer', 'money', 'blood', 'america'], 
       dtype='<U18'), array(['raids', 'killed', 'terrorists', 'attack', 'hebdo', 'charlie'], 
       dtype='<U18'), array(['health', 'boost', 'foods', 'eaten', 'mixology', 'food'], 
       dtype='<U18'), array([

In [31]:
count_test2 = cv.transform(X_test)

In [32]:
from sklearn.naive_bayes import MultinomialNB

nb3 = MultinomialNB()

nb3.fit(count_train2, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [33]:
print ("Classification rate for NB- Model 3 (Train):", round(nb3.score(count_train2, y_train),4)*100,"%")
print ("Classification rate for NB- Model 3 (Test):", round(nb3.score(count_test2, y_test),4)*100,"%")

Classification rate for NB- Model 3 (Train): 94.34 %
Classification rate for NB- Model 3 (Test): 79.72 %


Applying count vectorization on the title only performed much worse than on the text only.

### Model 4: TFID Vectorizer- Title only

In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer

tv = TfidfVectorizer(stop_words='english')

tfid_train2 = tv.fit_transform(X_train)

In [35]:
tfid_test2 = cv.transform(X_test)

In [36]:
from sklearn.naive_bayes import MultinomialNB

nb4 = MultinomialNB()

nb4.fit(tfid_train2, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [37]:
print ("Classification rate for NB- Model 2 (Train):", round(nb4.score(tfid_train2, y_train),4)*100,"%")
print ("Classification rate for NB- Model 2 (Test):", round(nb4.score(tfid_test2, y_test),4)*100,"%")

Classification rate for NB- Model 2 (Train): 95.15 %
Classification rate for NB- Model 2 (Test): 79.96 %


### Model 5: Count vectorizer- Title and text

In [38]:
X = realOrFake.loc[:, :'text']

In [39]:
print(X.shape)
print(y.shape)

(6335, 2)
(6335,)


In [40]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=53)

In [41]:
print(X_train.shape)
print(y_train.shape)

(4244, 2)
(4244,)


In [42]:
X_train.head()

Unnamed: 0,title,text
2576,A Reader Refers Us To Englishman Pat Condell O...,
1539,Do you think there will be as many doom sayers...,Report Copyright Violation Do you think there ...
5163,Bernie's California endgame,"The election in 232 photos, 43 numbers and 131..."
2615,"They Said What?!: Find Out What Liam Neeson, L...",Email Ever wonder what’s on the mind of today’...
4270,Wells Fargo is Rotting from the Top Down,Wells Fargo is Rotting from the Top Down Wells...


In [43]:
## addressing the two y columns 
import scipy.sparse as sp

title_vectorizer = CountVectorizer(stop_words='english')
title_vectors = title_vectorizer.fit_transform(X_train['title'])
title_test = title_vectorizer.transform(X_test['title'])

text_vectorizer = CountVectorizer(stop_words='english')
text_vectors = text_vectorizer.fit_transform(X_train['text'])
text_test = text_vectorizer.transform(X_test['text'])

combined_train = sp.hstack([title_vectors, text_vectors], format='csr')
combined_test = sp.hstack([title_test, text_test], format='csr')

In [44]:
print(combined_train.shape)
print(combined_test.shape)
print(y_train.shape)
print(y_test.shape)

(4244, 64881)
(2091, 64881)
(4244,)
(2091,)


In [45]:
from sklearn.naive_bayes import MultinomialNB

nb5 = MultinomialNB()

nb5.fit(combined_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [46]:
## Best performance on training set so far
print ("Classification rate for NB- Model 5 (Train):", round(nb5.score(combined_train, y_train),4)*100,"%")
print ("Classification rate for NB- Model 5 (Test):", round(nb5.score(combined_test, y_test),4)*100,"%")

Classification rate for NB- Model 5 (Train): 95.26 %
Classification rate for NB- Model 5 (Test): 89.72 %


#### Bernoulli Naive Bayes

In [47]:
from sklearn.naive_bayes import BernoulliNB

nb5b = BernoulliNB()

nb5b.fit(combined_train, y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [48]:
## Did not perform better
print ("Classification rate for NB- Model 5 w Bernoulli (Train):", round(nb5b.score(combined_train, y_train),4)*100,"%")
print ("Classification rate for NB- Model 5 w Bernoulli (Test):", round(nb5b.score(combined_test, y_test),4)*100,"%")

Classification rate for NB- Model 5 w Bernoulli (Train): 88.83 %
Classification rate for NB- Model 5 w Bernoulli (Test): 83.36 %


### Model 6: TFID vectorizer- Title and text

In [49]:
## addressing the two y columns 
import scipy.sparse as sp

title_tf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
title_tf_vectors = title_tf_vectorizer.fit_transform(X_train['title'])
title_tf_test = title_tf_vectorizer.transform(X_test['title'])

text_tf_vectorizer = TfidfVectorizer(stop_words='english')
text_tf_vectors = text_tf_vectorizer.fit_transform(X_train['text'])
text_tf_test = text_tf_vectorizer.transform(X_test['text'])

combined_tf_train = sp.hstack([title_tf_vectors, text_tf_vectors], format='csr')
combined_tf_test = sp.hstack([title_tf_test, text_tf_test], format='csr')

In [50]:
print(combined_tf_train.shape)
print(combined_tf_test.shape)
print(y_train.shape)
print(y_test.shape)

(4244, 64881)
(2091, 64881)
(4244,)
(2091,)


In [51]:
from sklearn.naive_bayes import MultinomialNB

nb6 = MultinomialNB()

nb6.fit(combined_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [52]:
## tied for best performance on test set
print ("Classification rate for NB- Model 6 (Train):", round(nb6.score(combined_tf_train, y_train),4)*100,"%")
print ("Classification rate for NB- Model 6  (Test):", round(nb6.score(combined_tf_test, y_test),4)*100,"%")

Classification rate for NB- Model 6 (Train): 97.38 %
Classification rate for NB- Model 6  (Test): 89.57 %


### Model 7 and 8: Model 5 with new alpha

In [53]:
## addressing the two y columns 
import scipy.sparse as sp

title_vectorizer = CountVectorizer(stop_words='english')
title_vectors = title_vectorizer.fit_transform(X_train['title'])
title_test = title_vectorizer.transform(X_test['title'])

text_vectorizer = CountVectorizer(stop_words='english')
text_vectors = text_vectorizer.fit_transform(X_train['text'])
text_test = text_vectorizer.transform(X_test['text'])

combined_train = sp.hstack([title_vectors, text_vectors], format='csr')
combined_test = sp.hstack([title_test, text_test], format='csr')

In [54]:
from sklearn.naive_bayes import MultinomialNB

nb7 = MultinomialNB(alpha=0.50)

nb7.fit(combined_train, y_train)

MultinomialNB(alpha=0.5, class_prior=None, fit_prior=True)

In [55]:
print ("Classification rate for NB- Model 7 (Train):", round(nb7.score(combined_train, y_train),4)*100,"%")
print ("Classification rate for NB- Model 7 (Test):", round(nb7.score(combined_test, y_test),4)*100,"%")

Classification rate for NB- Model 7 (Train): 96.02 %
Classification rate for NB- Model 7 (Test): 90.1 %


In [56]:
from sklearn.naive_bayes import MultinomialNB

nb8 = MultinomialNB(alpha=0.10)

nb8.fit(combined_train, y_train)

MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True)

In [57]:
## best performance so far!
print ("Classification rate for NB- Model 8 (Train):", round(nb8.score(combined_train, y_train),4)*100,"%")
print ("Classification rate for NB- Model 8 (Test):", round(nb8.score(combined_test, y_test),4)*100,"%")

Classification rate for NB- Model 8 (Train): 97.27 %
Classification rate for NB- Model 8 (Test): 90.2 %


### Model 9 and 10: Model 6 with tweaks to TFID Vectorizer

In [58]:
## using max_df to remove very common words
import scipy.sparse as sp

title_tf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.8)
title_tf_vectors = title_tf_vectorizer.fit_transform(X_train['title'])
title_tf_test = title_tf_vectorizer.transform(X_test['title'])

text_tf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.8)
text_tf_vectors = text_tf_vectorizer.fit_transform(X_train['text'])
text_tf_test = text_tf_vectorizer.transform(X_test['text'])

combined_tf_train = sp.hstack([title_tf_vectors, text_tf_vectors], format='csr')
combined_tf_test = sp.hstack([title_tf_test, text_tf_test], format='csr')

In [59]:
from sklearn.naive_bayes import MultinomialNB

nb9 = MultinomialNB()

nb9.fit(combined_tf_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [60]:
## this hurts performance a lot
print ("Classification rate for NB- Model 9 (Train):", round(nb9.score(combined_tf_train, y_train),4)*100,"%")
print ("Classification rate for NB- Model 9 (Test):", round(nb9.score(combined_tf_test, y_test),4)*100,"%")

Classification rate for NB- Model 9 (Train): 94.89 %
Classification rate for NB- Model 9 (Test): 88.19 %


In [61]:
## using min_df to remove uncommon words
import scipy.sparse as sp

title_tf_vectorizer = TfidfVectorizer(stop_words='english', min_df=0.1)
title_tf_vectors = title_tf_vectorizer.fit_transform(X_train['title'])
title_tf_test = title_tf_vectorizer.transform(X_test['title'])

text_tf_vectorizer = TfidfVectorizer(stop_words='english', min_df=0.1)
text_tf_vectors = text_tf_vectorizer.fit_transform(X_train['text'])
text_tf_test = text_tf_vectorizer.transform(X_test['text'])

combined_tf_train = sp.hstack([title_tf_vectors, text_tf_vectors], format='csr')
combined_tf_test = sp.hstack([title_tf_test, text_tf_test], format='csr')

In [62]:
from sklearn.naive_bayes import MultinomialNB

nb10 = MultinomialNB()

nb10.fit(combined_tf_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [63]:
## this hurts performance a lot
print ("Classification rate for NB- Model 10 (Train):", round(nb10.score(combined_tf_train, y_train),4)*100,"%")
print ("Classification rate for NB- Model 10 (Test):", round(nb10.score(combined_tf_test, y_test),4)*100,"%")

Classification rate for NB- Model 10 (Train): 84.52 %
Classification rate for NB- Model 10 (Test): 82.54 %


### Best outcome: Model 8- CountVectorizer with alpha of .01