In [28]:
import pandas as pd
import numpy as np
import nltk
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score
from wordcloud import WordCloud
import warnings
warnings.filterwarnings('ignore')

In [29]:
filepath = 'C:\\Users\\Dinesh\\Desktop\\IR\\fake_or_real_news1.csv'
fake_real_data = pd.read_csv(filepath)

In [30]:
fake_real_data.drop(['Unnamed: 0'],axis=1).head()

Unnamed: 0,title,text,label,clean_title,clean_text
0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",0,smell hillary s fear,daniel greenfield shillman journalism fellow ...
1,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,0,watch exact moment paul ryan committed politic...,google pinterest digg linkedin reddit stumbleu...
2,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,1,kerry go paris gesture sympathy,u.s. secretary state john f. kerry said monday...
3,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",0,bernie supporters twitter erupt anger dnc we...,kaydee king kaydeeking november 2016 lesson...
4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,1,battle new york primary matters,primary day new york front runners hillary cli...


In [31]:
X_body_text = fake_real_data['clean_text'].values
X_title_text = fake_real_data['clean_title'].values
y = fake_real_data['label'].values

In [32]:
tfidf = TfidfVectorizer(ngram_range=(1,2), max_df= 0.99, min_df= 0.01)

In [33]:
X_body_tfidf = tfidf.fit_transform(X_body_text.astype('U'))
X_title_tfidf = tfidf.fit_transform (X_title_text.astype('U'))

In [34]:
indices = fake_real_data.index.values

In [35]:
X_body_tfidf_train, X_body_tfidf_test,y_body_train, y_body_test,indices_body_train, indices_body_test = train_test_split(X_body_tfidf, y, indices, test_size = 0.2, random_state=42)

In [36]:
fake_real_data.loc[indices_body_train].groupby('label').agg('count')

Unnamed: 0_level_0,Unnamed: 0,title,text,clean_title,clean_text
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,2536,2536,2536,2534,2508
1,2532,2532,2532,2532,2532


In [37]:
fake_real_data.loc[indices_body_test].groupby('label').agg('count')

Unnamed: 0_level_0,Unnamed: 0,title,text,clean_title,clean_text
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,628,628,628,628,620
1,639,639,639,639,639


In [38]:
nb_body = MultinomialNB()

In [39]:
nb_body.fit(X_body_tfidf_train, y_body_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [40]:
y_body_train_pred = nb_body.predict(X_body_tfidf_train)

In [41]:
print('Naive Bayes In Sample F1 and Accuracy Scores:')
print('F1 score {:.4}%'.format(f1_score(y_body_train, y_body_train_pred, average='macro')*100 ))
print ('Accuracy score {:.4}%'.format(accuracy_score(y_body_train, y_body_train_pred)*100))

Naive Bayes In Sample F1 and Accuracy Scores:
F1 score 90.21%
Accuracy score 90.21%


In [42]:
np.where(y_body_train != y_body_train_pred)

(array([  18,   20,   29,   58,   63,   82,   94,   98,  100,  101,  105,
         132,  133,  141,  145,  147,  163,  164,  179,  180,  188,  198,
         212,  220,  221,  230,  233,  240,  250,  268,  270,  274,  275,
         322,  353,  354,  355,  366,  367,  372,  382,  391,  392,  394,
         400,  404,  435,  443,  454,  463,  481,  482,  488,  490,  494,
         499,  508,  519,  520,  535,  550,  556,  566,  574,  581,  614,
         624,  637,  650,  654,  660,  662,  664,  709,  722,  754,  763,
         765,  772,  781,  791,  804,  829,  836,  848,  852,  860,  861,
         869,  892,  901,  907,  916,  919,  923,  924,  932,  952,  968,
         980,  996, 1001, 1006, 1009, 1042, 1060, 1067, 1083, 1086, 1089,
        1101, 1105, 1111, 1122, 1126, 1135, 1139, 1146, 1158, 1166, 1191,
        1192, 1197, 1236, 1241, 1249, 1250, 1251, 1255, 1259, 1265, 1269,
        1271, 1273, 1300, 1305, 1306, 1309, 1315, 1319, 1321, 1348, 1354,
        1375, 1377, 1383, 1390, 1394, 

In [43]:
fake_real_data.drop(['Unnamed: 0'],axis=1).loc[[18,20,29,58,63,82,94,98,100,101,105]]

Unnamed: 0,title,text,label,clean_title,clean_text
18,What's in that Iran bill that Obama doesn't like?,"Washington (CNN) For months, the White House a...",1,what s iran bill obama like,washington cnn months white house congress ...
20,The slippery slope to Trump’s proposed ban on ...,"With little fanfare this fall, the New York de...",1,slippery slope trump s proposed ban muslims,little fanfare fall new york developer planne...
29,"Syrian War Report – November 1, 2016: Syrian M...","Syrian War Report – October 31, 2016: Al-Nusra...",0,syrian war report november 2016 syrian milita...,syrian war report october 31 2016 al nusra l...
58,Police Turn In Badges Rather Than Incite Viole...,It should be evident if you’re following news ...,0,police turn badges rather incite violence stan...,evident you re following news concerning stand...
63,Comment on Quid Pro Quo? Wikileaks Email Revea...,New Wikileaks email dumps have revealed massiv...,0,comment quid pro quo wikileaks email reveals ...,new wikileaks email dumps revealed massive cor...
82,"To get around Congress, Obama turns to city halls",WASHINGTON — President Obama has quietly racke...,1,get around congress obama turns city halls,washington president obama quietly racked seri...
94,French police hunt two brothers accused of Cha...,A third suspect has turned himself in. Prime M...,1,french police hunt two brothers accused charli...,third suspect turned in. prime minister valls ...
98,"Fireworks erupt between Trump and Bush, Rubio ...",Sparks flew at the toughest and liveliest GOP ...,1,fireworks erupt trump bush rubio cruz gop debate,sparks flew toughest liveliest gop primary deb...
100,How Ted Cruz Became Ted Cruz,"In 2008, in the high-profile Supreme Court gun...",1,ted cruz became ted cruz,2008 high profile supreme court gun rights ca...
101,Newly Approved GM Potatoes Have Potential to S...,"Late last week, the US Department of Agricultu...",0,newly approved gm potatoes potential silence h...,late last week us department agriculture usd...


In [44]:
y_body_pred = nb_body.predict(X_body_tfidf_test)

In [45]:
print('Naive Bayes F1 and Accuracy Scores:')
print('F1 score {:.4}%'.format(f1_score(y_body_test, y_body_pred, average='macro')*100 ))
print ('Accuracy score {:.4}%'.format(accuracy_score(y_body_test, y_body_pred)*100))

Naive Bayes F1 and Accuracy Scores:
F1 score 89.74%
Accuracy score 89.74%


In [46]:
np.where(y_body_test != y_body_pred)

(array([  10,   16,   33,   58,   68,   71,   80,   87,  106,  128,  130,
         139,  145,  153,  154,  191,  202,  214,  224,  228,  229,  242,
         259,  262,  265,  274,  275,  276,  290,  322,  343,  347,  349,
         353,  361,  382,  384,  395,  397,  402,  406,  409,  422,  436,
         484,  497,  498,  499,  514,  517,  518,  532,  536,  539,  554,
         556,  557,  575,  578,  604,  610,  612,  616,  624,  634,  658,
         662,  674,  691,  704,  721,  728,  735,  745,  752,  767,  769,
         777,  788,  809,  822,  831,  845,  848,  851,  876,  895,  905,
         919,  928,  936,  945,  946,  951,  964,  974,  976,  988,  993,
         998, 1009, 1026, 1029, 1031, 1038, 1046, 1067, 1071, 1078, 1091,
        1093, 1103, 1110, 1114, 1118, 1126, 1130, 1143, 1156, 1164, 1166,
        1171, 1192, 1198, 1201, 1208, 1226, 1244, 1250, 1264]),)

In [47]:
spam = fake_real_data[fake_real_data['label']==1]

In [48]:
spam_words = nltk.word_tokenize(" ".join(spam['clean_text'].values.tolist()))

In [49]:
spam_counter = Counter(spam_words)
print(spam_counter.most_common(50))

[('.', 122301), ('s', 34236), ('said', 17202), ('trump', 15704), ('clinton', 10658), ('would', 7858), ('one', 6680), ('people', 6309), ('state', 6261), ('president', 6141), ('it', 6102), ('obama', 5972), ('new', 5808), ('campaign', 5581), ('t', 5513), ('also', 5030), ('republican', 4996), ('party', 4464), ('could', 4055), ('like', 3929), ('time', 3922), ('states', 3917), ('sanders', 3824), ('two', 3793), ('that', 3765), ('even', 3555), ('house', 3554), ('we', 3527), ('percent', 3468), ('first', 3420), ('political', 3359), ('many', 3273), ('u.s.', 3236), ('voters', 3199), ('republicans', 3195), ('year', 3149), ('he', 3122), ('presidential', 3108), ('the', 3074), ('democratic', 3066), ('cruz', 3060), ('last', 2968), ('going', 2951), ('told', 2868), ('say', 2829), ('white', 2821), ('get', 2818), ('years', 2778), ('news', 2732), ('american', 2707)]


In [50]:
spam_wordcloud = WordCloud(width=1200, height=1000, random_state = 42).generate(" ".join(spam_words))
spam_bigrams = nltk.bigrams(spam_words)
spam_counter = Counter(spam_bigrams)
print(spam_counter.most_common(10))

[(('said', '.'), 4625), (('it', 's'), 2913), (('trump', 's'), 2730), (('.', 'trump'), 2552), (('that', 's'), 2111), (('.', 'it'), 2094), (('he', 's'), 2043), (('donald', 'trump'), 2037), (('.', 'we'), 1846), (('hillary', 'clinton'), 1844)]


In [51]:
ham = fake_real_data[fake_real_data['label']==0]
ham.dropna()
ham["clean_text"]= ham["clean_text"].astype(str)

In [52]:
ham_words = nltk.word_tokenize(" ".join(ham['clean_text'].values.tolist()))

In [53]:
ham_counter = Counter(ham_words)
print(ham_counter.most_common(50))

[('.', 83934), ('s', 19541), ('clinton', 6862), ('trump', 6731), ('t', 5507), ('people', 5437), ('one', 5192), ('us', 5171), ('would', 4923), ('it', 4796), ('hillary', 4597), ('said', 4001), ('new', 3523), ('like', 3320), ('also', 3195), ('world', 3183), ('election', 3081), ('time', 3030), ('state', 2934), ('even', 2926), ('government', 2821), ('president', 2699), ('many', 2594), ('2016', 2562), ('war', 2527), ('american', 2524), ('could', 2505), ('states', 2318), ('obama', 2262), ('russia', 2223), ('years', 2215), ('first', 2198), ('that', 2188), ('campaign', 2135), ('media', 2127), ('get', 2076), ('two', 2075), ('u.s.', 2034), ('know', 1972), ('may', 1965), ('the', 1962), ('well', 1922), ('fbi', 1915), ('donald', 1886), ('year', 1885), ('political', 1856), ('way', 1854), ('country', 1845), ('america', 1839), ('right', 1798)]


In [54]:
ham_wordcloud = WordCloud(width=1200, height=1000, random_state = 42).generate(" ".join(ham_words))
ham_bigrams = nltk.bigrams(ham_words)
ham_counter = Counter(ham_bigrams)
print(ham_counter.most_common(10))

[(('hillary', 'clinton'), 2481), (('it', 's'), 2478), (('donald', 'trump'), 1717), (('don', 't'), 1507), (('united', 'states'), 1438), (('.', 'it'), 1148), (('it', '.'), 1094), (('clinton', 's'), 1066), (('that', 's'), 1001), (('trump', 's'), 845)]
