In [1]:
import pandas as pd

In [2]:
data=pd.read_csv('fake-news/train.csv')


In [3]:
data.head()

#we will be using the title column for our prediction

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [4]:
#checking for null values in the dataset

data.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [5]:
data.shape

(20800, 5)

In [6]:
#we will use the title column so other columns will be of no use

data=data.drop(['text','author','id'],axis=1)

In [7]:
#there are some  null values in the title column also

data.isnull().sum()

title    558
label      0
dtype: int64

In [8]:
#as title is the only column is the what we are using if it contains NaN values we have to drop it.

data=data.dropna()

In [9]:
data.isnull().sum()

title    0
label    0
dtype: int64

In [10]:
data.shape

(20242, 2)

In [11]:
data.head()

Unnamed: 0,title,label
0,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",0
2,Why the Truth Might Get You Fired,1
3,15 Civilians Killed In Single US Airstrike Hav...,1
4,Iranian woman jailed for fictional unpublished...,1


In [12]:
X=data['title']
y=data['label']

In [13]:
X.shape

(20242,)

In [14]:
#importing all necessary modules that we will be using to build our LSTM neural network

import tensorflow as tf
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [15]:
#we dropped some rows as there were nan values so reset index will make it uniform

X=X.reset_index()

In [16]:
X=X.drop(['index'],axis=1)

In [17]:
X.tail()

Unnamed: 0,title
20237,Rapper T.I.: Trump a ’Poster Child For White S...
20238,"N.F.L. Playoffs: Schedule, Matchups and Odds -..."
20239,Macy’s Is Said to Receive Takeover Approach by...
20240,"NATO, Russia To Hold Parallel Exercises In Bal..."
20241,What Keeps the F-35 Alive


In [18]:
#as we dropped some rows so to make the dataframe in order
y=y.reset_index()

In [19]:
y=y.drop(['index'],axis=1)

In [20]:
y.tail()

Unnamed: 0,label
20237,0
20238,0
20239,0
20240,1
20241,1


In [21]:
# importing nltk,stopwords and porterstemmer we are using stemming on the text we have and stopwords will help in removing the stopwords in the text

#re is regular expressions used for identifying only words in the text and ignoring anything else
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [22]:

ps = PorterStemmer()
corpus = []
#each row of the dataset is considered here.everything except the alphabets are removed ,stopwords are also being removed here .the text is converted in lowercase letters and stemming is performed
#lemmatisation can also be used here at the end a corpus of sentences is created
for i in range(0, len(X)):
    review = re.sub('[^a-zA-Z]', ' ',X['title'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [23]:
corpus[30]

'chuck todd buzzfe eic publish fake news breitbart'

In [24]:
#vocabulary size
voc_size=5000

In [25]:
#performing one hot representation

onehot_repr=[one_hot(words,voc_size)for words in corpus] 

In [26]:
len(onehot_repr[0])

10

In [27]:
len(onehot_repr[700])

5

In [28]:
#specifying a sentence length so that every sentence in the corpus will be of same length

sent_length=25

#using padding for creating equal length sentences


embedded_docs=pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)
print(embedded_docs)

[[   0    0    0 ... 2122  902 4257]
 [   0    0    0 ... 2334 2638 4244]
 [   0    0    0 ... 4148 1787 1255]
 ...
 [   0    0    0 ... 3183 1348 1173]
 [   0    0    0 ... 1840 4544 3153]
 [   0    0    0 ...  970 2433 1016]]


In [29]:
#Creating model

from tensorflow.keras.layers import Dropout
embedding_vector_features=40
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model.add(Dropout(0.3))
model.add(LSTM(200))
model.add(Dropout(0.3))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])


Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [30]:
import numpy as np
X_final=np.array(embedded_docs)
y_final=np.array(y)

In [31]:
X_final.shape,y_final.shape

((20242, 25), (20242, 1))

In [32]:
#splitting the data for training and testing the model

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.10, random_state=42)

In [33]:
model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=20,batch_size=64)

Train on 18217 samples, validate on 2025 samples
Instructions for updating:
Use tf.cast instead.
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x21f66bad6d8>

In [35]:
#loading test dataset for prediction

test=pd.read_csv('fake-news/test.csv')

In [36]:
test.head()

Unnamed: 0,id,title,author,text
0,20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning..."
1,20801,Russian warships ready to strike terrorists ne...,,Russian warships ready to strike terrorists ne...
2,20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...
3,20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different..."
4,20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...


In [37]:
#null values in the test dataset

test.isnull().sum()

id          0
title     122
author    503
text        7
dtype: int64

In [38]:
#using the title column only as we did in the train dataset

test=test.drop(['text','id','author'],axis=1)

In [39]:
test.head()

Unnamed: 0,title
0,"Specter of Trump Loosens Tongues, if Not Purse..."
1,Russian warships ready to strike terrorists ne...
2,#NoDAPL: Native American Leaders Vow to Stay A...
3,"Tim Tebow Will Attempt Another Comeback, This ..."
4,Keiser Report: Meme Wars (E995)


In [40]:
test.isnull().sum()

title    122
dtype: int64

In [42]:
test.fillna('fake fake fake',inplace=True)

In [43]:
test.shape

(5200, 1)

In [45]:
#creating corpus for the test dataset exactly the same as we created for the training dataset

corpus_test = []
for i in range(0, len(test)):
    review = re.sub('[^a-zA-Z]', ' ',test['title'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus_test.append(review)

In [46]:
#creating one hot representation for the test corpus
onehot_repr_test=[one_hot(words,voc_size)for words in corpus_test] 

In [47]:
#padding for the test dataset
sent_length=25

embedded_docs_test=pad_sequences(onehot_repr_test,padding='pre',maxlen=sent_length)
print(embedded_docs_test)

[[   0    0    0 ... 3183 1348 1173]
 [   0    0    0 ... 3796 1492  928]
 [   0    0    0 ... 4355 4262 1000]
 ...
 [   0    0    0 ... 3183 1348 1173]
 [   0    0    0 ... 2879 1064  375]
 [   0    0    0 ... 3183 1348 1173]]


In [48]:
X_test=np.array(embedded_docs_test)

In [49]:
#making predictions for the test dataset

check=model.predict_classes(X_test)

In [50]:
check

array([[0],
       [1],
       [0],
       ...,
       [0],
       [1],
       [0]])

In [60]:
for i in range(0, 10):
    print(check[i][0], end = ' ')

0 1 0 0 1 1 0 1 1 1 

In [68]:
for i in range(0, 10):
    if (check[i][0] == 0):
        print(test.loc[i])

title    Specter of Trump Loosens Tongues, if Not Purse...
Name: 0, dtype: object
title    #NoDAPL: Native American Leaders Vow to Stay A...
Name: 2, dtype: object
title    Tim Tebow Will Attempt Another Comeback, This ...
Name: 3, dtype: object
title    Pelosi Calls for FBI Investigation to Find Out...
Name: 6, dtype: object


In [54]:
print(check.shape)
print(test.shape)

(5200, 1)
(5200, 1)


# References Used:

## Dataset
* https://www.kaggle.com/c/fake-news

## Research Papers
* Long, Y. (2017). Fake news detection through multi-perspective speaker profiles. Association for Computational Linguistics.