In [24]:
#importing libraries
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import Embedding,LSTM,Dense,Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split

In [25]:
#reading the data
df=pd.read_csv('/home/user/Downloads/Fake news.csv')

In [26]:
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [27]:
df.shape

(20800, 5)

In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20800 entries, 0 to 20799
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      20800 non-null  int64 
 1   title   20242 non-null  object
 2   author  18843 non-null  object
 3   text    20761 non-null  object
 4   label   20800 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 812.6+ KB


In [29]:
df.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [30]:
#setting id as the index of the dataframe
df=df.set_index('id')

In [31]:
df.head()

Unnamed: 0_level_0,title,author,text,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [32]:
#replacing nan values empty string
df.fillna('',inplace=True)

In [33]:
#for getting stopwords in english
Stopwords=stopwords.words('english')

In [34]:
ps=PorterStemmer()

In [35]:
#creating new feature combining title and auther
df['total']=df['title']+' '+df['author']

In [36]:
#substituting charecters other than alphabets with blank space
#convert the text to lower case
#split the sentence and remove the stopwords and covert the remaining words into stummed words
#join the stummed word and add to list
corpus=[]
maxlen=0
for i in range(len(df)):
    news=re.sub('[^a-zA-z]',' ',df['total'][i])
    news=news.lower()
    news=news.split()
    news=[ps.stem(word) for word in news if word not in Stopwords]
    if maxlen<len(news):
        maxlen=len(news)
    news=' '.join(news)
    corpus.append(news)

In [37]:
maxlen

51

In [38]:
#coverting to one_hot representation
onehot_rep=[one_hot(word,5000) for word in corpus]

In [39]:
#padding the sentences to make all sentences of equal length
news=pad_sequences(onehot_rep,padding='pre',maxlen=maxlen)

In [40]:
news

array([[   0,    0,    0, ..., 3097, 1746, 2091],
       [   0,    0,    0, ...,  578, 3010, 1908],
       [   0,    0,    0, ..., 3079, 4934, 3692],
       ...,
       [   0,    0,    0, ..., 1013, 1583, 2321],
       [   0,    0,    0, ..., 4760, 4466, 2415],
       [   0,    0,    0, ..., 1471, 1771,  982]], dtype=int32)

In [41]:
#splitting the data to features and labels
x=news
y=df.label
x.shape,y.shape

((20800, 51), (20800,))

In [42]:
#splitting data into training and testing data
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=.2,random_state=10)
x_train.shape,x_test.shape,y_train.shape,y_test.shape

((16640, 51), (4160, 51), (16640,), (4160,))

In [43]:
model=Sequential()
model.add(Embedding(5000,64,input_length=maxlen))
model.add(Dropout(.3))
model.add(LSTM(64))
model.add(Dropout(.3))
model.add(Dense(64,activation='relu'))
model.add(Dropout(.3))
model.add(Dense(1,activation='sigmoid'))

In [44]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 51, 64)            320000    
_________________________________________________________________
dropout_3 (Dropout)          (None, 51, 64)            0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                33024     
_________________________________________________________________
dropout_4 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 64)                4160      
_________________________________________________________________
dropout_5 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                

In [45]:
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [46]:
model.fit(x_train,y_train,batch_size=200,epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f227ab4c580>

In [47]:
model.evaluate(x_test,y_test)



[0.05165329948067665, 0.9882211685180664]