## Fake News Classifier Using RNN and LSTM



In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import SimpleRNN
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences



In [2]:
df=pd.read_csv('WELFake_Dataset.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


In [4]:
df.drop(['Unnamed: 0'],axis=1,inplace=True)

In [5]:
df.head()

Unnamed: 0,title,text,label
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,,Did they post their votes for Hillary already?,1
2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


In [6]:
df.isnull().sum()

title    558
text      39
label      0
dtype: int64

In [7]:
###Drop Nan Values
df=df.dropna()


In [8]:
df.shape

(71537, 3)

In [9]:
df.drop_duplicates(inplace=True)

In [10]:
df.shape

(63121, 3)

In [11]:
df.reset_index(drop=True, inplace=True)

In [12]:
df.head()

Unnamed: 0,title,text,label
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
2,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
3,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1
4,About Time! Christian Group Sues Amazon and SP...,All we can say on this one is it s about time ...,1


In [13]:
df['label'].value_counts()

label
0    34791
1    28330
Name: count, dtype: int64

In [14]:
## Get the Independent Features

X=df.drop('label',axis=1)


In [15]:
## Get the Dependent features
y= df['label']

In [16]:
X.shape

(63121, 2)

In [17]:
y.shape

(63121,)

### Tokenize Representation

In [18]:
news=X.copy()

In [19]:
news['title'][1]

'UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MOST CHARLOTTE RIOTERS WERE “PEACEFUL” PROTESTERS…In Her Home State Of North Carolina [VIDEO]'

In [20]:
news

Unnamed: 0,title,text
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...
1,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ..."
2,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...
3,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will..."
4,About Time! Christian Group Sues Amazon and SP...,All we can say on this one is it s about time ...
...,...,...
63116,WIKILEAKS EMAIL SHOWS CLINTON FOUNDATION FUNDS...,An email released by WikiLeaks on Sunday appea...
63117,Russians steal research on Trump in hack of U....,WASHINGTON (Reuters) - Hackers believed to be ...
63118,WATCH: Giuliani Demands That Democrats Apolog...,"You know, because in fantasyland Republicans n..."
63119,Migrants Refuse To Leave Train At Refugee Camp...,Migrants Refuse To Leave Train At Refugee Camp...


In [21]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/krushilramani/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [22]:
### Dataset Preprocessing
from nltk.stem import PorterStemmer
ps=PorterStemmer()
corpus=[]
for i in range(0, len(news)):
    review= re.sub('[^a-zA-Z]', ' ', news['title'][i])
    review= review.lower()
    review= review.split()
    review= [ps.stem(word)  for word in review if not word in stopwords.words('english')]
    review= ' '.join(review)
    corpus.append(review)


In [None]:
# corpus_t=[]

# for i in range(0, len(news)):
#     review = re.sub('[^a-zA-Z]', ' ', news['text'][i])
#     review = review.lower()
#     review = review.split()
#     review = [ps.stem(word) for word in review if word not in stopwords.words('english')]
#     review = ' '.join(review)
#     corpus_t.append(review)

In [None]:
# corpus_t[0]

'comment expect barack obama member fyf fukyoflag blacklivesmatt movement call lynch hang white peopl cop encourag other radio show tuesday night turn tide kill white peopl cop send messag kill black peopl america one f yoflag organ call sunshin radio blog show host texa call sunshin f ing opinion radio show snapshot fyf lolatwhitefear twitter page p show urg support call fyf tonight continu dismantl illus white snapshot twitter radio call invit fyf radio show air p eastern standard time show caller clearli call lynch kill white peopl minut clip radio show heard provid breitbart texa someon would like refer hannib alreadi receiv death threat result interrupt fyf confer call unidentifi black man said mother f ker start f ing like us bunch ni er takin one us roll said caus alreadi roll gang anyway six seven black mother f cker see white person lynch ass let turn tabl conspir cop start lose peopl state emerg specul one two thing would happen big ass r war ni er go start backin alreadi get

In [25]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
vocab_size = len(tokenizer.word_index) + 1

In [26]:
tokenize_title = tokenizer.texts_to_sequences(corpus)

In [27]:
tokenize_title[0]

[76, 1275, 322, 1257, 613, 214, 192, 17, 2080, 12159, 207, 4]

In [28]:
vocab_size

19509

### Tokenize Representation with padding

In [29]:
max_len=max(len(word) for word in tokenize_title)

In [30]:

embedded_title=pad_sequences(tokenize_title,padding='post',maxlen=max_len)


In [31]:
embedded_title[0]

array([   76,  1275,   322,  1257,   613,   214,   192,    17,  2080,
       12159,   207,     4,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0], dtype=int32)

## model creation

In [55]:
X_final=np.array(embedded_title)
y_final=np.array(y)

In [56]:
X_final.shape,y_final.shape

((63121, 47), (63121,))

In [57]:

X_train,X_test,y_train,y_test = train_test_split(X_final,y_final,test_size=0.33, random_state=42)

In [None]:
embedding_vector_features=40
model = Sequential()
model.add(Embedding(vocab_size,embedding_vector_features,input_length=max_len))
model.add(SimpleRNN(15))
model.add(Dense(1,activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [91]:
model.summary()

### Model Training

In [None]:
### Finally Training
model.fit(X_train,y_train, validation_data=(X_test,y_test),epochs=10, batch_size=100)
# model.fit(X_train,y_train, validation_split=0.3, epochs=10, batch_size=100)

## RNN with Dropout layer


In [133]:
embedding_vector_features=50
d_model = Sequential()
d_model.add(Embedding(vocab_size,embedding_vector_features,input_length=max_len))
d_model.add(SimpleRNN(100))
d_model.add(Dropout(0.3))
d_model.add(Dense(1,activation='sigmoid'))
d_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])



In [134]:
d_model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=10, batch_size=100)

Epoch 1/10
[1m423/423[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 24ms/step - accuracy: 0.8004 - loss: 0.4167 - val_accuracy: 0.8938 - val_loss: 0.2725
Epoch 2/10
[1m423/423[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 24ms/step - accuracy: 0.9251 - loss: 0.2005 - val_accuracy: 0.8922 - val_loss: 0.2870
Epoch 3/10
[1m423/423[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 24ms/step - accuracy: 0.9480 - loss: 0.1433 - val_accuracy: 0.8886 - val_loss: 0.3114
Epoch 4/10
[1m423/423[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 24ms/step - accuracy: 0.9656 - loss: 0.0975 - val_accuracy: 0.8911 - val_loss: 0.3058
Epoch 5/10
[1m423/423[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 26ms/step - accuracy: 0.9737 - loss: 0.0752 - val_accuracy: 0.8811 - val_loss: 0.4347
Epoch 6/10
[1m423/423[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 24ms/step - accuracy: 0.9811 - loss: 0.0572 - val_accuracy: 0.8007 - val_loss: 0.5149
Epoch 7/10
[1m4

<keras.src.callbacks.history.History at 0x3bcd9cf10>

## Using LSTM

In [None]:
embedding_vector_features=40
l_model = Sequential()
l_model.add(Embedding(vocab_size,embedding_vector_features,input_length=max_len))
l_model.add(LSTM(80))
l_model.add(Dropout(0.2))
l_model.add(Dense(1,activation='sigmoid'))
l_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])



In [126]:
l_model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=10, batch_size=100)

Epoch 1/10
[1m423/423[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 42ms/step - accuracy: 0.6055 - loss: 0.6437 - val_accuracy: 0.7373 - val_loss: 0.5340
Epoch 2/10
[1m423/423[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 43ms/step - accuracy: 0.7537 - loss: 0.5199 - val_accuracy: 0.7712 - val_loss: 0.5082
Epoch 3/10
[1m423/423[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 43ms/step - accuracy: 0.7758 - loss: 0.5030 - val_accuracy: 0.7855 - val_loss: 0.5027
Epoch 4/10
[1m423/423[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 44ms/step - accuracy: 0.8032 - loss: 0.4721 - val_accuracy: 0.7770 - val_loss: 0.5053
Epoch 5/10
[1m423/423[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 42ms/step - accuracy: 0.8113 - loss: 0.4617 - val_accuracy: 0.7904 - val_loss: 0.5009
Epoch 6/10
[1m423/423[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 42ms/step - accuracy: 0.8174 - loss: 0.4587 - val_accuracy: 0.7980 - val_loss: 0.4949
Epoch 7/10
[1m4

<keras.src.callbacks.history.History at 0x3bb0a2910>

In [132]:
l_model.summary()

### Performance Metrics And Accuracy

### accuracy of simple rnn

In [110]:
y_pred=model.predict(X_test)

[1m651/651[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step


In [111]:
y_pred=np.where(y_pred > 0.6, 1,0) ##AUC ROC Curve

In [112]:
confusion_matrix(y_test,y_pred)

array([[10145,  1267],
       [ 1544,  7874]])

In [113]:

accuracy_score(y_test,y_pred)

0.8650504080652904

In [114]:

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.87      0.89      0.88     11412
           1       0.86      0.84      0.85      9418

    accuracy                           0.87     20830
   macro avg       0.86      0.86      0.86     20830
weighted avg       0.86      0.87      0.86     20830



### accuracy of model with Dropout

In [115]:
dy_pred=d_model.predict(X_test)

[1m651/651[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step


In [116]:
dy_pred=np.where(y_pred > 0.6, 1,0) ##AUC ROC Curve

In [117]:
confusion_matrix(y_test,dy_pred)

array([[10145,  1267],
       [ 1544,  7874]])

In [118]:
accuracy_score(y_test,dy_pred)

0.8650504080652904

In [119]:
print(classification_report(y_test,dy_pred))

              precision    recall  f1-score   support

           0       0.87      0.89      0.88     11412
           1       0.86      0.84      0.85      9418

    accuracy                           0.87     20830
   macro avg       0.86      0.86      0.86     20830
weighted avg       0.86      0.87      0.86     20830



### LSTM Accuracy

In [127]:
ly_pred=l_model.predict(X_test)

[1m651/651[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step


In [128]:
ly_pred=np.where(y_pred > 0.6, 1,0) ##AUC ROC Curve

In [129]:
confusion_matrix(y_test,ly_pred)

array([[10145,  1267],
       [ 1544,  7874]])

In [130]:
accuracy_score(y_test,ly_pred)

0.8650504080652904

In [131]:
print(classification_report(y_test,ly_pred))

              precision    recall  f1-score   support

           0       0.87      0.89      0.88     11412
           1       0.86      0.84      0.85      9418

    accuracy                           0.87     20830
   macro avg       0.86      0.86      0.86     20830
weighted avg       0.86      0.87      0.86     20830

