# Fake News Classifier using NLP and LSTM RNN

Dataset: https://www.kaggle.com/c/fake-news/overview/description

In [18]:
# Importing neccessary libraries

import pandas as pd
import numpy as np
import tensorflow as tf
import nltk
import re

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense




In [2]:
# Loading the dataset

df= pd.read_csv(r'C:\Users\user\Desktop\Springboard\Curriculum Projects\Fake News Classifier using NLP and LSTM RNN\train.csv')
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [3]:
# Dropping the null values

df= df.dropna()
X= df.drop('label', axis=1)
y= df['label']

In [4]:
X.shape

(18285, 4)

In [5]:
y.shape

(18285,)

In [6]:
# Initializing vocabulary size
voc_size= 5000

# One Hot Representation
text= X.copy()
text.reset_index(inplace= True)

In [7]:
# Data Preprocessing

nltk.download('stopwords')
ps= PorterStemmer()
corpus=[]

for i in range(0, len(text)):
    review= re.sub('[^A-Za-z]', ' ', text['title'][i])
    review= review.lower()
    review= review.split()
    
    review= [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review=' '.join(review)
    corpus.append(review)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
corpus[1]

'flynn hillari clinton big woman campu breitbart'

In [9]:
# Converting text from the corpus to a numerical representation  

onehot_rep= [one_hot(words, voc_size) for words in corpus]
onehot_rep[1]

[4744, 3475, 1005, 3033, 4622, 2270, 2765]

In [10]:
# Embedding Representation

sent_len= 20

# Converting the sentences length of equal length
embedded_docs= pad_sequences(onehot_rep, padding= 'pre', maxlen=sent_len)
print(embedded_docs)

[[   0    0    0 ... 4734 4641 1982]
 [   0    0    0 ... 4622 2270 2765]
 [   0    0    0 ... 1343  151  556]
 ...
 [   0    0    0 ... 3140 1422 4219]
 [   0    0    0 ... 2055 4577  723]
 [   0    0    0 ...  538 1604 3680]]


In [11]:
# Model Creation

embedding_vec_feat= 40
model= Sequential()
model.add(Embedding(voc_size, embedding_vec_feat, input_length= sent_len))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 20, 40)            200000    
_________________________________________________________________
lstm (LSTM)                  (None, 100)               56400     
_________________________________________________________________
dense (Dense)                (None, 1)                 101       
Total params: 256,501
Trainable params: 256,501
Non-trainable params: 0
_________________________________________________________________
None


In [12]:
# Converting the embedded layer to array for modeling

X_final= np.array(embedded_docs)
y_final= np.array(y)

In [13]:
X_final.shape, y_final.shape

((18285, 20), (18285,))

In [14]:
# Train test split
X_train, X_test, y_train, y_test= train_test_split(X_final, y_final, test_size=0.33, random_state=42)

# fitting model to the data
model.fit(X_train, y_train, validation_data= (X_test, y_test), epochs=10, batch_size=64)

Train on 12250 samples, validate on 6035 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1fabe36a2b0>

In [16]:
# Model Evaluation

y_pred= model.predict_classes(X_test)
confusion_matrix(y_test, y_pred)

array([[3101,  318],
       [ 227, 2389]], dtype=int64)

In [17]:
accuracy_score(y_test, y_pred)

0.9096934548467275

In [20]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.91      0.92      3419
           1       0.88      0.91      0.90      2616

    accuracy                           0.91      6035
   macro avg       0.91      0.91      0.91      6035
weighted avg       0.91      0.91      0.91      6035



In [21]:
y_pred

array([[1],
       [0],
       [0],
       ...,
       [0],
       [1],
       [1]])

In [26]:
y_act= df['label']
df_final= pd.DataFrame(data= y_pred, columns=["Pred"])
df_final.head()

Unnamed: 0,Pred
0,1
1,0
2,0
3,1
4,1


In [27]:
df_save= df_final.to_csv("output.csv")