Application of LSTM and GRU Recurrent Neural Networks in Fake NEWS detection

In [1]:
# importing necessary libraries 
import pandas as pd
import tensorflow as tf
import os
import re
import numpy as np
from string import punctuation
from zipfile import ZipFile
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

In [53]:
# importing neural network libraries
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from tensorflow.keras.layers import Dense, Dropout, Embedding, GRU, LSTM, RNN, SpatialDropout1D,Conv1D,GlobalAveragePooling1D

In [3]:
train = pd.read_csv('d:/datasets/mothers day/dataset/train.csv')
test = pd.read_csv('d:/datasets/mothers day/dataset/test.csv')
train_data = train.copy()
test_data = test.copy()

In [4]:
print(train_data.shape)
train_data.head()

(3235, 6)


Unnamed: 0,id,original_text,lang,retweet_count,original_author,sentiment_class
0,1.245025e+18,Happy #MothersDay to all you amazing mothers o...,en,0,BeenXXPired,0
1,1.245759e+18,Happy Mothers Day Mum - I'm sorry I can't be t...,en,1,FestiveFeeling,0
2,1.246087e+18,Happy mothers day To all This doing a mothers ...,en,0,KrisAllenSak,-1
3,1.244803e+18,Happy mothers day to this beautiful woman...ro...,en,0,Queenuchee,0
4,1.244876e+18,Remembering the 3 most amazing ladies who made...,en,0,brittan17446794,-1


In [5]:
print(test_data.shape)
test_data.head()

(1387, 5)


Unnamed: 0,id,original_text,lang,retweet_count,original_author
0,1.246628e+18,"3. Yeah, I once cooked potatoes when I was 3 y...",en,0,LToddWood
1,1.245898e+18,"Happy Mother's Day to all the mums, step-mums,...",en,0,iiarushii
2,1.244717e+18,"I love the people from the UK, however, when I...",en,0,andreaanderegg
3,1.24573e+18,Happy 81st Birthday Happy Mother’s Day to my m...,en,1,TheBookTweeters
4,1.244636e+18,Happy Mothers day to all those wonderful mothe...,en,0,andreaanderegg


In [6]:
# dropping missing values from text columns alone. 
train_data.drop(['id','lang','retweet_count','original_author'],axis=1,inplace=True)
train_data.head()

Unnamed: 0,original_text,sentiment_class
0,Happy #MothersDay to all you amazing mothers o...,0
1,Happy Mothers Day Mum - I'm sorry I can't be t...,0
2,Happy mothers day To all This doing a mothers ...,-1
3,Happy mothers day to this beautiful woman...ro...,0
4,Remembering the 3 most amazing ladies who made...,-1


In [7]:
le = LabelEncoder()
le.fit([-1,0,1])
train_data['sentiment_class'] =le.transform(train_data['sentiment_class'])

In [8]:
length = []
[length.append(len(str(text))) for text in train_data['original_text']]
train_data['length'] = length
train_data.head()

Unnamed: 0,original_text,sentiment_class,length
0,Happy #MothersDay to all you amazing mothers o...,1,252
1,Happy Mothers Day Mum - I'm sorry I can't be t...,1,301
2,Happy mothers day To all This doing a mothers ...,0,298
3,Happy mothers day to this beautiful woman...ro...,1,155
4,Remembering the 3 most amazing ladies who made...,0,254


In [9]:
min(train_data['length']), max(train_data['length']), round(sum(train_data['length'])/len(train_data['length']))

(73, 728, 227)

we can keep 250 as max features for training the neural network.

**minimum length is 73?? Looks like there are some outliers.**

In [36]:
len(train_data[train_data['length'] > 350])

5

**There are 10 outliers in this dataset. Outliers can be removed. It is a good practice to check the outliers before removing them**

In [11]:
train_data['original_text'][train_data['length'] < 130]

1189    Happy Mother's Day! A bit of time with one of ...
1233    Happy Mother's Day to all the amazing Mums out...
1301    @BofaFerjani لكل ولد و بنت أمهم متوفية كل سنة ...
1328    لكل ولد و بنت أمهم متوفية كل سنة و أمي هيا أمك...
1406    Happy Mothers day to my super Mum. Ochie dike ...
1826    Happy #MothersDay to all the amazing women who...
2207    #القصة_مو_مزحة #خليك_بالبيت #عبير_شمس_الدين #س...
2554    First Mothers Day not spent with my mum but we...
2647    الشي الوحيد اللي بيخليني اعرس عشان عقب كم سنه ...
Name: original_text, dtype: object

*Mostly empty texts. They can be removed since they will surely guide the neural network in the wrong way*

In [12]:
# dropping the outliers
train_data = train_data.drop(train_data['original_text'][train_data['length'] < 130].index, axis = 0)

In [None]:
# dropping the outliers
train_data = train_data.drop(train_data['original_text'][train_data['length'] > 350].index, axis = 0)

In [13]:
min(train_data['length']), max(train_data['length']), round(sum(train_data['length'])/len(train_data['length']))

(130, 728, 228)

In [14]:
max_features = 228

In [15]:
train_data['original_text'][train_data['length'] < 141]

1995    To all the wonderful women out there, you’re a...
2038    For all mothers in the world #happy _ mothers_...
Name: original_text, dtype: object

Preprocessing the Text before feeding it into the neural networks

In [16]:
import nltk
import inflect
import contractions
from textblob import TextBlob
from bs4 import BeautifulSoup
import re, string, unicodedata
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer

nltk.download('punkt')

def denoise_text(text):
    # Strip html if any. For ex. removing <html>, <p> tags
    soup = BeautifulSoup(text, "html.parser")
    text = soup.get_text()
    # Replace contractions in the text. For ex. didn't -> did not
    text = contractions.fix(text)
    return text

def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words
def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

def remove_stopwords(input_text):
    stopwords_list = stopwords.words('english')
    # Some words which might indicate a certain sentiment are kept via a whitelist
    whitelist = ["n't", "not", "no"] 
    clean_words = [word for word in input_text if (word not in stopwords_list or word in whitelist) and len(word) > 1] 
    return clean_words

def replace_numbers(words):
    """Replace all interger occurrences in list of tokenized words with textual representation"""
    p = inflect.engine()
    new_words = []
    for word in words:
        if word.isdigit():
            new_word = p.number_to_words(word)
            new_words.append(new_word)
        else:
            new_words.append(word)
    return new_words

def lemmatize_verbs(words):
    """Lemmatize verbs in list of tokenized words"""
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
        lemma = lemmatizer.lemmatize(word, pos='v')
        lemmas.append(lemma)
    return lemmas
def remove_mention(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'@[\w]*', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def normalize_text(words):
    words = remove_non_ascii(words)
    words = to_lowercase(words)
    words = replace_numbers(words)
    words = remove_mention(words)
    words = remove_stopwords(words)
    words = lemmatize_verbs(words)
    return words

def tokenize(text):
    return nltk.word_tokenize(text)


def text_prepare(text):
    text = denoise_text(text)
    text = ' '.join([x for x in normalize_text(tokenize(text))])
    return text



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
#translate every tweet into english language
#TextBlob(c).detect_language
def detect(x):
    if(TextBlob(x).detect_language()!='en'):
        return str(TextBlob(x).translate(to='en'))

train_data['original_text'] = train_data['original_text'].apply(detect)

In [None]:
%%time
#to correct the spellings
train_data['original_text'] = train_data['original_text'].apply(lambda x:str(TextBlob(x).correct()))

In [18]:
%%time
#normalization
train_data['original_text'] = [text_prepare(x) for x in train_data['original_text']]

Wall time: 11.8 s


In [None]:
NB_WORDS = 7  # Parameter indicating the number of words we'll put in the dictionary
VAL_SIZE = 1000  # Size of the validation set
NB_START_EPOCHS = 10  # Number of epochs we usually start to train with
BATCH_SIZE = 512  # Size of the batches used in the mini-batch gradient descent
MAX_LEN = 24  # Maximum number of words in a sequence
GLOVE_DIM = 100 

In [19]:
# Tokenizing the text - converting the words, letters into counts or numbers. 
# We dont need to explicitly remove the punctuations. we have an inbuilt option in Tokenizer for this purpose
tokenizer = Tokenizer(num_words = max_features, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower = True, split = ' ')
tokenizer.fit_on_texts(texts = train_data['original_text'])
X = tokenizer.texts_to_sequences(texts = train_data['original_text'])

In [20]:
# now applying padding to make them even shaped.
X = pad_sequences(sequences = X, maxlen = max_features, padding = 'pre')

In [21]:
u = train_data['sentiment_class']
u.shape

(3226,)

In [23]:
print(X.shape)
y = train_data['sentiment_class'].values
print(y.shape)

(3226, 228)
(3226,)


In [27]:
y[6]

2

In [28]:
# splitting the data training data for training and validation.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

We got our training data preprocessed and ready for training the neural network. 

We have to create a neural network now

In [29]:
# LSTM Neural Network
lstm_model = Sequential(name = 'lstm_nn_model')
lstm_model.add(layer = Embedding(input_dim = max_features, output_dim = 120, name = '1st_layer'))
lstm_model.add(layer = LSTM(units = 120, dropout = 0.2, recurrent_dropout = 0.2, name = '2nd_layer'))
lstm_model.add(layer = Dropout(rate = 0.5, name = '3rd_layer'))
lstm_model.add(layer = Dense(units = 120,  activation = 'relu', name = '4th_layer'))
lstm_model.add(layer = Dropout(rate = 0.5, name = '5th_layer'))
lstm_model.add(layer = Dense(units = len(set(y)),  activation = 'softmax', name = 'output_layer'))
# compiling the model
lstm_model.compile(optimizer = 'adam', loss = 'sparse_categorical_crossentropy', metrics = ['accuracy'])

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [30]:
lstm_model_fit = lstm_model.fit(X_train, y_train, epochs = 3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


Constructing GRU Neural Network

In [37]:
# GRU neural Network
gru_model = Sequential(name = 'gru_nn_model')
gru_model.add(layer = Embedding(input_dim = max_features, output_dim = 120, name = '1st_layer'))
gru_model.add(layer = GRU(units = 120, dropout = 0.2, 
                          recurrent_dropout = 0.2, recurrent_activation = 'relu', 
                          activation = 'relu', name = '2nd_layer'))
gru_model.add(layer = Dropout(rate = 0.4, name = '3rd_layer'))
gru_model.add(layer = Dense(units = 120, activation = 'relu', name = '4th_layer'))
gru_model.add(layer = Dropout(rate = 0.4, name = '5th_layer'))
gru_model.add(layer = Dense(units = len(set(y)), activation = 'softmax', name = 'output_layer'))
# compiling the model
gru_model.compile(optimizer = 'adam', loss = 'sparse_categorical_crossentropy', metrics = ['accuracy'])

In [38]:
gru_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
1st_layer (Embedding)        (None, None, 120)         27360     
_________________________________________________________________
2nd_layer (GRU)              (None, 120)               86760     
_________________________________________________________________
3rd_layer (Dropout)          (None, 120)               0         
_________________________________________________________________
4th_layer (Dense)            (None, 120)               14520     
_________________________________________________________________
5th_layer (Dropout)          (None, 120)               0         
_________________________________________________________________
output_layer (Dense)         (None, 3)                 363       
Total params: 129,003
Trainable params: 129,003
Non-trainable params: 0
_________________________________________________________________


In [39]:
gru_model_fit = gru_model.fit(X_train, y_train, epochs = 3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [40]:
result = gru_model.evaluate(X_test, y_test)
result



[1.0452867805404191, 0.498452]

Now preparing the test dataset

In [41]:
print(test.shape)
test_data = test.copy()
print(test_data.shape)

(1387, 5)
(1387, 5)


**Filling the Missing values**

In [43]:
test_data.drop(['id','lang','retweet_count','original_author'],axis=1,inplace=True)
test_data.head(3)

Unnamed: 0,original_text
0,"3. Yeah, I once cooked potatoes when I was 3 y..."
1,"Happy Mother's Day to all the mums, step-mums,..."
2,"I love the people from the UK, however, when I..."


In [44]:
test_text = tokenizer.texts_to_sequences(texts = test_data['original_text'])

In [45]:
test_text = pad_sequences(sequences = test_text, maxlen = max_features, padding = 'pre')

Prediction:

In [46]:
lstm_prediction = lstm_model.predict_classes(test_text)

In [47]:
le.classes_

array([-1,  0,  1])

In [48]:
pred1 = le.inverse_transform(lstm_prediction)

The LSTM predictions have more accuracy.

In [49]:
test['sentiment_class'] = pred1
submission = test[['id','sentiment_class']] 
submission.to_csv('bilstm.csv', index=False)

In [50]:
gru_pred = gru_model.predict_classes(test_text)
pred2 = le.inverse_transform(gru_pred)

In [51]:
test['sentiment_class'] = pred2
submission = test[['id','sentiment_class']] 
submission.to_csv('gru.csv', index=False)

In [62]:
model = Sequential()
model.add(layers.Embedding(input_dim = max_features, output_dim = 120))
model.add(layers.Conv1D(128, 5, activation='relu'))
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(3, activation='softmax'))
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, None, 120)         27360     
_________________________________________________________________
conv1d_4 (Conv1D)            (None, None, 128)         76928     
_________________________________________________________________
global_max_pooling1d_4 (Glob (None, 128)               0         
_________________________________________________________________
dense_10 (Dense)             (None, 128)               16512     
_________________________________________________________________
dropout_5 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_11 (Dense)             (None, 64)                8256      
_________________________________________________________________
dropout_6 (Dropout)          (None, 64)                0         
__________

In [63]:
model.fit(X_train,y_train,epochs=10,verbose=1,batch_size=16,
          validation_data=(X_test, y_test))

Train on 2580 samples, validate on 646 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1a72a962d68>