<a href="https://colab.research.google.com/github/hishamp3/MasterThesis-Lies-DeceptiveText/blob/main/LSTM_baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q -U "tensorflow-text"

In [None]:
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from keras.layers import Embedding

In [None]:
import pandas as pd
df = pd.read_csv("./sample_data/fake reviews dataset.csv",usecols=["text_","label"])

In [None]:
import re
import string
def clean_text(text):
    # to lower case
    text = text.lower()
    # remove links
    text = re.sub('https:\/\/\S+', '', text)
    # remove punctuation
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    # remove next line
    text = re.sub(r'[^ \w\.]', '', text)
    # remove words containing numbers
    text = re.sub('\w*\d\w*', '', text)

    return text

In [None]:
df['text'] = df.text_.apply(lambda x: clean_text(x))

In [None]:
df['fake']=df['label'].apply(lambda x: 1 if x=='CG' else 0)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['text'],df['fake'], stratify=df['fake'])
X_train.head(4)

19951    weighted and thick you dont have to worry abou...
25894    a perfect read for vday samantha young gifts h...
31779    this book is good for a beginner but it is not...
7132     i needed to have a better set of grips on my t...
Name: text, dtype: object

In [None]:
train_dataset = X_train.to_frame().join(y_train)

In [None]:
# train_dataset = pd.concat([X_train,y_train])

In [None]:
# Defining pre-processing parameters
max_len = 10
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>' # out of vocabulary token
vocab_size = 500

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
tokenizer = Tokenizer(num_words = vocab_size,
                      char_level = False,
                      oov_token = oov_tok)
tokenizer.fit_on_texts(X_train)

In [None]:
word_index = tokenizer.word_index
total_words = len(word_index)
print(total_words)

48289


In [None]:
training_sequences = tokenizer.texts_to_sequences(X_train)
training_padded = pad_sequences(training_sequences,
                                maxlen = max_len,
                                padding = padding_type,
                                truncating = trunc_type)

In [None]:
testing_sequences = tokenizer.texts_to_sequences(X_test)
testing_padded = pad_sequences(testing_sequences,
                               maxlen = max_len,
                               padding = padding_type,
                               truncating = trunc_type)

In [None]:
print('Shape of training tensor: ', training_padded.shape)
print('Shape of testing tensor: ', testing_padded.shape)

Shape of training tensor:  (30324, 10)
Shape of testing tensor:  (10108, 10)


In [None]:
# Modeling
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, GRU, Dense, Embedding, Dropout, GlobalAveragePooling1D, Flatten, SpatialDropout1D, Bidirectional

In [None]:
vocab_size = 500
embedding_dim = 16
drop_value = 0.6
n_dense = 24

In [None]:
# Define parameter
n_lstm = 128
drop_lstm = 0.6
# Define LSTM Model
model1 = Sequential()
model1.add(Embedding(vocab_size, embedding_dim, input_length=max_len))
model1.add(SpatialDropout1D(drop_lstm))
model1.add(LSTM(n_lstm, return_sequences=False))
model1.add(Dropout(drop_lstm))
model1.add(Dense(1, activation='ReLU'))

In [None]:
model1.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 10, 16)            8000      
                                                                 
 spatial_dropout1d (Spatial  (None, 10, 16)            0         
 Dropout1D)                                                      
                                                                 
 lstm (LSTM)                 (None, 128)               74240     
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 82369 (321.75 KB)
Trainable params: 82369 (321.75 KB)
Non-trainable params: 0 (0.00 Byte)
__________________

In [None]:
from tensorflow.keras.optimizers import Adam
adamOpti = Adam(learning_rate = 1e-3)

In [None]:
model1.compile(loss = 'binary_crossentropy',
               optimizer = adamOpti,
               metrics = ['accuracy'])

In [None]:
num_epochs = 5
early_stop = EarlyStopping(monitor='val_loss', patience=2)
history = model1.fit(training_padded,
                     y_train,
                     batch_size=8,
                     epochs=num_epochs)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
loss, accuracy = model1.evaluate(testing_padded,y_test,batch_size=8)

print(f'Loss: {loss}')
print(f'Accuracy: {accuracy}')

Loss: 0.49087655544281006
Accuracy: 0.812129020690918
