In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
#for dirname, _, filenames in os.walk('/kaggle/input'):
#    for filename in filenames:
#        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# import the necessary libraries
from keras.preprocessing.text import Tokenizer 
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential 
from keras.layers import Embedding, SimpleRNN, LSTM, GRU, Dense

import warnings
warnings.filterwarnings("ignore")
import os
os.environ["KMP_SETTINGS"] = "false"

from sklearn import preprocessing
import time

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [None]:
# import the dataset and cleaning it 
df = pd.read_csv('https://raw.githubusercontent.com/khwanck/DeepMyeSequence/main/dataset/spam.csv', encoding = 'ISO-8859-1')
df.rename(columns={'v1': 'Classification', 'v2': 'SMS'}, inplace=True)
df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)
df.head()

In [None]:
# preprocessing the data by tokenizing and padding the textual data and encoding the categorical classification labels
maxlen = 100
training_samples = 3000
validation_samples = 2000
testing_samples = 572
max_words = 10000

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(df['SMS'])
sequences = tokenizer.texts_to_sequences(df['SMS'])
word_index = tokenizer.word_index
print('Found ', len(word_index), ' unique tokens.')

label_encoder = preprocessing.LabelEncoder()
classification = label_encoder.fit_transform(df['Classification'])

data = pad_sequences(sequences, maxlen=maxlen)
labels = np.asarray(classification)
print('Shape of the tensor containing the SMSes:', data.shape)
print('Shape of the tensor containing the classifcation labels:', labels.shape)

# splitting the data into train, validation and test sets
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

x_train = data[:training_samples]
y_train = labels[:training_samples]
x_val = data[training_samples: training_samples + validation_samples]
y_val = labels[training_samples: training_samples + validation_samples]
x_test = data[validation_samples: validation_samples + testing_samples]
y_test = labels[validation_samples: validation_samples + testing_samples]

In [None]:
# importing the GloVe word embeddings 
#glove_dir = '/kaggle/input/glove-global-vectors-for-word-representation/'
import urllib.request as urllib2

embeddings_index = {}
#f = open('https://bads-dl.s3.ap-southeast-1.amazonaws.com/dataset/glove.6B.100d.txt')
f = urllib2.urlopen("https://bads-dl.s3.ap-southeast-1.amazonaws.com/dataset/glove.6B.100d.txt")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Found', len(embeddings_index), 'word index')

# creating a embedding matrix of size (max_words, embedding_dim) which can be loaded in the embedding layer 
embedding_dim = 100

embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in word_index.items():
    if i < max_words:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [None]:
# SimpleRNN model
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
model.add(SimpleRNN(32, return_sequences=True))
model.add(Dense(1, activation='sigmoid'))
model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable = False
model.summary()

In [None]:
print("Dimension before changed:") 
print(x_train.shape,':',y_train.shape)
print(x_val.shape,':',y_val.shape)
print(x_test.shape,':',y_test.shape)

x_train=x_train[ ..., np.newaxis ]
y_train=y_train[ ..., np.newaxis ]
x_val=x_val[ ..., np.newaxis ]
y_val=y_val[ ..., np.newaxis ]
x_test=x_test[ ..., np.newaxis ]
y_test=y_test[ ..., np.newaxis ]

print("\nDimension after changed:") 
print(x_train.shape,':',y_train.shape)
print(x_val.shape,':',y_val.shape)
print(x_test.shape,':',y_test.shape)

In [None]:
model.compile(optimizer='rmsprop',loss='binary_crossentropy', metrics=['acc'])
tik = time.time()
history = model.fit(x_train, y_train, epochs=10, batch_size=32, validation_data=(x_val, y_val))
tok = time.time()
print(tik, tok, tok-tik)

In [None]:
# evaluating the performance of the model
values_1 = model.evaluate(x_test,y_test)
values_1
time_1 = tok-tik 

In [None]:
df_1 = pd.DataFrame()
df_1['Training Accuracy'] = history.history['acc']
df_1['Validation Accuracy'] = history.history['val_acc']
df_1['Training Loss'] = history.history['loss']
df_1['Validation Loss'] = history.history['val_loss']
df_1['Epochs'] = range(1, len(df_1['Training Accuracy']) + 1)

# comparing the training and validation accuracy 
fig = px.line(df_1, x='Epochs', y=['Training Accuracy', 'Validation Accuracy'], title='Training and Validation Accuracy for the SimpleRNN model')
fig.show()

In [None]:
# comparing the training and validation loss
fig = px.line(df_1, x='Epochs', y=['Training Loss', 'Validation Loss'], title='Training and Validation Loss for the SimpleRNN model')
fig.show()

In [None]:
# LSTM model
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
model.add(LSTM(32))
model.add(Dense(1, activation='sigmoid'))
model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable = False
model.summary()

In [None]:
model.compile(optimizer='rmsprop',loss='binary_crossentropy', metrics=['acc'])
tik = time.time()
lstm = model.fit(x_train, y_train, epochs=10, batch_size=32, validation_data=(x_val, y_val))
tok = time.time()
print(tik, tok, tok-tik)

In [None]:
# evaluating the performance of the model
values_2 = model.evaluate(x_test,y_test)
values_2
time_2 = tok-tik

In [None]:
df_2 = pd.DataFrame()
df_2['Training Accuracy'] = lstm.history['acc']
df_2['Validation Accuracy'] = lstm.history['val_acc']
df_2['Training Loss'] = lstm.history['loss']
df_2['Validation Loss'] = lstm.history['val_loss']
df_2['Epochs'] = range(1, len(df_2['Training Accuracy']) + 1)

# comparing the training and validation accuracy 
fig = px.line(df_2, x='Epochs', y=['Training Accuracy', 'Validation Accuracy'], title='Training and Validation Accuracy for the LSTM model')
fig.show()

In [None]:
# comparing the training and validation loss
fig = px.line(df_2, x='Epochs', y=['Training Loss', 'Validation Loss'], title='Training and Validation Loss for the LSTM model')
fig.show()

In [None]:
# GRU model
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
model.add(GRU(32))
model.add(Dense(1, activation='sigmoid'))
model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable = False
model.summary()

In [None]:
model.compile(optimizer='rmsprop',loss='binary_crossentropy', metrics=['acc'])
tik = time.time()
gru = model.fit(x_train, y_train, epochs=10, batch_size=32, validation_data=(x_val, y_val))
tok = time.time()
print(tik, tok, tok-tik)

In [None]:
# evaluating the performance of the model
values_3 = model.evaluate(x_test,y_test)
values_3
time_3 = tok-tik

In [None]:
df_3 = pd.DataFrame()
df_3['Training Accuracy'] = gru.history['acc']
df_3['Validation Accuracy'] = gru.history['val_acc']
df_3['Training Loss'] = gru.history['loss']
df_3['Validation Loss'] = gru.history['val_loss']
df_3['Epochs'] = range(1, len(df_3['Training Accuracy']) + 1)

# comparing the training and validation accuracy 
fig = px.line(df_3, x='Epochs', y=['Training Accuracy', 'Validation Accuracy'], title='Training and Validation Accuracy for the GRU model')
fig.show()

In [None]:
# comparing the training and validation loss
fig = px.line(df_3, x='Epochs', y=['Training Loss', 'Validation Loss'], title='Training and Validation Loss for the GRU model')
fig.show()

In [None]:
# comparing the evaluation performance of all the models
fig = make_subplots(rows=1, cols=2, subplot_titles=('Evaluation Loss',  'Evaluation Accuracy'))

fig.add_trace(
    go.Bar(name='Loss', 
           x=['SimpleRNN', 'LSTM', 'GRU'], 
           y=[values_1[0], values_2[0], values_3[0]]),
           row=1, 
           col=1)

fig.add_trace(
    go.Bar(name='Accuracy', 
           x=['SimpleRNN', 'LSTM', 'GRU'], 
           y=[values_1[1], values_2[1], values_3[1]]),
           row=1, 
           col=2)

fig.update_layout(title_text='Evaulation Results')
fig.show()

In [None]:
# comparing the training time for each of the models 
fig = go.Figure()
fig.add_trace(go.Bar(name='Loss', 
                     x=['SimpleRNN', 'LSTM', 'GRU'], 
                     y=[time_1, time_2, time_3]))
fig.update_layout(title_text='Training time of each model')
fig.show()

As we can from our analysis, LSTM and GRU have similar performance however, GRU takes a little more time to train as compared to LSTM. GRU have a lower evaluation loss as compared to LSTM. SimpleRNN model takes the least amount of time but also have the lowest accuracy.