<a href="https://www.kaggle.com/jashtailor/spam-sms-classification-using-simplernn-lstm-gru?scriptVersionId=84209054" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/sms-spam-collection-dataset/spam.csv
/kaggle/input/glove-global-vectors-for-word-representation/glove.6B.200d.txt
/kaggle/input/glove-global-vectors-for-word-representation/glove.6B.50d.txt
/kaggle/input/glove-global-vectors-for-word-representation/glove.6B.100d.txt


In [2]:
# import the necessary libraries
from keras.preprocessing.text import Tokenizer 
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential 
from keras.layers import Embedding, SimpleRNN, LSTM, GRU, Dense

import warnings
warnings.filterwarnings("ignore")
import os
os.environ["KMP_SETTINGS"] = "false"

from sklearn import preprocessing
import time

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [3]:
# import the dataset and cleaning it 
df = pd.read_csv('/kaggle/input/sms-spam-collection-dataset/spam.csv', encoding = 'ISO-8859-1')
df.rename(columns={'v1': 'Classification', 'v2': 'SMS'}, inplace=True)
df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)
df.head()

Unnamed: 0,Classification,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
# preprocessing the data by tokenizing and padding the textual data and encoding the categorical classification labels
maxlen = 100
training_samples = 3000
validation_samples = 2000
testing_samples = 572
max_words = 10000

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(df['SMS'])
sequences = tokenizer.texts_to_sequences(df['SMS'])
word_index = tokenizer.word_index
print('Found ', len(word_index), ' unique tokens.')

label_encoder = preprocessing.LabelEncoder()
classification = label_encoder.fit_transform(df['Classification'])

data = pad_sequences(sequences, maxlen=maxlen)
labels = np.asarray(classification)
print('Shape of the tensor containing the SMSes:', data.shape)
print('Shape of the tensor containing the classifcation labels:', labels.shape)

# splitting the data into train, validation and test sets
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

x_train = data[:training_samples]
y_train = labels[:training_samples]
x_val = data[training_samples: training_samples + validation_samples]
y_val = labels[training_samples: training_samples + validation_samples]
x_test = data[validation_samples: validation_samples + testing_samples]
y_test = labels[validation_samples: validation_samples + testing_samples]

Found  8920  unique tokens.
Shape of the tensor containing the SMSes: (5572, 100)
Shape of the tensor containing the classifcation labels: (5572,)


In [5]:
# importing the GloVe word embeddings 
glove_dir = '/kaggle/input/glove-global-vectors-for-word-representation/'

embeddings_index = {}
f = open(os.path.join(glove_dir, 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Found', len(embeddings_index), 'word index')

# creating a embedding matrix of size (max_words, embedding_dim) which can be loaded in the embedding layer 
embedding_dim = 100

embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in word_index.items():
    if i < max_words:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

Found 400000 word index


In [6]:
# SimpleRNN model
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
model.add(SimpleRNN(32, return_sequences=True))
model.add(Dense(1, activation='sigmoid'))
model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable = False
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 100)          1000000   
_________________________________________________________________
simple_rnn (SimpleRNN)       (None, 100, 32)           4256      
_________________________________________________________________
dense (Dense)                (None, 100, 1)            33        
Total params: 1,004,289
Trainable params: 4,289
Non-trainable params: 1,000,000
_________________________________________________________________


2022-01-02 08:27:11.552511: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


In [7]:
model.compile(optimizer='rmsprop',loss='binary_crossentropy', metrics=['acc'])
tik = time.time()
history = model.fit(x_train, y_train, epochs=10, batch_size=32, validation_data=(x_val, y_val))
tok = time.time()
print(tik, tok, tok-tik)

Epoch 1/10


2022-01-02 08:27:11.882539: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
1641112031.8244216 1641112073.286083 41.46166133880615


In [8]:
# evaluating the performance of the model
values_1 = model.evaluate(x_test,y_test)
values_1
time_1 = tok-tik 



In [9]:
df_1 = pd.DataFrame()
df_1['Training Accuracy'] = history.history['acc']
df_1['Validation Accuracy'] = history.history['val_acc']
df_1['Training Loss'] = history.history['loss']
df_1['Validation Loss'] = history.history['val_loss']
df_1['Epochs'] = range(1, len(df_1['Training Accuracy']) + 1)

# comparing the training and validation accuracy 
fig = px.line(df_1, x='Epochs', y=['Training Accuracy', 'Validation Accuracy'], title='Training and Validation Accuracy for the SimpleRNN model')
fig.show()

In [10]:
# comparing the training and validation loss
fig = px.line(df_1, x='Epochs', y=['Training Loss', 'Validation Loss'], title='Training and Validation Loss for the SimpleRNN model')
fig.show()

In [11]:
# LSTM model
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
model.add(LSTM(32))
model.add(Dense(1, activation='sigmoid'))
model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable = False
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 100)          1000000   
_________________________________________________________________
lstm (LSTM)                  (None, 32)                17024     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 33        
Total params: 1,017,057
Trainable params: 17,057
Non-trainable params: 1,000,000
_________________________________________________________________


In [12]:
model.compile(optimizer='rmsprop',loss='binary_crossentropy', metrics=['acc'])
tik = time.time()
lstm = model.fit(x_train, y_train, epochs=10, batch_size=32, validation_data=(x_val, y_val))
tok = time.time()
print(tik, tok, tok-tik)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
1641112076.6837559 1641112161.6604123 84.97665643692017


In [13]:
# evaluating the performance of the model
values_2 = model.evaluate(x_test,y_test)
values_2
time_2 = tok-tik



In [14]:
df_2 = pd.DataFrame()
df_2['Training Accuracy'] = lstm.history['acc']
df_2['Validation Accuracy'] = lstm.history['val_acc']
df_2['Training Loss'] = lstm.history['loss']
df_2['Validation Loss'] = lstm.history['val_loss']
df_2['Epochs'] = range(1, len(df_2['Training Accuracy']) + 1)

# comparing the training and validation accuracy 
fig = px.line(df_2, x='Epochs', y=['Training Accuracy', 'Validation Accuracy'], title='Training and Validation Accuracy for the LSTM model')
fig.show()

In [15]:
# comparing the training and validation loss
fig = px.line(df_2, x='Epochs', y=['Training Loss', 'Validation Loss'], title='Training and Validation Loss for the LSTM model')
fig.show()

In [16]:
# GRU model
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
model.add(GRU(32))
model.add(Dense(1, activation='sigmoid'))
model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable = False
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 100, 100)          1000000   
_________________________________________________________________
gru (GRU)                    (None, 32)                12864     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 33        
Total params: 1,012,897
Trainable params: 12,897
Non-trainable params: 1,000,000
_________________________________________________________________


In [17]:
model.compile(optimizer='rmsprop',loss='binary_crossentropy', metrics=['acc'])
tik = time.time()
gru = model.fit(x_train, y_train, epochs=10, batch_size=32, validation_data=(x_val, y_val))
tok = time.time()
print(tik, tok, tok-tik)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
1641112167.238597 1641112271.150785 103.9121880531311


In [18]:
# evaluating the performance of the model
values_3 = model.evaluate(x_test,y_test)
values_3
time_3 = tok-tik



In [19]:
df_3 = pd.DataFrame()
df_3['Training Accuracy'] = gru.history['acc']
df_3['Validation Accuracy'] = gru.history['val_acc']
df_3['Training Loss'] = gru.history['loss']
df_3['Validation Loss'] = gru.history['val_loss']
df_3['Epochs'] = range(1, len(df_3['Training Accuracy']) + 1)

# comparing the training and validation accuracy 
fig = px.line(df_3, x='Epochs', y=['Training Accuracy', 'Validation Accuracy'], title='Training and Validation Accuracy for the GRU model')
fig.show()

In [20]:
# comparing the training and validation loss
fig = px.line(df_3, x='Epochs', y=['Training Loss', 'Validation Loss'], title='Training and Validation Loss for the GRU model')
fig.show()

In [21]:
# comparing the evaluation performance of all the models
fig = make_subplots(rows=1, cols=2, subplot_titles=('Evaluation Loss',  'Evaluation Accuracy'))

fig.add_trace(
    go.Bar(name='Loss', 
           x=['SimpleRNN', 'LSTM', 'GRU'], 
           y=[values_1[0], values_2[0], values_3[0]]),
           row=1, 
           col=1)

fig.add_trace(
    go.Bar(name='Accuracy', 
           x=['SimpleRNN', 'LSTM', 'GRU'], 
           y=[values_1[1], values_2[1], values_3[1]]),
           row=1, 
           col=2)

fig.update_layout(title_text='Evaulation Results')
fig.show()

In [22]:
# comparing the training time for each of the models 
fig = go.Figure()
fig.add_trace(go.Bar(name='Loss', 
                     x=['SimpleRNN', 'LSTM', 'GRU'], 
                     y=[time_1, time_2, time_3]))
fig.update_layout(title_text='Training time of each model')
fig.show()

As we can from our analysis, LSTM and GRU have similar performance however, GRU takes a little more time to train as compared to LSTM. GRU have a lower evaluation loss as compared to LSTM. SimpleRNN model takes the least amount of time but also have the lowest accuracy.