In [20]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Input, GlobalMaxPool1D
from tensorflow.keras.layers import LSTM, GRU, SimpleRNN, Embedding
from tensorflow.keras.models import Model
from tensorflow.keras.losses import SparseCategoricalCrossentropy

In [21]:
df = pd.read_csv("/kaggle/input/spam-text-message-classification/SPAM text message 20170820 - Data.csv")
df.head(3)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...


# Data Preprocessing

## Label Mapping

In [22]:
df['label'] = df['Category'].map({
    'ham': 0,
    'spam': 1
})
df.head(3)

Unnamed: 0,Category,Message,label
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1


## Data Splitting

In [23]:
X_train, X_val, y_train, y_val = train_test_split(df[['Message']], df['label'], test_size=0.2, random_state=12)

## Tokenize
use keras tokenizer to tokenize the message

In [24]:
MAX_VOCAB_SIZE = 2000

# keras tokenizer
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE) 

# train tokenizer
tokenizer.fit_on_texts(df['Message'])

# embed token into vector
sequences_train = tokenizer.texts_to_sequences(X_train['Message'])
sequences_test = tokenizer.texts_to_sequences(X_val['Message'])

In [25]:
word2idx = tokenizer.word_index

total_words = len(word2idx)
print(f' total unique words {total_words}')

 total unique words 9004


## Padding
ensure that vector sequences have the same length

In [26]:
# ensure that input sequences have the same length by padding them
data_train = pad_sequences(sequences_train)
print(data_train.shape)

# get sequence length
length = data_train.shape[1] # 175

# pad data test
data_test = pad_sequences(sequences_test, maxlen=length)

(4457, 175)


# Modeling

## RNN

In [27]:
D = 50
num_categories = 2

i = Input(shape=(length,))
x = Embedding(total_words + 1, D)(i)
x = SimpleRNN(32, return_sequences=True)(x)
x = GlobalMaxPool1D()(x)
x = Dense(num_categories)(x)

model1 = Model(i, x)

In [28]:
model1.compile(
  loss=SparseCategoricalCrossentropy(from_logits=True),
  optimizer='adam',
  metrics=['accuracy']
)

In [29]:
%%time

# training model
history = model1.fit(
  data_train,
  y_train,
  epochs=10,
  validation_data=(data_test, y_val)
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 7min 28s, sys: 51.8 s, total: 8min 20s
Wall time: 4min 22s


## LSTM

In [30]:
D = 50
num_categories = 2

i = Input(shape=(length,))
x = Embedding(total_words + 1, D)(i)
x = LSTM(32, return_sequences=True)(x)
x = GlobalMaxPool1D()(x)
x = Dense(num_categories)(x)

model2 = Model(i, x)

**Code Explanation**
1. `D = 50`: This line initializes a variable D with the value 50. It's likely representing the dimensionality of the word embeddings.
2. `i = Input(shape=(length,))`: This line creates an input layer for the neural network. The input shape is specified as (length,), indicating that the model expects input sequences of a certain length.
3. `x = Embedding(total_words + 1, D)(i)`: This line adds an embedding layer to the model. The embedding layer is used to convert integer-encoded words into dense vectors of fixed size (D). The total_words + 1 is the input dimension, representing the vocabulary size.
4. `x = LSTM(32, return_sequences=True)(x)`: This line adds a Long Short-Term Memory (LSTM) layer with 32 units to the model. LSTMs are a type of recurrent neural network (RNN) designed to capture dependencies in sequential data. The return_sequences=True option indicates that the LSTM layer should return the full sequence of outputs for each input sequence.
5. `x = GlobalMaxPool1D()(x)`: This line adds a global max pooling layer to the model. Global max pooling reduces the dimensionality of the output from the LSTM layer by taking the maximum value over the sequence.
6. `x = Dense(total_categories)(x)`: This line adds a dense layer to the model with total_categories units. This layer is the output layer, and it produces the final predictions. The number of units in this layer corresponds to the number of categories or classes in the classification task.
7. `model = Model(i, x)`: This line creates a Keras Model, specifying the input (i) and output (x). The resulting model can be compiled and trained on a dataset for a specific NLP task, such as text classification.

In [31]:
model2.compile(
  loss=SparseCategoricalCrossentropy(from_logits=True),
  optimizer='adam',
  metrics=['accuracy']
)

In [32]:
%%time

# training model
history = model2.fit(
  data_train,
  y_train,
  epochs=10,
  validation_data=(data_test, y_val)
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 42.3 s, sys: 1.85 s, total: 44.1 s
Wall time: 39.2 s


## GRU

In [33]:
D = 50
num_categories = 2

i = Input(shape=(length,))
x = Embedding(total_words + 1, D)(i)
x = GRU(32, return_sequences=True)(x)
x = GlobalMaxPool1D()(x)
x = Dense(num_categories)(x)

model3 = Model(i, x)

In [34]:
model3.compile(
  loss=SparseCategoricalCrossentropy(from_logits=True),
  optimizer='adam',
  metrics=['accuracy']
)

In [35]:
%%time

# training model
history = model3.fit(
  data_train,
  y_train,
  epochs=10,
  validation_data=(data_test, y_val)
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 40.8 s, sys: 1.88 s, total: 42.7 s
Wall time: 37.3 s


## Combination

In [36]:
D = 50
num_categories = 2

i = Input(shape=(length,))
x = Embedding(total_words + 1, D)(i)
x = LSTM(16, return_sequences=True)(x)
x = GRU(32, return_sequences=True)(x)
x = GRU(64, return_sequences=True)(x)
x = GlobalMaxPool1D()(x)
x = Dense(num_categories)(x)

model4 = Model(i, x)

In [37]:
model4.compile(
  loss=SparseCategoricalCrossentropy(from_logits=True),
  optimizer='adam',
  metrics=['accuracy']
)

In [38]:
%%time

# training model
history = model4.fit(
  data_train,
  y_train,
  epochs=10,
  validation_data=(data_test, y_val)
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 1min, sys: 2.4 s, total: 1min 3s
Wall time: 54.8 s


# Evaluation

In [39]:
preds = model1.predict(data_test)
# get the index of maximum prediction values (0/1)
preds = np.argmax(preds, axis=1)
print(preds)

print('confusion matrix')
print(confusion_matrix(y_val, preds))
print()
print('classification report')
print(classification_report(y_val, preds))

[0 0 0 ... 0 1 0]
confusion matrix
[[949   8]
 [ 12 146]]

classification report
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       957
           1       0.95      0.92      0.94       158

    accuracy                           0.98      1115
   macro avg       0.97      0.96      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [40]:
preds = model2.predict(data_test)
# get the index of maximum prediction values (0/1)
preds = np.argmax(preds, axis=1)
print(preds)

print('confusion matrix')
print(confusion_matrix(y_val, preds))
print()
print('classification report')
print(classification_report(y_val, preds))

[0 0 0 ... 0 1 0]
confusion matrix
[[954   3]
 [ 16 142]]

classification report
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       957
           1       0.98      0.90      0.94       158

    accuracy                           0.98      1115
   macro avg       0.98      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [41]:
preds = model3.predict(data_test)
# get the index of maximum prediction values (0/1)
preds = np.argmax(preds, axis=1)
print(preds)

print('confusion matrix')
print(confusion_matrix(y_val, preds))
print()
print('classification report')
print(classification_report(y_val, preds))

[0 0 0 ... 0 1 0]
confusion matrix
[[954   3]
 [ 16 142]]

classification report
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       957
           1       0.98      0.90      0.94       158

    accuracy                           0.98      1115
   macro avg       0.98      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [42]:
preds = model4.predict(data_test)
# get the index of maximum prediction values (0/1)
preds = np.argmax(preds, axis=1)
print(preds)

print('confusion matrix')
print(confusion_matrix(y_val, preds))
print()
print('classification report')
print(classification_report(y_val, preds))

[0 1 0 ... 0 1 0]
confusion matrix
[[946  11]
 [ 11 147]]

classification report
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       957
           1       0.93      0.93      0.93       158

    accuracy                           0.98      1115
   macro avg       0.96      0.96      0.96      1115
weighted avg       0.98      0.98      0.98      1115

