Data used for this notebook: [SMS Spam Collection Dataset](https://www.kaggle.com/uciml/sms-spam-collection-dataset)

In [1]:
import keras
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.layers import Dense, LSTM, Dropout, Flatten, Embedding
from tensorflow.keras import Sequential
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
data = pd.read_csv('/content/spam.csv', encoding='latin-1')
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


We will use only label and text data

In [3]:
data = data[['v1', 'v2']]

Clean the text data from links, emails, symbols..

In [4]:
def get_clean_text(x):
    x = re.sub(r'([a-zA-Z0-9+._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)', '', x) 
    #regex to remove to emails(above)
    x = re.sub(r'(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', '', x)
    #regex to remove URLs
    x = re.sub('RT', "", x)
    #substitute the 'RT' retweet tags with empty spaces
    x = re.sub('[^A-Z a-z]+', '', x)
    return x

data['v2'] = data['v2'].apply(lambda x: get_clean_text(x))
data.head()

Unnamed: 0,v1,v2
0,ham,Go until jurong point crazy Available only in ...
1,ham,Ok lar Joking wif u oni
2,spam,Free entry in a wkly comp to win FA Cup final...
3,ham,U dun say so early hor U c already then say
4,ham,Nah I dont think he goes to usf he lives aroun...


Manually label encode y values, and split the data into x,y train and test values. Check the shapes

In [5]:
X = data.v2
y = data.v1.map({'ham':0, 'spam':1})

train_size = int(len(data)*0.8)

X_train, y_train = X[:train_size], y[:train_size]
X_test, y_test = X[train_size:], y[train_size:]

#Print the shapes
print("X_train shape: ",X_train.shape)
print("X_test shape: ",X_test.shape)
print("y_train shape: ",y_train.shape)
print("y_test shape: ",y_test.shape)

X_train shape:  (4457,)
X_test shape:  (1115,)
y_train shape:  (4457,)
y_test shape:  (1115,)


Decide on how big should the max_length be, based on average sentence length and other parameters

In [6]:
print("Max sentence length:", X.map(len).max())
print("Min sentence length:", X.map(len).min())
print("Average sentence length:", X.map(len).mean())

chars = sorted(list(set(X)))
print('Total words:', len(chars))

Max sentence length: 888
Min sentence length: 0
Average sentence length: 73.23869346733669
Total words: 5119


Apply tokenizer only on X_train, we cannot leak X_test words to the tokenizer.
Create paddings for train and test data

In [7]:
max_length = 80
embedding_dim = 32
batch_size = 32

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index

train_sequences = tokenizer.texts_to_sequences(X_train)
train_padded = pad_sequences(train_sequences, maxlen = max_length, padding='pre', truncating='pre')

test_sequences = tokenizer.texts_to_sequences(X_test)
test_padded = pad_sequences(test_sequences, maxlen = max_length, padding='pre', truncating='pre')

In [8]:
train_data = tf.data.Dataset.from_tensor_slices((train_padded, y_train))
test_data = tf.data.Dataset.from_tensor_slices((test_padded, y_test))

In [9]:
model = Sequential([
                    Embedding(len(word_index)+1, embedding_dim, input_length=max_length),
                    LSTM(50),
                    Dense(32, activation='relu'),
                    Dense(1)
])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 80, 32)            243104    
_________________________________________________________________
lstm (LSTM)                  (None, 50)                16600     
_________________________________________________________________
dense (Dense)                (None, 32)                1632      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 33        
Total params: 261,369
Trainable params: 261,369
Non-trainable params: 0
_________________________________________________________________


In [10]:
class CustomFit(tf.keras.Model):
  def __init__(self, model):
    super(CustomFit, self).__init__()
    self.model = model

  def compile(self, optimizer, loss):
    super(CustomFit, self).compile()
    self.optimizer=optimizer
    self.loss = loss

  def train_step(self, data):
    x, y = data
    
    with tf.GradientTape() as tape:
      y_pred = self.model(x, training=True)
      loss = self.loss(y, y_pred)

    training_vars = self.trainable_variables
    gradients = tape.gradient(loss, training_vars)

    self.optimizer.apply_gradients(zip(gradients, training_vars))
    acc_metric.update_state(y, y_pred)

    return {"loss": loss, "accuracy": acc_metric.result()}
  
  def test_step(self, data):
    x, y = data
    y_pred = self.model(x, training=False)
    loss = self.loss(y, y_pred)
    acc_metric.update_state(y, y_pred)

    return{"loss":loss, "accuracy":acc_metric.result()}

acc_metric = tf.keras.metrics.BinaryAccuracy(name='accuracy')

training = CustomFit(model)
training.compile(optimizer=tf.keras.optimizers.Adam(),
                 loss=tf.keras.losses.BinaryCrossentropy(from_logits=True))
training.fit(train_padded, y_train, batch_size=batch_size, epochs=20)
training.evaluate(test_padded, y_test, batch_size=batch_size)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


[]