# Author__ Hussam Qassim__

# Spam Detection using BLSTM neural network

# Setup

In [1]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Import the main necessary libraries
import os
import warnings
from distutils.version import LooseVersion
from datetime import datetime
import numpy as np
import tensorflow as tf

# To make this notebook's output stable across runs
def rset_graph(seed=42):
    tf.reset_default_graph()
    tf.set_random_seed(seed)
    np.random.seed(seed)

# Check TensorFlow Version. Please use TensorFlow version 1.0 or newer
assert LooseVersion(tf.__version__) >= LooseVersion('1.0')
print('TensorFlow Version: {}'.format(tf.__version__))

TensorFlow Version: 1.3.0


# Load data

In [2]:
'''
The SMS Spam Collection v.1 (hereafter the corpus) is a set of SMS tagged messages that have been collected for
SMS Spam research. It contains one set of SMS messages in English of 5,574 messages, tagged acording being 
ham (legitimate) or spam. 
'''
# Load the data
with open('dataset/SMSSpamCollection', 'r') as f:
    data = f.read()
    
print('Done..')

Done..


# Preprocessing


In [3]:
# Remove punctuation and lowercase the dataset
from string import punctuation
all_text = ''.join([c for c in data if c not in punctuation])
all_text = all_text.lower()

# Split label and text of each line
messages = all_text.split('\n')
messages = [x.split('\t') for x in messages if len(x)>=1]
[labels, texts] = np.array([list(x) for x in zip(*messages)])
print('Done..')

Done..


In [4]:
# Print samples of label and text
print("Example: ")
print("Label: {},\tText: {}".format(labels[0],texts[0]))
print("Label: {},\tText: {}".format(labels[1],texts[1]))
print("Label: {},\tText: {}".format(labels[2],texts[2]))

Example: 
Label: ham,	Text: go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat
Label: ham,	Text: ok lar joking wif u oni
Label: spam,	Text: free entry in 2 a wkly comp to win fa cup final tkts 21st may 2005 text fa to 87121 to receive entry questionstd txt ratetcs apply 08452810075over18s


In [5]:
# Our labels are "spam" or "ham". To use these labels in our network, we need to convert them to 0 and 1
labels = np.array([1 if each == 'spam' else 0 for each in labels])
print('labels: ', labels)

labels:  [0 0 1 ..., 0 0 0]


In [6]:
# Build our vocabulary words from the review texts 
all_text = ' '.join(texts)
words = all_text.split()
print("The vocabulary words samples:\n", words[:20]) 

The vocabulary words samples:
 ['go', 'until', 'jurong', 'point', 'crazy', 'available', 'only', 'in', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', 'cine', 'there', 'got', 'amore', 'wat']


# Encoding the words

In [7]:
# Encode the words with integers and build a dictionary that maps words to integers
from collections import Counter
counts = Counter(words)
vocab = sorted(counts, key=counts.get, reverse = True)
vocab_to_int = {word: ii for ii, word in enumerate(vocab)}
print('Vocabulary to integer:\n', vocab_to_int)

Vocabulary to integer:


In [8]:
# Convert each word in the review texts to corresponding integers number
text_ints = []
for each in texts:
    text_ints.append([vocab_to_int[word] for word in each.split()])
print('text_ints:\n',text_ints[3] )

text_ints:
 [5, 231, 140, 23, 355, 2911, 5, 160, 143, 59, 140]


In [9]:
# Get the lenght of the review texts 
from collections import Counter
text_lens = Counter([len(x) for x in text_ints])
print("Zero-length text: {}".format(text_lens[0]))
print("Maximum text length: {}".format(max(text_lens)))

Zero-length text: 2
Maximum text length: 171


In [10]:
# Get the index of nonzero in the review texts
non_zero_idx = [ii for ii, texts in enumerate(text_ints) if len(texts) != 0]
print('The lenght of nonzero reviews: {}'.format(len(non_zero_idx)))
print('The lenght of all the reviews: {}'.format(len(texts)))

The lenght of nonzero reviews: 5572
The lenght of all the reviews: 5574


In [11]:
# Fillter out that review with 0 length
text_ints = [text_ints[ii] for ii in non_zero_idx]
labels = np.array([labels[ii] for ii in non_zero_idx])
print('Done..')

Done..


In [12]:
'''
Create an array features that contains the data we'll pass to the network. Each row should be 170 elements long.
For text shorter than 170 words, left pad with 0s. For text longer than 170, use on the first 170 words as 
the feature vector.
'''
seq_len = 170
features = np.zeros((len(text_ints), seq_len), dtype=int)
for i, row in enumerate(text_ints):
    features[i, -len(row):] = np.array(row)[:seq_len]

print('The features array: {}'.format(features[0]))

The features array: [   0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
   45  440 4392  795  712  678   63    8 1250   89  120  354 1251  152 2907
 1252   67   56 4393  136]


# Build the Training, Validation and Test datasets

In [13]:
# Divide the dataset into 80% Training, 10% Validation, and 10% Testing  
split_idx = int(len(features)*0.8)
train_x, val_x = features[:split_idx], features[split_idx:]
train_y, val_y = labels[:split_idx], labels[split_idx:]

test_idx = int(len(val_x)*0.5)
val_x, test_x = val_x[:test_idx], val_x[test_idx:]
val_y, test_y = val_y[:test_idx], val_y[test_idx:]

print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(train_x.shape), 
      "\nValidation set: \t{}".format(val_x.shape),
      "\nTest set: \t\t{}".format(test_x.shape))

			Feature Shapes:
Train set: 		(4457, 170) 
Validation set: 	(557, 170) 
Test set: 		(558, 170)


# Build the Neural Network

In [15]:
# Define the Neural Network parameters
lstm_size = 64
batch_size = 250
n_layers = 2
drop_out = 0.5
learning_rate = 0.001
epochs = 10

print('Done..')

Done..


In [16]:
# Create TF Placeholders for the Neural Network
n_words = len(vocab_to_int)
print(n_words)

# Create the graph object
graph = tf.Graph()
# Add nodes to the graph
with graph.as_default():
    inputs_ = tf.placeholder(tf.int32, [None, None], name='inputs')
    labels_ = tf.placeholder(tf.int32, [None], name='labels')
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')   
print('Done..')

9661
Done..


# Construction phase_ Build the TF graph 

### Embedding

In [16]:
'''
There are about 1000 words in our vocabulary. It is massively inefficient to one-hot encode. 
Instead of one-hot encoding, we can have an embedding layer and use that layer as a lookup table.
'''
# Size of the embedding vectors (number of units in the embedding layer)
embed_size = 300 

with graph.as_default():
# random_uniform()function creates a node in the graph that will generate a tensor containing random values
    embedding = tf.Variable(tf.random_uniform((n_words, embed_size), -1, 1)) 
    embed = tf.nn.embedding_lookup(embedding, inputs_)

print('Done..')

Done..


### Build NN Cell and Initialize

In [17]:
'''
Bidirectional LSTMs train two instead of one LSTMs on the input sequence. The first on the input sequence 
as-is and the second on a reversed copy of the input sequence. This can provide additional context to the 
network and result in faster and even fuller learning on the problem
'''
# Build one BLSTM and Fully_connected layer with Softmax classifier
with graph.as_default():
    
# Using He initialization can significantly reduce the vanishing/exploding gradients problems   
    he_init = tf.contrib.layers.variance_scaling_initializer()

# Create the graph of 2 hidden BLSTM layer and one output layer
    with tf.name_scope("BLSTM"):
        def lstm_cell():
            '''
This LSTM variant with extra connections called peephole connections: the previous long-term state is added as
an input to the controllers of the forget gate and the input gate, and the current long-term tate is added as
input to the controller of the output gate
'''
            cell = tf.contrib.rnn.LSTMCell(num_units=lstm_size, initializer=he_init, use_peepholes=True)
            '''
            Applying dropout between the LSTM layers to prevent overfitting the training set.The following 
code applies dropout to the inputs of each layer in the LSTM, dropping each input with a 50% probability
'''
            cell_drop = tf.contrib.rnn.DropoutWrapper(cell, input_keep_prob=keep_prob)
            return cell_drop
    
        cell_fw = tf.contrib.rnn.MultiRNNCell([lstm_cell() for _ in range(n_layers)])
        cell_bw = tf.contrib.rnn.MultiRNNCell([lstm_cell() for _ in range(n_layers)])
   
        outputs, final_state = tf.nn.bidirectional_dynamic_rnn(cell_fw=cell_fw, cell_bw=cell_bw, 
                                                          inputs=embed, dtype=tf.float32, 
                                                          scope="BiLSTM")
        outputs = tf.concat(axis = 2, values = outputs)
        last_output = outputs[:,-1,:]
    
        logits = tf.contrib.layers.fully_connected(last_output, 2, activation_fn=None, scope="logits")
        
# Create the cost function     
    with tf.name_scope("loss"): 

# Computes the cross entropy, it is equivalent to applying the softmax activation function and then
# computing the cross entropy, but it is more efficient, and it properly takes care of corner cases like logits 
# equal to 0

        xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels_, logits=logits)
# Computes the mean of elements across dimensions of a tensor
        loss = tf.reduce_mean(xentropy, name="loss")
                
# Craete the optimizer 
    with tf.name_scope("train"): 
# Applying clip the gradients technique to lessen the exploding gradient problem in the LSTM
        threshold = 1.0
# Using Adam as optimizer because it is combines the ideas of Momentum optimization and RMSProp
# Adam is an adaptive learning rate algorithm, it requires less tuning of the learning rate hyperparameter η. 
# We can often use the default value η = 0.001
        optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
        grads_and_vars = optimizer.compute_gradients(loss)
        capped_gvs = [(tf.clip_by_value(grad, -threshold, threshold), var)
                                     for grad, var in grads_and_vars]
        training_op = optimizer.apply_gradients(capped_gvs)
        
# Evaluate the NN     
    with tf.name_scope("eval"):    
        correct = tf.nn.in_top_k(logits, labels_, 1)
        accuracy = tf.reduce_mean(tf.cast(correct, tf.float32), name="accuracy" )

print('Done..')

Done..


### Baching the dataset

In [18]:
def get_batches(x, y, batch_size=100):
    
    n_batches = len(x)//batch_size
    x, y = x[:n_batches*batch_size], y[:n_batches*batch_size]
    for ii in range(0, len(x), batch_size):
        yield x[ii:ii+batch_size], y[ii:ii+batch_size]
        
print('Done..')

Done..


# Execution phase_Execute the TF graph 

### Training the model

In [19]:
# initialize a name and file directory for TensorBoard
now = datetime.utcnow().strftime("%Y%m%d%H%M%S")
root_logdir = "tf_logs"
logdir = "{}/run-{}/".format(root_logdir, now)

# Create a Saver node
with graph.as_default():
    saver = tf.train.Saver()
    
# Create early stopping      
early_stopping = 0 

with tf.Session(graph=graph) as sess:
    sess.run(tf.global_variables_initializer()) # initialize all variables
    
# Creates a node in the graph that will evaluate the reduce_mean value and write it to a TensorBoard 
# compatible binary log string called a summary 
    loss_summary = tf.summary.scalar("accuracy", accuracy)

# Creates a FileWriter that you will use to write summaries to logfiles in the log directory
    file_writer = tf.summary.FileWriter(logdir, tf.get_default_graph())

    print('training ..')
    
    for epoch in range(epochs):
        for ii, (x, y) in enumerate(get_batches(train_x, train_y, batch_size), 1):
            x = x.reshape((-1, seq_len))
# Update the execution phase to evaluate the loss_summary node regularly during training(every 10 mini-batches)
            if ii % 10 == 0:
                summary_str = loss_summary.eval(feed_dict={inputs_: x, labels_: y, keep_prob: drop_out})
                step = epochs * batch_size + ii
                file_writer.add_summary(summary_str, step)
            sess.run(training_op, feed_dict={inputs_: x, labels_: y, keep_prob: drop_out})
        acc_train = accuracy.eval(feed_dict={inputs_: x, labels_: y, keep_prob: drop_out})
        for x, y in get_batches(val_x, val_y, batch_size):
            x = x.reshape((-1, seq_len))
            acc_val = accuracy.eval(feed_dict={inputs_: x, labels_: y, keep_prob: 1})
        print(epoch, "Train accuracy:", acc_train, "Validation accuracy:", acc_val)
        if acc_val >= early_stopping:
            # Save the best trained model
            saver.save(sess, "checkpoints/sentiment.ckpt")
            early_stopping = acc_val
                        
file_writer.close()

training ..
0 Train accuracy: 0.828 Validation accuracy: 0.864
1 Train accuracy: 0.884 Validation accuracy: 0.904
2 Train accuracy: 0.96 Validation accuracy: 0.96
3 Train accuracy: 0.968 Validation accuracy: 0.976
4 Train accuracy: 0.984 Validation accuracy: 0.98
5 Train accuracy: 0.988 Validation accuracy: 0.984
6 Train accuracy: 0.992 Validation accuracy: 0.988
7 Train accuracy: 0.992 Validation accuracy: 0.988
8 Train accuracy: 0.996 Validation accuracy: 0.988
9 Train accuracy: 0.984 Validation accuracy: 0.984


### Testing the model

In [20]:
with tf.Session(graph=graph) as sess:

    print('Loading the saved checkpoint..')
# Load the saved model 
    saver.restore(sess, "checkpoints/sentiment.ckpt")
    print('Testing..')
# Baching the test dataset
    for ii, (x, y) in enumerate(get_batches(test_x, test_y, batch_size), 1):
        x = x.reshape((-1, seq_len))
        acc_test = accuracy.eval(feed_dict={inputs_: x, labels_: y, keep_prob: 1})
    print("Test accuracy:", acc_test)

Loading the saved checkpoint..
INFO:tensorflow:Restoring parameters from checkpoints/sentiment.ckpt
Testing..
Test accuracy: 0.984
