# Spam Detection Using LSTMs

Here we use a simple LSTM model from Keras to predict whether a given text is spam or not. 

## Importing Modules

In [53]:
import pandas as pd 
import numpy as np 
import seaborn as sns 

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM
from keras.utils.np_utils import to_categorical
from keras.callbacks import ModelCheckpoint
from keras.models import load_model

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score

import matplotlib.pyplot as plt 
%matplotlib inline 

## Importing the Dataset

In [9]:
df = pd.read_csv('smsspam.txt',sep='\t',names=['Status','Message'])

In [10]:
df.columns.values

array(['Status', 'Message'], dtype=object)

In [11]:
df.head()

Unnamed: 0,Status,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [12]:
df['Status'] = df['Status'].apply(lambda x: 0 if 'ham' in x else 1)

## Text Data Preprocessing

In [44]:
# Use the Keras tokenizer
num_words = 1000
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(df['Message'].values)

# Pad the data 
X = tokenizer.texts_to_sequences(df['Message'].values)
X = pad_sequences(X, maxlen=1000)

Y = to_categorical(df['Status'].values)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42, stratify=Y)

In [45]:
print X.shape
print Y.shape

(5572, 1000)
(5572, 2)


## Creating the LSTM Model

In [46]:
embed_dim = 128

# Model saving callback
ckpt_callback = ModelCheckpoint('keras_model', monitor='val_loss', verbose=1, save_best_only=True, mode='auto')

In [47]:
model = Sequential()

model.add(Embedding(num_words, embed_dim, input_length=X.shape[1]))

model.add(LSTM(units=196, recurrent_dropout=0.2, dropout=0.2))

model.add(Dense(units=2,activation='softmax'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

## Training the LSTM

In [48]:
model.fit(X_train, Y_train, epochs=5, batch_size=32, validation_split=0.1, callbacks=[ckpt_callback])

Train on 4011 samples, validate on 446 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x13c389b90>

## Accuracy Tests

In [50]:
Y_pred = model.predict(X_test)

y_test_class = np.argmax(Y_test, axis=1)
y_pred_class = np.argmax(Y_pred, axis=1)

In [51]:
model.evaluate(X_test, Y_test)



[0.037049245740383062, 0.99013452968255289]

In [54]:
accuracy_score(y_test_class, y_pred_class)

0.99013452914798206

In [66]:
# 0 is not spam, 1 is spam.
CM = confusion_matrix(y_test_class, y_pred_class, labels=[0, 1])


TP = CM[1, 1]
TN = CM[0, 0]
FP = CM[0, 1]
FN = CM[1, 0]

In [67]:
print CM

[[961   1]
 [ 10 143]]


In [68]:
# CLASSIFICATION ACCURACY
print (TP + TN) / float(TP + TN + FP + FN)

0.990134529148


In [69]:
# CLASSIFICATION ERROR
print (FP + FN) / float(TP + TN + FP + FN)

0.00986547085202


In [70]:
# Sensitivity: How sensitive is the model in predicting positive instances?
print TP / float(TP + FN)

0.934640522876


In [71]:
# Specificity: When it's actually no, how often does it predict no?
# True Negative Rate
print TN / float(TN + FP)

0.99896049896


In [72]:
# False Positive Rate: When it's actually no, how often does it predict yes?
print FP / float(TN + FP)

0.0010395010395


In [73]:
# Precision: When it predicts yes, how often is it correct?
print TP / float(TP + FP)

0.993055555556
