# Implementing RNN for Spam Prediction

To start we will apply the standard RNN unit to predict a singular numerical output.

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
%matplotlib inline

### Loading data

In [2]:
import os
import sys

sys.path.append(os.path.abspath('..'))

In [3]:
from SpamData import SpamData
spam = SpamData()
df = spam.load_data()

In [4]:
df.head(3)

Unnamed: 0,Class,Content,Label,Text Length
0,ham,go jurong point crazy available bugis n great ...,0,16
1,ham,ok lar joking wif u oni,0,6
2,spam,free entry wkly comp win fa cup final tkts st ...,1,20


### Constants

In [5]:
EPOCHS = 20
BATCH_SIZE = 250
MAX_SEQUENCE_LENGTH = 40
RNN_SIZE = 10
EMBEDDING_SIZE = 50
MIN_WORD_FREQUENCY = 10
LEARNING_RATE = 0.0005
TRAIN_SIZE = 0.8
TEST_SIZE = 0.2

### Processing vocabulary

In [15]:
vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(max_document_length=MAX_SEQUENCE_LENGTH, min_frequency=MIN_WORD_FREQUENCY)
processed_text = np.array(list(vocab_processor.fit_transform(df.Content)))
vocab_size = len(vocab_processor.vocabulary_)
processed_text.shape

(5572, 40)

### Splitting data

In [16]:
input_train, input_test, label_train, label_test = train_test_split(processed_text, df[['Label']], train_size=TRAIN_SIZE, test_size=TEST_SIZE)

In [17]:
print(input_train.shape)
input_train[4]

(4457, 40)


array([ 17, 120,  68,  84,  73, 107,   0,  19,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0])

In [18]:
print(input_test.shape)
input_test[4]

(1115, 40)


array([  3,   0,   0, 718,   0,   0,   1, 296, 321, 422,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0])

In [19]:
print(label_train.shape)
label_train.head(3)

(4457, 1)


Unnamed: 0,Label
2687,0
265,0
1422,0


In [20]:
print(label_test.shape)
label_test.head(3)

(1115, 1)


Unnamed: 0,Label
1987,0
446,0
1415,0


### Placeholders

In [23]:
X = tf.placeholder(dtype=tf.int32, shape=[None, MAX_SEQUENCE_LENGTH], name='X')
y = tf.placeholder(dtype=tf.int32, shape=[None, 1], name='y')
keep_prob = tf.placeholder(dtype=tf.float32, name='keep_prob')

print(X)
print(y)
print(keep_prob)

Tensor("X_3:0", shape=(?, 40), dtype=int32)
Tensor("y_3:0", shape=(?, 1), dtype=int32)
Tensor("keep_prob_2:0", dtype=float32)


### Embedding matrix and embedding lookup

In [25]:
embedding_mx = tf.Variable(tf.random_uniform(minval=-1.0, maxval=1.0, shape=[vocab_size, EMBEDDING_SIZE]), name='embedding_mx')
embedding_output = tf.nn.embedding_lookup(params=embedding_mx, ids=X)

print(embedding_mx)
print(embedding_output)

Tensor("embedding_mx/read:0", shape=(895, 50), dtype=float32)
Tensor("embedding_lookup_1:0", shape=(?, 40, 50), dtype=float32)


In [14]:
tf.random_uniform?

In [22]:
tf.nn.embedding_lookup?