# Working with Bag of Words

In [1]:
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow.contrib import learn
from SpamData import SpamData
from Partition import partition_indices
%matplotlib inline

sess = tf.Session()

### Preparing Data

In [2]:
spamData = SpamData()
spamDf = spamData.load_data()

In [3]:
spamDf.head()

Unnamed: 0,Class,Content,Label,Text Length
0,ham,go jurong point crazy available bugis n great ...,0,16
1,ham,ok lar joking wif u oni,0,6
2,spam,free entry wkly comp win fa cup final tkts st ...,1,20
3,ham,u dun say early hor u c already say,0,9
4,ham,nah i dont think goes usf lives around though,0,9


In [None]:
trainIdx, testIdx = partition_indices(spamDf.shape[0])

In [None]:
trainData = spamDf.iloc[trainIdx]
print(trainData.shape)
trainData.head()

In [None]:
testData = spamDf.iloc[testIdx]
print(testData.shape)
testData.head()

### Plotting Data

In [None]:
spamDf.hist('Text Length', bins=50)

### Identifying Embedding Size

In [None]:
MAX_SENT_SIZE = 35
MIN_WORD_FREQ = 3

vocabProcessor = learn.preprocessing.VocabularyProcessor(max_document_length=MAX_SENT_SIZE, min_frequency=MIN_WORD_FREQ)
wordIdx = vocabProcessor.fit_transform(spamDf['Content'])
embeddingSize = len(vocabProcessor.vocabulary_._mapping)
print(type(wordIdx))
embeddingSize

-----
**TEST**

-----

In [None]:
vocab = vocabProcessor.vocabulary_._mapping

In [None]:
print(type(vocab))
print(len(vocab.items()))

In [None]:
sortedVocab = sorted(vocab.items(), key=lambda i: i[1], reverse=True)

In [None]:
sortedVocab[: 10]

In [None]:
import itertools

list(itertools.islice(wordIdx, 2))

-----

### Embedding Matrix

In [None]:
embeddingMx = tf.convert_to_tensor(np.eye(embeddingSize, dtype=np.float32))
embeddingMx

### Declaring Parameters and Placeholders

In [None]:
W = tf.Variable(tf.truncated_normal(shape=[embeddingSize, 1]), name='W')
b = tf.Variable(tf.zeros(shape=[1, 1]), name='b')
print(W)
print(b)

X = tf.placeholder(shape=[MAX_SENT_SIZE], dtype=tf.int32, name='X')
y = tf.placeholder(shape=[1, 1], dtype=tf.float32, name='y')
print(X)
print(y)

### Mapping Indices of Words

In [None]:
X_embed = tf.nn.embedding_lookup(embeddingMx, X, name='X_embed')
X_sums = tf.reduce_sum(X_embed, 0, name='X_sums')

print(X_embed)
print(X_sums)

### Calculating Outputs and Loss Function

In [None]:
X_sums_2D = tf.expand_dims(X_sums, 0, name='X_sums_2D')
output = tf.add(tf.matmul(X_sums_2D, W), b, name='output')
print(X_sums_2D)
print(output)

prediction = tf.sigmoid(output, name='prediction')
loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=y, logits=output, name='loss'))
print(prediction)
print(loss)

### Training Step

In [None]:
optimizer = tf.train.GradientDescentOptimizer(0.001)
train = optimizer.minimize(loss, name='train')
print(train)

### Initializing Variables

In [None]:
init = tf.global_variables_initializer()
sess.run(init)

### Training Data

In [None]:
for (i, x_item) in enumerate(vocabProcessor.fit_transform(trainData['Content'])):
    y_item = trainData.iloc[i]['Label']
    feedDict = {X: x_item, y: [[y_item]]}
    sess.run(train, feed_dict=feedDict)
    
    if (i + 1) % 200 == 0:
        lossRes = sess.run(loss, feed_dict=feedDict)
        print('#{0} - loss: {1}'.format(i, lossRes))

### Measuring Test Set Accuracy

In [None]:
testAccuracy = []

for (i, x_item) in enumerate(vocabProcessor.fit_transform(testData['Content'])):
    y_item = testData.iloc[i]['Label']
    feedDict = {X: x_item, y: [[y_item]]}
    predictionRes = sess.run(prediction, feed_dict=feedDict)
    testAccuracy.append(y_item == np.round(predictionRes))
    
    if (i + 1) % 50 == 0:
        print('#{0} - accuracy: {1}:'.format(i, np.mean(testAccuracy)))

print('Overall Test Accuracy: {0}'.format(np.mean(testAccuracy)))