# Text Classification
*Complete and hand in this completed worksheet (including its outputs and any supporting code outside of the worksheet) with your assignment submission. Please check the pdf file for more details.*

In this exercise you will:
    
- implement a of spam classifier with **Naive Bayes method** for real world email messages
- learn the **training and testing phase** for Naive Bayes classifier  
- get an idea of the **precision-recall** tradeoff

In [1]:
# some basic imports
import numpy as np
import matplotlib.pyplot as plt
import scipy.sparse
%matplotlib inline

%load_ext autoreload
%autoreload 2

In [67]:
# ham_train contains the occurrences of each word in ham emails. 1-by-N vector
ham_train = np.loadtxt('ham_train.csv', delimiter=',')
# spam_train contains the occurrences of each word in spam emails. 1-by-N vector
spam_train = np.loadtxt('spam_train.csv', delimiter=',')
# N is the size of vocabulary.
N = ham_train.shape[0]
# There 9034 ham emails and 3372 spam emails in the training samples
num_ham_train = 9034
num_spam_train = 3372
# Do smoothing
x = np.vstack([ham_train, spam_train]) + 1

# ham_test contains the occurences of each word in each ham test email. P-by-N vector, with P is number of ham test emails.
i,j,ham_test = np.loadtxt('ham_test.txt').T
i = i.astype(np.int)
j = j.astype(np.int)
ham_test_tight = scipy.sparse.coo_matrix((ham_test, (i - 1, j - 1)))
ham_test = scipy.sparse.csr_matrix((ham_test_tight.shape[0], ham_train.shape[0]))
ham_test[:, 0:ham_test_tight.shape[1]] = ham_test_tight

# spam_test contains the occurences of each word in each spam test email. Q-by-N vector, with Q is number of spam test emails.
i,j,spam_test = np.loadtxt('spam_test.txt').T
i = i.astype(np.int)
j = j.astype(np.int)
spam_test_tight = scipy.sparse.csr_matrix((spam_test, (i - 1, j - 1)))
spam_test = scipy.sparse.csr_matrix((spam_test_tight.shape[0], spam_train.shape[0]))
spam_test[:, 0:spam_test_tight.shape[1]] = spam_test_tight




In [45]:
print (i, j)

[   1    1    1 ... 1124 1124 1124] [ 2261  2509  2557 ... 75923 76519 77219]


In [25]:
all_word_map_file = open("all_word_map.txt", "r")
all_word_map = {}
for line in all_word_map_file.readlines():
    line = line.strip().split('\t')
    all_word_map[int(line[1])] = line[0]

letters = ham_train.shape[0]
ham_train_sorted = sorted([(i+1, int(x[1][i]), int(x[0][i])) for i in range(0, letters)], key = lambda x: x[1] / x[2], reverse = True)
for ch in ham_train_sorted[0:10]:
    print (ch, all_word_map[ch[0]])

(30033, 386, 1) nbsp
(75526, 364, 1) viagra
(38176, 321, 1) pills
(45153, 247, 1) cialis
(9494, 244, 1) voip
(65398, 224, 1) php
(37568, 196, 1) meds
(13613, 190, 1) computron
(56930, 179, 1) sex
(9453, 151, 1) ooking


In [91]:
print (np.sum(l[0]))

1.0


## Now let's implement a ham/spam email classifier. Please refer to the PDF file for details

In [96]:
from likelihood import likelihood
import math

# Implement a ham/spam email classifier, and calculate the accuracy of your classifier

# Hint: you can directly do matrix multiply between scipy.sparse.coo_matrix and numpy.array.
# Specifically, you can use sparse_matrix * np_array to do this. Note that when you use "*" operator
# between numpy array, this is typically an elementwise multiply.
num_ham_train = 9034
num_spam_train = 3372

# begin answer
l = likelihood(x)
logl = np.log(l)
p_ham = num_ham_train / (num_ham_train + num_spam_train)
p_spam = num_spam_train / (num_ham_train + num_spam_train)
    
ham_try_ham   = ham_test  * logl[0] + math.log(p_ham)
ham_try_spam  = ham_test  * logl[1] + math.log(p_spam)
spam_try_ham  = spam_test * logl[0] + math.log(p_ham)
spam_try_spam = spam_test * logl[1] + math.log(p_spam)


'''print (test[:, i].shape, try_ham.shape)
for ch in test[:, i]:
    print (ch, type(ch))
try_ham  += test[:, i] * log_p_ham_i
try_spam += test[:, i] * log_p_spam_i'''

total = ham_test.shape[0] + spam_test.shape[0]

TN = np.sum(ham_try_ham >= ham_try_spam)
TP = np.sum(spam_try_ham < spam_try_spam)
FN = np.sum(spam_try_ham >= spam_try_spam)
FP = np.sum(ham_try_ham < ham_try_spam)

print ("accuracy:", (TP + TN) / total)
print ("precision:", TP / (TP + FP))
print ("recall:", TP / (TP + FN))
    
# end answer

accuracy: 0.9857315598548972
precision: 0.9750223015165032
recall: 0.9724199288256228
