# Text Classification
*Complete and hand in this completed worksheet (including its outputs and any supporting code outside of the worksheet) with your assignment submission. Please check the pdf file for more details.*

In this exercise you will:
    
- implement a of spam classifier with **Naive Bayes method** for real world email messages
- learn the **training and testing phase** for Naive Bayes classifier  
- get an idea of the **precision-recall** tradeoff

In [1]:
# some basic imports
import numpy as np
import matplotlib.pyplot as plt
import scipy.sparse
%matplotlib inline

%load_ext autoreload
%autoreload 2

In [2]:
# ham_train contains the occurrences of each word in ham emails. 1-by-N vector
ham_train = np.loadtxt('ham_train.csv', delimiter=',')
# spam_train contains the occurrences of each word in spam emails. 1-by-N vector
spam_train = np.loadtxt('spam_train.csv', delimiter=',')
# N is the size of vocabulary.
N = ham_train.shape[0]
# There 9034 ham emails and 3372 spam emails in the training samples
num_ham_train = 9034
num_spam_train = 3372
# Do smoothing
x = np.vstack([ham_train, spam_train]) + 1

# ham_test contains the occurences of each word in each ham test email. P-by-N vector, with P is number of ham test emails.
i,j,ham_test = np.loadtxt('ham_test.txt').T
i = i.astype(np.int)
j = j.astype(np.int)
ham_test_tight = scipy.sparse.coo_matrix((ham_test, (i - 1, j - 1)))
ham_test = scipy.sparse.csr_matrix((ham_test_tight.shape[0], ham_train.shape[0]))
ham_test[:, 0:ham_test_tight.shape[1]] = ham_test_tight
# spam_test contains the occurences of each word in each spam test email. Q-by-N vector, with Q is number of spam test emails.
i,j,spam_test = np.loadtxt('spam_test.txt').T
i = i.astype(np.int)
j = j.astype(np.int)
spam_test_tight = scipy.sparse.csr_matrix((spam_test, (i - 1, j - 1)))
spam_test = scipy.sparse.csr_matrix((spam_test_tight.shape[0], spam_train.shape[0]))
spam_test[:, 0:spam_test_tight.shape[1]] = spam_test_tight




In [3]:
word_dict = {}
with open('all_word_map.txt', 'r') as file_to_read:
    while(True):
        line = file_to_read.readline()
        if (not line):
            break
        word,num = line.split('\t')
        word_dict[int(num)] = word

In [4]:
print(word_dict[2])

nordisk


In [5]:
spam_ham_ratio = x[1] / x[0]
K = 10
top_k_idx = spam_ham_ratio.argsort()[::-1][0:K]
print(top_k_idx)

for idx in top_k_idx:
    print(x[0][idx], x[1][idx], word_dict[idx + 1])

[30032 75525 38175 45152  9493 65397 37567 13612 56929  9452]
1.0 386.0 nbsp
1.0 364.0 viagra
1.0 321.0 pills
1.0 247.0 cialis
1.0 244.0 voip
1.0 224.0 php
1.0 196.0 meds
1.0 190.0 computron
1.0 179.0 sex
1.0 151.0 ooking


In [33]:
print(ham_test[0].shape)

(1, 77386)


## Now let's implement a ham/spam email classifier. Please refer to the PDF file for details

In [99]:
from likelihood import likelihood
# TODO
# Implement a ham/spam email classifier, and calculate the accuracy of your classifier

# Hint: you can directly do matrix multiply between scipy.sparse.coo_matrix and numpy.array.
# Specifically, you can use sparse_matrix * np_array to do this. Note that when you use "*" operator
# between numpy array, this is typically an elementwise multiply.

l = np.log(likelihood(x))

#return 0:ham/1:spam
def classify(X):
    t0 = l[0]
    t1 = l[1]
    S0 = np.sum(X * t0) + np.log(num_ham_train / (num_ham_train + num_spam_train))
    S1 = np.sum(X * t1) + np.log(num_spam_train / (num_ham_train + num_spam_train))
    if (S0 > S1):
        return 0
    else:
        return 1
# begin answer
# end answer

In [103]:
result = np.zeros((2,2))
print(spam_test.shape, ham_test.shape)
spam_test_num = spam_test.shape[0]
ham_test_num = ham_test.shape[0]
for i in range(ham_test_num):
    #print(ham_test[i])
    result[0][classify(ham_test[i])] += 1
for i in range(spam_test_num):
    result[1][classify(spam_test[i])] += 1
print(result)
print((result[0][0] + result[1][1]) / (sum(sum(result))))
TP = result[1][1]
TN = result[0][0]
FP = result[0][1]
FN = result[1][0]
precision = TP / (TP + FP)
recall = TP / (TP + FN)
print(precision, recall)

(1124, 77386) (3011, 77386)
[[2983.   28.]
 [  31. 1093.]]
0.9857315598548972
0.9750223015165032 0.9724199288256228
