# Text Classification
*Complete and hand in this completed worksheet (including its outputs and any supporting code outside of the worksheet) with your assignment submission. Please check the pdf file for more details.*

In this exercise you will:
    
- implement a of spam classifier with **Naive Bayes method** for real world email messages
- learn the **training and testing phase** for Naive Bayes classifier  
- get an idea of the **precision-recall** tradeoff

In [1]:
# some basic imports
import numpy as np
import matplotlib.pyplot as plt
import scipy.sparse
%matplotlib inline

%load_ext autoreload
%autoreload 2

In [2]:
# ham_train contains the occurrences of each word in ham emails. 1-by-N vector
ham_train = np.loadtxt('ham_train.csv', delimiter=',')
# spam_train contains the occurrences of each word in spam emails. 1-by-N vector
spam_train = np.loadtxt('spam_train.csv', delimiter=',')
# N is the size of vocabulary.
N = ham_train.shape[0]
# There 9034 ham emails and 3372 spam emails in the training samples
num_ham_train = 9034
num_spam_train = 3372
# Do smoothing
x = np.vstack([ham_train, spam_train]) + 1

# ham_test contains the occurences of each word in each ham test email. P-by-N vector, with P is number of ham test emails.
i,j,ham_test = np.loadtxt('ham_test.txt').T
i = i.astype(np.int)
j = j.astype(np.int)
ham_test_tight = scipy.sparse.coo_matrix((ham_test, (i - 1, j - 1)))
ham_test = scipy.sparse.csr_matrix((ham_test_tight.shape[0], ham_train.shape[0]))
ham_test[:, 0:ham_test_tight.shape[1]] = ham_test_tight
# spam_test contains the occurences of each word in each spam test email. Q-by-N vector, with Q is number of spam test emails.
i,j,spam_test = np.loadtxt('spam_test.txt').T
i = i.astype(np.int)
j = j.astype(np.int)
spam_test_tight = scipy.sparse.csr_matrix((spam_test, (i - 1, j - 1)))
spam_test = scipy.sparse.csr_matrix((spam_test_tight.shape[0], spam_train.shape[0]))
spam_test[:, 0:spam_test_tight.shape[1]] = spam_test_tight




In [3]:
word_dict = {}
with open('all_word_map.txt', 'r') as file_to_read:
    while(True):
        line = file_to_read.readline()
        if (not line):
            break
        word,num = line.split('\t')
        word_dict[int(num)] = word

In [4]:
print(word_dict[2])

nordisk


In [5]:
spam_ham_ratio = x[1] / x[0]
K = 10
top_k_idx = spam_ham_ratio.argsort()[::-1][0:K]
print(top_k_idx)

for idx in top_k_idx:
    print(x[0][idx], x[1][idx], word_dict[idx + 1])

[30032 75525 38175 45152  9493 65397 37567 13612 56929  9452]
1.0 386.0 nbsp
1.0 364.0 viagra
1.0 321.0 pills
1.0 247.0 cialis
1.0 244.0 voip
1.0 224.0 php
1.0 196.0 meds
1.0 190.0 computron
1.0 179.0 sex
1.0 151.0 ooking


In [33]:
print(ham_test[0].shape)

(1, 77386)


## Now let's implement a ham/spam email classifier. Please refer to the PDF file for details

In [6]:
from likelihood import likelihood
# TODO
# Implement a ham/spam email classifier, and calculate the accuracy of your classifier

# Hint: you can directly do matrix multiply between scipy.sparse.coo_matrix and numpy.array.
# Specifically, you can use sparse_matrix * np_array to do this. Note that when you use "*" operator
# between numpy array, this is typically an elementwise multiply.

l = np.log(likelihood(x))

#return 0:ham/1:spam
def classify(X):
    t0 = l[0]
    t1 = l[1]
    S0 = np.sum(X * t0) + np.log(num_ham_train / (num_ham_train + num_spam_train))
    S1 = np.sum(X * t1) + np.log(num_spam_train / (num_ham_train + num_spam_train))
    print (S0, S1)
    if (S0 > S1):
        return 0
    else:
        return 1
# begin answer
# end answer

In [9]:
print (np.sum(likelihood(x)[0]))

1.0


In [7]:
classify(spam_test[0])


result = np.zeros((2,2))
print(spam_test.shape, ham_test.shape)
spam_test_num = spam_test.shape[0]
ham_test_num = ham_test.shape[0]
for i in range(ham_test_num):
    #print(ham_test[i])
    result[0][classify(ham_test[i])] += 1
for i in range(spam_test_num):
    result[1][classify(spam_test[i])] += 1
print(result)
print((result[0][0] + result[1][1]) / (sum(sum(result))))
TP = result[1][1]
TN = result[0][0]
FP = result[0][1]
FN = result[1][0]
precision = TP / (TP + FP)
recall = TP / (TP + FN)
print(precision, recall)

-790.9091661641785 -716.1462918164125
(1124, 77386) (3011, 77386)
-1186.0320537649488 -1435.4708587402774
-1727.6858744391216 -1985.1202363204957
-5095.571192237475 -5999.969574631802
-312.7398550674935 -364.72815424727304
-1749.7938059431415 -2065.914530293185
-124.91163769426299 -127.75349385926224
-1773.1146424554722 -1991.699119548766
-2189.5605167103995 -2538.695182253603
-2310.3162542643454 -2410.1256588284054
-190.94332069685854 -201.3473156907832
-1281.6498573691536 -1624.3056501331669
-108.44917144251485 -132.64251780505563
-1037.6609040868482 -1216.7079282586767
-1635.7294641254534 -1759.9910463848755
-485.87350703091164 -643.2845706539314
-116.61419594099056 -185.81137274695857
-3640.528909725178 -3871.8969408337525
-664.7330091262922 -800.6087463656147
-43.69379812583852 -45.27780389933061
-84.3308279641252 -119.79413160039182
-356.15938796464803 -393.9132322094333
-1940.5896413075463 -2134.5224093397765
-2371.6859906647046 -2581.7532048078547
-215.34135502843512 -251.10490

-165.8977948878198 -191.09949019450357
-168.86547108991633 -209.03470283759864
-1128.1515404481938 -1228.6767578700192
-266.9799185925149 -280.2569117890165
-970.3304024001476 -1058.5823699498353
-948.9119069669772 -1115.3441084328595
-731.2105525857683 -873.863051508214
-76.19603519745824 -96.5435836540435
-240.75493716003623 -323.1048790385666
-646.1338739086304 -722.1441943466477
-62.91637601295039 -94.52589693250395
-1895.396218187979 -1938.7673487446853
-85.67934693496376 -100.38344838402108
-824.4735758688469 -926.2446533346266
-251.66646360346422 -345.87660998512666
-249.6204797765156 -310.90057801244785
-2052.5490784580147 -2349.885973648022
-1228.3330974888274 -1778.3953583888203
-753.5971829143086 -822.8244678212595
-75.44925833100629 -125.34235599492824
-327.1768280984185 -389.51823477978985
-359.1118001807894 -394.17545567260527
-3014.8590208762207 -3368.7517130198653
-210.69300057341616 -244.3741329071484
-2955.9146654122146 -3167.1497898147445
-3030.2340157743592 -3701.32

-885.7345067535332 -966.0318494024233
-340.3523906527043 -396.44699297550176
-1030.7199637543229 -1077.0141688077165
-281.1665395743168 -333.7757192411476
-552.275690308179 -711.2599992212378
-175.2849252113529 -239.50576243795297
-3610.4927718150143 -4241.524488860255
-200.5906496504198 -224.7679262507095
-520.1590216499333 -599.0519354648454
-265.41670460349417 -312.40600314761804
-338.82975848885934 -437.94499497567887
-2180.6271009994884 -2600.460855903018
-74090.10287473288 -86002.80066414304
-441.79151908512904 -498.48260862896586
-238.22269713630038 -328.4265742614688
-135.2174027053606 -167.62941350533512
-2246.011513745837 -2561.5401839495153
-624.9210547003075 -817.3000271894265
-144.2133072158821 -180.09364876889762
-647.56206368759 -726.2424619969845
-1042.9066304918235 -1176.68063279244
-84.8088987907245 -121.19421928364405
-2056.787989373277 -2248.3616070442
-271.8088618513044 -311.6085739478754
-881.9976998766302 -940.2751562224448
-211.92104998687608 -264.0920115369197


-366.9343077810425 -421.93150291266653
-130.716361320131 -137.4293034685595
-1440.5689875415333 -1854.2780561292498
-84.42169634040756 -119.53071414993967
-183.87693736530403 -208.7258408557374
-1993.370540430069 -2196.997137396725
-748.5870118792235 -831.7066414922091
-75.05511228677855 -96.77650054398389
-7257.51354534569 -7867.166162973027
-611.3744207065297 -871.0827945167033
-441.310148740542 -469.23963539209944
-1442.099889937904 -1627.4253727369592
-208.83759351711 -251.91976213292003
-1052.2049970990695 -1158.8763906385793
-212.56742674728727 -259.5679576386349
-2042.0487605061496 -2381.76672936319
-458.0910876204558 -486.8747376036156
-86.04428014026745 -120.39571158742628
-10.872448495172746 -10.964399502058523
-1636.2604490406932 -2126.2582492898023
-1284.3678997840716 -1327.2111728082896
-671.6593196797394 -760.2580341919625
-159.0029684762583 -193.75283422828963
-1339.5969532348984 -1470.4542512280702
-151.1852634172655 -172.02115747844366
-2509.5716871504437 -2935.9294008

-2354.119502755091 -2533.8482615490675
-1917.4901409362117 -2048.277340112861
-2310.746469724918 -2891.681005810714
-1606.0463702906438 -2071.639900120653
-534.3498158659903 -616.1273352124321
-2024.7174506082574 -2132.049276150799
-226.39417481579608 -257.6818067669313
-157.88742953885313 -179.54902633316007
-977.9562099779838 -1058.6179542233822
-84.3308279641252 -119.79413160039182
-752.9631830682433 -827.1383827480116
-2449.100458266116 -2764.8870521196523
-8212.957530466985 -8915.469855022284
-295.7114871612678 -303.85341503762737
-210.74589702613324 -241.27116188839207
-1175.8984842159691 -1244.075186981567
-447.49139585883387 -508.1098785187518
-267.6673010414849 -305.90435502237256
-893.7549409247778 -1105.4776541911701
-2145.7147126532386 -2234.899215847613
-2538.8722192052796 -2732.28775138316
-71.18230413151072 -122.32939931653372
-840.6718172766349 -961.6445087362105
-1056.0006781335805 -1124.6506923913255
-1176.1059668476514 -1211.865031326313
-84.10183834224758 -120.78875

-75.66046653177122 -124.92114252985193
-139.08157114819738 -156.43644415116512
-5811.006977106851 -6280.14959565667
-603.1130230270002 -630.3199851952508
-1821.2128575216204 -2022.202005849199
-81.85950895796518 -114.66202678824492
-290.5914190684358 -330.7390901400768
-1861.6628162580384 -2086.7118912312676
-388.60674899106084 -448.39652205443224
-785.7471949256238 -832.057132860897
-542.7456953900651 -678.6272513016327
-527.4822921415595 -621.6679078853126
-1813.4038653068446 -2103.745527505684
-971.7945703351851 -1122.9715004104655
-5332.431237787177 -5949.207622254559
-1740.4051256371492 -2119.3309054186666
-5295.239046799991 -5772.955783610938
-247.0781857735658 -295.4103265757183
-332.02258861932694 -407.1102424651718
-320.72111480463724 -406.8241445773284
-313.9497122733254 -359.8991675286285
-1552.6148113061138 -2012.4223426645826
-2397.0241991758744 -3000.7241675291198
-2622.921391344846 -3122.616267579707
-97.83069172223524 -105.77366909911142
-1271.4501710758555 -1659.709579

-534.8464699177945 -464.52251449447357
-745.0070168541063 -721.2640006300088
-770.8912698432451 -736.1207867599503
-202.07373296585098 -176.069979418548
-854.7633578852402 -713.5538834101293
-909.0381459413961 -828.9777634251717
-839.5684977029549 -752.4832754582355
-1857.490840425825 -1698.5959879274253
-429.7792073287978 -410.886738244831
-98.18625985985705 -82.38779789437514
-3575.4273226950613 -3407.4824351851316
-323.5343306488674 -287.06406758749074
-1931.131120625156 -1841.1200040099632
-1462.2500030803822 -1319.6514364999225
-865.5080003780976 -760.4191257867449
-934.2332648089514 -830.671138612881
-665.7258940023239 -650.0035676330075
-1344.3834667430758 -1202.3911958274543
-4653.891928804438 -4534.22666642769
-1144.7588654547778 -929.6184808492619
-594.4424039452654 -555.9008989433402
-10414.042261155975 -9257.519693826212
-3970.384466417126 -4015.686769439712
-642.681207591294 -550.5181679800959
-2242.2520801553137 -2125.1472648785602
-686.7972302523017 -586.1794297348836
-5

-832.8759498840692 -802.2013415467225
-446.7267415031952 -382.16066404941864
-331.1068548255544 -296.8865774071777
-551.7927826018507 -520.7269263117295
-11668.436013444873 -10919.066774559655
-323.23248175316763 -304.5220547474087
-1297.169819803732 -1256.6893434567673
-2518.4824543028294 -2401.9781681012687
-1201.125186087136 -1037.3199598052877
-441.039115304113 -375.26287625447117
-832.2117666999757 -736.6286017305755
-565.0683796667082 -499.80567417068613
-103.51708610547664 -93.34346149835598
-1195.7113001650866 -1039.1109566322261
-8968.594015792503 -8045.177272775261
-1091.0323868851203 -1069.590490917925
-1087.7178061993832 -819.9177270029342
-1097.834809674905 -975.323710635922
-293.65547332074175 -281.10709247514467
-938.0925917189503 -884.1657551680855
-917.1992576291226 -810.3671685492241
-230.66603056714112 -215.5220606672728
-591.3708911451379 -544.853966014583
-1007.0289640007392 -953.0098573995303
-2535.9088965330884 -2249.219921816213
-18635.167127425237 -18091.670992