In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [5]:
VOCAB_SIZE=2500

TOKEN_SPAM_PROB_FILE = 'SpamData/03_Testing/prob-spam.txt'
TOKEN_HAM_PROB_FILE = 'SpamData/03_Testing/prob-nonspam.txt'
TOKEN_ALL_PROB_FILE = 'SpamData/03_Testing/prob-all-tokens.txt'

TEST_FEATURE_MATRIX = 'SpamData/03_Testing/test-features.txt'
TEST_TARGET_MATRIX = 'SpamData/03_Testing/test-target.txt'

## Load the Data

In [6]:
X_test = np.loadtxt(TEST_FEATURE_MATRIX, delimiter=' ')
y_test = np.loadtxt(TEST_TARGET_MATRIX, delimiter=' ')
prob_token_spam = np.loadtxt(TOKEN_SPAM_PROB_FILE, delimiter=' ')
prob_token_ham = np.loadtxt(TOKEN_HAM_PROB_FILE, delimiter=' ')
prob_all_tokens = np.loadtxt(TOKEN_ALL_PROB_FILE, delimiter=' ')

In [10]:
X_test.shape

(1724, 2500)

In [13]:
prob_token_spam.shape



(2500,)

In [15]:
X_test.dot(prob_token_spam).shape

(1724,)

## Set the Prior

$$P(Spam \, | \, X) = \frac{P(X \, | \, Spam) \, P(Spam)} {P(X)}$$

In [17]:
PROB_SPAM = 0.3116

In [18]:
np.log(prob_token_spam)

array([ -4.40750247,  -5.25357728,  -4.98997971, ..., -11.40097663,
       -12.09412381,  -8.83602727])

## Joint Probability in log format

In [19]:
joint_log_spam = X_test.dot(np.log(prob_token_spam)- np.log(prob_all_tokens)+np.log(PROB_SPAM))

In [20]:
joint_log_spam[:5]

array([-184.44445007,  -43.31536421,  -50.53737572, -224.78751009,
       -176.55006415])

$$P(Ham \, | \, X) = \frac{P(X \, | \, Ham) \,(1 - P(Spam))} {P(X)}$$

In [24]:
joint_log_ham = X_test.dot(np.log(prob_token_ham)- np.log(prob_all_tokens)+np.log(1-PROB_SPAM))

In [26]:
joint_log_ham.size

1724

## Making Predictions
### Checking for higher joint probability
$$(Spam \, | \, X) \, > \, P(Ham \, | \, X)$$
<br>
<center> OR </center>
<br>
$$(Ham \, | \, X) \, > \, P(Spam \, | \, X)$$

In [27]:
prediction = joint_log_spam>joint_log_ham

In [29]:
prediction[-5:]*1

array([0, 0, 0, 0, 0])

In [30]:
joint_log_spam = X_test.dot(np.log(prob_token_spam)) + np.log(PROB_SPAM)
joint_log_ham = X_test.dot(np.log(prob_token_ham)) + np.log(1-PROB_SPAM)

## Metrics and Evaluation
### Accuracy

In [31]:
correct_doc = (y_test == prediction).sum()

In [33]:
correct_doc/y_test.size

0.7708816705336426

In [34]:
correct_doc

1329