In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline

In [2]:
df = pd.read_csv("data/SpamDataPruned.csv", delimiter=",", header=None, dtype="float32")

In [3]:
df.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9266,9267,9268,9269,9270,9271,9272,9273,9274,9275
2794,0,3,0,3,1,3,2,0,7,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2795,0,0,0,2,0,10,7,0,4,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2796,0,5,0,0,1,0,0,2,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2797,0,0,0,1,0,9,3,4,4,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2798,0,2,0,3,0,10,12,3,3,1,...,,,,,,,,,,


### rows are documents and columns are word frequencies. target is 1 (spam) or 0 (ham)
### and we want to train a model to test if a document is spam or ham.

In [4]:
#Remove last row
df = df[0:2798]
df.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9266,9267,9268,9269,9270,9271,9272,9273,9274,9275
2793,0,3,0,0,0,5,3,2,1,0,...,0,0,0,0,0,0,0,0,0,1
2794,0,3,0,3,1,3,2,0,7,1,...,0,0,0,0,0,0,0,0,0,0
2795,0,0,0,2,0,10,7,0,4,1,...,0,0,0,0,0,0,0,0,0,1
2796,0,5,0,0,1,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
2797,0,0,0,1,0,9,3,4,4,1,...,0,0,0,0,0,0,0,0,0,1


In [5]:
# create train (90%) and test set (10%)
np.random.seed(0)
np.random.shuffle(df)
train_num = int(df.shape[0] * 0.9)
X_train = df.iloc[:train_num,:-1]
Y_train = df.iloc[:train_num,-1]
X_test  = df.iloc[train_num:,:-1]
Y_test  = df.iloc[train_num:,-1]

In [6]:
X_train[Y_train == 1].head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9265,9266,9267,9268,9269,9270,9271,9272,9273,9274
0,0,0,0,0,0,0,0,0,2,2,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,2,2,0,0,2,2,2,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,2,2,...,0,0,0,0,0,0,0,0,0,0
6,0,1,0,0,0,0,0,0,2,2,...,0,0,0,0,0,0,0,0,0,0


In [7]:
freqs = pd.DataFrame()
freqs["Spam"] = X_train[Y_train == 1].sum()
freqs.head()

Unnamed: 0,Spam
0,22
1,787
2,22
3,750
4,750


In [8]:
freqs["Ham"] = X_train[Y_train == 0].sum()
freqs

Unnamed: 0,Spam,Ham
0,22,8
1,787,6301
2,22,8
3,750,493
4,750,493
5,29,1597
6,29,1597
7,750,493
8,2824,1708
9,2824,1708


In [9]:
freqs["Ham"].head()

0       8
1    6301
2       8
3     493
4     493
Name: Ham, dtype: float32

In [10]:
probs = pd.DataFrame()

## Add One Smoothing
probs["Spam"] = (freqs["Spam"] + 1)/(sum(freqs["Spam"]) + len(freqs))
probs["Ham"] = (freqs["Ham"] + 1)/(sum(freqs["Ham"]) + len(freqs))
probs.head()

Unnamed: 0,Spam,Ham
0,1.3e-05,4e-06
1,0.00044,0.002858
2,1.3e-05,4e-06
3,0.000419,0.000224
4,0.000419,0.000224


In [11]:
P_s = 1.0*len(X_train[Y_train == 1])/len(X_train)
P_h = 1.0*len(X_train[Y_train == 0])/len(X_train)
print P_s, P_h

0.498411437649 0.501588562351


In [12]:
len(X_test.iloc[0])

9275

In [25]:
preds = np.ndarray(shape=(len(X_test),))
## 3s per loop...
for i in range(len(X_test)):
    #Spam probability
    #c_map = argmax_c[ log(P(c)) + sum of log(P(t | c)) ]
    c_spam = np.log(P_s) + X_test.iloc[i].dot(np.log(probs["Spam"]))
    c_ham = np.log(P_h) + X_test.iloc[i].dot(np.log(probs["Ham"]))
        
#     print c_spam, c_ham
    c = 1 if c_spam > c_ham else 0
    preds[i] = c

In [40]:
c_matrix = np.ndarray(shape=(2,2))
c_matrix[0,0] = sum((Y_test==0) & (preds==0))  #actual = 0, pred = 0 
c_matrix[0,1] = sum((Y_test==0) & (preds==1))  #actual = 0, pred = 1
c_matrix[1,0] = sum((Y_test==1) & (preds==0))  
c_matrix[1,1] = sum((Y_test==1) & (preds==1))  
print c_matrix

[[ 117.   17.]
 [   0.  146.]]
