# Spam Classification

In [4]:
import numpy as np

In [5]:
data = np.loadtxt('./spambase/spambase.data', delimiter=',')
print("Data layout: ", data.shape)
print("A single sample: ", data[0])

Data layout:  (4601, 58)
A single sample:  [  0.      0.64    0.64    0.      0.32    0.      0.      0.      0.
   0.      0.      0.64    0.      0.      0.      0.32    0.      1.29
   1.93    0.      0.96    0.      0.      0.      0.      0.      0.
   0.      0.      0.      0.      0.      0.      0.      0.      0.
   0.      0.      0.      0.      0.      0.      0.      0.      0.
   0.      0.      0.      0.      0.      0.      0.778   0.      0.
   3.756  61.    278.      1.   ]


We will split X and Y labels 

Here label 1 : Spam
           0 : Not Spam

In [6]:
X = data[:,0:-1]
Y = data[:,-1]

In [7]:
#Split Train data into 10- folds
from sklearn.model_selection import KFold
no_of_folds = 10
kf = KFold(n_splits=no_of_folds, shuffle = False) #Splits data into 10 folds and do not shuffle data before splitting

    

In [19]:
#Create Multinomial Naive Bayes model

from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB(alpha=1, fit_prior = True) #alpha : laplace smoothing parameter to handle unseen test data, fit_prior to learn class prior probabilities
i = 1
result = []
running_sum_accuracy = 0
running_sum_fp = 0
running_sum_fn = 0
running_sum_not_spam_count = 0
running_sum_spam_count = 0

for train_index, test_index in kf.split(X): #train_index and test_index contains list of indices pointing to data in X
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = Y[train_index], Y[test_index]
    
    model.fit(X_train, y_train) #Learn the conditional probabilties P(word|class) from the train data
    predicted_labels = model.predict(X_test) #Predicts probability of each instance belonging to class Spam or Not Spam
    FP = np.sum(np.logical_and(predicted_labels == 1, y_test == 0)) #False Positiveis number of emails incorrectly classifies as Spam
    FN = np.sum(np.logical_and(predicted_labels == 0, y_test == 1)) #False Negative is number of emails incorrectly clsssified as Not Spam
    OER = (FP + FN)/len(y_test) #Overall Error Rate(OER) : %age of total examples that were misclassified
    result.append([i,FP,FN,OER])
    
    #For Averaging results
    running_sum_accuracy += OER 
    running_sum_fp += FP
    running_sum_fn += FN
    running_sum_not_spam_count += np.sum(y_test == 0)
    running_sum_spam_count += np.sum(y_test == 1)
    
    i+=1
result.append(["Average Rate", running_sum_fp/running_sum_not_spam_count,running_sum_fn/running_sum_spam_count, running_sum_accuracy/no_of_folds])




In [20]:
#Printing the results
import pandas as pd
df = pd.DataFrame(result, columns = ["Fold","False Positive", "False Negative", "Overall Error Rate"])
df.set_index('Fold', inplace=True)
df

Unnamed: 0_level_0,False Positive,False Negative,Overall Error Rate
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.0,160.0,0.347072
2,0.0,108.0,0.234783
3,0.0,126.0,0.273913
4,5.0,141.0,0.317391
5,64.0,0.0,0.13913
6,58.0,0.0,0.126087
7,57.0,0.0,0.123913
8,85.0,0.0,0.184783
9,57.0,0.0,0.123913
10,168.0,0.0,0.365217
