In [1]:
import numpy as np
import pandas as pd
import re
#read data for spam.csv
data_kaggle = pd.read_csv('spam.csv', header=0, encoding='latin1')

#check the column and create the 'spam' column
print(data_kaggle.columns)
data_kaggle['spam']=(data_kaggle['v1']=='spam').astype('int32')

#drop the unwanted column and drop the NA column
data_kaggle = data_kaggle.drop(['v1','Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'],1)
data_kaggle.columns = ['sms','spam']
data_kaggle.dropna(how='any')

#check out the data and data's distribution.
len(data_kaggle)
data_kaggle['spam'].value_counts(normalize=True)

Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')


0    0.865937
1    0.134063
Name: spam, dtype: float64

In [2]:
# make the train data and test data
from sklearn.model_selection import train_test_split
X=data_kaggle['sms']
Y=data_kaggle['spam']

X_train, X_test,y_train,y_test=train_test_split(X,Y,test_size=0.33)
# Because I define spam value is '1' in 'spam' column, define SPAM is '1'
SPAM = 1

#create the class consisted with body(=email content) and label(='spam' or 'ham')
class Dataset:
    def __init__(self,X,Y):
        self.body=X
        self.label=Y

#make the train_list data
trainData=[]
for index in range(len(X_train)-1):
    try:
        data_ex=Dataset(X_train[index], y_train[index])
        trainData.append(data_ex)
    except:
        continue


In [3]:
# create variables for processEmail function
trainPositive=pd.DataFrame()
trainNegative=pd.DataFrame()
positiveTotal = 0
negativeTotal = 0

#train function return the ratio of spam and ham in training dataset
def train():
    total = 0
    numSpam = 0
    for email in trainData:
        if email.label == SPAM :
            numSpam +=1
        total += 1
        processEmail(email.body , email.label)
    pA = numSpam/float(total)
    pNotA = (total - numSpam)/float(total)
    return pA, pNotA

#reading words from a specific email
def processEmail(body , label):
    for word in body:
        if label == SPAM:
            trainPositive[word] = trainPositive.get(word, 0) + 1  #the number of word(token) repeat.
            global positiveTotal
            positiveTotal += 1      #total spam words number
        else:
            trainNegative[word] = trainNegative.get(word, 0) + 1  #the number of word(token) repeat.
            global negativeTotal
            negativeTotal += 1    #total ham words number

pA, pNotA = train()
print('ratio of spam : %s'%pA)
print('ratio of ham : %s'%pNotA)
print('%s words are for spam'%positiveTotal)
print('%s words are for ham'%negativeTotal)

ratio of spam : 0.1348
ratio of ham : 0.8652
46979 words are for spam
154336 words are for ham


In [4]:
#gives the conditional probability p(B_i/A_x)
#the each token has the probability for 'spam' or 'ham'
#the result is the multiplication of each token.
def conditionalEmail(body , spam) :
    result =1.0
    for word in body:
        result *= conditionalWord(body , spam)
    return result

#classifies a new email as spam or not spam
def classify(email):
    isSpam = pA * conditionalEmail(email, True) # P (A | B)
    notSpam = pNotA * conditionalEmail(email, False) # P(¬A | B)
    return isSpam > notSpam

In [5]:
#Laplace Smoothing for the words not present in the training set
# the alpha is 1. this value is for 0 frequency
alpha=1

# for total number of words, first, i crean up the str such as '....',non alphabet.
# we make the all words lowercase
def clean_str(string):
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r",", " ", string)
    string = re.sub(r"!", " ", string)
    string = re.sub(r"\(", " ", string)
    string = re.sub(r"\)", " ", string)
    string = re.sub(r"\?", " ", string)
    return string.strip().lower()

# make the vocab(unique words) and count words of vocab
vocab_list=[]
for i in range(len(data_kaggle)):
    str_raw=clean_str(data_kaggle['sms'][i])
    token=str_raw.split()
    vocab_list+=token
vocab = set(vocab_list)
numWords = len(vocab_list)

In [6]:
#gives the conditional probability p(B_i | A_x) with smoothing
def conditionalWord(word, spam):
    if spam:
        return (trainPositive.get(word,0)+alpha)/(float)(positiveTotal+alpha*numWords)
    return (trainNegative.get(word,0)+alpha)/(float)(negativeTotal+alpha*numWords)

In [7]:
#make the test data list like training data
testData=[]
for index in range(len(X_test)-1):
    try:
        data_ex=Dataset(X_test[index], y_test[index])
        testData.append(data_ex)
    except:
        continue
# for evaluation of our filter, count the hit and calculate the hit ratio.
hit = 0
for email_test in testData:
    predict = classify(email_test.body)
    if predict==email_test.label:
        hit+=1
hit_ratio = float(hit/len(testData))

In [9]:
print('accuracy : %s'%hit_ratio)

accuracy : 0.38305084745762713
