In [1]:
#naive bayes is natural language processing 101
#it is very naive and simple but powerful
#quite similar to logistic regression i suppose
#it is simply calculating posterior distribution from bayes theorem
#pretty much the same as another generative learning algorithm gda
#feel free to check gda in the following link
# https://github.com/tattooday/machine-learning/blob/master/gaussian%20discriminant%20analysis.ipynb

import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
import re
os.chdir('d:/')

In [2]:
#this is a function to calculate top 30 frequent words
#it would help us to determine which of those are stop words
#stop words refer to the words we dont take in account for naive bayes
#such as you, i, and, of, the, an
#which are inevitable in both spam and ham
#it is a stop word helper
#we still need to filter out the words that dont belong in this list manually
def get_stopword(output):
    
    vocabulary=sorted(list(set(output)))
    
    d={}
    stopword=[]
    
    for i in vocabulary:
        d[i]=output.count(i)
    
    #get top 30 frequent words
    for j in sorted(list(d.values()))[::-1][:30]:
        temp=list(d.keys())[list(d.values()).index(j)]
        stopword.append(temp)
        del d[temp]
        
    return stopword

In [3]:
#denote vocabulary as the list of all the words in english theoretically
#in reality, it is not feasible and economic to do so
#here, we just collect all words from all emails
#we gotta filter out those stop words as well
def get_vocabulary(output,stopword):
    
    vocabulary=sorted(list(set(output)))
    
    for i in vocabulary:
        if i in stopword:
            vocabulary.remove(i)
        
    return vocabulary    

In [4]:
#calculate p(x|y=classification) for multivariate naive bayes
def multivariate_calc_prob(word,x_train,y_train,classification):
    
    #how many spam or ham from all emails
    num=list(y_train).count(classification)
    
    #check how many emails contain the given word
    temp=[i.count(word) for i in x_train[y_train==classification]]
    freq=len(temp)-temp.count(0)
    
    #calculate p(x|y=classification)
    if freq!=0:
        p=freq/num
        
    else:
        
        #laplace smoothing
        #when the given word hasnt appeared in our training set yet
        #we gotta avoid the scenario 0/num
        #even if the given word has never appeared
        #it doesnt indicate it wont appear in the future
        #therefore, we gotta use laplace smoothing
        #see the following link for more details
        # https://en.wikipedia.org/wiki/Additive_smoothing
        
        #add 1 to the numerator and k to the denominator
        #where k is the dimension of x
        #in another word, how many possible values x can take
        #in binary classification, k=2
        #x either exists in the email or it doesnt
        p=(freq+1)/(num+2)

    return p


#multivariate event model follows a bernoulli distribution
#each word has two scenario
#it either appears in the email or not
def multivariate(sample,stopword):
    
    x_train,x_test,y_train,y_test= \
    train_test_split(sample['word'],sample['spam'],test_size=0.3)
    
    #caculate p(y)
    phi_y0=list(y_train).count(0)/len(y_train)
    phi_y1=1-phi_y0
    
    forecast=[]
    
    #assume all x are independent
    #each word has its own probability of occurence in emails
    #the probability of occurence of those words are not correlated
    #hence, p(x|y) for all x can be written as
    #the product of all probabilities of x
    #which is called chain rule in bayesian network
    #see the following link for details
    # https://en.wikipedia.org/wiki/Chain_rule_%28probability%29
    for i in x_test:
        px_y0,px_y1=1,1
        for j in i:
            if j not in stopword:
                px_y0*=multivariate_calc_prob(j,x_train,y_train,0)
                px_y1*=multivariate_calc_prob(j,x_train,y_train,1)
            else:
                pass
        
        #calculate p(y|x)
        py0_x=px_y0*phi_y0
        py1_x=px_y1*phi_y1
        
        #compare p(y=1|x) with p(y=0|x)
        #we take the larger one as for all generative learning algo
        #if the probabilities of spam and ham are equal
        #we d rather see it in inbox folder instead of spam folder
        p=0 if py0_x>=py1_x else 1
        forecast.append(p)
        
    return len(y_test[forecast==y_test])/len(y_test)*100

        

In [5]:
#calculate p(x|y) for multinominal naive bayes
def multinominal_calc_prob(word,x_train,y_train,classification,vocabulary):
    
    #get the word count of all spam/ham emails
    num=sum([len(i) for i in x_train[y_train==classification]])
    
    #get the frequency of given word in all spam/ham emails
    freq=sum([i.count(word) for i in x_train[y_train==classification]])
    

    if freq!=0:
        p=freq/num
        
    else:
        
        #laplace smoothing 
        #here x can take on v possible values
        #where v is the length of vocabulary list
        #we are assuming vocabulary list covers every word in english
        p=(freq+1)/(num+len(vocabulary))

    return p



#unlike multivariate, multinominal event model follows a multinominal distribution
#the frequency of each word is taken into consideration
#the formula is pretty much the same as multivariate
#except p(x|y) is different
def multinominal(sample,stopword):
    
    x_train,x_test,y_train,y_test= \
    train_test_split(sample['word'],sample['spam'],test_size=0.3)
    
    vocabulary=[]
    for i in sample['word']:
        temp=get_vocabulary(i,stopword)
        vocabulary+=temp
    
    phi_y0=list(y_train).count(0)/len(y_train)
    phi_y1=1-phi_y0
    
    forecast=[]
    
    for i in x_test:
        px_y0,px_y1=1,1
        for j in i:
            if j not in stopword:
                px_y0*=multinominal_calc_prob(j,x_train,y_train,0,vocabulary)
                px_y1*=multinominal_calc_prob(j,x_train,y_train,1,vocabulary)
            else:
                pass
            
        py0_x=px_y0*phi_y0
        py1_x=px_y1*phi_y1
        
        p=0 if py0_x>=py1_x else 1
        forecast.append(p)
        
    return len(y_test[forecast==y_test])/len(y_test)*100

In [6]:
#create multivariate vocabulary vector for sklearn
#this is the authenticate way in naive bayes
#but using pandas is more convenient
#which is why our own implementation is kinda different
def multivariate_vector(x,y,vocabulary):
    
    #CRUCIAL!!!!
    #train_test_split would shuffle the training set
    #the shuffled index would cause a problem later
    x.reset_index(inplace=True,drop=True)
    y.reset_index(inplace=True,drop=True)
    
    #we create a m*n matrix
    #where m denotes the number of emails(rows)
    #n denotes the number of words in vocabulary(columns)
    #the value denotes if word[n] appears in email[m]
    #if so, we set the value to 1
    multivariate=pd.DataFrame()
    for i in vocabulary:
        temp=[]
        for j in x.index:
            if i in x[j]:
                temp.append(1)
            else:
                temp.append(0)
                
        multivariate[i]=temp
        
    multivariate['real y']=y
    
    return multivariate


#create multinominal vocabulary vector for sklearn
def multinominal_vector(x,y,vocabulary):
    
    #CRUCIAL!!!!!
    x.reset_index(inplace=True,drop=True)
    y.reset_index(inplace=True,drop=True)
    
    #we create a m*n matrix
    #where m denotes the number of emails(rows)
    #n denotes the number of words in vocabulary(columns)
    #the value denotes the frequency of word[n] in email[m]
    multinominal=pd.DataFrame()
    for i in vocabulary:
        temp=[]
        for j in x.index:
            temp.append(x[j].count(i))
                
        multinominal[i]=temp
        
    multinominal['real y']=y
    
    return multinominal


#using the official sklearn package
def sklearn(sample,stopword,method,nbmethod):
    
    vocabulary=[]
    for i in sample['word']:
        temp=get_vocabulary(i,stopword)
        vocabulary+=temp
    
    x_train,x_test,y_train,y_test= \
    train_test_split(sample['word'],sample['spam'],test_size=0.3)

    mn_train=method(x_train,y_train,vocabulary)
    mn_test=method(x_test,y_test,vocabulary)
    
    model=nbmethod().fit(pd.concat([mn_train[i] for i in vocabulary], \
                                      axis=1),mn_train['real y'])
    
    forecast=model.predict(pd.concat([mn_test[i] for i in vocabulary], \
                                      axis=1))
    
    return len(y_test[forecast==y_test])/len(y_test)*100

In [7]:
def main():
    
    #the raw data really comes from my email
    df=pd.read_csv('spam.csv')
    
    #deaggregation splits the emails into lists of words
    #output denotes the list of every word in all emails
    deaggregation=[]
    output=[]
    for i in df['text']:
        regex=re.findall('\w*',i.lower())
        temp=list(filter(lambda x: x!='',regex))
        deaggregation.append(temp)
        output+=temp
        
    df['word']=deaggregation
    stopword=get_stopword(output)
    
    #some adjustment on stopword
    stopword.remove('account')
    stopword.remove('application')
    stopword.remove('data')
    stopword.remove('trip')

    temp=[]
    for i in range(20):
        
        temp.append(multivariate(df,stopword))
        temp.append(multinominal(df,stopword))
        temp.append(sklearn(df,stopword,multivariate_vector,BernoulliNB))
        temp.append(sklearn(df,stopword,multinominal_vector,MultinomialNB))
    
    for j,k in enumerate(['multivariate implementation','multinominal implementation', \
                         'multivariate sklearn','multinominal sklearn']):
        print('{} accuracy: {}%'.format(k,np.mean(temp[j::4])))

In [8]:
main()

#surprisingly the accuracy of my implementation is actually higher than sklearn
#but it is a huge tradeoff for time
#i would definitely use sklearn instead
#btw, my sample size is very small
#the result is definitely biased
#it heavily depends on how train and test sets are splitted

multivariate implementation accuracy: 80.0%
multinominal implementation accuracy: 78.0%
multivariate sklearn accuracy: 75.0%
multinominal sklearn accuracy: 78.0%
