In [1]:
#naïve bayes is natural language processing 101
#it is very naïve and simple but powerful
#quite similar to logistic regression i suppose
#it is simply calculating posterior distribution from bayes theorem
#pretty much the same as another generative learning algorithm gda
#feel free to check gda in the following link
# https://github.com/je-suis-tm/machine-learning/blob/master/gaussian%20discriminant%20analysis.ipynb

import os
import numpy as np
import pandas as pd
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
os.chdir('h:/')

In [2]:
#convert text into a list of words
#we use stemming and lemmatization to save space and improve efficiency
#for instance, we have words walked,walking,walks
#with nltk package, we can revert all of them to walk
def text2list(text,stopword,lower=True):

    temp=text if lower==False else text.lower()
    tokenizer=RegexpTokenizer(r'\w+')
    temp2=[WordNetLemmatizer().lemmatize(i) for i in tokenizer.tokenize(temp)]
    output=[PorterStemmer().stem(i) for i in temp2 if i not in stopword]
    
    #remove numbers as they are stopword as well
    for i in output:
        try:
            float(i)
            output.remove(i)
        except:
            pass
    
    return output

In [3]:
#this is a function to calculate top 30 frequent words
#it would help us to determine which of those are stop words
#stop words refer to the words we dont take in account for naïve bayes
#such as you, i, and, of, the, an
#which are inevitable in both spam and ham
#it is a stop word helper
#we still need to filter out the words that dont belong in this list manually
#for each task, stop word may be different
#e.g. if every text you need to classify has a prefix like '[Confidential]'
#confidential becomes a stop word to be excluded
def get_stopword(output):
    
    vocabulary=sorted(list(set(output)))
    
    d={}
    stopword=[]
    
    for i in vocabulary:
        d[i]=output.count(i)
    
    #get top 30 frequent words
    for j in sorted(list(d.values()))[::-1][:30]:
        temp=list(d.keys())[list(d.values()).index(j)]
        stopword.append(temp)
        del d[temp]
        
    return stopword


#there is another way to find stop words called kl divergence
#it calculates the distance from one distribution to another
#in layman’s terms, it gives us the impact of each word on classifications
#you can check Wikipedia for more details
# https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence
#it is also called mutual information in machine learning
#the actual formula is kl(p(x:y)||p(x)p(y))
#we want the divergence from xy joint distribution to xy independent distribution
#in scipy, u can use scipy.stats.entropy
#in sklearn, u can use sklearn.feature_selection.mutual_info_classif
def kl_distance(df):

    #concatenate all words together
    temp=set([j for i in df['word'] for j in i])

    #probability of y is fixed
    #they are calculated before everything
    py0=len(df[df['spam']==0])/len(df)
    py1=len(df[df['spam']==1])/len(df)

    #calculate kl distance for each word
    temp2=[]
    for word in set(temp):

        kldistance=0

        countx=[]

        #to make our life easier
        #we take multivariate approach
        #we dont calculate the frequency of a word
        for j in range(len(df)):
            if df['word'][j].count(word)>0:
                countx.append(1)
            else:
                countx.append(0)

        #using dataframe even though the performance is slower
        #it is easier to do slicing by logical expression
        temp3=pd.DataFrame()
        temp3['x']=countx
        temp3['y']=df['spam']

        px0=len(temp3[temp3['x']==0])/len(temp3)
        px1=len(temp3[temp3['x']==1])/len(temp3)

        #using loops to do sum
        for k in range(2):
            for l in range(2):
                prob_joint=len(temp3[temp3['y']==k][temp3['x']==l])/len(temp3)

                probx=px0 if l==0 else px1
                proby=py0 if k==0 else py1

                #we need to avoid 2 traps
                #zero division and logarithm zero
                try:
                    entropy=prob_joint*np.log(prob_joint/(probx*proby))
                except ZeroDivisionError:
                    entropy=0

                if prob_joint==0:
                    entropy=0

                kldistance+=entropy

        temp2+=[word,kldistance]

    #transform into dataframe and sort by kl distance in descending order
    output=pd.DataFrame()
    output['word']=temp2[0::2]
    output['kl distance']=temp2[1::2]

    output.sort_values(by='kl distance',ascending=False,inplace=True)

    output.reset_index(inplace=True,drop=True)

    return output

In [4]:
stopword=stopwords.words('english')+['u',
 'beyond',
 'within',
 'around',
 'would',
 'b',
 'c',
 'e',
 'f',
 'g',
 'h',
 'j',
 'k',
 'l',
 'n',
 'p',
 'q',
 'r',
 'u',
 'v',
 'w',
 'x',
 'z',
 'first']

In [5]:
#denote vocabulary as the list of all the words in english theoretically
#in reality, it is not feasible and economic to do so
#here, we just collect all words from all emails
#we gotta filter out those stop words as well
def get_vocabulary(output,stopword):
    
    vocabulary=sorted(list(set(output)))
    
    for i in vocabulary:
        if i in stopword:
            vocabulary.remove(i)
        
    return vocabulary    

In [6]:
#calculate p(x|y=classification) for multivariate naïve bayes
def multivariate_calc_prob(word,x_train,y_train,classification):
    
    #how many spam or ham from all emails
    num=list(y_train).count(classification)
    
    #check how many emails contain the given word
    temp=[i.count(word) for i in x_train[y_train==classification]]
    freq=len(temp)-temp.count(0)
    
    #calculate p(x|y=classification)
    if freq!=0:
        p=freq/num
        
    else:
        
        #laplace smoothing
        #when the given word hasnt appeared in our training set yet
        #we gotta avoid the scenario 0/num
        #even if the given word has never appeared
        #it doesnt indicate it wont appear in the future
        #therefore, we gotta use laplace smoothing
        #see the following link for more details
        # https://en.wikipedia.org/wiki/Additive_smoothing
        
        #add 1 to the numerator and k to the denominator
        #where k is the dimension of x
        #in another word, how many possible values x can take
        #in binary classification, k=2
        #x either exists in the email or it doesnt
        p=(freq+1)/(num+2)

    return p


#multivariate event model follows a bernoulli distribution
#each word has two scenario
#it either appears in the email or not
def multivariate(sample,stopword):
    
    x_train,x_test,y_train,y_test= \
    train_test_split(sample['word'],sample['spam'],test_size=0.3)
    
    #calculate p(y)
    phi_y0=list(y_train).count(0)/len(y_train)
    phi_y1=1-phi_y0
    
    forecast=[]
    
    #assume all x are independent
    #each word has its own probability of occurrence in emails
    #the probability of occurrence of those words are not correlated
    #hence, p(x|y) for all x can be written as
    #the product of all probabilities of x
    #which is called chain rule in bayesian network
    #see the following link for details
    # https://en.wikipedia.org/wiki/Chain_rule_%28probability%29
    for i in x_test:
        px_y0,px_y1=1,1
        for j in i:
            if j not in stopword:
                px_y0*=multivariate_calc_prob(j,x_train,y_train,0)
                px_y1*=multivariate_calc_prob(j,x_train,y_train,1)
            else:
                pass
        
        #calculate p(y|x)
        py0_x=px_y0*phi_y0
        py1_x=px_y1*phi_y1
        
        #compare p(y=1|x) with p(y=0|x)
        #we take the larger one as for all generative learning algo
        #if the probabilities of spam and ham are equal
        #we d rather see it in inbox folder instead of spam folder
        p=0 if py0_x>=py1_x else 1
        forecast.append(p)
        
    return len(y_test[forecast==y_test])/len(y_test)*100

        

In [7]:
#calculate p(x|y) for multinomial naïve bayes
def multinomial_calc_prob(word,x_train,y_train,classification,vocabulary):
    
    #get the word count of all spam/ham emails
    num=sum([len(i) for i in x_train[y_train==classification]])
    
    #get the frequency of given word in all spam/ham emails
    freq=sum([i.count(word) for i in x_train[y_train==classification]])
    

    if freq!=0:
        p=freq/num
        
    else:
        
        #laplace smoothing 
        #here x can take on v possible values
        #where v is the length of vocabulary list
        #we are assuming vocabulary list covers every word in english
        p=(freq+1)/(num+len(vocabulary))

    return p



#unlike multivariate, multinomial event model follows a multinomial distribution
#the frequency of each word is taken into consideration
#the formula is pretty much the same as multivariate
#except p(x|y) is different
def multinomial(sample,stopword):
    
    x_train,x_test,y_train,y_test= \
    train_test_split(sample['word'],sample['spam'],test_size=0.3)
    
    vocabulary=[]
    for i in sample['word']:
        temp=get_vocabulary(i,stopword)
        vocabulary+=temp
    
    phi_y0=list(y_train).count(0)/len(y_train)
    phi_y1=1-phi_y0
    
    forecast=[]
    
    for i in x_test:
        px_y0,px_y1=1,1
        for j in i:
            if j not in stopword:
                px_y0*=multinomial_calc_prob(j,x_train,y_train,0,vocabulary)
                px_y1*=multinomial_calc_prob(j,x_train,y_train,1,vocabulary)
            else:
                pass
            
        py0_x=px_y0*phi_y0
        py1_x=px_y1*phi_y1
        
        p=0 if py0_x>=py1_x else 1
        forecast.append(p)
        
    return len(y_test[forecast==y_test])/len(y_test)*100

In [8]:
#create multivariate vocabulary vector for sklearn
#this is the authenticate way in naïve bayes
#but using pandas is more convenient
#which is why our own implementation is kinda different
def multivariate_vector(x,y,vocabulary):
    
    #CRUCIAL!!!!
    #train_test_split would shuffle the training set
    #the shuffled index would cause a problem later
    x.reset_index(inplace=True,drop=True)
    y.reset_index(inplace=True,drop=True)
    
    #we create a m*n matrix
    #where m denotes the number of emails(rows)
    #n denotes the number of words in vocabulary(columns)
    #the value denotes if word[n] appears in email[m]
    #if so, we set the value to 1
    multivariate=pd.DataFrame()
    for i in vocabulary:
        temp=[]
        for j in x.index:
            if i in x[j]:
                temp.append(1)
            else:
                temp.append(0)
                
        multivariate[i]=temp
        
    multivariate['real y']=y
    
    return multivariate


#create multinomial vocabulary vector for sklearn
def multinomial_vector(x,y,vocabulary):
    
    #CRUCIAL!!!!!
    x.reset_index(inplace=True,drop=True)
    y.reset_index(inplace=True,drop=True)
    
    #we create a m*n matrix
    #where m denotes the number of emails(rows)
    #n denotes the number of words in vocabulary(columns)
    #the value denotes the frequency of word[n] in email[m]
    multinomial=pd.DataFrame()
    for i in vocabulary:
        temp=[]
        for j in x.index:
            temp.append(x[j].count(i))
                
        multinomial[i]=temp
        
    multinomial['real y']=y
    
    return multinomial


#using the official sklearn package
def sklearn(sample,stopword,method,nbmethod):
    
    vocabulary=[]
    for i in sample['word']:
        temp=get_vocabulary(i,stopword)
        vocabulary+=temp
    
    x_train,x_test,y_train,y_test= \
    train_test_split(sample['word'],sample['spam'],test_size=0.3)

    mn_train=method(x_train,y_train,vocabulary)
    mn_test=method(x_test,y_test,vocabulary)
    
    model=nbmethod().fit(pd.concat([mn_train[i] for i in vocabulary], \
                                      axis=1),mn_train['real y'])
    
    forecast=model.predict(pd.concat([mn_test[i] for i in vocabulary], \
                                      axis=1))
    
    return len(y_test[forecast==y_test])/len(y_test)*100

In [9]:
#the raw data really comes from my email
df=pd.read_csv('spam.csv')

In [10]:
#tokenization
temp=[]
for i in df['text'].tolist():
    temp.append(text2list(i,stopword,lower=True))
            
df['word']=temp

In [11]:
#to review stopword
print(get_stopword([j for i in temp for j in i]))

['trip', 'pleas', 'applic', 'data', 'organ', 'student', 'account', 'person', 'use', 'follow', 'password', 'univers', 'inform', 'wolfram', 'http', 'interest', 'intern', 'price', 'regard', 'see', 'time', 'contact', 'detail', 'event', 'note', 'privaci', 'protect', 'provid', 'question', 'thank']


In [12]:
#to check each word's contribution to the result
print(kl_distance(df))



          word  kl distance
0        invit     0.290305
1       follow     0.290305
2        offic     0.290305
3         team     0.290305
4       differ     0.215762
5       ticket     0.215762
6       provid     0.215762
7        studi     0.215762
8    eventbrit     0.215762
9       friend     0.215762
10        look     0.215762
11          co     0.215762
12       quick     0.215762
13          hi     0.215762
14     neither     0.215762
15      famili     0.215762
16     student     0.215762
17      includ     0.215762
18       might     0.215762
19      travel     0.215762
20       thank     0.215762
21        love     0.215762
22       hesit     0.215762
23          uk     0.215762
24     gabriel     0.215762
25     everyon     0.215762
26      member     0.215762
27        trip     0.215762
28       organ     0.215762
29        link     0.215762
..         ...          ...
482       well     0.013025
483    protect     0.013025
484      unabl     0.013025
485     receiv     0

In [13]:
#try ten times and compare the result of self implementation and sklearn
temp=[]
for i in range(10):
        
    temp.append(multivariate(df,stopword))
    temp.append(multinomial(df,stopword))
    temp.append(sklearn(df,stopword,multivariate_vector,BernoulliNB))
    temp.append(sklearn(df,stopword,multinomial_vector,MultinomialNB))

In [14]:
for j,k in enumerate(['multivariate implementation','multinomial implementation',
                        'multivariate sklearn','multinomial sklearn']):
    print('{} accuracy: {}%'.format(k,np.mean(temp[j::4])))

multivariate implementation accuracy: 84.0%
multinomial implementation accuracy: 88.0%
multivariate sklearn accuracy: 78.0%
multinomial sklearn accuracy: 74.0%


In [15]:
#surprisingly the accuracy of my implementation is actually higher than sklearn
#but it is a huge tradeoff for time
#i would definitely use sklearn instead
#btw, my sample size is very small
#the result is definitely biased
#it heavily depends on how train and test sets are split