In [1]:
import math
import pandas as pd 
from collections import Counter

In [2]:
data_path = "../data/spam.csv"
df = pd.read_csv(data_path)

In [3]:
df.head(5)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
# dropping the columns
df.drop(df.columns[2:], axis = 1, inplace = True)

# renaming the columns
df.columns = ["class", "text"]

In [5]:
df.head(5)

Unnamed: 0,class,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


### Bernoulli Naive Bayes

In [13]:
class BernoulliNB:
    def __init__(self, num_classes, laplace_smoothing = 1.0, is_imbalance = True):
        self.num_classes = num_classes
        self.is_imbalance = is_imbalance
        self.laplace_smoothing = laplace_smoothing
        
    def preprocess(self, X):
        tokens = self.split_into_tokens(X)
        tokens = map(str.lower, filter(str.isalnum, tokens))
        return list(tokens)
    
    def split_into_tokens(self, sentence):
        return sentence.split(" ")
        
    def fit(self, X, y):
        assert len(set(y)) == self.num_classes, f'{self.num_classes} do not match with the classes present in the dataset.'
                
        # Building the vocabulary
        self.corpus = {}
        self.total_records = len(y)
        for sentence, label in zip(X, y):
            sentence_tokens = self.preprocess(sentence)
            if label not in self.corpus:
                self.corpus[label] = sentence_tokens
                continue 
            self.corpus[label].extend(sentence_tokens)          

        # computing the frequecy 
        self.corpus = {k : Counter(v) for k, v in self.corpus.items()}
        self.labelCount = Counter(y)
    
    def predict(self, X):
        # preprocessing the sentence
        X = self.preprocess(X)
        
        conditional_prob = {}
        for potential_label, total in self.labelCount.items():
            log_prob = 1 if self.is_imbalance else math.log(self.labelCount[potential_label] / self.total_records)
            for token in X:
                log_prob += (self.corpus[potential_label][token] + self.laplace_smoothing)/ \
                    (total + (self.num_classes * self.laplace_smoothing))

            conditional_prob[potential_label] =  log_prob        
            
        return max(conditional_prob, key=conditional_prob.get)

In [14]:
input_sentences = df.text.tolist()
output_vars = df['class'].tolist()


nb = BernoulliNB(num_classes = 2)
nb.fit(input_sentences, output_vars)

In [15]:
nb.corpus

{'ham': Counter({'go': 235,
          'until': 22,
          'jurong': 1,
          'available': 11,
          'only': 117,
          'in': 786,
          'bugis': 5,
          'n': 134,
          'great': 77,
          'world': 23,
          'la': 2,
          'e': 78,
          'cine': 6,
          'there': 125,
          'got': 227,
          'amore': 1,
          'ok': 156,
          'joking': 3,
          'wif': 27,
          'u': 874,
          'dun': 55,
          'say': 80,
          'so': 399,
          'early': 25,
          'c': 57,
          'already': 47,
          'then': 195,
          'nah': 7,
          'i': 2172,
          'think': 124,
          'he': 180,
          'goes': 26,
          'to': 1544,
          'lives': 2,
          'around': 51,
          'here': 75,
          'though': 16,
          'even': 55,
          'my': 741,
          'brother': 13,
          'is': 710,
          'not': 381,
          'like': 221,
          'speak': 21,
          'with': 269,


In [17]:
nb.predict("All items are free, to check out please add some cash in your cart")

'spam'