# Unigrams, bigrams, and trigrams in Naive Bayes Classifiers

In [136]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [141]:
class ngrams_bayes():
    
    def __init__(self,data,n=2,split=.75,labels=None,train_data=None,test_data=None):
        
        #Setting the labels of classes
        self.labels = labels
        
        #If the training and test datasets are passed as parameters
        if train_data is not None: self.train_data = train_data
        if test_data is not None: self.test_data = test_data
        
        #If the training and test datasets are not set
        if train_data is None and test_data is None:
            #split into training and testing data
            self.train_data, self.test_data = train_test_split(data, 
                                                               train_size = split)
        
        #convert into n grams
        self.train_data = [[item[0], self.ngrams(n, item[1])] for item in self.train_data]
        self.test_data = [[item[0], self.ngrams(n, item[1])] for item in self.test_data]
        
        #count unique n grams in training data
        flattened = [gram for message in self.train_data for gram in message[1]]
        self.unique = len(set(flattened))
        
        #init dicts
        self.trainPositive = {}
        self.trainNegative = {}
        #counters 
        self.posGramCount = 0
        self.negGramCount = 0
        self.spamCount = 0
        #priors
        self.pA = 0
        self.pNotA = 0
    
    def ngrams(self,n,text):
        text = text.split(' ')
        grams = []
        for i in range(len(text)-n+1):
            gram = ' '.join(text[i:i+n])
            grams.append(gram)
        return grams
    
    def train(self):
        
        for item in self.train_data:
            label = item[0]
            grams = item[1]
            if label == 1:
                self.spamCount +=1
            for gram in grams:
                if label == 1:
                    self.trainPositive[gram] = self.trainPositive.get(gram,0) + 1
                    self.posGramCount += 1
                else:
                    self.trainNegative[gram] = self.trainNegative.get(gram,0) + 1
                    self.negGramCount += 1
            
            self.pA = self.spamCount / float(len(self.train_data))
            self.pNotA = 1.0 - self.pA
    
    def classify(self,text,alpha=1.0):
        
        self.alpha = alpha
        isSpam = self.pA * self.conditionalText(text,1)
        notSpam = self.pNotA * self.conditionalText(text,0)
        if(isSpam > notSpam):
            return 1
        else:
            return 0
    
    def probabilities(self,text,alpha=1.0):
        
        self.alpha = alpha
        isSpam = self.pA * self.conditionalText(text,1)
        notSpam = self.pNotA * self.conditionalText(text,0)
        return isSpam/(isSpam+notSpam),notSpam/(isSpam+notSpam)
        
    def conditionalText(self,grams,label):
        result = 1.0
        for ngram in grams:
            result *= self.conditionalNgram(ngram,label)
        return result
    
    def conditionalNgram(self,ngram,label):
        alpha = self.alpha
        if label == 1:
            return ((self.trainPositive.get(ngram,0)+alpha)/
                    float(self.posGramCount+alpha*self.unique))
        
        else:
            return ((self.trainNegative.get(ngram,0)+alpha)/
                    float(self.negGramCount+alpha*self.unique))
    
    def evaluate_test_data(self):
        results = []
        for test in self.test_data:
            label = test[0]
            text = test[1]
            ruling = self.classify(text)
            if ruling == label:
                results.append(1)
            else:
                results.append(0)
        print("Probabilities for {} classes: {:.2f}%".format(len(results),100.0*sum(results)/
    float(len(results))))
        return sum(results)/float(len(results))
    
    def evaluate_test_data_prob(self):
        results = []
        for test in self.test_data:
            label = test[0]
            text = test[1]
            class0,class1 = self.probabilities(text)
            if class0>class1: index = 0
            else: index = 1
            print("Probabilities for classes: " + str(class0) + ", " + str(class1))
            print("The sample \"" + str(test) + "\" belongs to class " + str(self.labels[index]) )
        return np.array([class0,class1])

## Loading the data

In [142]:
df = pd.read_csv("example_131.csv",delimiter=",",names = ['label','message'],encoding = 'latin-1') #Loading the data from example 13.1
df2 = pd.read_csv("example_1310.csv",delimiter=",",names = ['label','message'],encoding = 'latin-1') #Loading the data from example 13.10
df3 = pd.read_csv("example_spam.csv",delimiter=",",names = ['label','message'],encoding = 'latin-1') #Loading the data from example spam detection

#label China as 1, not China as 0
df['label'] = df['label'].replace(['Not China','China'],[0,1])
#label China as 1, not China as 0
df2['label'] = df2['label'].replace(['Not China','China'],[0,1])
#label spam as 1, not spam as 0
df3['label'] = df3['label'].replace(['ham','spam'],[0,1])

data = df.values
data2 = df2.values
data3 = df3.values

print("Data for the example 13.1")
print(data)
print("\n Data for the example 13.10")
print(data2)
print("\n Data for the example spam detection")
print(data3)

Data for the example 13.1
[[1 'Chinese Beijing Chinese']
 [1 'Chinese Chinese Shanghai']
 [1 'Chinese Macao']
 [0 'Tokyo Japan Chinese']
 ['?' 'Chinese Chinese Chinese Tokyo Japan']]

 Data for the example 13.10
[[1 'Taipei Taiwan']
 [1 'Macao Taiwan Shanghai']
 [0 'Japan Sapporo']
 [0 'Sapporo Osaka']
 ['?' 'Taiwan Taiwan Sapporo']]

 Data for the example spam detection
[[1 'send us your password']
 [0 'send us your review']
 [0 'review your password']
 [1 'review us']
 [1 'send your password']
 [1 'send us your password']
 ['?' 'review us now']]


### Defining train and test data

In [143]:
#For example 13.1
x_train = df.get_values()[0:4]
x_test = df.get_values()[4:5]
print("Training and test datasets for the example 13.1")
print("Train dataset")
print(x_train)
print("Test dataset")
print(x_test)

#For example 13.10
x_train2 = df2.get_values()[0:4]
x_test2 = df2.get_values()[4:5]
print("\nTraining and test datasets for the example 13.1")
print("Train dataset")
print(x_train2)
print("Test dataset")
print(x_test2)

#For example spam detection
x_train3 = df3.get_values()[0:6]
x_test3 = df3.get_values()[6:7]
print("\nTraining and test datasets for the example 13.1")
print("Train dataset")
print(x_train3)
print("Test dataset")
print(x_test3)


Training and test datasets for the example 13.1
Train dataset
[[1 'Chinese Beijing Chinese']
 [1 'Chinese Chinese Shanghai']
 [1 'Chinese Macao']
 [0 'Tokyo Japan Chinese']]
Test dataset
[['?' 'Chinese Chinese Chinese Tokyo Japan']]

Training and test datasets for the example 13.1
Train dataset
[[1 'Taipei Taiwan']
 [1 'Macao Taiwan Shanghai']
 [0 'Japan Sapporo']
 [0 'Sapporo Osaka']]
Test dataset
[['?' 'Taiwan Taiwan Sapporo']]

Training and test datasets for the example 13.1
Train dataset
[[1 'send us your password']
 [0 'send us your review']
 [0 'review your password']
 [1 'review us']
 [1 'send your password']
 [1 'send us your password']]
Test dataset
[['?' 'review us now']]


## Instantiating the ngrams_bayes class for each dataset, unigrams

In [144]:
print("Data for the example 13.1")
unigram_bayes = ngrams_bayes(data,1,labels=['China','Not China'],train_data = x_train,test_data = x_test)
unigram_bayes.train()
unigram_bayes.evaluate_test_data_prob()

print("\nData for the example 13.10")
unigram_bayes2 = ngrams_bayes(data2,1,labels=['China','Not China'],train_data = x_train2,test_data = x_test2)
unigram_bayes2.train()
unigram_bayes2.evaluate_test_data_prob()

print("\nData for the example spam detection")
unigram_bayes3 = ngrams_bayes(data3,1,labels=['ham','spam'],train_data = x_train3,test_data = x_test3)
unigram_bayes3.train()
unigram_bayes3.evaluate_test_data_prob()


Data for the example 13.1
Probabilities for classes: 0.6897586117634673, 0.31024138823653263
The sample "['?', ['Chinese', 'Chinese', 'Chinese', 'Tokyo', 'Japan']]" belongs to class China

Data for the example 13.10
Probabilities for classes: 0.697954902988988, 0.30204509701101206
The sample "['?', ['Taiwan', 'Taiwan', 'Sapporo']]" belongs to class China

Data for the example spam detection
Probabilities for classes: 0.4413793103448275, 0.5586206896551725
The sample "['?', ['review', 'us', 'now']]" belongs to class spam


array([0.44137931, 0.55862069])

## Instantiating the ngrams_bayes class for each dataset, bigrams

In [145]:
print("Data for the example 13.1")
unigram_bayes = ngrams_bayes(data,2,labels=['China','Not China'],train_data = x_train,test_data = x_test)
unigram_bayes.train()
unigram_bayes.evaluate_test_data_prob()

print("\nData for the example 13.10")
unigram_bayes2 = ngrams_bayes(data2,2,labels=['China','Not China'],train_data = x_train2,test_data = x_test2)
unigram_bayes2.train()
unigram_bayes2.evaluate_test_data_prob()

print("\nData for the example spam detection")
unigram_bayes3 = ngrams_bayes(data3,2,labels=['ham','spam'],train_data = x_train3,test_data = x_test3)
unigram_bayes3.train()
unigram_bayes3.evaluate_test_data_prob()

Data for the example 13.1
Probabilities for classes: 0.6549865229110512, 0.34501347708894875
The sample "['?', ['Chinese Chinese', 'Chinese Chinese', 'Chinese Tokyo', 'Tokyo Japan']]" belongs to class China

Data for the example 13.10
Probabilities for classes: 0.43362831858407086, 0.5663716814159292
The sample "['?', ['Taiwan Taiwan', 'Taiwan Sapporo']]" belongs to class Not China

Data for the example spam detection
Probabilities for classes: 0.6923076923076922, 0.3076923076923077
The sample "['?', ['review us', 'us now']]" belongs to class ham


array([0.69230769, 0.30769231])