# Unigrams, bigrams, and trigrams in Naive Bayes Classifiers

In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [20]:
df = pd.read_csv('./unigrams_NB_clasiffiers/spamSMS.csv', usecols = [0,1], encoding = 'latin-1')
df.columns = ['label','body']
#label spam as 1, not spam as 0
df['label'] = df['label'].replace(['ham','spam'],[0,1])
data = df.values
print(data)

[[0
  'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...']
 [0 'Ok lar... Joking wif u oni...']
 [1
  "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"]
 ...
 [0 'Pity, * was in mood for that. So...any other suggestions?']
 [0
  "The guy did some bitching but I acted like i'd be interested in buying something else next week and he gave it to us for free"]
 [0 'Rofl. Its true to its name']]


In [21]:
class ngrams_bayes():
    
    def __init__(self,data,n=2,split=.75):
        
        #split into training and testing data
        self.train_data, self.test_data = train_test_split(data, 
                                                           train_size = split)
        #convert into n grams
        self.train_data = [[item[0], self.ngrams(n, item[1])] for item in self.train_data]
        self.test_data = [[item[0], self.ngrams(n, item[1])] for item in self.test_data]
        
        #count unique n grams in training data
        flattened = [gram for message in self.train_data for gram in message[1]]
        self.unique = len(set(flattened))
        
        #init dicts
        self.trainPositive = {}
        self.trainNegative = {}
        #counters 
        self.posGramCount = 0
        self.negGramCount = 0
        self.spamCount = 0
        #priors
        self.pA = 0
        self.pNotA = 0
    
    def ngrams(self,n,text):
        text = text.split(' ')
        grams = []
        for i in range(len(text)-n+1):
            gram = ' '.join(text[i:i+n])
            grams.append(gram)
        return grams
    
    def train(self):
        
        for item in self.train_data:
            label = item[0]
            grams = item[1]
            if label == 1:
                self.spamCount +=1
            for gram in grams:
                if label == 1:
                    self.trainPositive[gram] = self.trainPositive.get(gram,0) + 1
                    self.posGramCount += 1
                else:
                    self.trainNegative[gram] = self.trainNegative.get(gram,0) + 1
                    self.negGramCount += 1
            
            self.pA = self.spamCount / float(len(self.train_data))
            self.pNotA = 1.0 - self.pA
    
    def classify(self,text,alpha=1.0):
        
        self.alpha = alpha
        isSpam = self.pA * self.conditionalText(text,1)
        notSpam = self.pNotA * self.conditionalText(text,0)
        if(isSpam > notSpam):
            return 1
        else:
            return 0
        
    def conditionalText(self,grams,label):
        result = 1.0
        for ngram in grams:
            result *= self.conditionalNgram(ngram,label)
        return result
    
    def conditionalNgram(self,ngram,label):
        alpha = self.alpha
        if label == 1:
            return ((self.trainPositive.get(ngram,0)+alpha)/
                    float(self.posGramCount+alpha*self.unique))
        
        else:
            return ((self.trainNegative.get(ngram,0)+alpha)/
                    float(self.negGramCount+alpha*self.unique))
    
    def evaluate_test_data(self):
        results = []
        for test in self.test_data:
            label = test[0]
            text = test[1]
            ruling = self.classify(text)
            if ruling == label:
                results.append(0)
            else:
                results.append(1)
        
        print("Evaluated {} test cases. {:.2f}% Accuracy".format(len(results),100.0*sum(results)/
    float(len(results))))
        return sum(results)/float(len(results))

In [54]:
unigram_bayes = ngrams_bayes(data,1)
print(unigram_bayes.trainPositive)
unigram_bayes.train()
print(unigram_bayes.test_data[0])
print(unigram_bayes.unique)

{}
[0, ['Sure,', "I'll", 'see', 'if', 'I', 'can', 'come', 'by', 'in', 'a', 'bit']]
12937


In [23]:
unigram_bayes.train()

In [24]:
unigram_bayes.evaluate_test_data()

Evaluated 1393 test cases. 3.95% Accuracy


0.03948312993539124

In [25]:
bigram_sms = ngrams_bayes(data,2)
bigram_sms.train()
bigram_sms.evaluate_test_data()

Evaluated 1393 test cases. 26.35% Accuracy


0.26346015793251976

In [26]:
trigram_sms = ngrams_bayes(data,3)
trigram_sms.train()
trigram_sms.evaluate_test_data()

Evaluated 1393 test cases. 52.40% Accuracy


0.5240488155061019

# Naives Bayes Classifier for distinguishing between lines from Biggie Smalls and 2Pac

In [27]:
biggie_df = pd.read_csv('./unigrams_NB_clasiffiers/biggie_lyrics.csv',usecols=[1],encoding='latin-1',header=None)
biggie_df.columns = ['lyrics']
biggie_df['lyrics'] = biggie_df['lyrics'].str.replace('[^\w\s]','')
biggie_df['lyrics'] = biggie_df['lyrics'].str.lower()

In [28]:
biggie_df.tail()

Unnamed: 0,lyrics
11,relax and take notes while i take tokes of the...
12,good evenin ladies and gentlemen\nhows everybo...
13,who shot ya\nseperate the weak from the obsole...
14,when i die fuck it i wanna go to hell\ncause i...
15,when the lala hits ya lyrics just splits ya\nh...


In [29]:
pac_df = pd.read_csv('./unigrams_NB_clasiffiers/2pac_lyrics.csv',usecols=[1],encoding='latin-1',header=None)
pac_df.columns = ['lyrics']
pac_df['lyrics'] = pac_df['lyrics'].str.replace('[^\w\s]','')
pac_df['lyrics'] = pac_df['lyrics'].str.lower()

In [30]:
pac_df.head()

Unnamed: 0,lyrics
0,little something for my godson elijah\nand a l...
1,yo mo bee mayn drop that shit\nyou know what t...
2,rest in peace to my motherfucker biggy smallz\...
3,makaveli in this killuminati\nall through your...
4,its just me against the world\nnothin to lose\...


In [32]:
biggie_lyrics = biggie_df['lyrics'].values
biggie_lyrics = [song.split('\n') for song in biggie_lyrics]
biggie_lyrics = [line for song in biggie_lyrics for line in song]
pac_lyrics = pac_df['lyrics'].values
pac_lyrics = [song.split('\n') for song in pac_lyrics]
pac_lyrics = [line for song in pac_lyrics for line in song]

rap_lines = []

for line in biggie_lyrics:
    if len(line.split()) > 3:
        rap_lines.append(np.array([0,line]))

for line in pac_lyrics:
    if len(line.split()) > 3:
        rap_lines.append(np.array([1,line]))

rap_lines = np.array(rap_lines)

In [37]:
rap_lines = pd.DataFrame(rap_lines)
rap_lines.columns = ['label','line']
rap_lines.head()
rap_lines['label'] = rap_lines['label'].replace(['0','1'],[0,1])

In [38]:
bayes_biggie_vs_pac = ngrams_bayes(rap_lines.values,1,.9)

In [39]:
bayes_biggie_vs_pac.train()

In [40]:
bayes_biggie_vs_pac.evaluate_test_data()

Evaluated 197 test cases. 29.95% Accuracy


0.29949238578680204

In [41]:
results = []
for _ in range(10):
    unigram = ngrams_bayes(rap_lines.values,1,.9)
    unigram.train()
    results.append(unigram.evaluate_test_data())
    
print("Average Accuracy: {:.2f}".format(sum(results)/float(len(results))))

Evaluated 197 test cases. 25.89% Accuracy
Evaluated 197 test cases. 24.37% Accuracy
Evaluated 197 test cases. 27.92% Accuracy
Evaluated 197 test cases. 29.44% Accuracy
Evaluated 197 test cases. 25.38% Accuracy
Evaluated 197 test cases. 24.37% Accuracy
Evaluated 197 test cases. 29.44% Accuracy
Evaluated 197 test cases. 26.40% Accuracy
Evaluated 197 test cases. 27.41% Accuracy
Evaluated 197 test cases. 24.87% Accuracy
Average Accuracy: 0.27


#### Bigram

In [43]:
results = []
for _ in range(10):
    bigram = ngrams_bayes(rap_lines.values,1,.9)
    bigram.train()
    results.append(bigram.evaluate_test_data())
    
print("Average Accuracy: {:.2f}".format(sum(results)/float(len(results))))

Evaluated 197 test cases. 21.83% Accuracy
Evaluated 197 test cases. 20.30% Accuracy
Evaluated 197 test cases. 24.87% Accuracy
Evaluated 197 test cases. 19.29% Accuracy
Evaluated 197 test cases. 27.92% Accuracy
Evaluated 197 test cases. 22.34% Accuracy
Evaluated 197 test cases. 27.41% Accuracy
Evaluated 197 test cases. 23.35% Accuracy
Evaluated 197 test cases. 29.95% Accuracy
Evaluated 197 test cases. 30.46% Accuracy
Average Accuracy: 0.25


#### Trigram

In [44]:
results = []
for _ in range(10):
    trigram = ngrams_bayes(rap_lines.values,1,.9)
    trigram.train()
    results.append(trigram.evaluate_test_data())
    
print("Average Accuracy: {:.2f}".format(sum(results)/float(len(results))))

Evaluated 197 test cases. 29.95% Accuracy
Evaluated 197 test cases. 26.40% Accuracy
Evaluated 197 test cases. 24.37% Accuracy
Evaluated 197 test cases. 26.40% Accuracy
Evaluated 197 test cases. 22.84% Accuracy
Evaluated 197 test cases. 25.38% Accuracy
Evaluated 197 test cases. 27.92% Accuracy
Evaluated 197 test cases. 23.35% Accuracy
Evaluated 197 test cases. 25.38% Accuracy
Evaluated 197 test cases. 19.29% Accuracy
Average Accuracy: 0.25
