In [13]:
import os
import random
import numpy as np
import matplotlib.pyplot as plt

In [2]:
class Naive_Bayes:
    def __init__(self,smooth=0.5,path="./languageID/"):
        
        # dictionary to count character occurances
        alphabet = {"a":0, "b":0,"c":0, "d":0,"e":0,"f":0,"g":0, "h":0,"i":0, "j":0, "k":0, "l":0, 
                    "m":0, "n":0, "o":0, "p":0, "q":0, "r":0, "r":0, "s":0, "t":0, "u":0, "v":0, 
                    "w":0, "x":0, "y":0, "z":0, " ": 0};
        
        self.lang = np.array(['e','s','j'])
        self.prior = np.array([0,0,0]).astype('float')
        self.alpha = [alphabet.copy(),alphabet.copy(),alphabet.copy()] # shallow copies
        self.log_liklihood = [alphabet.copy(),alphabet.copy(),alphabet.copy()]
        self.path = path
        self.train = []
        self.test = []
        self.smooth = smooth
        
    def set_data(self,train_size=10):
        assert train_size <= 20 # there are only 20 of each file
        for lang in self.lang:
            for i in range(train_size):
                self.train.append(lang+str(i)+".txt")
            for i in range(train_size,20):
                self.test.append(lang+str(i)+".txt")
                
    def get_prior(self):
        count_e = count_s = count_j = 0
        assert len(self.train) > 0
        for file in self.train:
            if 'e' in file: # count occurances of english files
                count_e += 1
            elif 's' in file: # count occurances of spanish files
                count_s += 1
            elif 'j' in file: # count occurances of japanese files
                count_j += 1
        tot = count_e + count_s + count_j + (3*self.smooth) # total files with smoothing factor added
        self.prior[0] = (1.0*count_e+self.smooth)/tot
        self.prior[1] = (1.0*count_s+self.smooth)/tot
        self.prior[2] = (1.0*count_j+self.smooth)/tot
        
    def print_prior(self):
        print("P(Y=english) = %f   P(Y=spanish) = %f   P(Y=japanese) = %f" %(self.prior[0],self.prior[1],self.prior[2]))
    
    def get_cond_prob(self):
        assert len(self.train) > 0
        for file in self.train:
            i = np.argwhere(file[0] == self.lang)[0][0] # checks the language of the file
            text = open(self.path+file,'r') # open file to read
            for r in text: # iterate over rows
                for c in r: # iterate over characters
                    if (c == '\n'): # skip newline
                        continue
                    self.alpha[i][c] += 1 # increment count of current character
        for i in range(len(self.lang)):
            tot = sum(self.alpha[i].values()) + (27*self.smooth)
            for key in self.alpha[i].keys():
                self.alpha[i][key] += self.smooth
                self.alpha[i][key] /= tot # normalize character counts
                self.log_liklihood[i][key] = np.log10(self.alpha[i][key])
                
    def get_count(self,filename):
        temp_alpha = {"a":0, "b":0,"c":0, "d":0,"e":0,"f":0,"g":0, "h":0,"i":0, "j":0, "k":0, "l":0, 
                    "m":0, "n":0, "o":0, "p":0, "q":0, "r":0, "r":0, "s":0, "t":0, "u":0, "v":0, 
                    "w":0, "x":0, "y":0, "z":0, " ": 0};
        try:
            text = open(self.path+filename,'r')
            for r in text:
                for c in r:
                    if (c== '\n'):
                        continue
                    temp_alpha[c] += 1
        except:
            print("error opening file")
        return temp_alpha
    
    def get_prob(self,counts):
        probs = np.zeros(3)
        for l in range(len(self.lang)):
            for key in counts.keys():
                # probability is theta^count so log(probability) is count*theta
                probs[l] += counts[key]*self.log_liklihood[l][key]
        return probs

In [5]:
## Problem 4.1

nb = Naive_Bayes(smooth=0.5)
nb.set_data(train_size=10)
nb.get_prior()
nb.print_prior()

P(Y=english) = 0.333333   P(Y=spanish) = 0.333333   P(Y=japanese) = 0.333333


In [6]:
## Problem 4.2 and 4.3

nb.get_cond_prob()
for i in range(3):
    print("\n Probabilities for",str(nb.lang[i]),'\n')
    for key,val in nb.alpha[i].items():
        #print("%s: %f" %(key,round(val,5))) # commented out to reduce clutter
        continue


 Probabilities for e 


 Probabilities for s 


 Probabilities for j 



In [7]:
## Problem 4.4

temp_count = nb.get_count("e10.txt")
for key,val in temp_count.items():
    #print("%s:  %03d" %(key,val)) # commented out to reduce clutter
    continue

In [8]:
## Problem 4.5

probs = nb.get_prob(temp_count)
print("Log Probabilities are:",probs)
print("Predicted language is %s" %(nb.lang[np.argmax(probs)]))

Log Probabilities are: [-3405.67889149 -3677.29386843 -3809.38498463]
Predicted language is e


In [12]:
## Problem 4.7

conf = np.zeros((3,3))
for col,lang in enumerate(nb.lang):
    for i in range(10,20):
        count = nb.get_count(str(lang)+str(i)+".txt")
        probs = nb.get_prob(count)
        row = np.argmax(probs)
        conf[row,col] += 1
print(conf)

[[10.  0.  0.]
 [ 0. 10.  0.]
 [ 0.  0. 10.]]


In [22]:
## Problem 4.8

test_doc = "e10.txt"
file = open(nb.path+test_doc)
text = list(file.read())
random.shuffle(text)
scrambled = ''.join(text)
file_scram = open(nb.path+"scram_e10.txt",'w')
file_scram.write(scrambled)
file_scram.close()
scram_count = nb.get_count("scram_e10.txt")
scram_prob = nb.get_prob(scram_count)
print("Log Probabilities are:",scram_prob)
# we see they are equal to the log probabilities from question 4.5
# so shuffling/order does not matter

Log Probabilities are: [-3405.67889149 -3677.29386843 -3809.38498463]
