# Assignment 1, Task 3

In [1]:
#To import relevant libraries
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import *
from collections import defaultdict


In [2]:
#Data Pre-processing for string for standardisation
def preprocess_string(str_arg):
 cleaned_str=re.sub('[^a-z\s]+',' ',str_arg,flags=re.IGNORECASE) 
 cleaned_str=re.sub('(\s+)',' ',cleaned_str) 
 cleaned_str=cleaned_str.lower() 
 return cleaned_str



In [3]:
#Naive Bayes
class NaiveBayes:
    
    #A fucntion to pass in unique number of classes of the training data
    def __init__(self,unique_classes):
        self.classes=unique_classes
    
    
    #A function requried for Naive Bayes class that sees a "space" as a
    #tokeniser and then uses it to add to a word, and checks back with the
    #corresponding dictionary. 
    def addToBow(self,example,dict_index):
        if isinstance(example,np.ndarray): example=example[0]
        for token_word in example.split():
            self.bow_dicts[dict_index][token_word]+=1
          
        
    #A function required for Naive Bayes class that trains the model, and in this
    #with the "def addToBow" function, to compute for each category or class
    def train(self,dataset,labels):    
        self.examples=dataset
        self.labels=labels
        self.bow_dicts=np.array([defaultdict(lambda:0) for index in range(self.classes.shape[0])])
        
        if not isinstance(self.examples,np.ndarray): self.examples=np.array(self.examples)
        if not isinstance(self.labels,np.ndarray): self.labels=np.array(self.labels)
            
        for cat_index,cat in enumerate(self.classes):
          
            all_cat_examples=self.examples[self.labels==cat] #filter all examples of category == cat
            
            #For pre-processed examples
            cleaned_examples=[preprocess_string(cat_example) for cat_example in all_cat_examples]
            cleaned_examples=pd.DataFrame(data=cleaned_examples)
            
            np.apply_along_axis(self.addToBow,1,cleaned_examples,cat_index)

        prob_classes=np.empty(self.classes.shape[0])
        all_words=[]
        cat_word_counts=np.empty(self.classes.shape[0])
        for cat_index,cat in enumerate(self.classes):
           
            prob_classes[cat_index]=np.sum(self.labels==cat)/float(self.labels.shape[0]) 
            
            count=list(self.bow_dicts[cat_index].values())
            cat_word_counts[cat_index]=np.sum(np.array(list(self.bow_dicts[cat_index].values())))+1
            
            #To call all the words in the dictionary                           
            all_words+=self.bow_dicts[cat_index].keys()
                                                     
        
        #To make a unqiue vocab list
        self.vocab=np.unique(np.array(all_words))
        self.vocab_length=self.vocab.shape[0]
                                  
        #To calculate the Naive Bayes denominator value                                    
        denoms=np.array([cat_word_counts[cat_index]+self.vocab_length+1 for cat_index,cat in enumerate(self.classes)])                                                                          
            
        #To put into a tuple
        self.cats_info=[(self.bow_dicts[cat_index],prob_classes[cat_index],denoms[cat_index]) for cat_index,cat in enumerate(self.classes)]                               
        self.cats_info=np.array(self.cats_info)                                 
                                              
    
    #A function required of a Naive Bayes class to estimate the posterior proability of the test vocab words. 
    def getExampleProb(self,test_example):                                                            
        likelihood_prob=np.zeros(self.classes.shape[0])
        
        for cat_index,cat in enumerate(self.classes): 
                             
            for test_token in test_example.split():                         
                test_token_counts=self.cats_info[cat_index][0].get(test_token,0)+1                             
                test_token_prob=test_token_counts/float(self.cats_info[cat_index][2])                              
                #To prevent underflow problem as mentioned in lecture
                likelihood_prob[cat_index]+=np.log(test_token_prob)
                                              
        post_prob=np.empty(self.classes.shape[0])
        for cat_index,cat in enumerate(self.classes):
            post_prob[cat_index]=likelihood_prob[cat_index]+np.log(self.cats_info[cat_index][1])                                  
      
        return post_prob
    
   #A function required of Naive Bayes class to calculate the proability of each test sample
   #to all classes and to predict it against the class proability at max. 
    def test(self,test_set):
        
        #An empty list to store the prediction of each test 
        predictions=[]
        for example in test_set: 
                                                                               
            cleaned_example=preprocess_string(example) 
                                            
            post_prob=self.getExampleProb(cleaned_example)
            
            predictions.append(self.classes[np.argmax(post_prob)])
                
        return np.array(predictions)

In [4]:
#To import wordsList and classList as a CSV file (previously it was just a "data" type file)
wordsList = pd.read_csv('wordsList', sep=" ", header=None)
wordsList.columns = ['wordList']
classList = pd.read_csv('classList', sep=" ", header=None)
classList.columns = ['classList']
wordsList['classList'] = classList['classList']
print(wordsList.shape[0])

#To print the first 5 rows of wordsList
wordsList.head()

72


Unnamed: 0,wordList,classList
0,"codeine,15mg,for,203,visa,only,codeine,methylm...",1
1,"peter,with,jose,out,town,you,want,meet,once,wh...",0
2,"hydrocodone,vicodin,brand,watson,vicodin,750,1...",1
3,"yay,you,both,doing,fine,working,mba,design,str...",0
4,"you,have,everything,gain,incredib1e,gains,leng...",1


In [5]:
#Stratified sampling as requested from the question
X = wordsList.wordList
y = wordsList.classList
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.08)
print("Training no of rows X: "+str(len(X_train)))
print("Testing no of rows X: "+str(len(X_test)))
print("Training no of rows y: "+str(len(y_train)))
print("Testing no of rows y: "+str(len(y_test)))

Training no of rows X: 66
Testing no of rows X: 6
Training no of rows y: 66
Testing no of rows y: 6


In [6]:
#Training of Model using the Naive Bayes class coded earlier on
nb=NaiveBayes(np.unique(y_train)) 
nb.train(X_train,y_train) 

In [7]:
#To get the prediction results from the test set
pclasses=nb.test(X_test) 

#To check the accurarcy of the model. 
test_acc=np.sum(pclasses==y_test)/float(y_test.shape[0])
print ("Total number of Test Set examples: ",y_test.shape[0])
print ("Accurarcy for Test Set: ",test_acc*100,"%")

Total number of Test Set examples:  6
Accurarcy for Test Set:  100.0 %
