In [1]:
from numba import njit, jit
import numpy as np
import csv
import re
from functools import reduce
from typing import List, Set, Dict
import math
from math import log
import time
import random

In [2]:
def read_csv(filename):
    dataframe = {}
    with open(filename, "r" , newline='') as csvfile:
        DictReader = csv.DictReader(csvfile)
        init = False
        for row in DictReader:
            for key,value in row.items():
                if not init:
                    dataframe[key] = [value]
                else:
                    dataframe[key].append(value)
            init = True
        
    return dataframe
    

In [3]:
training_base_df = read_csv("trg.csv")

In [4]:
training_base_df["class"][:5] # Sanity check for data to be loaded

['B', 'A', 'E', 'E', 'B']

In [5]:
class CrossValidation():
    """
    Cross Validation class. Custom-made for easier dataset management and testing. 
    
    Parameters: 
        dataframe(DataFrame/ Dictionary of list from read_csv() function): Data to be used in cross-validation. 
        num_split(Int): The number of split to be executed. 
    """
    def __init__(self, dataframe, num_split=10):
        self.df = dataframe
        self.split = num_split
        self.df_length = 0
        self.dfs = []
        for i in range(self.split):
            self.dfs.append(dict())
        self._shuffle_and_split()
        
    
    def _shuffle_and_split(self, random_state=42): 
        """
        Internal function. Called when CrossValidation is initialized. 
        Shuffle the given data and split them according to the number of split 
        
        Parameters:
            random_state(Int) : The random seed to be used to shuffle the data
        """
        self.df_length = len(self.df["abstract"])
        shuffle_index = list(range(self.df_length)) 
        random.Random(random_state).shuffle(shuffle_index)# Shuffle the indexes 
        for i in self.df.keys():
            shuffled_df = []
            for j in range(self.df_length):
                shuffled_df.append(self.df[i][j])
            split_chunks = np.array_split(shuffled_df, self.split)
            for j in range(len(split_chunks)):
                self.dfs[j][i] = np.asarray(split_chunks[j])
        
    def get_ith_cv(self,i):
        """
        Get the training set and test set k-th fold. 
        
        Parameters: 
            i(Int): Used to choose which fold to be used
        """
        i%=self.split
        train = dict()
        test = self.dfs[i]
        for j in self.dfs[0].keys():
            train[j] = []
        for x in range(self.split):
            if x == i:
                continue
            for j in self.dfs[x].keys():
                train[j].append(self.dfs[x][j])
        for j in train.keys():
            train[j] = np.concatenate(train[j])
        return train,test
    
    def train_and_validate(self): 
        """
        Start the k-fold validation test
        """
        
        accuracy = np.zeros(self.split, dtype=float)
        for i in range(self.split):
            print("Fold ",i+1)
            train_df, test_df = self.get_ith_cv(i)
            classifier = MultinomialNBC()
            classifier.add_ignored_words(stopwords)
            classifier.train(train_df)
            predictions = classifier.predict(test_df)
            Y_test = test_df["class"]
            accuracy[i] = get_accuracy(Y_test, predictions)
            print(accuracy[i])
            
        return accuracy
    
            
            
            


In [6]:
crossval_data = CrossValidation(training_base_df)

In [7]:
stopwords = []
with open("english_stopwords.txt", "r") as f:
    stopwords = [i.strip() for i in f.readlines()] # Read the file, strip newlines 

In [37]:
class MultinomialNBC():
    """
    Multinomial Naive Bayes Classifier that takes word counts from abstracts as its features. 
    """
    
    def __init__(self):
        self.class_mapping = dict()
        self.reverse_class_mapping = dict()
        self.word_mapping = dict()
        self.reverse_word_mapping = dict()
        self.word_counts = []
        self.probs = []
        self.words = []
        self.num_unique_words = 0
        self.alpha_i = 1
        self.ignored_words = set()
    
    def add_ignored_words(self, words): # 
        """
        Add words to be ignored by the classifier
        
        Parameters:
            words(List[String]): List of words to be ignored
            
        """
        for word in words:
            self.ignored_words.add(word)
    
    def train(self, train_data): 
        """
        Train the classifier from the given training data
        
        Parameters:
            train_data(DataFrame/ Dictionary of list from read_csv() function): Training data for the classifier
            
        """
        y = train_data["class"]
        x = train_data["abstract"]
        num_training_data = len(x)
        
        self.classes , self.classes_count = np.unique(y, return_counts=True)
        self.num_class = len(self.classes)
        print("number of classes found in train data:",self.num_class)
        print("classes: ", self.classes)
        for i in range(self.num_class):
            self.class_mapping[self.classes[i]]=i
            self.reverse_class_mapping[i] = self.classes[i]
        
        print("Finished preprocess classes: ", time.perf_counter())
        self._get_overall_word_count(x)
        print("Finished count in overall words: ",time.perf_counter())
        
        print("number of unique words=",self.num_unique_words)
        # print(self.word_counts)
        # self.word_counts_per_category = np.zeros([self.num_class, self.num_unique_words], dtype=int) 
        self.weighted_words= np.zeros([len(x), self.num_unique_words], dtype= float)
        self.words_occurence= np.zeros(self.num_unique_words, dtype= int)
        
        for i in range(num_training_data):
            words, counts = np.unique(self._get_sanitized_wordlist(x[i]), return_counts=True)
#             downweight_ratio = math.sqrt(sum(counts**2))
            
            for j in range(len(words)):
                if words[j] not in self.word_mapping:
                    continue
                word_index = self.word_mapping[words[j]] 
                self.weighted_words[i][word_index]= counts[j]
                self.words_occurence[word_index]+=1
#                 self.weighted_words[i][word_index] /= downweight_ratio

        print("Finished counting words per category at", time.perf_counter())

        self._downweight_common_words(num_training_data)
                            
        print("Finished downweighitng common words at", time.perf_counter())
        
        self.weighted_words_per_class = np.zeros([self.num_class, self.num_unique_words], dtype=float)
        self.weighted_words_class = np.zeros(self.num_class, dtype=float)
        
        for i in range(num_training_data):
            words, counts = np.unique(self._get_sanitized_wordlist(x[i]), return_counts=True)
            for j in range(len(words)):
                if words[j] not in self.word_mapping:
                    continue
                data_class_index = self.class_mapping[y[i]]
                word_index = self.word_mapping[words[j]] 
                self.weighted_words_per_class[data_class_index][word_index]+= self.weighted_words[i][word_index]
        for i in range(self.num_class):
            self.weighted_words_class[i] = sum(self.weighted_words_per_class[data_class_index])
            
                
        self.probs = np.zeros([self.num_class, self.num_unique_words+1], dtype=float)
        
        print("Finished calculating weighted words: ",time.perf_counter())
        
        
        for i in range(self.num_class):
            for j in range(self.num_unique_words):
                self.probs[i][j] = math.log(self.weighted_words_per_class[i][j] + 1) - \
                    math.log(self.weighted_words_class[i] + self.num_unique_words)
                
        print("Training finished at ", time.perf_counter())
        
    def _downweight_common_words(self, num_training_data:int):
        """
        Internal function. Feature enhancement to the weighted words to downweight common words. 
        Implementation based on the given tutorial slides
        
        Parameters: 
            num_training_data(Int): Length of training data to be processed
        """
        for i in range(num_training_data):
            for j in range(self.num_unique_words):
                self.weighted_words[i][j] *= log(num_training_data/ self.words_occurence[j])
    
    def _get_sanitized_wordlist(self, sentence):
        """
        Internal function. Get the sanitized list of words from the given sentence
        
        Parameters: 
            sentence(String): Sentence to be processed
        """
        wordlist = [x.strip().lower() for x in sentence.split(' ')] # split by space then make it lowercase
        for i in range(len(wordlist)):
            wordlist[i] = re.sub('[^a-z]+', '', wordlist[i]) # Remove everything that is not related to alphabet
        return wordlist
    
    def _construct_word_dictionary(self, sentence, word_dict= {}):
        wordlist = self._get_sanitized_wordlist(sentence)
        for word in wordlist:
            if word in self.ignored_words:
                continue
            if word in word_dict:
                word_dict[word] += 1
            else:
                word_dict[word] = 1
        return word_dict
        
    
    def _get_overall_word_count(self, data):
        """
        Internal function. Get the overall word count from the given data 
        
        Parameters: 
            data(List[String]): List of abstracts which word counts need to be analyzed
        """
        word_dict = dict()
        
        for abstract in data:
            word_dict = self._construct_word_dictionary(abstract, word_dict= word_dict)
#         print(word_dict)
            
        self.word_counts = np.zeros(len(word_dict), dtype =int)
        self.words = word_dict.keys()
        
        for key,value in word_dict.items():
            self.word_mapping[key] = self.num_unique_words
            self.reverse_word_mapping[self.num_unique_words] = key
            self.word_counts[self.num_unique_words] = value
            self.num_unique_words+=1
            
    
    def predict(self, test_data):
        """
        Predict the given test data. 
        Note: Classifier have to be trained first in order to return meaningful results! 
        
        Parameters:
            test_data(DataFrame/ Dictionary of list from read_csv() function): Test data to be predicted by the classifier
            
        """
        x = test_data["abstract"]
        prediction_probs = np.zeros([len(x), self.num_class], dtype=float)
        for i in range(len(x)):
            words, counts = np.unique(self._get_sanitized_wordlist(x[i]), return_counts=True)
            sigma_fi_fact = log(math.factorial(sum(counts)))
            pi_fi_fact = 0
            for k in range(len(words)): 
                pi_fi_fact+= log(math.factorial(counts[k]))
            
            for j in range(self.num_class):
                prediction_probs[i][j] = sigma_fi_fact - pi_fi_fact
                for k in range(len(words)):
                    if words[k] not in self.word_mapping:
                        continue
                    word_index = self.word_mapping[words[k]]
                    prediction_probs[i][j]+= self.probs[j][word_index] * counts[k]
                    
        predictions = [''] * len(x)
        for i in range(len(x)):
            prediction_i = np.argmax(prediction_probs[i], axis=0)
            predictions[i] = self.reverse_class_mapping[prediction_i]
        return predictions
 

In [38]:
train_df, test_df = crossval_data.get_ith_cv(1)
print(len(train_df["abstract"]), len(test_df["abstract"]))
print(train_df["id"][0], train_df["class"][0], train_df["abstract"][0][:100]) # Sanity check whether shuffle is performed successfuly and consistent 

3600 400
1 B the 4 202 353 bp genome of the alkaliphilic bacterium bacillus halodurans c-125 contains 4066 predic


In [39]:
classifier = MultinomialNBC()
classifier.add_ignored_words(stopwords)
print(len(classifier.ignored_words))

127


In [40]:
probs = classifier.train(train_df)

number of classes found in train data: 4
classes:  ['A' 'B' 'E' 'V']
Finished preprocess classes:  1159.5957824
Finished count in overall words:  1160.4560121
number of unique words= 23510
Finished counting words per category at 1162.0920339
Finished downweighitng common words at 1243.4408652
Finished calculating weighted words:  1245.1020893
Training finished at  1245.2496632


In [41]:
predictions = classifier.predict(test_df)
Y_test = test_df["class"]

In [42]:
def get_accuracy(test, predictions):
    correct = 0 
    false = 0 
    for i in range(len(test)):
        if test[i]==predictions[i]:
            correct+=1
        else:
            false+=1
    return correct/(correct+false)

In [43]:
get_accuracy(Y_test, predictions)

0.855

In [44]:
overall_accuracy = crossval_data.train_and_validate()

Fold  1
number of classes found in train data: 4
classes:  ['A' 'B' 'E' 'V']
Finished preprocess classes:  1283.9361662
Finished count in overall words:  1284.8006219
number of unique words= 23529
Finished counting words per category at 1286.4248422
Finished downweighitng common words at 1367.9061821
Finished calculating weighted words:  1369.5611215
Training finished at  1369.712379
0.8875
Fold  2
number of classes found in train data: 4
classes:  ['A' 'B' 'E' 'V']
Finished preprocess classes:  1370.1640729
Finished count in overall words:  1371.0326773
number of unique words= 23510
Finished counting words per category at 1372.6425347
Finished downweighitng common words at 1454.2144242
Finished calculating weighted words:  1455.8921428
Training finished at  1456.0408636
0.855
Fold  3
number of classes found in train data: 4
classes:  ['A' 'B' 'E' 'V']
Finished preprocess classes:  1456.4927917
Finished count in overall words:  1457.3636551
number of unique words= 23534
Finished counti

In [45]:
overall_accuracy

array([0.8875, 0.855 , 0.91  , 0.86  , 0.87  , 0.91  , 0.885 , 0.8875,
       0.9025, 0.8875])

In [46]:
sum(overall_accuracy)/len(overall_accuracy)

0.8854999999999998

# Get the final model

In [18]:
classifier = MultinomialNBC()
classifier.train(training_base_df)

number of classes found in train data: 4
classes:  ['A' 'B' 'E' 'V']
Finished preprocess classes:  55.1163301
Finished count in overall words:  56.0946357
number of unique words= 25069
Finished counting words per category at 58.012002
Finished downweighitng common words at 58.012507
Finished calculating weighted words:  59.8978714
Training finished at  60.0563226


array([[ -5.159973  ,  -5.79223181,  -8.73456331, ..., -12.64658631,
        -12.64658631,   0.        ],
       [ -2.84038088,  -3.81443641,  -6.11425202, ..., -12.64658631,
        -12.64658631,   0.        ],
       [ -2.52818901,  -3.52814196,  -6.48960733, ..., -11.95343913,
        -11.95343913,   0.        ],
       [ -5.15885255,  -6.02784733,  -9.65085404, ..., -12.64658631,
        -12.64658631,   0.        ]])

In [19]:
# Sanity check 
predictions = classifier.predict(test_df)
Y_test = test_df["class"]
get_accuracy(Y_test, predictions)

0.85

In [21]:
test_base_df = read_csv("tst.csv")

In [22]:
test_base_predictions = classifier.predict(test_base_df)
test_base_predictions

['B',
 'E',
 'E',
 'E',
 'E',
 'E',
 'E',
 'B',
 'E',
 'E',
 'E',
 'E',
 'B',
 'B',
 'E',
 'E',
 'E',
 'E',
 'E',
 'B',
 'B',
 'E',
 'B',
 'B',
 'E',
 'E',
 'E',
 'E',
 'B',
 'E',
 'E',
 'B',
 'E',
 'E',
 'B',
 'E',
 'E',
 'E',
 'B',
 'E',
 'E',
 'E',
 'E',
 'E',
 'E',
 'E',
 'E',
 'B',
 'B',
 'E',
 'E',
 'E',
 'E',
 'B',
 'B',
 'E',
 'E',
 'B',
 'B',
 'E',
 'E',
 'E',
 'E',
 'B',
 'E',
 'B',
 'E',
 'E',
 'E',
 'E',
 'E',
 'E',
 'E',
 'E',
 'E',
 'E',
 'E',
 'B',
 'B',
 'E',
 'E',
 'B',
 'E',
 'E',
 'E',
 'E',
 'E',
 'E',
 'B',
 'E',
 'E',
 'E',
 'B',
 'B',
 'E',
 'E',
 'E',
 'E',
 'E',
 'B',
 'E',
 'B',
 'E',
 'E',
 'E',
 'B',
 'E',
 'E',
 'E',
 'B',
 'B',
 'E',
 'E',
 'B',
 'B',
 'E',
 'B',
 'B',
 'E',
 'E',
 'B',
 'E',
 'E',
 'E',
 'E',
 'E',
 'E',
 'E',
 'E',
 'B',
 'E',
 'E',
 'E',
 'E',
 'E',
 'B',
 'B',
 'E',
 'B',
 'B',
 'E',
 'B',
 'E',
 'E',
 'B',
 'E',
 'E',
 'E',
 'B',
 'E',
 'E',
 'E',
 'E',
 'E',
 'E',
 'B',
 'B',
 'B',
 'E',
 'E',
 'E',
 'E',
 'B',
 'B',
 'E',
 'E',
 'E'

In [33]:
filename = 'answers3.1.csv'

with open(filename, 'w+', newline='') as csvfile:
    fieldnames = ['id', 'class']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for i in range(len(test_base_predictions)):
        writer.writerow({
            'id': test_base_df['id'][i],
            'class': test_base_predictions[i]
        })
    print("Answers successfully writen to",filename)

Answers successfully writen to answers3.csv


In [36]:
check_answer = read_csv(filename)
print(check_answer["id"][:10], check_answer["class"][:10])

['1', '2', '3', '4', '5'] ['B', 'E', 'E', 'E', 'E']
