In [10]:
# Import pandas
import pandas as pd
import numpy as np
from gensim.corpora import Dictionary
from sklearn.model_selection import train_test_split

In [11]:
# Assign spreadsheet filename to `file`
file = 'Eclipse_4sourcev1.xls'
# Load spreadsheet
xl = pd.ExcelFile(file)
# Load a sheet into a DataFrame by name: df1
df = xl.parse('total')

In [12]:
print('Total: ', len(df))

Total:  7373


In [51]:
class NaiveBayesClassifer:
    def __init__(self):
        self.name = 'Naive Bayes Classifier'
    
    def train (self, X_train, y_train, number_of_unique_words = 100000):
        self.prior = np.zeros(5)
        labels = list(y_train)
        
        for i in range(5):
            self.prior[i] = labels.count(i)/len(labels)
        print(self.prior)
        self.mle_summary = np.zeros((number_of_unique_words, 5))
        self.wordCount_summary = np.zeros(5)
        
        summaries = list(X_train.loc[:,'summary'])

        self.dictionary = Dictionary()

        for summary in summaries:
            self.dictionary.add_documents([summary.split()])

        for i in range(len(summaries)):
            summaries[i] = self.dictionary.doc2idx(summaries[i].split())
        
        for i in range(len(summaries)):
            summary = summaries[i]
            for word in summary:
                self.mle_summary[word][labels[i]] += 1 
                self.wordCount_summary[labels[i]] += 1
        
        for i in range(number_of_unique_words):
            for j in range(5):
                self.mle_summary[i][j] += 1
                self.mle_summary[i][j] /= (self.wordCount_summary[j])
        
        self.mle_desc = np.zeros((number_of_unique_words, 5))
        self.wordCount_desc = np.zeros(5)
        
        descriptions = list(X_train.loc[:,'description'])
    
        for description in descriptions:
            self.dictionary.add_documents([description.split()])

        for i in range(len(descriptions)):
            descriptions[i] = self.dictionary.doc2idx(descriptions[i].split())
        
        
        for i in range(len(descriptions)):
            description = descriptions[i]
            for word in description:
                '''
                if (i==0):
                    print(word, self.wordCount_desc[word])
                '''
                self.mle_desc[word][labels[i]] += 1
                self.wordCount_desc[labels[i]] += 1
        
        for i in range(number_of_unique_words):
            for j in range(5):
                self.mle_desc[i][j] += 1
                self.mle_desc[i][j] /= self.wordCount_desc[labels[j]]
        
        pass
    
    def predict (self, X_test):
        y_test = np.ones((X_test.shape[0], 5))
        for i in range(X_test.shape[0]):
            for j in range(5):
                y_test[i][j] = (self.prior[j])
        
        summaries = list(X_test.loc[:,'summary'])
        for summary in summaries:
            self.dictionary.add_documents([summary.split()])
        for i in range(len(summaries)):
            summaries[i] = self.dictionary.doc2idx(summaries[i].split())
        
        descriptions = list(X_test.loc[:,'description'])
        for description in descriptions:
            self.dictionary.add_documents([description.split()])
        for i in range(len(descriptions)):
            descriptions[i] = self.dictionary.doc2idx(descriptions[i].split())
        
        for i in range(X_test.shape[0]):
            for j in range(5):
                log_sum_summary = 0
                log_sum_desc = 0
                for word in summaries[i]:
                    if word != -1 and self.mle_summary[word][j] != 0:
                        log_sum_summary += np.log(self.mle_summary[word][j])
                for word in descriptions[i]:
                    if word != -1 and self.mle_desc[word][j] != 0:
                        log_sum_desc += np.log(self.mle_desc[word][j])
                
                y_test[i][j] = np.log(y_test[i][j])
                y_test[i][j] *= ((log_sum_summary) )
                
                ''' + (log_sum_desc) '''
        #print(y_test)
        ret_val = np.zeros(len(y_test))
        
        cnt = 0
        for i in range(X_test.shape[0]):
            mx = 0
            mxindex = -1
            for j in range(5):
                if y_test[i][j] != 0:
                    if mxindex == -1 or y_test[i][j] > mx:
                        mxindex = j
                        mx = y_test[i][j]
            if mxindex == -1:
                cnt += 1
                mxindex = cnt % 5
            ret_val[i] = mxindex
        ret_val =  [int(round(x)) for x in ret_val] 
        print('Number of random classifications: ',cnt)
        return list(ret_val)

In [26]:

# create training and testing vars
y = df.loc[:,'label'] - 1
X = df.loc[:, 'summary':'component']

In [52]:
naiveBayesClassifier = NaiveBayesClassifer()

In [16]:
naiveBayesClassifier.train(X_train = X.loc[0:(len(X)/11-1), ], y_train = y.loc[0:(len(X)/11-1), ])

[0.04776119 0.36865672 0.3358209  0.17462687 0.07313433]


In [17]:
y_test = naiveBayesClassifier.predict(X_test = X.loc[(len(X)/11-1):, ])

Number of random classifications:  0


In [18]:
y_label = list(y.loc[(len(X)/11-1):]) 
# print(y_label)

In [19]:
print(y_test.count(0), y_label.count(0))
print(y_test.count(1), y_label.count(1))
print(y_test.count(2), y_label.count(2))
print(y_test.count(3), y_label.count(3))
print(y_test.count(4), y_label.count(4))

6256 698
44 1191
71 2756
51 1269
281 789


In [None]:
f1_scores = np.zeros((10, 5))
for i in range(1,11):
    split_point = (len(X)/11) * i
    naiveBayesClassifier.train(X_train = X.loc[0:split_point,], y_train = y.loc[0:split_point, ])
    y_test = naiveBayesClassifier.predict(X_test = X.loc[split_point:, ])
    
    
    y_label = list(y.loc[split_point:, ])
    
    #print(y_test, y_label)
    #print((y_test == y_label).count(True) / len(y_test))
    
    
    true_positive = np.zeros(5)
    
    false_positive = np.zeros(5)
    false_negative = np.zeros(5)
    
    for j in range(len(y_test)):
        if (y_test[j] == y_label[j]):
            true_positive[y_test[j]] += 1
        else:
            false_positive[y_test[j]] += 1
            false_negative[y_label[j]] += 1
    
    precision = np.zeros(5)
    recall = np.zeros(5)
    for j in range(5):
        precision[j] = true_positive[j] / (true_positive[j] + false_positive[j])
        recall[j] = true_positive[j] / (true_positive[j] + false_negative[j])
    
    for j in range(5):
        f1_scores[i-1][j] = (2 * (precision[j] * recall[j])) / (precision[j] + recall[j]) 
    print('Count: Train vs Test')
    print(y_test.count(0), y_label.count(0))
    print(y_test.count(1), y_label.count(1))
    print(y_test.count(2), y_label.count(2))
    print(y_test.count(3), y_label.count(3))
    print(y_test.count(4), y_label.count(4))
    
    print('F-measure after ', i, 'th iteration: ', f1_scores[i-1])
    pass
    

[0.04769001 0.3681073  0.33532042 0.17585693 0.07302534]


In [None]:
print(accuracies)

In [213]:
a= [1, 0, 1, 0, 1]
b = [1, 1, 0, 0, 0]
a==b

False

In [259]:
print(y_test)

[0.0, 0.0, 0.0, 4.0, 4.0, 4.0, 0.0, 0.0, 4.0, 0.0, 0.0, 0.0, 1.0, 3.0, 0.0, 4.0, 3.0, 0.0, 4.0, 0.0, 4.0, 0.0, 4.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 4.0, 0.0, 0.0, 0.0, 0.0, 0.0, 4.0, 0.0, -1.0, 0.0, 4.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 4.0, 4.0, -1.0, 1.0, 0.0, 0.0, 4.0, 0.0, 4.0, 0.0, 0.0, 4.0, 4.0, 0.0, -1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.0, 0.0, 0.0, 4.0, 0.0, 0.0, 0.0, 0.0, -1.0, 0.0, 1.0, 0.0, 4.0, 0.0, 0.0, 0.0, 4.0, 0.0, 0.0, 4.0, 1.0, 3.0, 0.0, 4.0, 4.0, 0.0, 0.0, 0.0, 0.0, 4.0, 0.0, 4.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 4.0, 0.0, 0.0, 4.0, 0.0, 0.0, 4.0, 0.0, 4.0, 0.0, 0.0, 4.0, 0.0, 0.0, 4.0, 4.0, 0.0, 0.0, 2.0, 4.0, 4.0, 0.0, 0.0, 0.0, 4.0, 0.0, 3.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 4.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 3.0, 0.0, 4.0, 0.0, 0.0, 0.0, 0.0, 0.0, 4.0, 0.0, 0.0, -1.0, 0.0, 3.0, 0.0, 4.0, 0.0, 0.0, 0.0, 4.0, 0.0, 0.0, 0.0, 4.0, 4.0, 0.0, 4.0, 0.0, 0.0, 4.0, 0.0, 0.0, 0.0, 4.0, 0.0, 0.0,