In [1]:
# Import pandas
import pandas as pd
import numpy as np
from gensim.corpora import Dictionary
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import 

In [2]:
# Assign spreadsheet filename to `file`
file = '../../data/Eclipse_4sourcev1.xls'
# Load spreadsheet
xl = pd.ExcelFile(file)
# Load a sheet into a DataFrame by name: df1
df = xl.parse('total')

In [3]:
print('Total: ', len(df))

Total:  7373


In [4]:
class NaiveBayesClassifer:
    def __init__(self):
        self.name = 'Naive Bayes Classifier'
    
    def train (self, X_train, y_train, number_of_unique_words = 100000):
        self.prior = np.zeros(5)
        labels = list(y_train)
        
        for i in range(5):
            self.prior[i] = labels.count(i)/len(labels)
        
        self.dictionary = Dictionary()

        summaries = list(X_train.loc[:,'description'])
        for summary in summaries:
            self.dictionary.add_documents([summary.split()])
        
        self.number_of_unique_words = len(self.dictionary)
        print('Number of unique words:', self.number_of_unique_words)
        self.summary_feature = np.ones((len(X_train), self.number_of_unique_words))
        self.summary_word_count = np.ones((5, self.number_of_unique_words))
        
        for i in range(len(summaries)):
            summaries[i] = self.dictionary.doc2idx(summaries[i].split())
        
        for i in range(len(summaries)):
            for j in range(len(summaries[i])):
                self.summary_feature[i][summaries[i][j]] += 1
                self.summary_word_count[labels[i]][summaries[i][j]] += 1
        
        for i in range(5):
            self.summary_word_count[i] /= np.sum(self.summary_word_count[i]) 
        
        
        ch2 = SelectKBest(chi2)
        X_new = ch2.fit_transform(self.summary_feature, np.array(labels))
        
        self.important_features = np.where(ch2.get_support())[0]
        
        print('Total number of features:', self.number_of_unique_words)
        print('Number of selected features:', len(self.important_features))
        
    def predict (self, X_test):
        y_test = np.ones((X_test.shape[0], 5))
        
        '''
        for i in range(X_test.shape[0]):
            for j in range(5):
                y_test[i][j] = (self.prior[j])
        '''
        
        summaries = list(X_test.loc[:,'description'])
        
        for i in range(len(summaries)):
            summaries[i] = self.dictionary.doc2idx(summaries[i].split())
        
        for i in range(X_test.shape[0]):
            for lab in range(5):
                log_sum_summary = 0
                for j in range(len(summaries[i])):
                    if (summaries[i][j] != -1 and summaries[i][j] in self.important_features and self.summary_word_count[lab][summaries[i][j]]):
                        log_sum_summary += np.log(self.summary_word_count[lab][summaries[i][j]])
        
                #y_test[i][lab] = np.log(y_test[i][lab])
                y_test[i][lab] += ((log_sum_summary))
                
        
        for i in range(X_test.shape[0]):
            y_test[i] /= (np.sum(y_test[i]) - y_test[i])
            y_test[i] = -y_test[i]
            
        ret_val = np.zeros(len(y_test))
        
        cnt = 0
        for i in range(X_test.shape[0]):
            mx = 0
            mxindex = -1
            for j in range(5):
                if y_test[i][j] != 0:
                    if mxindex == -1 or y_test[i][j] > mx:
                        mxindex = j
                        mx = y_test[i][j]
            if mxindex == -1:
                cnt += 1
                mxindex = cnt % 5
            ret_val[i] = mxindex
        ret_val =  [int(round(x)) for x in ret_val] 
        print('Number of random classifications: ',cnt)
        return list(ret_val)

In [5]:

# create training and testing vars
y = df.loc[:,'label'] - 1
X = df.loc[:, 'summary':'component']

In [6]:
naiveBayesClassifier = NaiveBayesClassifer()

In [7]:
f1_scores = np.zeros((10, 5))
f1_scores_list = []
for i in range(1,11):
    split_point = (len(X)/11) * i
    naiveBayesClassifier.train(X_train = X.loc[0:split_point,], y_train = y.loc[0:split_point, ])
    y_test = naiveBayesClassifier.predict(X_test = X.loc[split_point:, ])
    
    
    y_label = list(y.loc[split_point:, ])
    
    #print(y_test, y_label)
    #print((y_test == y_label).count(True) / len(y_test))
    
    
    true_positive = np.zeros(5)
    
    false_positive = np.zeros(5)
    false_negative = np.zeros(5)
    
    for j in range(len(y_test)):
        if (y_test[j] == y_label[j]):
            true_positive[y_test[j]] += 1
        else:
            false_positive[y_test[j]] += 1
            false_negative[y_label[j]] += 1
    
    precision = np.zeros(5)
    recall = np.zeros(5)
    for j in range(5):
        precision[j] = true_positive[j] / (true_positive[j] + false_positive[j])
        recall[j] = true_positive[j] / (true_positive[j] + false_negative[j])
    
    for j in range(5):
        f1_scores[i-1][j] = (2 * (precision[j] * recall[j])) / (precision[j] + recall[j])
        if np.isnan(f1_scores[i-1][j]):
            f1_scores[i-1][j] = 0
    print('Count: Train vs Test')
    print(y_test.count(0), y_label.count(0))
    print(y_test.count(1), y_label.count(1))
    print(y_test.count(2), y_label.count(2))
    print(y_test.count(3), y_label.count(3))
    print(y_test.count(4), y_label.count(4))
    
    print('F-measure after ', i, 'th iteration: ', f1_scores[i-1])
    f1_scores_list.append(f1_scores)
    pass
    

Number of unique words: 4584
Total number of features: 4584
Number of selected features: 100
Number of random classifications:  0
Count: Train vs Test
743 698
3925 1191
1246 2756
749 1268
39 789
F-measure after  1 th iteration:  [0.09576683 0.30531665 0.26286857 0.22211205 0.02415459]
Number of unique words: 7196
Total number of features: 7196
Number of selected features: 100
Number of random classifications:  0
Count: Train vs Test
799 662
3058 1014
1312 2500
863 1133
0 723
F-measure after  2 th iteration:  [0.11498973 0.26866405 0.3069255  0.250501   0.        ]
Number of unique words: 10021




Total number of features: 10021
Number of selected features: 100
Number of random classifications:  0
Count: Train vs Test
762 627
2416 880
1481 2227
696 1006
7 622
F-measure after  3 th iteration:  [0.11951044 0.27002427 0.35652643 0.2479436  0.00953895]
Number of unique words: 12022
Total number of features: 12022
Number of selected features: 100
Number of random classifications:  0
Count: Train vs Test
1585 595
1728 773
828 1910
510 858
40 555
F-measure after  4 th iteration:  [0.20091743 0.2614954  0.25346969 0.21052632 0.02352941]
Number of unique words: 14122
Total number of features: 14122
Number of selected features: 100
Number of random classifications:  0
Count: Train vs Test
1231 553
1552 644
740 1618
440 730
58 476
F-measure after  5 th iteration:  [0.20852018 0.26411658 0.2663274  0.22564103 0.05992509]
Number of unique words: 16727
Total number of features: 16727
Number of selected features: 100
Number of random classifications:  0
Count: Train vs Test
1040 498
1142 533
7

In [8]:
np.mean(np.array(f1_scores_list[0]), axis = 0)

array([0.18416859, 0.27159234, 0.29040998, 0.21478647, 0.03191554])

In [None]:
# round robin classification:           [0.16300248, 0.17159368, 0.26298052, 0.1744121 , 0.12709789]
# without prior without smoothing:      [0.21465332, 0.10047268, 0.05816234, 0.08378671, 0.09948181]
# with prior without smoothing:         [0.17769152, 0.12833085, 0.14291655, 0.10463765, 0.0911579 ]
# without prior with smoothing:         [0.21558835, 0.23368606, 0.42980058, 0.31752347, 0.2647105 ]
# with prior with smoothing:            [0.0466586 , 0.21333919, 0.50649528, 0.30572038, 0.20783305]
# with likelihood ratio with smoothing: [0.21558835, 0.23368606, 0.42980058, 0.31752347, 0.2647105 ]

# chi squared 100 features:             [0.26700822, 0.19633933, 0.31755486, 0.30805486, 0.24794675]