In [12]:
import pandas as pd
import csv
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [13]:
def load_file(fileName):
    dataset = pd.read_table(fileName, header=0, sep=",", encoding="unicode_escape")
    
    return dataset

In [14]:
# preprocess creates the term frequency matrix for the review data set
def preprocess(data):
    count_vectorizer = CountVectorizer()
    data = count_vectorizer.fit_transform(data)
    #tfidf_data = TfidfTransformer(use_idf=False).fit_transform(data)
    return data

In [15]:
class NaiveClassifier:
     def __init__(self):
            self.target_labels = None
            self.target_labels_counts = 0
            self.prob_dict = dict()
            self.prob_mat = None
            
    
     def Naive_learn_model(self,data,target):
  
        self.target_labels = target.value_counts().index    #what categories are there
        self.target_labels_counts = target.value_counts().array   #how many categories
        vector_Y = target.array
        total_Y = vector_Y.shape[0]
        matrix_X = data.toarray()
        self.prob_mat = np.zeros((len(self.target_labels),matrix_X.shape[1],2))    #3d array to store conditional prob
        t_matrix_X = matrix_X.T

        for i in range(len(self.target_labels)):
            self.prob_dict[self.target_labels[i]]= ((self.target_labels_counts[i])+1)/((total_Y)+len(self.target_labels))   #probability of labels

        for i in range(len(self.target_labels)):
            row_number =0
            for row in t_matrix_X:
                prob1 =0
                prob0 = 0
                target_index = np.where(vector_Y == self.target_labels[i])[0]
                vals, counts = np.unique(row[target_index],return_counts = True)
                self.prob_mat[i,row_number,0] = (counts[0]+1)/((len(target_index))+2)   # adding 1 for laplacian
                self.prob_mat[i,row_number,1] = (np.sum(counts[1:])+1)/((len(target_index))+2)
                row_number+=1
    
     def Naive_classify(self,testdata):
            x_test = testdata.toarray()
            category_predicted = []
            prob_list = np.zeros(len(self.target_labels))
            
            for row in range(x_test.shape[0]):
                for k in range(len(self.target_labels_counts)):
                    prob_test = 1
                    for ind, i in enumerate(x_test[row]):
                        if i ==0:
                            prob_test = prob_test* self.prob_mat[k,ind,0]
                        else:
                            prob_test = prob_test* self.prob_mat[k,ind,1]
                    prob_list[k] = prob_test * self.prob_dict[self.target_labels[k]]
                    
                max_prob_index = np.where(prob_list == np.amax(prob_list))
                category_predicted.append(self.target_labels[max_prob_index[0][0]])
            return category_predicted

In [16]:
def learn_model(data,target):
    classifier = NaiveClassifier()
    classifier.Naive_learn_model(data,target)
    return classifier

In [17]:
def classify(classifier, testdata):
    
    predicted_val= classifier.Naive_classify(testdata)
    return predicted_val

In [22]:

def Confusion_mat(actual,predicted):
    target_labels = np.unique(actual).tolist()
    confusion_mat = np.zeros((len(target_labels),len(target_labels)))
    actual = np.asarray(actual)
    
    for i in range(len(actual)):
        actual_index = target_labels.index(actual[i])
        pred_index = target_labels.index(predicted[i])
        confusion_mat[actual_index,pred_index]+=1
    return confusion_mat

def individual_confusion_mat(actual,predicted):
    
    target_labels = np.unique(actual)
    confusion_mat = Confusion_mat(actual,predicted)
    individual_prob_mat =[]
    
    for i in range(len(target_labels)):
        individual_prob_mat_a = confusion_mat[i,i]
#         individual_prob_mat_b = np.sum(confusion_mat[i,:])- individual_prob_mat_a
#         individual_prob_mat_c = np.sum(confusion_mat[:,i])- individual_prob_mat_a
        individual_prob_mat_b = confusion_mat[i,:][confusion_mat[i,:]!= individual_prob_mat_a].sum()
        individual_prob_mat_c = confusion_mat[:,i][confusion_mat[:,i]!= individual_prob_mat_a].sum()
        individual_prob_mat_d = np.sum(confusion_mat) - individual_prob_mat_a - individual_prob_mat_b - individual_prob_mat_c
        individual_prob_mat.append((individual_prob_mat_a,individual_prob_mat_b,individual_prob_mat_c,individual_prob_mat_d))
    return individual_prob_mat
    
def precision_recall_fmeasure(actual,predicted):
    individual_prob_mat = individual_confusion_mat(actual,predicted)
    precision_list= []
    recall_list= []
    fmeasure_list =[]
    precision = 0
    recall = 0
    fmeasure = 0
    actual_np = np.asarray(actual)
    unique_elements, counts_elements = np.unique(actual_np, return_counts=True)
    
    for i in individual_prob_mat:
        if((i[0]+i[2])==0):
            precision_list.append(0)
        else:
            precision_list.append((i[0])/(i[0]+i[2]))
        
        if((i[0]+i[1]) == 0):
            recall_list.append(0)    
        else:
            recall_list.append((i[0])/(i[0]+i[1]))
            
        if((2*i[0]+i[1]+i[2]) == 0):
            fmeasure_list.append(0)
        else:
            fmeasure_list.append((2*i[0])/(2*i[0]+i[1]+i[2]))
    
    for i in range(len(precision_list)):
        precision+= (precision_list[i]*counts_elements[i])
    
    for i in range(len(recall_list)):
        recall+= recall_list[i]*counts_elements[i]
    
    for i in range(len(fmeasure_list)):
        fmeasure+= fmeasure_list[i]*counts_elements[i]
    
    return (precision/(actual_np.shape[0])),(recall/(actual_np.shape[0])),(fmeasure/(actual_np.shape[0]))
    

def evaluate(actual_class, predicted_class):
    p_r_f = precision_recall_fmeasure(actual_class,predicted_class)
    accuracy = accuracy_score(actual_class, predicted_class)
    
    print("The precision score is :",p_r_f[0])
    print("The recall score is :",p_r_f[1])
    print("The f_measure score is :",p_r_f[2])
    print("The accuracy score is :",accuracy)
    
    

In [19]:
features = ["SUMMARY", "categories", "sub_categories"]

print("Loading data.....")
dataset = load_file("TextClassification_Data.csv")
data,target = dataset[features[0]].fillna(" "), dataset[features[1]]

print("preprocessing data.....")
word_vectors = preprocess(data)
    
trainingX,testX,trainingY,testY = train_test_split(word_vectors,target,test_size=0.4,random_state=43)
#print(type(trainingY))
#print(trainingY.value_counts().array)

print("Learning model.....")
model = learn_model(trainingX,trainingY)


Loading data.....
preprocessing data.....
Learning model.....


In [20]:
print("Classifying test data......")      
predictedY = classify(model, testX)


Classifying test data......


In [23]:
print("Evaluating results.....")
evaluate(testY,predictedY)

Evaluating results.....
The precision score is : 0.7338093072406146
The recall score is : 0.7165687426556991
The f_measure score is : 0.7173412550457267
The accuracy score is : 0.7165687426556991
