In [1]:
"""
Using Language Models
1. Assumption for Naive Bayes : All words are independent
slide no 38.
2. Naive Bayes is a probabilistic classifier.
3. We compute the probability of document being in class C.
4. The goal of the naive bayes class is to find the best class.
5. Naive Bayes = probability of each class + prob of each token belonging to that class.
6. Formula = argmax(log(P(c) + log(P(t/c)))
7. Smoothing in NB
8. P(t/c) = T(no of particular token) + 1 / T(sum of all tokens in a class) + V(size of vocab)
9. Naive bayes is good for predicting the class and not for estimating probabilites.
10. Naive Bayes is robust to concept drift. (change of definition of class over time).
11. For text, the independence assumption does not hold for naive bayes, but for other domains it does hold.
12. Advantages:
    1. Very Fast
    2. Low storage requirement
"""

'\nUsing Language Models\n1. Assumption for Naive Bayes : All words are independent\nslide no 38.\n2. Naive Bayes is a probabilistic classifier.\n3. We compute the probability of document being in class C.\n4. The goal of the naive bayes class is to find the best class.\n5. Naive Bayes = probability of each class + prob of each token belonging to that class.\n'

In [174]:
import re
import numpy as np


# Evaluation
class evaluation:
    def __init__(self, y_actual, y_pred, labels):
        self.y_actual = np.array(y_actual)
        #print(self.y_actual)
        self.y_pred = np.array(y_pred)
        self.labels = labels
        #print(self.y_pred)

    def confusion_matrix(self, actual, pred):
        tp = fp = tn = fn = 0
        for i, j in zip(actual, pred):
            if i == 1:
                # positive
                if i == j:
                    tp += 1
                else:
                    fp += 1
            else:
                # negative
                if i == j:
                    tn += 1
                else:
                    fn += 1
        cf = pd.DataFrame([[tp, fp], [fn, tn]], columns=["actual_pos", "actual_neg"], index=["pred_pos", "pred_neg"])
        return cf, tp, fp, tn, fn

    def recall(self, actual, pred):
        cf, tp, fp, tn, fn = self.confusion_matrix(actual, pred)
        return round(tp / (tp+fn), 2)

    def precision(self, actual, pred):
        cf, tp, fp, tn, fn = self.confusion_matrix(actual, pred)
        return round(tp / (tp+fp), 2)

    def f1(self, actual, pred):
        # harmonic mean
        pr = self.precision(actual, pred)
        re = self.recall(actual, pred)
        f1 = 2 * ((pr * re) / (pr + re))
        return round(f1, 2)

    def main(self):
        #mapping = {1: "fear", 2: "anger", 3: "guilt", 4: "joy", 5: "shame", 6: "disgust", 7: "sadness"}
        res = {}
        for cls in self.labels:
            c = 0
            #print("Class : ", mapping[cls])
            mod_y_actual = []
            for i in self.y_actual:
                if i == cls:
                    c += 1
                    mod_y_actual.append(1)
                else:
                    mod_y_actual.append(0)

            mod_y_pred = []
            for i in self.y_pred:
                if i == cls:
                    mod_y_pred.append(1)
                else:
                    mod_y_pred.append(0)
            #print(mod_y_pred)
            """
            print()
            print()
            print("Confusion Matrix : \n", self.confusion_matrix(mod_y_actual, mod_y_pred)[0])
            print("*******************************************************\n")
            print("Precision : \n", self.precision(mod_y_actual, mod_y_pred))
            print("*******************************************************\n")
            print("Recall : \n", self.recall(mod_y_actual, mod_y_pred))
            print("*******************************************************\n")
            print("F1 Score : \n", self.f1(mod_y_actual, mod_y_pred))
            print()
            print()
        """
            temp = [self.precision(mod_y_actual, mod_y_pred), self.recall(mod_y_actual, mod_y_pred), self.f1(mod_y_actual, mod_y_pred), c]
            res[cls] = temp
        res = pd.DataFrame(res, index=["Precision", "Recall", "F1-Score", "Count"]).transpose()

        """
        Macro average calculation
        """
        avg_pr = np.sum((res["Precision"]*res["Count"]))/ np.sum(res["Count"])
        avg_re = np.sum((res["Recall"]*res["Count"]))/ np.sum(res["Count"])
        avg_f1 = np.sum((res["F1-Score"]*res["Count"]))/ np.sum(res["Count"])
        res.loc["macro_average"] = [round(avg_pr, 2), round(avg_re, 2), round(avg_f1, 2), np.sum(res["Count"])]
        #res = pd.concat([res, pd.DataFrame([avg_pr, avg_re, avg_f1, np.sum(res["Count"])]).transpose()])
        flattened_index = []
        for fl_lab in self.labels:
            flattened_index.append(fl_lab+"-precision")
            flattened_index.append(fl_lab+"-recall")
            flattened_index.append(fl_lab+"-f1score")
            flattened_index.append(fl_lab+"-count")
        flattened_index = flattened_index + ["macro-average-precision", "macro-average-recall", "macro-average-F1score", "macro-average-count"]
        res_flattened = pd.DataFrame(res.to_numpy().flatten(), index=flattened_index).transpose()
        return res_flattened, res


"""
"""


# Classifier
class NaiveBayesClassifier:
    def __init__(self):
        self.vocabSize = 0
        self.trainingSize = 0
        self.train_labels = set()
        self.vocab = []
        self.fear = {}
        self.anger = {}
        self.guilt = {}
        self.joy = {}
        self.shame = {}
        self.disgust = {}
        self.sadness = {}
        self.fearSize = 0
        self.angerSize = 0
        self.guiltSize = 0
        self.joySize = 0
        self.shameSize = 0
        self.disgustSize = 0
        self.sadnessSize = 0
        self.priorDict = {}
        self.stopwords = {'hers', 'below', "wouldn't", 'nor', 'for', 'over', "hasn't", 'at', 'shouldn', 'only', 'above', 'itself', 'yourselves', 'what', "don't", "it's", 'which', 'against', "that'll", 'has', 'i', 'his', 'having', 'then', "shan't", 'myself', 'do', 'yours', 'up', 'own', 'the', 'same', 'aren', 'few', 'through', 'here', 'whom', 'o', "aren't", 'were', 'are', 'both', "didn't", 'll', 'again', 'is', 're', "wasn't", "you'll", 'm', "haven't", 'such', 'off', 'of', 'it', 'did', 'into', 'to', 'other', 'was', 'just', 've', "mustn't", 'while', 'about', 'each', 'by', 'this', 'isn', 'ourselves', 'in', 'our', 'couldn', 'until', 'where', "couldn't", 'ain', "you'd", 'all', 'when', 'does', 'before', 'weren', 'y', 'doing', 'than', 'being', 'my', 'mightn', 'yourself', 'with', 'theirs', 'so', "needn't", 'a', "doesn't", "isn't", 'its', 'your', 'if', "should've", 'ma', 'can', 'herself', 'but', 'too', 'more', 'her', "hadn't", 'hadn', 'there', "you're", 'from', 'should', 'we', 'how', 'out', 'once', 'mustn', 'won', 'their', 'don', 'had', 'he', 'or', 'didn', 'd', 'down', 't', "she's", 'that', 'himself', 'wouldn', "you've", "mightn't", 'between', 'them', 'on', 'haven', 'after', 'themselves', 'because', 'and', 'you', 'very', 's', 'these', 'no', 'now', 'him', 'been', 'those', 'during', 'doesn', 'wasn', 'am', 'under', 'an', 'some', 'have', 'me', 'any', 'who', 'shan', 'why', 'will', "shouldn't", 'not', 'they', "won't", 'needn', 'further', 'most', 'be', 'ours', 'she', 'as', 'hasn', "weren't", "a", "''"}
        self.paraMapping = {"fear": [self.fear, self.fearSize],
                            "anger": [self.anger, self.angerSize],
                            "guilt": [self.guilt, self.guiltSize],
                            "joy": [self.joy, self.joySize],
                            "shame": [self.shame, self.shameSize],
                            "disgust": [self.disgust, self.disgustSize],
                            "sadness": [self.sadness, self.sadnessSize]
                            }
        self.mapping = {"fear": 1, "anger": 2, "guilt": 3, "joy": 4, "shame": 5, "disgust": 6, "sadness": 7}
        self.reverseMapping = {1: "fear", 2: "anger", 3: "guilt", 4: "joy", 5: "shame", 6: "disgust", 7: "sadness"}

    def getVocabulary(self, text):
        tokens = [re.sub("[^A-Za-z]", "",i).strip().lower() for i in text.split(" ") if i not in self.stopwords and len(i) > 1]
        return list(set(tokens))


    def updateProbabilityDict(self, emodf, resDict):
        text = " ".join(list(emodf["X"])).split(" ")
        for word in text:
            word = re.sub("[^A-Za-z]", "", word).strip().lower()
            if word not in self.stopwords and len(word) > 1:
                if word in resDict:
                    resDict[word] += 1
                else:
                    resDict[word] = 1
        size = sum([value for key, value in resDict.items()])
        return resDict, size


    def maximum_likelihood_estimation(self, instance, emoDict, emotionCorpusSize, emotion):
        """
        instanceDict = {}
        for word in instance.split(" "):
            word = re.sub("[^A-Za-z]", "", word).strip().lower()
            if word not in self.stopwords and len(word) > 1:
                if word in instanceDict:
                    instanceDict[word] += 1
                else:
                    instanceDict[word] = 1

        """

        p_tc = 0
        tokens = [re.sub("[^A-Za-z]", "",i).strip().lower() for i in instance.split(" ") if i not in self.stopwords and len(i) > 1]
        for word in tokens:
            if word in emoDict:
                p_tc += (emoDict[word] + 1) / (emotionCorpusSize + self.vocabSize)
            else:
                p_tc += 1 / (emotionCorpusSize + self.vocabSize)
        return np.log(self.priorDict[emotion]) + np.log(p_tc)


    def labelEncoding(self, temp):
        #print(temp["Y"].value_counts())
        mapped_emotions = temp["Y"].map(self.mapping)
        temp["Y"] = mapped_emotions
        temp = temp.dropna().reset_index(drop=True)
        return temp

    def cleanData(self, input_df):
        temp = input_df[input_df["Y"].isin(["fear", "anger", "guilt", "joy", "shame", "disgust", "sadness"])]
        return temp

    def fit(self, train_df):
        """
        tokenDict... : holds the probability of occurence of each word in the class, therefore the length of dict = vocab size.
        priorDict: holds the probability of occurence of each class
        """

        train_df  = self.cleanData(train_df)
        #train_df = self.labelEncoding(train_df)
        self.train_labels = set(train_df["Y"])
        print("Labels in the training data = ", self.train_labels)
        self.vocab = self.getVocabulary(" ".join(list(train_df["X"])))
        self.vocabSize = len(self.vocab)
        self.trainingSize = train_df.shape[0]

        for label in self.train_labels:
            self.priorDict[label] = train_df[train_df["Y"] == label].shape[0]/self.trainingSize
            emoParam = self.paraMapping[label]
            emoParam[0], emoParam[1] = self.updateProbabilityDict(train_df[train_df["Y"] == label], emoParam[0])


    def predict(self, xtest):
        """
        This method uses the probabilites learned in the fit function and applies the formula to get the correct class for the given instance.
        """

        k = 0
        predictions = []
        for instance in xtest:
            k += 1
            #print(k, instance)
            prob = []
            for label in self.train_labels:
                emoPrm = self.paraMapping[label]
                proba_emo = self.maximum_likelihood_estimation(instance, emoPrm[0], emotionCorpusSize=emoPrm[1], emotion=label)
                prob.append((label, proba_emo))
           # print(sorted(prob, key = lambda x: x[1]))
            prediction = sorted(prob, key = lambda x: x[1], reverse=True)[0][0]
            predictions.append(prediction)
            #predictions.append(self.mapping[prediction])
        return predictions




In [175]:

"""
# Model 1 training and evaluation
"""


# train data loading

import pandas as pd
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


isear_train_df = pd.read_csv("./isear/isear-train.csv", error_bad_lines=False, warn_bad_lines=False, header=None)
isear_train_df = isear_train_df.rename(columns={0: "Y", 1: "X"})
#isear_train_df = isear_train_df[isear_train_df["Y"].isin([4, 7])]
print("Shape of training data = ", isear_train_df.shape)


#test data loading

import pandas as pd
isear_test1_df = pd.read_csv("./isear/isear-test.csv", error_bad_lines=False, warn_bad_lines=False, header=None)
isear_val_df = pd.read_csv("./isear/isear-val.csv", error_bad_lines=False, warn_bad_lines=False, header=None)
isear_test_df = pd.concat([isear_test1_df, isear_val_df])
isear_test_df = isear_test_df.rename(columns={0: "Y", 1: "X"})
#isear_test_df = NaiveBayesClassifier().labelEncoding(isear_test_df)
isear_test_df = NaiveBayesClassifier().cleanData(isear_test_df)
#isear_test_df = isear_test_df[isear_test_df["Y"].isin([4, 7])]
print("Shape of test data = ", isear_test_df.shape)


trainlabels=["fear", "anger", "guilt", "joy", "shame", "disgust", "sadness"]

nbObj = NaiveBayesClassifier()
nbObj.fit(isear_train_df)
predictions = nbObj.predict(isear_test_df["X"])
evalObj = evaluation(y_actual=isear_test_df["Y"], y_pred=predictions, labels=trainlabels)
evalObj.main()[1]

Shape of training data =  (5333, 2)
Shape of test data =  (2293, 2)
Labels in the training data =  {'fear', 'guilt', 'anger', 'disgust', 'shame', 'sadness', 'joy'}


Unnamed: 0,Precision,Recall,F1-Score,Count
fear,0.56,0.55,0.55,335.0
anger,0.33,0.41,0.37,336.0
guilt,0.45,0.34,0.39,320.0
joy,0.64,0.48,0.55,313.0
shame,0.34,0.53,0.41,331.0
disgust,0.44,0.54,0.48,332.0
sadness,0.57,0.52,0.54,326.0
macro_average,0.47,0.48,0.47,2293.0


In [181]:

"""
# Model 2 training and evaluation
"""

trainlabels=["disgust", "guilt", "shame"]

# train data loading

import pandas as pd
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


isear_train_df = pd.read_csv("./isear/isear-train.csv", error_bad_lines=False, warn_bad_lines=False, header=None)
isear_train_df = isear_train_df.rename(columns={0: "Y", 1: "X"})
isear_train_df = isear_train_df[isear_train_df["Y"].isin(trainlabels)]
print("Shape of training data = ", isear_train_df.shape)


#test data loading

import pandas as pd
isear_test1_df = pd.read_csv("./isear/isear-test.csv", error_bad_lines=False, warn_bad_lines=False, header=None)
isear_val_df = pd.read_csv("./isear/isear-val.csv", error_bad_lines=False, warn_bad_lines=False, header=None)
isear_test_df = pd.concat([isear_test1_df, isear_val_df])
isear_test_df = isear_test_df.rename(columns={0: "Y", 1: "X"})
isear_test_df = NaiveBayesClassifier().cleanData(isear_test_df)
isear_test_df = isear_test_df[isear_test_df["Y"].isin(trainlabels)]
print("Shape of test data = ", isear_test_df.shape)




nbObj = NaiveBayesClassifier()
nbObj.fit(isear_train_df)
predictions = nbObj.predict(isear_test_df["X"])
evalObj = evaluation(y_actual=isear_test_df["Y"], y_pred=predictions, labels=trainlabels)
evalObj.main()[1]

Shape of training data =  (2280, 2)
Shape of test data =  (983, 2)
Labels in the training data =  {'guilt', 'disgust', 'shame'}


Unnamed: 0,Precision,Recall,F1-Score,Count
disgust,0.55,0.71,0.62,332.0
guilt,0.74,0.52,0.61,320.0
shame,0.48,0.6,0.53,331.0
macro_average,0.59,0.61,0.59,983.0


In [178]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(isear_test_df["Y"], predictions))

[[250  85]
 [ 31 282]]
