In [1]:
"""
Using Language Models
1. Assumption for Naive Bayes : All words are independent
slide no 38.
2. Naive Bayes is a probabilistic classifier.
3. We compute the probability of document being in class C.
4. The goal of the naive bayes class is to find the best class.
5. Naive Bayes = probability of each class + prob of each token belonging to that class.
6. Formula = argmax(log(P(c) + log(P(t/c)))
7. Smoothing in NB
8. P(t/c) = T(no of particular token) + 1 / T(sum of all tokens in a class) + V(size of vocab)
9. Naive bayes is good for predicting the class and not for estimating probabilites.
10. Naive Bayes is robust to concept drift. (change of definition of class over time).
11. For text, the independence assumption does not hold for naive bayes, but for other domains it does hold.
12. Advantages:
    1. Very Fast
    2. Low storage requirement
"""

'\nUsing Language Models\n1. Assumption for Naive Bayes : All words are independent\nslide no 38.\n2. Naive Bayes is a probabilistic classifier.\n3. We compute the probability of document being in class C.\n4. The goal of the naive bayes class is to find the best class.\n5. Naive Bayes = probability of each class + prob of each token belonging to that class.\n'

In [3]:
import pandas as pd
mapping = {"fear": 1, "anger": 2, "guilt": 3, "joy": 4, "shame": 5, "disgust": 6, "sadness": 7}
isear_train_df = pd.read_csv("./isear/isear-train.csv", error_bad_lines=False, header=None)

mapped_emotions = isear_train_df[0].map(mapping)
isear_train_df[0] = mapped_emotions
isear_train_df = isear_train_df.dropna().reset_index(drop=True)
isear_train_df = isear_train_df.rename(columns={0: "Y", 1: "X"})
#isear_train_df = isear_train_df[isear_train_df["Y"].isin([4, 7])]
isear_train_df["Y"].value_counts()



  exec(code_obj, self.user_global_ns, self.user_ns)
b'Skipping line 119: expected 2 fields, saw 7\nSkipping line 1213: expected 2 fields, saw 4\nSkipping line 2323: expected 2 fields, saw 3\nSkipping line 2803: expected 2 fields, saw 3\nSkipping line 3630: expected 2 fields, saw 4\nSkipping line 4635: expected 2 fields, saw 5\nSkipping line 4797: expected 2 fields, saw 4\n'


4.0    777
3.0    766
7.0    760
2.0    758
5.0    757
6.0    757
1.0    751
Name: Y, dtype: int64

In [4]:
import re
import numpy as np

class NaiveBayesClassifier:
    def __init__(self):
        self.vocabSize = 0
        self.trainingSize = 0
        self.vocab = []
        self.fear = {}
        self.anger = {}
        self.guilt = {}
        self.joy = {}
        self.shame = {}
        self.disgust = {}
        self.sadness = {}
        self.fearSize = 0
        self.angerSize = 0
        self.guiltSize = 0
        self.joySize = 0
        self.shameSize = 0
        self.disgustSize = 0
        self.sadnessSize = 0
        self.priorDict = {}
        self.stopwords = {'hers', 'below', "wouldn't", 'nor', 'for', 'over', "hasn't", 'at', 'shouldn', 'only', 'above', 'itself', 'yourselves', 'what', "don't", "it's", 'which', 'against', "that'll", 'has', 'i', 'his', 'having', 'then', "shan't", 'myself', 'do', 'yours', 'up', 'own', 'the', 'same', 'aren', 'few', 'through', 'here', 'whom', 'o', "aren't", 'were', 'are', 'both', "didn't", 'll', 'again', 'is', 're', "wasn't", "you'll", 'm', "haven't", 'such', 'off', 'of', 'it', 'did', 'into', 'to', 'other', 'was', 'just', 've', "mustn't", 'while', 'about', 'each', 'by', 'this', 'isn', 'ourselves', 'in', 'our', 'couldn', 'until', 'where', "couldn't", 'ain', "you'd", 'all', 'when', 'does', 'before', 'weren', 'y', 'doing', 'than', 'being', 'my', 'mightn', 'yourself', 'with', 'theirs', 'so', "needn't", 'a', "doesn't", "isn't", 'its', 'your', 'if', "should've", 'ma', 'can', 'herself', 'but', 'too', 'more', 'her', "hadn't", 'hadn', 'there', "you're", 'from', 'should', 'we', 'how', 'out', 'once', 'mustn', 'won', 'their', 'don', 'had', 'he', 'or', 'didn', 'd', 'down', 't', "she's", 'that', 'himself', 'wouldn', "you've", "mightn't", 'between', 'them', 'on', 'haven', 'after', 'themselves', 'because', 'and', 'you', 'very', 's', 'these', 'no', 'now', 'him', 'been', 'those', 'during', 'doesn', 'wasn', 'am', 'under', 'an', 'some', 'have', 'me', 'any', 'who', 'shan', 'why', 'will', "shouldn't", 'not', 'they', "won't", 'needn', 'further', 'most', 'be', 'ours', 'she', 'as', 'hasn', "weren't", "a", "''"}

        self.classMapping = {1: "fear", 2: "anger", 3: "guilt", 4: "joy", 5: "shame", 6: "disgust", 7: "sadness"}

    def getVocabulary(self, text):
        tokens = [re.sub("[^A-Za-z]", "",i).strip().lower() for i in text.split(" ") if i not in self.stopwords and len(i) > 1]
        return list(set(tokens))


    def updateProbabilityDict(self, emodf, resDict):
        text = " ".join(list(emodf["X"])).split(" ")
        for word in text:
            word = re.sub("[^A-Za-z]", "", word).strip().lower()
            if word not in self.stopwords and len(word) > 1:
                if word in resDict:
                    resDict[word] += 1
                else:
                    resDict[word] = 1
        size = sum([value for key, value in resDict.items()])
        return resDict, size


    def maximum_likelihood_estimation(self, instance, emoDict, emotionCorpusSize, emotion):
        """
        instanceDict = {}
        for word in instance.split(" "):
            word = re.sub("[^A-Za-z]", "", word).strip().lower()
            if word not in self.stopwords and len(word) > 1:
                if word in instanceDict:
                    instanceDict[word] += 1
                else:
                    instanceDict[word] = 1

        """

        p_tc = 0
        tokens = [re.sub("[^A-Za-z]", "",i).strip().lower() for i in instance.split(" ") if i not in self.stopwords and len(i) > 1]
        for word in tokens:
            if word in emoDict:
                p_tc += (emoDict[word] + 1) / (emotionCorpusSize + self.vocabSize)
            else:
                p_tc += 1 / (emotionCorpusSize + self.vocabSize)
        return np.log(self.priorDict[emotion]) + np.log(p_tc)




    def fit(self, train_df):
        """
        tokenDict... : holds the probability of occurence of each word in the class, therefore the length of dict = vocab size.
        priorDict: holds the probability of occurence of each class
        """
        self.vocab = self.getVocabulary(" ".join(list(train_df["X"])))
        self.vocabSize = len(self.vocab)
        self.trainingSize = train_df.shape[0]
        self.priorDict = {"fear": train_df[train_df["Y"] == 1].shape[0]/self.trainingSize,
                          "anger": train_df[train_df["Y"] == 2].shape[0]/self.trainingSize,
                           "guilt": train_df[train_df["Y"] == 3].shape[0]/self.trainingSize,
                           "joy": train_df[train_df["Y"] == 4].shape[0]/self.trainingSize,
                           "shame": train_df[train_df["Y"] == 5].shape[0]/self.trainingSize,
                           "disgust": train_df[train_df["Y"] == 6].shape[0]/self.trainingSize,
                           "sadness": train_df[train_df["Y"] == 7].shape[0]/self.trainingSize,
                          }
        self.fear, self.fearSize = self.updateProbabilityDict(train_df[train_df["Y"] == 1], self.fear)
        self.anger, self.angerSize = self.updateProbabilityDict(train_df[train_df["Y"] == 2], self.anger)
        self.guilt, self.guiltSize = self.updateProbabilityDict(train_df[train_df["Y"] == 3], self.guilt)
        self.joy, self.joySize = self.updateProbabilityDict(train_df[train_df["Y"] == 4], self.joy)
        self.shame, self.shameSize = self.updateProbabilityDict(train_df[train_df["Y"] == 5], self.shame)
        self.disgust, self.disgustSize = self.updateProbabilityDict(train_df[train_df["Y"] == 6], self.disgust)
        self.sadness, self.sadnessSize = self.updateProbabilityDict(train_df[train_df["Y"] == 7], self.sadness)


    def predict(self, xtest):
        """
        This method uses the probabilites learned in the fit function and applies the formula to get the correct class for the given instance.
        """

        k = 0
        predictions = []
        for instance in xtest:
            k += 1
            #print(k, instance)
            prob = []

            proba_fear = self.maximum_likelihood_estimation(instance, self.fear, emotionCorpusSize=self.fearSize, emotion="fear")
            prob.append(proba_fear)
            proba_anger = self.maximum_likelihood_estimation(instance, self.anger, emotionCorpusSize=self.angerSize, emotion="anger")
            prob.append(proba_anger)
            proba_guilt = self.maximum_likelihood_estimation(instance, self.guilt, emotionCorpusSize=self.guiltSize, emotion="guilt")
            prob.append(proba_guilt)

            proba_joy = self.maximum_likelihood_estimation(instance, self.joy, emotionCorpusSize=self.joySize, emotion="joy")
            prob.append(proba_joy)

            proba_shame = self.maximum_likelihood_estimation(instance, self.shame, emotionCorpusSize=self.shameSize, emotion="shame")
            prob.append(proba_shame)
            proba_disgust = self.maximum_likelihood_estimation(instance, self.disgust, emotionCorpusSize=self.disgustSize, emotion="disgust")
            prob.append(proba_disgust)

            proba_sadness = self.maximum_likelihood_estimation(instance, self.sadness, emotionCorpusSize=self.sadnessSize, emotion="sadness")
            prob.append(proba_sadness)
            prediction = np.argmax(np.array(prob)) + 1
            predictions.append(prediction)
            #print(self.classMapping[prediction])
        return predictions




In [5]:
#test data

import pandas as pd
mapping = {"fear": 1, "anger": 2, "guilt": 3, "joy": 4, "shame": 5, "disgust": 6, "sadness": 7}
isear_test1_df = pd.read_csv("./isear/isear-test.csv", error_bad_lines=False, header=None)
isear_val_df = pd.read_csv("./isear/isear-val.csv", error_bad_lines=False, header=None)
isear_test_df = pd.concat([isear_test1_df, isear_val_df])
mapped_emotions = isear_test_df[0].map(mapping)
isear_test_df[0] = mapped_emotions
isear_test_df = isear_test_df.dropna().reset_index(drop=True)
isear_test_df = isear_test_df.rename(columns={0: "Y", 1: "X"})
#isear_test_df = isear_test_df[isear_test_df["Y"].isin([4, 7])]
isear_test_df.shape



  exec(code_obj, self.user_global_ns, self.user_ns)


(2293, 2)

In [6]:
"""
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(isear_train_df["X"], isear_train_df["Y"], stratify=isear_train_df["Y"], test_size=0.2, random_state=42)
train_df = pd.DataFrame()
train_df["X"] = X_train
train_df["Y"] = y_train
print(train_df.shape)
print(y_test.shape)
train_df = train_df.reset_index(drop=True)
"""

nbObj = NaiveBayesClassifier()
nbObj.fit(isear_train_df)
predictions = nbObj.predict(isear_test_df["X"])
"""
temp = []
for i in predictions:
    if i == 1:
        temp.append(4)
    elif i == 2:
        temp.append(7)
"""

'\ntemp = []\nfor i in predictions:\n    if i == 1:\n        temp.append(4)\n    elif i == 2:\n        temp.append(7)\n'

In [37]:
predictions

[2,
 3,
 3,
 6,
 4,
 3,
 2,
 7,
 3,
 7,
 1,
 5,
 7,
 7,
 4,
 7,
 3,
 2,
 3,
 2,
 7,
 1,
 6,
 1,
 4,
 7,
 6,
 3,
 4,
 2,
 4,
 5,
 7,
 7,
 1,
 1,
 4,
 3,
 7,
 1,
 5,
 4,
 5,
 6,
 3,
 2,
 3,
 4,
 2,
 5,
 3,
 2,
 2,
 5,
 5,
 7,
 4,
 2,
 4,
 7,
 4,
 1,
 5,
 4,
 3,
 7,
 6,
 5,
 7,
 4,
 3,
 4,
 3,
 3,
 2,
 3,
 7,
 7,
 7,
 1,
 1,
 4,
 2,
 4,
 1,
 2,
 3,
 7,
 5,
 5,
 2,
 5,
 3,
 3,
 4,
 2,
 1,
 3,
 5,
 7,
 6,
 7,
 2,
 2,
 2,
 7,
 4,
 3,
 1,
 6,
 1,
 2,
 5,
 4,
 1,
 3,
 6,
 3,
 3,
 3,
 7,
 4,
 3,
 5,
 3,
 4,
 3,
 1,
 2,
 1,
 7,
 7,
 7,
 4,
 2,
 1,
 4,
 3,
 2,
 4,
 4,
 2,
 2,
 5,
 1,
 6,
 6,
 4,
 2,
 2,
 1,
 4,
 4,
 3,
 7,
 6,
 4,
 5,
 7,
 5,
 1,
 2,
 6,
 5,
 4,
 6,
 1,
 3,
 7,
 5,
 7,
 4,
 4,
 2,
 4,
 3,
 7,
 6,
 2,
 4,
 4,
 7,
 7,
 5,
 7,
 6,
 4,
 7,
 3,
 7,
 4,
 1,
 3,
 4,
 3,
 4,
 2,
 6,
 7,
 7,
 1,
 3,
 3,
 5,
 3,
 2,
 1,
 6,
 4,
 1,
 3,
 6,
 1,
 4,
 1,
 3,
 4,
 6,
 7,
 5,
 6,
 5,
 4,
 6,
 3,
 5,
 7,
 6,
 2,
 4,
 1,
 7,
 7,
 4,
 4,
 7,
 6,
 2,
 4,
 7,
 4,
 7,
 2,
 3,
 2,
 1,
 4,
 2,
 5,
 1,


In [9]:
# Recall
import numpy as np


class evaluation:
    def __init__(self, y_actual, y_pred):
        self.y_actual = np.array(y_actual)
        #print(self.y_actual)
        self.y_pred = np.array(y_pred)
        #print(self.y_pred)

    def confusion_matrix(self, actual, pred):
        tp = fp = tn = fn = 0
        for i, j in zip(actual, pred):
            if i == 1:
                # positive
                if i == j:
                    tp += 1
                else:
                    fp += 1
            else:
                # negative
                if i == j:
                    tn += 1
                else:
                    fn += 1
        cf = pd.DataFrame([[tp, fp], [fn, tn]], columns=["actual_pos", "actual_neg"], index=["pred_pos", "pred_neg"])
        return cf, tp, fp, tn, fn

    def recall(self, actual, pred):
        cf, tp, fp, tn, fn = self.confusion_matrix(actual, pred)
        return round(tp / (tp+fn), 2)

    def precision(self, actual, pred):
        cf, tp, fp, tn, fn = self.confusion_matrix(actual, pred)
        return round(tp / (tp+fp), 2)

    def f1(self, actual, pred):
        # harmonic mean
        pr = self.precision(actual, pred)
        re = self.recall(actual, pred)
        f1 = 2 * ((pr * re) / (pr + re))
        return round(f1, 2)

    def main(self):
        mapping = {1: "fear", 2: "anger", 3: "guilt", 4: "joy", 5: "shame", 6: "disgust", 7: "sadness"}
        res = {}
        for cls in [1.0, 2.0, 3.0, 4.0, 5.0, 6.0 ,7.0]:
            c = 0
            #print("Class : ", mapping[cls])
            mod_y_actual = []
            for i in self.y_actual:
                if i == cls:
                    c += 1
                    mod_y_actual.append(1)
                else:
                    mod_y_actual.append(0)
            mod_y_pred = []
            for i in self.y_pred:
                if i == cls:
                    mod_y_pred.append(1)
                else:
                    mod_y_pred.append(0)
            #print(mod_y_pred)
            """
            print()
            print()
            print("Confusion Matrix : \n", self.confusion_matrix(mod_y_actual, mod_y_pred)[0])
            print("*******************************************************\n")
            print("Precision : \n", self.precision(mod_y_actual, mod_y_pred))
            print("*******************************************************\n")
            print("Recall : \n", self.recall(mod_y_actual, mod_y_pred))
            print("*******************************************************\n")
            print("F1 Score : \n", self.f1(mod_y_actual, mod_y_pred))
            print()
            print()
        """
            temp = [self.precision(mod_y_actual, mod_y_pred), self.recall(mod_y_actual, mod_y_pred), self.f1(mod_y_actual, mod_y_pred), c]
            res[mapping[cls]] = temp
        res = pd.DataFrame(res, index=["Precision", "Recall", "F1-Score", "Count"]).transpose()

        """
        Macro average calculation
        """
        avg_pr = np.sum((res["Precision"]*res["Count"]))/ np.sum(res["Count"])
        avg_re = np.sum((res["Recall"]*res["Count"]))/ np.sum(res["Count"])
        avg_f1 = np.sum((res["F1-Score"]*res["Count"]))/ np.sum(res["Count"])
        res.loc["Macro_Average"] = [round(avg_pr, 2), round(avg_re, 2), round(avg_f1, 2), np.sum(res["Count"])]
        #res = pd.concat([res, pd.DataFrame([avg_pr, avg_re, avg_f1, np.sum(res["Count"])]).transpose()])

        flattened_index = ["Fear-Precision", "Fear-Recall", "Fear-F1score", "Fear-Count",
                           "Anger-Precision", "Anger-Recall", "Anger-F1score", "Anger-Count",
                           "Guilt-Precision", "Guilt-Recall", "Guilt-F1score", "Guilt-Count",
                           "Joy-Precision", "Joy-Recall", "Joy-F1score", "Joy-Count",
                           "Shame-Precision", "Shame-Recall", "Shame-F1score", "Shame-Count",
                           "Disgust-Precision", "Disgust-Recall", "Disgust-F1score", "Disgust-Count",
                           "Sadness-Precision", "Sadness-Recall", "Sadness-F1score", "Sadness-Count",
                           "Macro-Average-Precision", "Macro-Average-Recall", "Macro-Average-F1score", "Macro-Average-Count"]
        res_flattened = pd.DataFrame(res.to_numpy().flatten(), index=flattened_index).transpose()
        return res_flattened, res




In [10]:

evalObj = evaluation(y_actual=isear_test_df["Y"], y_pred=predictions)
evalObj.main()[1]

Unnamed: 0,Precision,Recall,F1-Score,Count
fear,0.56,0.55,0.55,335.0
anger,0.33,0.41,0.37,336.0
guilt,0.45,0.34,0.39,320.0
joy,0.64,0.48,0.55,313.0
shame,0.34,0.53,0.41,331.0
disgust,0.44,0.54,0.48,332.0
sadness,0.57,0.52,0.54,326.0
Macro_Average,0.47,0.48,0.47,2293.0


In [9]:

c = 0
for i, j in zip(y_test, predictions):
    if int(i) == j:
        c += 1
print(c/len(y_test))

Unnamed: 0,Precision,Recall,F1-Score,Count
fear,0.64,0.578313,0.607595,150.0
anger,0.355263,0.461538,0.401487,152.0
guilt,0.45098,0.367021,0.404692,153.0
joy,0.628205,0.507772,0.561605,156.0
shame,0.342105,0.530612,0.416,152.0
disgust,0.476821,0.590164,0.527473,151.0
sadness,0.644737,0.538462,0.586826,152.0
Macro_Average,0.505629,0.510208,0.500724,1066.0


In [48]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(isear_test_df["Y"], predictions))

[[189  18  27  45   6  22  28]
 [ 41 111  72  24  25  34  29]
 [ 27  46 144  26  29  17  31]
 [ 17  13  36 199   6  10  32]
 [ 19  29  70  40 114  33  26]
 [ 23  33  41  35  25 147  28]
 [ 25  21  30  42  11  11 186]]


In [None]:

c = 0
for i, j in zip(isear_test_df["Y"], temp):
    if int(i) == j:
        c += 1
print(c/len(isear_test_df["Y"]))

"accuracy with only joy and sadness = 82%, with 80:20 train:test split"

In [10]:

c = 0
for i, j in zip(isear_test_df["Y"], temp):
    if int(i) == j:
        c += 1
print(c/len(isear_test_df["Y"]))

"accuracy with only joy and sadness = 82%, with 80:20 train:test split"

0.5056285178236398


In [131]:

c = 0
for i, j in zip(y_test, temp):
    if int(i) == j:
        c += 1
print(c/len(y_test))

"accuracy with only joy and sadness = 82%, with 80:20 train:test split"

0.8214285714285714
