# Naive Bayes Baseline Creation for GoEmotions


In [1]:
# importing libraries
import requests
import os
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# # Downloading the GoEmotions datasets directly 
# # For Colab
# !wget https://raw.githubusercontent.com/google-research/google-research/master/goemotions/data/train.tsv
# !wget https://raw.githubusercontent.com/google-research/google-research/master/goemotions/data/dev.tsv
# !wget https://raw.githubusercontent.com/google-research/google-research/master/goemotions/data/test.tsv


# For Laptop
train_path = "goemotions/train.tsv"
dev_path = "goemotions/dev.tsv"
test_path = "goemotions/test.tsv"

In [3]:
# For Colab
# train_data = pd.read_csv('train.tsv', sep='\t', header=None)
# dev_data = pd.read_csv('dev.tsv', sep='\t', header=None)
# test_data = pd.read_csv('test.tsv', sep='\t', header=None)

# For Laptop
train_data = pd.read_csv(train_path, sep='\t', header=None)
dev_data = pd.read_csv(dev_path, sep='\t', header=None)

train_data.columns = ["text", "emotions", "ids"]
dev_data.columns = ["text", "emotions", "ids"]

# Mapping of emotion IDs to their names
emotion_mapping = {
    '1' : 'admiration',
    '2' : 'amusement',
    '3' : 'anger',
    '4' : 'annoyance',
    '5' : 'approval',
    '6' : 'caring',
    '7' : 'confusion',
    '8' : 'curiosity',
    '9' : 'desire',
    '10': 'disappointment',
    '11': 'disapproval',
    '12': 'disgust',
    '13': 'embarrassment',
    '14': 'excitement',
    '15': 'fear',
    '16': 'gratitude',
    '17': 'grief',
    '18': 'joy',
    '19': 'love',
    '20': 'nervousness',
    '21': 'optimism',
    '22': 'pride',
    '23': 'realization',
    '24': 'relief',
    '25': 'remorse',
    '26': 'sadness',
    '27': 'surprise',
    '28': 'neutral'
}

# function to replace a comma-separated list of emotion IDs with their names
def map_emotion_ids_to_names(emotion_ids_str):
    # Split the string by comma, replace each ID with its name, and then join them back
    return ','.join([emotion_mapping.get(e_id, 'unknown') for e_id in emotion_ids_str.split(',')])

train_data['emotions'] = train_data['emotions'].apply(map_emotion_ids_to_names)
dev_data['emotions'] = dev_data['emotions'].apply(map_emotion_ids_to_names)

## Naive Bayes as Binary Classification

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

In the below code, I am only considering the first emotion as the only label. This is a multi-label dataset, which means that there can be more than one labels for a training example.

In [5]:
# Convert the texts into a matrix of tokens
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(train_data['text'])
X_dev = vectorizer.transform(dev_data['text'])

#  only consider the first emotion for multi-labels
y_train = train_data['emotions'].str.split(',').str[0]
y_dev = dev_data['emotions'].str.split(',').str[0]

# Naive Bayes classifier
clf = MultinomialNB()
clf.fit(X_train, y_train)

# Predict on the dev set
y_dev_pred = clf.predict(X_dev)

# Evaluate the model
print("Accuracy:", accuracy_score(y_dev, y_dev_pred))
print(classification_report(y_dev, y_dev_pred, target_names=list(emotion_mapping.values())))

Accuracy: 0.3923700700331736
                precision    recall  f1-score   support

    admiration       0.74      0.25      0.37       297
     amusement       0.80      0.08      0.15       192
         anger       0.08      0.01      0.01       247
     annoyance       0.48      0.05      0.08       355
      approval       0.83      0.04      0.07       138
        caring       0.75      0.02      0.04       136
     confusion       0.43      0.03      0.05       205
     curiosity       0.00      0.00      0.00        64
        desire       0.50      0.01      0.02       129
disappointment       0.00      0.00      0.00       246
   disapproval       0.50      0.01      0.03        74
       disgust       0.00      0.00      0.00        28
 embarrassment       1.00      0.04      0.07        78
    excitement       0.00      0.00      0.00        74
          fear       0.79      0.56      0.66       297
     gratitude       0.00      0.00      0.00        10
         grief    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


The code below takes account the multi-label structure of the dataset and predicts the probability of each label.

This is a more robust approch to build a baseline model.

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier

# Convert the text into a matrix of tokens
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(train_data['text'])
X_dev = vectorizer.transform(dev_data['text'])

In [7]:
X_train

<43410x26379 sparse matrix of type '<class 'numpy.int64'>'
	with 495838 stored elements in Compressed Sparse Row format>

In [8]:
# Convert the comma-separated emotions into a binary matrix 
mlb = MultiLabelBinarizer(classes=list(emotion_mapping.values()))
y_train_mlb = mlb.fit_transform(train_data['emotions'].str.split(','))
y_dev_mlb = mlb.transform(dev_data['emotions'].str.split(','))

  "unknown class(es) {0} will be ignored".format(sorted(unknown, key=str))


In [9]:
y_train_mlb

array([[0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 1, 0],
       [0, 1, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [10]:
# OneVsRestClassifier with MultinomialNB as the estimator 
clf = OneVsRestClassifier(MultinomialNB())
clf.fit(X_train, y_train_mlb)

# Predict on the dev set
y_dev_pred_mlb = clf.predict(X_dev)

# Convert binary predictions back to labels
y_dev_pred_labels = mlb.inverse_transform(y_dev_pred_mlb)

# Evaluate 
print(classification_report(y_dev_mlb, y_dev_pred_mlb, target_names=mlb.classes_))

                precision    recall  f1-score   support

    admiration       0.69      0.09      0.16       303
     amusement       0.25      0.01      0.02       195
         anger       0.00      0.00      0.00       303
     annoyance       0.38      0.02      0.04       397
      approval       0.60      0.02      0.04       153
        caring       0.29      0.01      0.03       152
     confusion       0.38      0.02      0.04       248
     curiosity       0.00      0.00      0.00        77
        desire       0.25      0.01      0.01       163
disappointment       0.10      0.00      0.01       292
   disapproval       0.25      0.01      0.02        97
       disgust       0.00      0.00      0.00        35
 embarrassment       0.33      0.01      0.02        96
    excitement       0.33      0.01      0.02        90
          fear       0.92      0.52      0.66       358
     gratitude       0.00      0.00      0.00        13
         grief       0.33      0.02      0.04  

  "Label %s is present in all training examples." % str(classes[c])
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [12]:
# I need to get the predict probabilities for the dev set first 
y_dev_pred_probs = clf.predict_proba(X_dev)

# now to print  the first 5 dev examples and their top 5 predictions
for i in range(5):
    text_example = dev_data['text'].iloc[i]
    actual_labels = dev_data['emotions'].iloc[i].split(',')
    predicted_indices = y_dev_pred_probs[i].argsort()[-5:][::-1]  # added sort and get the top 5 labels for text
    predicted_labels_with_probs = [(mlb.classes_[index], y_dev_pred_probs[i][index]) for index in predicted_indices]
    
    # Printing
    print(f"Text: {text_example}")
    print(f"Actual Labels: {actual_labels}")
    for label, prob in predicted_labels_with_probs:
        print(f"{label}: {prob:.2f}")
    print("\n" + "="*50 + "\n")


Text: Is this in New Orleans?? I really feel like this is New Orleans.
Actual Labels: ['surprise']
surprise: 0.11
joy: 0.00
annoyance: 0.00
confusion: 0.00
anger: 0.00


Text: You know the answer man, you are programmed to capture those codes they send you, don’t avoid them!
Actual Labels: ['annoyance', 'surprise']
surprise: 0.85
anger: 0.00
annoyance: 0.00
confusion: 0.00
fear: 0.00


Text: I've never been this sad in my life!
Actual Labels: ['remorse']
surprise: 0.02
remorse: 0.02
annoyance: 0.00
desire: 0.00
admiration: 0.00


Text: The economy is heavily controlled and subsidized by the government. In any case, I was poking at the lack of nuance in US politics today
Actual Labels: ['annoyance', 'surprise']
surprise: 0.98
annoyance: 0.00
anger: 0.00
disappointment: 0.00
confusion: 0.00


Text: He could have easily taken a real camera from a legitimate source and change the price in Word/Photoshop and then print it out.
Actual Labels: ['nervousness']
surprise: 0.81
annoyance: 0.00
co