# Preprocessing GoEmotions Dataset for BERT Finetuning

This notebook preprocesses three GoEmotions datasets and groups them into categories for finetuning a BERT model.

In [3]:
import pandas as pd
import numpy as np

In [4]:
df1 = pd.read_csv("Y:\PPtryYash\goemotions_1.csv")
df1.head()

  df1 = pd.read_csv("Y:\PPtryYash\goemotions_1.csv")


Unnamed: 0,text,id,author,subreddit,link_id,parent_id,created_utc,rater_id,example_very_unclear,admiration,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,That game hurt.,eew5j0j,Brdd9,nrl,t3_ajis4z,t1_eew18eq,1548381000.0,1,False,0,...,0,0,0,0,0,0,0,1,0,0
1,>sexuality shouldn’t be a grouping category I...,eemcysk,TheGreen888,unpopularopinion,t3_ai4q37,t3_ai4q37,1548084000.0,37,True,0,...,0,0,0,0,0,0,0,0,0,0
2,"You do right, if you don't care then fuck 'em!",ed2mah1,Labalool,confessions,t3_abru74,t1_ed2m7g7,1546428000.0,37,False,0,...,0,0,0,0,0,0,0,0,0,1
3,Man I love reddit.,eeibobj,MrsRobertshaw,facepalm,t3_ahulml,t3_ahulml,1547965000.0,18,False,0,...,1,0,0,0,0,0,0,0,0,0
4,"[NAME] was nowhere near them, he was by the Fa...",eda6yn6,American_Fascist713,starwarsspeculation,t3_ackt2f,t1_eda65q2,1546669000.0,2,False,0,...,0,0,0,0,0,0,0,0,0,1


In [6]:
print("Column names in the dataset:")
print(df1.columns.tolist())


Column names in the dataset:
['text', 'id', 'author', 'subreddit', 'link_id', 'parent_id', 'created_utc', 'rater_id', 'example_very_unclear', 'admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral']


In [7]:
# Define the emotion groups to consolidate the features
EMOTION_GROUPS = {
    'Joyful': ['amusement', 'excitement', 'joy'],
    'Affectionate': ['admiration', 'caring', 'love', 'gratitude', 'pride', 'approval'],
    'Positive_Outlook': ['optimism', 'relief'],
    'Anger_Frustration': ['anger', 'annoyance', 'disapproval', 'disgust'],
    'Sadness_Disappointment': ['disappointment', 'grief', 'remorse', 'sadness'],
    'Fear_Anxiety': ['fear', 'nervousness', 'embarrassment'],
    'Surprise_Confusion': ['confusion', 'curiosity', 'realization', 'surprise'],
    'Desire': ['desire'],
    'Neutral': ['neutral']
}

# Create a new DataFrame for the grouped data, starting with the 'text' column
grouped_df = pd.DataFrame({'text': df1['text']})

# Loop through the groups and create the new columns
for group_name, emotions in EMOTION_GROUPS.items():
    # For each group, calculate the max value across the corresponding emotion columns in df1
    # This effectively checks if any emotion in the group is present (value of 1)
    grouped_df[group_name] = df1[emotions].max(axis=1)

# Display the first few rows of your newly grouped DataFrame
print("--- Preview of your grouped emotion data ---")
print(grouped_df.head())

--- Preview of your grouped emotion data ---
                                                text  Joyful  Affectionate  \
0                                    That game hurt.       0             0   
1   >sexuality shouldn’t be a grouping category I...       0             0   
2     You do right, if you don't care then fuck 'em!       0             0   
3                                 Man I love reddit.       0             1   
4  [NAME] was nowhere near them, he was by the Fa...       0             0   

   Positive_Outlook  Anger_Frustration  Sadness_Disappointment  Fear_Anxiety  \
0                 0                  0                       1             0   
1                 0                  0                       0             0   
2                 0                  0                       0             0   
3                 0                  0                       0             0   
4                 0                  0                       0             0   

   Su

In [8]:
grouped_df.head()

Unnamed: 0,text,Joyful,Affectionate,Positive_Outlook,Anger_Frustration,Sadness_Disappointment,Fear_Anxiety,Surprise_Confusion,Desire,Neutral
0,That game hurt.,0,0,0,0,1,0,0,0,0
1,>sexuality shouldn’t be a grouping category I...,0,0,0,0,0,0,0,0,0
2,"You do right, if you don't care then fuck 'em!",0,0,0,0,0,0,0,0,1
3,Man I love reddit.,0,1,0,0,0,0,0,0,0
4,"[NAME] was nowhere near them, he was by the Fa...",0,0,0,0,0,0,0,0,1


In [9]:
# Calculate the number of active labels for each row
label_counts = grouped_df.drop('text', axis=1).sum(axis=1)

print("Distribution of the number of labels per text sample:")
print(label_counts.value_counts())

Distribution of the number of labels per text sample:
1    60892
2     7395
0     1129
3      543
4       35
5        5
6        1
Name: count, dtype: int64


Multiclass classificatin problem

In [10]:
# Calculate the number of labels for each row in your grouped_df
label_counts = grouped_df.drop('text', axis=1).sum(axis=1)

# Get the original size of the dataframe for comparison
original_size = len(grouped_df)

# Filter the DataFrame to only include rows where the label count is exactly 1
# This is our final, super-clean dataset for training.
final_df = grouped_df[label_counts == 1].copy()

# Get the new size
final_size = len(final_df)

print(f"Original number of samples: {original_size}")
print(f"Number of samples removed (zero-label or multi-label): {original_size - final_size}")
print(f"Final number of samples for training: {final_size}")

print("\n--- Preview of the final, clean dataset ---")
print(final_df.head())

Original number of samples: 70000
Number of samples removed (zero-label or multi-label): 9108
Final number of samples for training: 60892

--- Preview of the final, clean dataset ---
                                                text  Joyful  Affectionate  \
0                                    That game hurt.       0             0   
2     You do right, if you don't care then fuck 'em!       0             0   
3                                 Man I love reddit.       0             1   
4  [NAME] was nowhere near them, he was by the Fa...       0             0   
5  Right? Considering it’s such an important docu...       0             1   

   Positive_Outlook  Anger_Frustration  Sadness_Disappointment  Fear_Anxiety  \
0                 0                  0                       1             0   
2                 0                  0                       0             0   
3                 0                  0                       0             0   
4                 0         

#Saving the new csv file

In [11]:
# Define your desired filename
my_output_filename = 'Preprocessed_goemotions_1.csv'

# Save the DataFrame to that file
final_df.to_csv(my_output_filename, index=False)

print(f"DataFrame saved successfully to '{my_output_filename}'!")

DataFrame saved successfully to 'Preprocessed_goemotions_1.csv'!
