## Set up

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import sys
from pathlib import Path

# check if workding_dir is in local variables
if "workding_dir" not in locals():
    workding_dir = str(Path.cwd().parent)

os.chdir(workding_dir)
sys.path.append(workding_dir)
print("workding dir:", workding_dir)

## Amazon Reviews

In [None]:
import pandas as pd

df_train = pd.read_csv("dataset/GoEmotions/train_clean.csv")
df_test = pd.read_csv("dataset/GoEmotions/test_clean.csv")
len(df_train), len(df_test)

In [None]:
# Loading emotion labels for GoEmotions taxonomy
with open("dataset/GoEmotions/emotions.txt", "r") as file:
    GE_taxonomy = file.read().split("\n")

for emo in GE_taxonomy:
    print(emo)

In [None]:
# Computing the number of labels for each sample
df_train["Cardinality"] = df_train.loc[:, GE_taxonomy].apply(lambda x: sum(x), axis=1)
df_test["Cardinality"] = df_test.loc[:, GE_taxonomy].apply(lambda x: sum(x), axis=1)

# Preview of data
display(df_train["Cardinality"].head(5)), display(df_test["Cardinality"].head(5))

In [None]:
# remove rows with Cardinality not equal to 1
df_train = df_train[df_train["Cardinality"] == 1]
df_test = df_test[df_test["Cardinality"] == 1]

# Reset index after filtering
df_train.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)

len(df_train), len(df_test)

In [None]:
# remove rows with neutral label
df_train = df_train[df_train["neutral"] != 1]
df_test = df_test[df_test["neutral"] != 1]
len(df_train), len(df_test)

In [None]:
df_train = df_train.sample(n=1400, random_state=42).reset_index(drop=True)
df_test = df_test.sample(n=600, random_state=42).reset_index(drop=True)

In [None]:
df_train["Emotion"] = df_train.loc[:, GE_taxonomy].idxmax(axis=1)
df_test["Emotion"] = df_test.loc[:, GE_taxonomy].idxmax(axis=1)

In [None]:
# rename columns
df_train.rename(columns={"Clean_text": "Text"}, inplace=True)
df_test.rename(columns={"Clean_text": "Text"}, inplace=True)

drop_cols = GE_taxonomy + ["Cardinality"]
df_train.drop(columns=drop_cols, inplace=True)
df_test.drop(columns=drop_cols, inplace=True)

# Save the cleaned data
df_train.to_csv("dataset/GoEmotions-train.csv", index=False)
df_test.to_csv("dataset/GoEmotions-test.csv", index=False)

In [None]:
df_all = pd.concat([df_train, df_test], ignore_index=True)
df_all.to_csv("dataset/GoEmotions.csv", index=False)

In [None]:
import pandas as pd

df_train = pd.read_csv("dataset/GoEmotions-train.csv")
df_test = pd.read_csv("dataset/GoEmotions-test.csv")
len(df_train), len(df_test)

In [None]:
total = len(df_train) + len(df_test)
total, len(df_test) / total * 100, len(df_train) / total * 100

In [None]:
# plot number of emotions
import matplotlib.pyplot as plt
import seaborn as sns


def plot_emotion_distribution(df, title):
    plt.figure(figsize=(12, 6))
    sns.countplot(data=df, x="Emotion", order=df["Emotion"].value_counts().index)
    total = len(df)
    plt.axhline(
        total / len(df["Emotion"].unique()),
        color="red",
        linestyle="--",
        label=f"Average Count: {total / len(df['Emotion'].unique()):.2f}",
    )
    plt.ylim(0, total * 0.12)  # Set y-limit to 10% of total for better visibility
    # show percentage on top of bars
    for p in plt.gca().patches:
        plt.gca().annotate(
            f"{p.get_height() / total * 100:.1f}%",
            (p.get_x() + p.get_width() / 2.0, p.get_height()),
            ha="center",
            va="bottom",
            fontsize=10,
        )
        plt.gca().annotate(
            f"{p.get_height()}",
            (
                p.get_x() + p.get_width() / 2.0,
                p.get_height() + (5 if total > 1000 else 2),
            ),
            ha="center",
            va="bottom",
            fontsize=10,
        )
    plt.title(title)
    plt.legend()
    plt.xticks(rotation=45)
    plt.xlabel("Emotion")
    plt.ylabel("Count")
    plt.tight_layout()
    plt.show()

In [None]:
plot_emotion_distribution(
    df_train, title="Distribution of Emotions in GoEmotions Train Set"
)
plot_emotion_distribution(
    df_test, title="Distribution of Emotions in GoEmotions Test Set"
)