In [1]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Imports
import pandas as pd
import json

In [5]:
# --- File Paths ---
INPUT_TSV = "/content/drive/MyDrive/266_final_project/data/train.tsv"
EMOTION_FILE = "/content/drive/MyDrive/266_final_project/data/emotions.txt"
MAPPING_DICT_JSON = "/content/drive/MyDrive/266_final_project/data/ekman_mapping.json"

OUTPUT_TSV = "/content/drive/MyDrive/266_final_project/data/new_train.tsv"
OUTPUT_EMOTION_FILE = "/content/drive/MyDrive/266_final_project/data/new_emotions.txt"

In [6]:
# --- Load data ---
data = pd.read_csv(INPUT_TSV, sep="\t", header=None, names=["text", "labels", "id"])

# Original emotions + neutral
emotions = open(EMOTION_FILE).read().splitlines() + ["neutral"]
idx2emotion = {i: t for i, t in enumerate(emotions)}

# Load mapping dictionary
with open(MAPPING_DICT_JSON, "r") as f:
    mapping_dict = json.load(f)

new_emotions = list(mapping_dict.keys())

# Identify old emotions that aren't mapped
not_found = []
for old in emotions:
    if not any(old in v for v in mapping_dict.values()):
        print(f"{old} is not found in mapping")
        not_found.append(old)

# New emotion list = mapped + unmapped
new_emotions = sorted(new_emotions + not_found)
print("New emotions:", new_emotions)

emotion2idx = {emotion: i for i, emotion in enumerate(new_emotions)}

neutral is not found in mapping
neutral is not found in mapping
New emotions: ['anger', 'disgust', 'fear', 'joy', 'neutral', 'neutral', 'sadness', 'surprise']


In [7]:
# Relabeling function
def replace_labels(label_string, idx2emotion, mapping_dict, emotion2idx):
    split = label_string.split(",")
    new_labels = []
    for label_idx in split:
        old_emotion = idx2emotion[int(label_idx)]
        found = False
        for new_emotion, group in mapping_dict.items():
            if old_emotion in group:
                new_labels.append(str(emotion2idx[new_emotion]))
                found = True
                break
        if not found:
            new_labels.append(str(emotion2idx[old_emotion]))
    assert new_labels
    return ",".join(new_labels)

In [8]:
# Apply mapping
data["labels"] = data["labels"].apply(
    replace_labels, args=(idx2emotion, mapping_dict, emotion2idx)
)

In [9]:
# Save new emotion list and remapped dataset
with open(OUTPUT_EMOTION_FILE, "w") as f:
    f.write("\n".join(new_emotions))

data.to_csv(OUTPUT_TSV, sep="\t", header=False, index=False, encoding="utf-8")

print(f"\n✔️ Saved remapped dataset to {OUTPUT_TSV}")
print(f"✔️ Saved new emotion list to {OUTPUT_EMOTION_FILE}")


✔️ Saved remapped dataset to /content/drive/MyDrive/266_final_project/data/new_train.tsv
✔️ Saved new emotion list to /content/drive/MyDrive/266_final_project/data/new_emotions.txt


In [10]:
data

Unnamed: 0,text,labels,id
0,My favourite food is anything I didn't have to...,5,eebbqej
1,"Now if he does off himself, everyone will thin...",5,ed00q6i
2,WHY THE FUCK IS BAYLESS ISOING,0,eezlygj
3,To make her feel threatened,2,ed7ypvh
4,Dirty Southern Wankers,0,ed0bdzj
...,...,...,...
43405,Added you mate well I’ve just got the bow and ...,3,edsb738
43406,Always thought that was funny but is it a refe...,7,ee7fdou
43407,What are you talking about? Anything bad that ...,0,efgbhks
43408,"More like a baptism, with sexy results!",3,ed1naf8
