# GoEmotions data 

In [35]:
from datasets import load_dataset
dataset = load_dataset("go_emotions")

dataset

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'id'],
        num_rows: 43410
    })
    validation: Dataset({
        features: ['text', 'labels', 'id'],
        num_rows: 5426
    })
    test: Dataset({
        features: ['text', 'labels', 'id'],
        num_rows: 5427
    })
})

# XED data

In [1]:
import polars as pl
import os 
# get all files per country
path = "emotion-data/XED"
files = os.listdir(path)

In [8]:
df = pl.DataFrame()
schema = {"text":pl.String,
          "labels":pl.String,
         "language":pl.String}
for f in files:
    tmp = pl.read_csv(path+"/"+f, separator="\t",schema=schema, ignore_errors=True)
    lang = f.split("-")[0]
    tmp = tmp.with_columns(language=pl.Series([lang]*len(tmp)))
    if tmp.is_empty():
        df=tmp
    else:
        df=pl.concat([df,tmp],how="vertical")

In [9]:
df

text,labels,language
str,str,str
"""Ruumiita ripus…","""1, 3, 4""","""fi"""
"""Ei mitään mutt…","""1""","""fi"""
"""Älä anna hänen…","""1""","""fi"""
"""Laske aseet ma…","""1, 4""","""fi"""
"""Vittuun toimis…","""1""","""fi"""
…,…,…
"""美國人民 我聽到了你們的聲音…","""8""","""zh"""
"""她很擅長她的工作""","""1, 3""","""zh"""
"""湯姆...""","""8""","""zh"""
"""如果他們要和我見面怎么辦""","""2, 5""","""zh"""


In [10]:
df["labels"].unique()

labels
str
"""1, 2, 3, 4, 6"""
"""8, 2, 4"""
"""6, 7, 1"""
"""4, 5, 7"""
"""1, 7, 8"""
…
"""2, 4, 5, 7, 8"""
"""8, 2, 3, 6"""
"""2, 3, 4, 6, 7"""
"""2, 4, 6"""


In [16]:
# exclude languages not present in parlamint 4.0 dataset
# language codes by ISO 639 in parlamint
lang_codes = ["bs","bg","hr","cs","da","nl","en","et","fi","fr","de",
                 "hu","is","it","lv","el","no","pl","pt","ru","sr","sl",
                 "es","sv","tr","uk"]

# exclude langs
df = df.filter(pl.col("language").is_in(lang_codes))

In [17]:
# convert string labels to list and recode from 0 to n-1
df = df.with_columns(pl.col("labels").map_elements(lambda s: [[int(x)-1] for x in s.split(",") if x.isdigit]))

In [None]:
# assess class-imbalance problem


In [18]:
# save full dataset
df.select(pl.col("text","labels")).write_parquet("data.parquet")

In [83]:
# calculate class-weights
import numpy as np
from sklearn.utils.class_weight import compute_sample_weight

y = df["labels"]
print(y)
#np.unique(y)
class_weights = compute_sample_weight(class_weight="balanced", y=y)


shape: (207_909,)
Series: 'labels' [list[list[i64]]]
[
	[[0], [2], [3]]
	[[0]]
	[[0]]
	[[0], [3]]
	[[0]]
	…
	[[7]]
	[[0], [2]]
	[[7]]
	[[1], [4]]
	[[1], [4]]
]


ValueError: operands could not be broadcast together with shapes (2,) (3,) 

# Combining datasets ?

In [89]:
# recode labels goem
labels_goem = {'0': "admiration",
              '1': "amusement",
              '2': "anger",
              '3': "annoyance",
              '4': "approval",
              '5': "caring",
              '6': "confusion",
              '7': "curiosity",
              '8': "desire",
              '9': "disappointment",
              '10': "disapproval",
              '11': "disgust",
              '12': "embarrassment", 
              '13': "excitement",
              '14': "fear",
              '15': "gratitude",
              '16': "grief",
              '17': "joy",
              '18': "love",
              '19': "nervousness",
              '20': "optimism",
              '21': "pride",
              '22': "realization",
              '23': "relief",
              '24': "remorse",
              '25': "sadness",
              '26': "surprise",
              '27': "neutral",
}

In [90]:
# id2labels XED
id2labels = {'1':"anger", 
            '2':"anticipation",
              '3':"disgust", 
              '4':"fear", 
              '5':"joy", 
              '6':"sadness", 
              '7':"surprise", 
              '8':"trust",
             }

In [91]:
# change order key value to recode through retrieval
labels2id = {v:k for k,v in labels_xed.items()}

# Model training notes

- Train two different models: one multi- and one single-label
- Train one model on only english data and one on translated data
- Train one model on a combination of XED and GoEmotions data