# GoEmotions data 

In [35]:
from datasets import load_dataset
dataset = load_dataset("go_emotions")

dataset

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'id'],
        num_rows: 43410
    })
    validation: Dataset({
        features: ['text', 'labels', 'id'],
        num_rows: 5426
    })
    test: Dataset({
        features: ['text', 'labels', 'id'],
        num_rows: 5427
    })
})

# XED data

In [35]:
import polars as pl
import os 
# get all files per country
path = "emotion-data/XED"
files = os.listdir(path)

In [56]:
df = pl.DataFrame()
schema = {"text":pl.String,
          "labels":pl.String,
         "country":pl.String}
for f in files:
    tmp = pl.read_csv(path+"/"+f, separator="\t",schema=schema, ignore_errors=True)
    country = f.split("-")[0]
    tmp = tmp.with_columns(country=pl.Series([country]*len(tmp)))
    if tmp.is_empty():
        df=tmp
    else:
        df=pl.concat([df,tmp],how="vertical")

In [57]:
df

text,labels,country
str,str,str
"""Ruumiita ripus…","""1, 3, 4""","""fi"""
"""Ei mitään mutt…","""1""","""fi"""
"""Älä anna hänen…","""1""","""fi"""
"""Laske aseet ma…","""1, 4""","""fi"""
"""Vittuun toimis…","""1""","""fi"""
…,…,…
"""美國人民 我聽到了你們的聲音…","""8""","""zh"""
"""她很擅長她的工作""","""1, 3""","""zh"""
"""湯姆...""","""8""","""zh"""
"""如果他們要和我見面怎么辦""","""2, 5""","""zh"""


In [51]:
df["labels"].unique()

labels
str
"""4, 7, 1"""
"""1, 5, 7"""
"""1"""
"""4, 5, 6, 7, 8"""
"""8, 5, 6, 7"""
…
"""1, 3, 4, 6, 7,…"
"""8, 3, 6, 7"""
"""8, 2, 5, 7"""
"""1, 2, 4, 6, 7,…"


In [59]:
# convert string labels to list and recode from 0 to n-1
df = df.with_columns(pl.col("labels").map_elements(lambda s: [[int(x)-1] for x in s.split(",") if x.isdigit]))

ComputeError: AttributeError: 'Series' object has no attribute 'split'

In [60]:
df

text,labels,country
str,list[list[i64]],str
"""Ruumiita ripus…","[[0], [2], [3]]","""fi"""
"""Ei mitään mutt…",[[0]],"""fi"""
"""Älä anna hänen…",[[0]],"""fi"""
"""Laske aseet ma…","[[0], [3]]","""fi"""
"""Vittuun toimis…",[[0]],"""fi"""
…,…,…
"""美國人民 我聽到了你們的聲音…",[[7]],"""zh"""
"""她很擅長她的工作""","[[0], [2]]","""zh"""
"""湯姆...""",[[7]],"""zh"""
"""如果他們要和我見面怎么辦""","[[1], [4]]","""zh"""


In [41]:
# exclude countries not present in parlamint 4.0 dataset
# country codes by ISO 3166 in parlamint
country_codes = ["BA","BE","AT","BG","CZ","DK","EE","ES","FI","FR","GB",
                 "GR","HR","HU","IS","IT","LV","NL","NO","PL","PT","RS",
                 "SE","SI","TR","UA"]

for i in df["country"].unique():
    print(i)

mk
ml

m
h
zh
fa
annotated
2
gl
b
f
hu
d
fi-annotated
da
bg
w
k
l
a
u


# recode labels

In [89]:
labels_goem = {'0': "admiration",
              '1': "amusement",
              '2': "anger",
              '3': "annoyance",
              '4': "approval",
              '5': "caring",
              '6': "confusion",
              '7': "curiosity",
              '8': "desire",
              '9': "disappointment",
              '10': "disapproval",
              '11': "disgust",
              '12': "embarrassment", 
              '13': "excitement",
              '14': "fear",
              '15': "gratitude",
              '16': "grief",
              '17': "joy",
              '18': "love",
              '19': "nervousness",
              '20': "optimism",
              '21': "pride",
              '22': "realization",
              '23': "relief",
              '24': "remorse",
              '25': "sadness",
              '26': "surprise",
              '27': "neutral",
}

In [90]:
labels_xed = {'1':"anger", 
            '2':"anticipation",
              '3':"disgust", 
              '4':"fear", 
              '5':"joy", 
              '6':"sadness", 
              '7':"surprise", 
              '8':"trust",
             }

In [91]:
# change order key value to recode through retrieval
labels_goem = {v:k for k,v in labels_goem.items()}
labels_xed = {v:k for k,v in labels_xed.items()}

In [92]:
labels_goem

{'admiration': '0',
 'amusement': '1',
 'anger': '2',
 'annoyance': '3',
 'approval': '4',
 'caring': '5',
 'confusion': '6',
 'curiosity': '7',
 'desire': '8',
 'disappointment': '9',
 'disapproval': '10',
 'disgust': '11',
 'embarrassment': '12',
 'excitement': '13',
 'fear': '14',
 'gratitude': '15',
 'grief': '16',
 'joy': '17',
 'love': '18',
 'nervousness': '19',
 'optimism': '20',
 'pride': '21',
 'realization': '22',
 'relief': '23',
 'remorse': '24',
 'sadness': '25',
 'surprise': '26',
 'neutral': '27'}

In [93]:
recode_dict = {}
for k in labels_xed.keys():
    label = labels_goem.get(k)
    print(k,label)
    recode_dict[labels_xed[k]]=label
recode_dict

anger 2
anticipation None
disgust 11
fear 14
joy 17
sadness 25
surprise 26
trust None


{'1': '2',
 '2': None,
 '3': '11',
 '4': '14',
 '5': '17',
 '6': '25',
 '7': '26',
 '8': None}

# Model training notes

- Train two different models: one multi- and one single-label
- Train one model on only english data and one on translated data
- Train one model on a combination of XED and GoEmotions data