# XED data

In [11]:
import polars as pl
import os 
# get all files per country
path = "emotion-data/XED"
files = os.listdir(path)

In [12]:
df = pl.DataFrame()
schema = {"text":pl.String,
          "labels":pl.String,
         "language":pl.String}
for f in files:
    tmp = pl.read_csv(path+"/"+f, separator="\t",schema=schema, ignore_errors=True)
    lang = f.split("-")[0]
    tmp = tmp.with_columns(language=pl.Series([lang]*len(tmp)))
    if tmp.is_empty():
        df=tmp
    else:
        df=pl.concat([df,tmp],how="vertical")

In [13]:
df["labels"].unique()
df

text,labels,language
str,str,str
"""انهمخزي!""","""1, 4, 7""","""ar"""
"""انهالأفضل!""","""8""","""ar"""
"""- لا تكن مؤدب …","""1, 3, 8""","""ar"""
"""في حال رفض الس…","""1""","""ar"""
"""لكن ماذا عن ال…","""2, 7""","""ar"""
…,…,…
"""美國人民 我聽到了你們的聲音…","""8""","""zh"""
"""她很擅長她的工作""","""1, 3""","""zh"""
"""湯姆...""","""8""","""zh"""
"""如果他們要和我見面怎么辦""","""2, 5""","""zh"""


In [14]:
# exclude languages not present in parlamint 4.0 dataset
# language codes by ISO 639 in parlamint
lang_codes = ["bs","bg","hr","cs","da","nl","en","et","fi","fr","de",
                 "hu","is","it","lv","el","no","pl","pt","ru","sr","sl",
                 "es","sv","tr","uk"]

# exclude langs
df = df.filter(pl.col("language").is_in(lang_codes))

In [15]:
# convert string labels to list and recode from 0 to n-1
df = df.with_columns(pl.col("labels").map_elements(lambda s: [int(x)-1 for x in s.split(",") if x.isdigit]))


In [16]:
def one_hot(lst):
    vec = [0]*8
    for cls in lst:
        vec[cls]=1
    return vec
    
df = df.with_columns(pl.col("labels").map_elements(lambda l: one_hot(l)))
df

text,labels,language
str,list[i64],str
"""Няма работа за…","[0, 0, … 0]","""bg"""
"""Ако нямаше наг…","[0, 0, … 0]","""bg"""
"""Защо не пием з…","[0, 1, … 0]","""bg"""
"""Не става.""","[0, 0, … 0]","""bg"""
"""А и са безполе…","[0, 0, … 0]","""bg"""
…,…,…
"""Dur, Onu tanıy…","[0, 0, … 0]","""tr"""
"""Vakıf fonu Jan…","[0, 0, … 0]","""tr"""
"""Bilmiyorum ama…","[0, 0, … 1]","""tr"""
"""Son yirmi yıld…","[0, 0, … 0]","""tr"""


In [17]:
# exclude labels that are under 100 times occurrence
exclude = df.group_by(pl.col("labels")).len().filter(pl.col("len")<99)["labels"].to_list()


include = df.group_by(pl.col("labels")).len().filter(pl.col("len")>99)["labels"].to_list()
print(len(exclude),len(include))

153 67


In [18]:
# iterate through rows and construct filter list 
filter_lst = [] 
for r in df.iter_rows():
    if r[-2] in include:
        filter_lst.append(True)
    else:
        filter_lst.append(False)

In [19]:
# add list as series and filter out values 
df = df.with_columns(flt=pl.Series(filter_lst)).filter(pl.col("flt")==True)

In [32]:
# drop nulls (one occurrence of null in the data)
df = df.drop_nulls()

In [33]:
# create train-test split

# shuffle
df = df.sample(fraction=1, shuffle=True, seed=42)

test_size = int(len(df)*0.1)

test, train = df.head(test_size), df.tail(-test_size)

test.write_parquet("test.parquet")
train.write_parquet("train.parquet")

print(len(train),len(test))

151370 16818


## Labels2id for model

In [None]:
id2labels = {'0':"anger",
            '1':"anticipation",
              '2':"disgust",
              '3':"fear",
              '4':"joy",
              '5':"sadness",
              '6':"surprise",
              '7':"trust",
             }

# change order key value to recode through retrieval
labels2id = {v:k for k,v in labels_xed.items()}