# XED data

In [2]:
import polars as pl
import os 
# get all files per country
path = "emotion-data/XED"
files = os.listdir(path)

In [19]:
df = pl.DataFrame()
schema = {"text":pl.String,
          "labels":pl.String,
         "language":pl.String}
for f in files:
    tmp = pl.read_csv(path+"/"+f, separator="\t",schema=schema, ignore_errors=True)
    lang = f.split("-")[0]
    tmp = tmp.with_columns(language=pl.Series([lang]*len(tmp)))
    if tmp.is_empty():
        df=tmp
    else:
        df=pl.concat([df,tmp],how="vertical")

In [20]:
df["labels"].unique()
df

text,labels,language
str,str,str
"""انهمخزي!""","""1, 4, 7""","""ar"""
"""انهالأفضل!""","""8""","""ar"""
"""- لا تكن مؤدب …","""1, 3, 8""","""ar"""
"""في حال رفض الس…","""1""","""ar"""
"""لكن ماذا عن ال…","""2, 7""","""ar"""
…,…,…
"""美國人民 我聽到了你們的聲音…","""8""","""zh"""
"""她很擅長她的工作""","""1, 3""","""zh"""
"""湯姆...""","""8""","""zh"""
"""如果他們要和我見面怎么辦""","""2, 5""","""zh"""


In [21]:
# exclude languages not present in parlamint 4.0 dataset
# language codes by ISO 639 in parlamint
lang_codes = ["bs","bg","hr","cs","da","nl","en","et","fi","fr","de",
                 "hu","is","it","lv","el","no","pl","pt","ru","sr","sl",
                 "es","sv","tr","uk"]

# exclude langs
df = df.filter(pl.col("language").is_in(lang_codes))

In [22]:
# convert string labels to list and recode from 0 to n-1
df = df.with_columns(pl.col("labels").map_elements(lambda s: [int(x)-1 for x in s.split(",") if x.isdigit]))


In [23]:
def one_hot(lst):
    # model expects values to be float
    vec = [0.0]*8
    for cls in lst:
        vec[cls]=1.0
    return vec
    
df = df.with_columns(pl.col("labels").map_elements(lambda l: one_hot(l)))
df

text,labels,language
str,list[f64],str
"""Няма работа за…","[0.0, 0.0, … 0.0]","""bg"""
"""Ако нямаше наг…","[0.0, 0.0, … 0.0]","""bg"""
"""Защо не пием з…","[0.0, 1.0, … 0.0]","""bg"""
"""Не става.""","[0.0, 0.0, … 0.0]","""bg"""
"""А и са безполе…","[0.0, 0.0, … 0.0]","""bg"""
…,…,…
"""Dur, Onu tanıy…","[0.0, 0.0, … 0.0]","""tr"""
"""Vakıf fonu Jan…","[0.0, 0.0, … 0.0]","""tr"""
"""Bilmiyorum ama…","[0.0, 0.0, … 1.0]","""tr"""
"""Son yirmi yıld…","[0.0, 0.0, … 0.0]","""tr"""


In [26]:
# exclude labels that are under 100 times occurrence
exclude = df.group_by(pl.col("labels")).len().filter(pl.col("len")<99)["labels"].to_list()


include = df.group_by(pl.col("labels")).len().filter(pl.col("len")>99)["labels"].to_list()
print(len(exclude),len(include))

153 67


In [27]:
# iterate through rows and construct filter list 
filter_lst = [] 
for r in df.iter_rows():
    if r[-2] in include:
        filter_lst.append(True)
    else:
        filter_lst.append(False)

In [28]:
# add list as series and filter out values 
df = df.with_columns(flt=pl.Series(filter_lst)).filter(pl.col("flt")==True)

In [33]:
# create train-test split
df = df.sample(fraction=1, shuffle=True,seed=42)
test_size = int(len(df)*0.1)
test, train = df.head(test_size), df.tail(-test_size)

train.write_parquet("train.parquet")
test.write_parquet("test.parquet")

print(len(train),len(test))

151371 16818


In [None]:
df.select(pl.col("text","labels")).write_parquet("data.parquet")

In [None]:
import pandas as pd

df = pd.DataFrame(df)

df.columns = ['text', 'labels', 'lang']

In [None]:
# convert string labels to list and recode from 0 to n-1
# df = df.with_columns(pl.col("labels").map_elements(lambda s: [[int(x)-1] for x in s.split(",") if x.isdigit]))
import numpy as np

def to_array(string):
    string = string.replace(" ", "")
    number_list = string.split(',')
    number_list = [int(num) for num in number_list]
    number_array = np.array(number_list)
    
    return number_array

df.labels = df['labels'].apply(to_array)
df

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer


def one_hot_encoding(ds):
    mlb = MultiLabelBinarizer()
    ds = ds.join(pd.DataFrame(mlb.fit_transform(ds.pop('labels')),
                          columns=mlb.classes_,
                          index=ds.index))
    
    return ds

df = one_hot_encoding(df)



In [None]:
#df = df.with_columns(pl.col('text').map_elements(lambda x: len(x)).alias('len_text'))
df
# Compute the mean length
#mean_length = df.select(pl.col('StringLength').mean())
#df['len_text'].mean()

In [None]:
# collapse one-hot columns into listdf
pl_df = pl.from_pandas(df)

In [None]:
labels = []
for r in pl_df.iter_rows():
    labels.append(list(r[-8:]))

In [None]:
labels

In [None]:
# delete combinations with less than 100 instances
#labels = df[['1', '2', '3', '4', '5', '6', '7', '8']]
#print( )
#labels_vectors = labels.values.tolist()

df.columns = ['text', 'lang', '1', '2', '3', '4', '5', '6', '7', '8']

In [None]:
freqs = list()

for i in range(1,9):
    freqs.append(df[str(i)].sum())
    print(i, df[str(i)].sum())
    


In [None]:
import matplotlib.pyplot as plt

values = []
for i, freq in enumerate(freqs):
    values.extend([i+1] * freq)  # i+1 to match the value (1, 2, 3, 4, 5)

# Step 3: Create a pandas DataFrame
df = pd.DataFrame(values, columns=['Value'])

# Step 4: Plot the histogram
plt.hist(df['Value'], bins=len(freqs), edgecolor='black')

# Customizing the histogram
plt.title('Distrbution of individual emotions')
plt.xlabel('Emotion')
plt.ylabel('Frequency')
plt.xticks(range(1, len(freqs) + 1))

# Show the plot
plt.show()

In [None]:
from collections import defaultdict

d = defaultdict(int)

for _, r in df.iterrows():
    dat = list(r)[2:10]
    d[tuple(dat)] += 1

In [None]:
for k in sorted(d, key = d.get, reverse=True):
    print(k, d[k])

In [None]:
len(d)

## Modeling

In [None]:
id2labels = {'0':"anger",
            '1':"anticipation",
              '2':"disgust",
              '3':"fear",
              '4':"joy",
              '5':"sadness",
              '6':"surprise",
              '7':"trust",
             }

# change order key value to recode through retrieval
labels2id = {v:k for k,v in labels_xed.items()}