# Dataset Pre-Processing

The datasets are here pre-processed and all the more complex emotions are remaped
to 5 baseline emotions:
- happy  
- sad  
- angry  
- surprised  
- neutral  

In [1]:
import pandas as pd
import re, json

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# happy,sad,neutral,surprised,angry


In [2]:
def remove_quotation(text):
    text = text.strip()
    if text.startswith('"'):
        text = text[1:]

    if text.endswith(','):
        text = text[:-1]
    if text.endswith('"'):
        text = text[:-1]
    return text


def elf_to_df(path):
    friends_dict = {}
    with open(path, 'r', encoding="utf8") as file:
        for line in file.readlines():
            line = re.sub(r'[\[\]]', "", line).strip()
            if ":" in line:
                key, value = line.split(':', 1)
                key = key.replace("\"", "").replace(',', "")
                value = value.replace(r"\u0092", "'")
                value = value.replace(r"\u0085", "...")

                value = remove_quotation(value)
                if key not in friends_dict.keys():
                    friends_dict[key] = []
                friends_dict[key].append(value)
    return pd.DataFrame(friends_dict)

def get_column_name(row):
    for column in row.index:
        if column != 'text' and row[column] == 1:
            return column
    return None

def json_lines_dump(dic,out_path):
    # Convert to a list of JSON strings
    json_lines = [json.dumps(l) for l in dic]

    # Join lines and save to .jsonl file
    json_data = '\n'.join(json_lines)
    with open(out_path, 'w') as f:
        f.write(json_data)


# Emotion lines

In [3]:
em_tag_map =  {
 'anger' : 'angry',
 'disgust':'angry',
 'fear':'sad',
 'joy':'happy',
 'neutral':'neutral',
 'non-neutral':'suprise',
 'sadness':'sad',
 'surprise':'surprise'}

In [4]:
# for key, value in friends_dict.items():
#   print(key, value)
#   print('\n')
df = elf_to_df("./datasets/raw/EmotionLines/Friends/friends_dev.json")
df1 = elf_to_df("./datasets/raw/EmotionLines/Friends/friends_test.json")
df2 = elf_to_df("./datasets/raw/EmotionLines/Friends/friends_train.json")


In [5]:
eml_df = pd.concat([df,df1,df2])

In [6]:
eml_df.columns

Index(['speaker', 'utterance', 'emotion', 'annotation'], dtype='object')

In [7]:
eml_df.drop(columns=['speaker','annotation'],inplace=True)
eml_df = eml_df.rename(mapper={'emotion':'EmotionTag','utterance':'text'},axis=1)
eml_df.dropna(inplace=True)
eml_df.head()

Unnamed: 0,text,EmotionTag
0,"Oh my God, he's lost it. He's totally lost it.",non-neutral
1,What?,surprise
2,"Or! Or, we could go to the bank, close our acc...",neutral
3,You're a genius!,joy
4,"Aww, man, now we won't be bank buddies!",sadness


In [8]:
eml_df['EmotionTag'] = eml_df['EmotionTag'].map(em_tag_map)

In [9]:
em_dict = eml_df.to_dict(orient='records')


In [10]:
json_lines_dump(em_dict,'./datasets/processed/emotion_lines.jsonl')

# Empathetic Dialogs

In [11]:
from datasets import load_dataset

emp_tag_map = {'anticipating':'happy', 'impressed':'happy',
                'guilty':'sad', 'surprised':'surprised', 
                'furious':'angry', 'excited':'happy', 'sad':'sad',
                'afraid':'sad', 'confident':'happy', 'grateful':'happy',
                'disgusted':'sad', 'jealous':'sad', 'faithful':'happy', 
                'trusting':'happy', 'prepared':'happy', 'joyful':'happy',
                'embarrassed':'sad', 'lonely':'sad', 'ashamed':'sad',
                'devastated':'sad', 'caring':'neutral', 'sentimental':'neutral', 
                'nostalgic':'neutral', 'hopeful':'happy', 'apprehensive':'sad',
                'angry':'angry', 'annoyed':'angry', 'anxious':'sad', 
                'content':'happy', 'terrified':'sad', 'proud':'happy',
                'disappointed':'neutral'}

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
ds = load_dataset('empathetic_dialogues')

Found cached dataset empathetic_dialogues (/root/.cache/huggingface/datasets/empathetic_dialogues/default/0.1.0/09bbeed3882a67db98c73952fb3c1c9a85af83dc78f81454c2454382fd03f6cf)
100%|██████████| 3/3 [00:00<00:00, 140.22it/s]


In [13]:
empdio_dict = []
for j in ds:
    for i in ds[j]:
        empdio_dict.append({'text': i['utterance'].replace('_comma_',', '),
                        'EmotionTag': emp_tag_map[i["context"]]})



In [14]:
json_lines_dump(empdio_dict,'./datasets/processed/empathetic_dialogue.jsonl')

# GoEmotions

In [15]:
goemotion_tag_map = {'admiration':'happy',
 'amusement':'surprised',
 'anger': 'angry', 
 'annoyance':'angry',
 'approval':'happy',
 'caring':'happy',
 'confusion':'sad',
 'curiosity':'happy',
 'desire':'neutral',
 'disappointment':'sad',
 'disapproval':'angry',
 'disgust':'angry',
 'embarrassment':'sad',
 'excitement':'happy',
 'fear':'sad',
 'gratitude':'happy',
 'grief':'sad',
 'joy':'sad',
 'love':'happy',
 'nervousness':'surprised',
 'neutral':'neutral',
 'optimism':'neutral',
 'pride':'neutral',
 'realization':'neutral',
 'relief':'neutral',
 'remorse':'sad',
 'sadness':'sad',
 'surprise':'surprise'}

In [16]:
go1 = pd.read_csv('./datasets/raw/GoEmotions/data/full_dataset/goemotions_1.csv')
go2 = pd.read_csv('./datasets/raw/GoEmotions/data/full_dataset/goemotions_2.csv')
go3 = pd.read_csv('./datasets/raw/GoEmotions/data/full_dataset/goemotions_3.csv')

go_df = pd.concat([go1,go2,go3])

go_df = go_df.drop(columns=['id', 'author', 'subreddit', 'link_id', 'parent_id',
       'created_utc', 'rater_id', 'example_very_unclear'])


In [17]:
go_df['EmotionTag'] = go_df.apply(lambda row: get_column_name(row), axis=1)

In [18]:
go_df = go_df.drop(columns=['admiration', 'amusement', 'anger', 'annoyance', 'approval',
       'caring', 'confusion', 'curiosity', 'desire', 'disappointment',
       'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear',
       'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride',
       'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral'])

In [19]:
go_df.dropna(inplace=True)

In [20]:
go_df['EmotionTag'] = go_df['EmotionTag'].map(goemotion_tag_map)

In [21]:
go_dict = go_df.to_dict(orient='records')

In [22]:
json_lines_dump(go_dict,'./datasets/processed/go_emotions.jsonl')

# Concatenating

In [28]:
# concat_dict={}
from itertools import chain

concat_dicts = []
concat_dicts.append([go_dict,empdio_dict,em_dict])
final = chain(*concat_dicts)
# concat_dicts = go_dict.extend(empdio_dict)
# concat_dicts.extend(em_dict)
# assert len(concat_dicts) == len(empdio_dict)+len(go_dict)+len(em_dict)

In [29]:
final = list(chain(*final))
print(len(final))
assert len(final)== len(go_dict) + len(empdio_dict) + len(em_dict)

321963


In [None]:
json_lines_dump(final,'./datasets/processed/emotions_ds.json')

In [26]:
# empdio_dict