In [451]:
import pandas as pd
import json
import matplotlib.pyplot as plt
import os
from googletrans import Translator

In [452]:
output_folder = './output/aya-101/'
dictionary_folder= './dataset/'
dictionary_file = 'NRC-Emotion-Lexicon-ForVariousLanguages.txt'

In [453]:
def load_dictionary(dictionary_folder, dictionary_file):
    # dictionary load
    dictionary = pd.read_csv(dictionary_folder + dictionary_file, sep='\t').drop(['negative', 'positive'], axis=1)
    # List of emotion columns
    emotion_columns = ['anger', 'anticipation', 'disgust', 'fear', 'joy', 'sadness', 'surprise','trust']

    # Identify identifier columns by excluding emotion columns
    identifier_columns = [col for col in dictionary.columns if col not in emotion_columns]

    # Melt the DataFrame including the identifier columns
    dictionary = pd.melt(dictionary, id_vars=identifier_columns, 
                        var_name='AffectCategory', 
                        value_name='value')
    dictionary = dictionary[dictionary['value'] == 1].drop('value', axis=1)

    return dictionary

In [454]:
def report_load(output_folder,output_file):
    # data load# report load
    print(output_folder + output_file)
    tmp=[]
    target_group_name= output_file.split("_")[1]
    if  ("english" in output_file and target_group_name != "male" and target_group_name != "female") \
        or \
        ("german" in output_file and target_group_name != "Mann" and target_group_name != "Frau"):
            target_group_name+= " " + output_file.split("_")[2]
    with open(output_folder + output_file, 'r') as f:
        data = json.load(f)
        values = list(data.values())
        for i,value in enumerate(values):
            try:    
                if "{" not in value:
                    tmp.append({"emotion":"Cannot adopt emotion","explanation":value.strip()})
                elif "adopt the identity" in value or 'racial or ethnic' in value or 'I cannot fulfill your request' in value or 'I cannot provide a response' in value:
                    tmp.append({"emotion":"Cannot adopt emotion","explanation":value.strip()})
                elif "}" not in value:
                    if value[-1]=="\"":
                        try_again = json.loads(value.strip() + "}")
                    else:
                        try_again = json.loads(value.strip() + "\"}")
                    tmp.append(try_again)
                elif '\n' in value:
                    value = value.split('\n')[1]
                    tmp.append(json.loads(value.strip()))
                elif '\n\n' in value:
                    value = value.split('\n\n')[1]
                    tmp.append(json.loads(value.strip()))
                elif '\r\n\r\n' in value:
                    value = value.split('\r\n\r\n')
                    if len(value) <4:
                        continue
                    value = value[3]
                    tmp.append(json.loads(value.strip()))
                else:
                    tmp.append(json.loads(value.strip()))
            except Exception as e:
                print(e)
                if value != '':
                    print(value.strip())
                    print(i,"\t",value.strip())
                    
        values = tmp
        emotion = pd.DataFrame(values)
        if 'Emotion' in emotion.columns and 'emotion' in emotion.columns:
            def replace_none(row):
                if pd.isna(row['emotion']):
                    return row["Emotion"]
                else:
                    return row['emotion']
            emotion['emotion'] = emotion.apply(replace_none, axis=1)
        elif 'Emotion' in emotion.columns:
            emotion['emotion'] = emotion['Emotion']
        emotion['emotion'] = emotion['emotion'].str.lower().str.strip()
        emotion['target_group'] = target_group_name
    return emotion

In [455]:

# Define a function to handle the replacement logic
def replace_none(row):
    if pd.isna(row['AffectCategory']):
        if not pd.isna(row['emotion']) and 'cannot adopt emotion' in row['emotion'].lower():
            return 'Cannot adopt emotion'
        else:
            return 'Cannot map the emotion'
    else:
        return row['AffectCategory']


def merged_data(emotion,dictionary,output_folder,output_file, word_column='English Word'):
    # join
    dictionary=dictionary.drop(list(set(dictionary.columns)-set([word_column,'AffectCategory'])),axis=1)
    dictionary[word_column] = dictionary[word_column].str.lower()
    merged = pd.merge(emotion, dictionary, left_on='emotion', right_on=word_column, how='left')
    merged = merged.dropna(subset=['target_group'])
    merged['AffectCategory'] = merged.apply(replace_none, axis=1)
    os.makedirs(output_folder+"/refactored/", exist_ok=True)
    merged.to_csv(output_folder+"/refactored/" + output_file.replace('.json', '_refactored.csv'), index=False, header=True, sep='\t')
    target_group = merged.drop(columns=list(set(merged.columns)-set(['AffectCategory','target_group']))).groupby('AffectCategory').count()
    print("The unique target groups are the following: \n",merged['target_group'].unique())
    print("The output is the following: \n",target_group)
    return merged

In [456]:
dictionary = load_dictionary(dictionary_folder, dictionary_file)
for output_file in os.listdir(output_folder):
    if 'refactored' in output_file or 'images' in output_file:
        continue
    emotion = report_load(output_folder,output_file)
    merged = merged_data(emotion,dictionary,output_folder,output_file, word_column= 'German' if 'german' in output_file else "English Word")

./output/aya-101/english_Asian_man_1_results.json
The unique target groups are the following: 
 ['Asian man']
The output is the following: 
                         target_group
AffectCategory                      
Cannot map the emotion            22
anger                            418
anticipation                     115
disgust                          237
fear                             244
joy                              161
sadness                          596
surprise                          35
trust                            356
./output/aya-101/english_Asian_man_2_results.json
The unique target groups are the following: 
 ['Asian man']
The output is the following: 
                         target_group
AffectCategory                      
Cannot map the emotion            26
anger                            414
anticipation                     108
disgust                          233
fear                             240
joy                              159
sadness        