In [19]:
import pandas as pd
import csv

In [20]:
raw_file_path = "data/NRC-Emotion-Lexicon/NRC-Emotion-Lexicon-ForVariousLanguages.txt"

In [21]:
raw_df = pd.read_table("data/NRC-Emotion-Lexicon/NRC-Emotion-Lexicon-ForVariousLanguages.txt", sep="\t", header=0, low_memory=False)

In [None]:
raw_df = raw_df.iloc[:, :11]
raw_df = raw_df.drop(columns=['anticipation', 'negative', 'trust'])
raw_df = raw_df.rename(columns={'anger': 'angry',
                       'joy': 'happy',
                       'sadness': 'sad',
                       'positive': 'neutral'})

csv_path = 'data/lexicon.csv'
raw_df.to_csv(csv_path, index=False, header=True)
raw_df.head(50)

Unnamed: 0,English Word,angry,disgust,fear,happy,neutral,sad,surprise
0,aback,0,0,0,0,0,0,0
1,abacus,0,0,0,0,0,0,0
2,abandon,0,0,1,0,0,1,0
3,abandoned,1,0,1,0,0,1,0
4,abandonment,1,0,1,0,0,1,1
5,abate,0,0,0,0,0,0,0
6,abatement,0,0,0,0,0,0,0
7,abba,0,0,0,0,1,0,0
8,abbot,0,0,0,0,0,0,0
9,abbreviate,0,0,0,0,0,0,0


In [15]:
def load_emotion_lexicon(file_path):
    emotion_dict = {}
    row_count = 0
    skipped_rows = 0
   
    # List of emotion columns in order
    emotions = ['angry', 'disgust', 'fear', 'happy',
                'neutral', 'sad', 'surprise']
    # angry	disgust	fear	happy	neutral	sad	surprise
   
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            reader = csv.reader(file)
           
            for row in reader:
                row_count += 1
               
                if len(row) != 8:  # Word + 7 emotions
                    print(f"Warning: Row {row_count} has incorrect number of columns: {len(row)}")
                    skipped_rows += 1
                    continue
   
                word = row[0]
                
                if ' ' in word:
                    continue
                # Convert the binary values to emotion list
                present_emotions = [
                    emotion for emotion, value in zip(emotions, row[1:])
                    if value.strip() == '1'
                ]
               
                # Only add words that have at least one emotion
                if present_emotions:
                    emotion_dict[word] = present_emotions
                else:
                    emotion_dict[word] = []
                   
        print(f"Processed {row_count} rows")
        print(f"Skipped {skipped_rows} rows")
        return emotion_dict
       
    except FileNotFoundError:
        print(f"Error: Could not find file {file_path}")
        return None
    except Exception as e:
        print(f"Error processing file: {str(e)}")
        return None

In [16]:
sentiment_lists = load_emotion_lexicon(csv_path)

df = pd.DataFrame({'word': sentiment_lists.keys(), 'emotions': sentiment_lists.values()})
df

Processed 14155 rows
Skipped 0 rows


Unnamed: 0,word,emotions
0,aback,[]
1,abacus,[]
2,abandon,"[fear, sad]"
3,abandoned,"[angry, fear, sad]"
4,abandonment,"[angry, fear, sad, surprise]"
...,...,...
14149,zone,[]
14150,zoo,[]
14151,zoological,[]
14152,zoology,[]


In [17]:
df.to_csv('data/emotion_lists.csv', header=True, index=False)