In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import (
    train_test_split,
)
import csv
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression

In [3]:
raw_file_path = "data/NRC-Emotion-Lexicon/NRC-Emotion-Lexicon-ForVariousLanguages.txt"

In [4]:
raw_df = pd.read_table("data/NRC-Emotion-Lexicon/NRC-Emotion-Lexicon-ForVariousLanguages.txt", sep="\t", header=0, low_memory=False)

In [5]:
raw_df = raw_df.iloc[:, :11]
csv_path = 'data/lexicon.csv'
raw_df.to_csv(csv_path, index=False, header=True)
raw_df.head(50)


Unnamed: 0,English Word,anger,anticipation,disgust,fear,joy,negative,positive,sadness,surprise,trust
0,aback,0,0,0,0,0,0,0,0,0,0
1,abacus,0,0,0,0,0,0,0,0,0,1
2,abandon,0,0,0,1,0,1,0,1,0,0
3,abandoned,1,0,0,1,0,1,0,1,0,0
4,abandonment,1,0,0,1,0,1,0,1,1,0
5,abate,0,0,0,0,0,0,0,0,0,0
6,abatement,0,0,0,0,0,0,0,0,0,0
7,abba,0,0,0,0,0,0,1,0,0,0
8,abbot,0,0,0,0,0,0,0,0,0,1
9,abbreviate,0,0,0,0,0,0,0,0,0,0


In [6]:
def load_emotion_lexicon(file_path):
    emotion_dict = {}
    row_count = 0
    skipped_rows = 0
   
    # List of emotion columns in order
    emotions = ['anger', 'anticipation', 'disgust', 'fear', 'joy',
                'negative', 'positive', 'sadness', 'surprise', 'trust']
   
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            reader = csv.reader(file)
           
            for row in reader:
                row_count += 1
               
                if len(row) != 11:  # Word + 10 emotions
                    print(f"Warning: Row {row_count} has incorrect number of columns: {len(row)}")
                    skipped_rows += 1
                    continue
   
                word = row[0]
                
                if ' ' in word:
                    continue
                # Convert the binary values to emotion list
                present_emotions = [
                    emotion for emotion, value in zip(emotions, row[1:])
                    if value.strip() == '1'
                ]
               
                # Only add words that have at least one emotion
                if present_emotions:
                    emotion_dict[word] = present_emotions
                else:
                    emotion_dict[word] = []
                   
        print(f"Processed {row_count} rows")
        print(f"Skipped {skipped_rows} rows")
        return emotion_dict
       
    except FileNotFoundError:
        print(f"Error: Could not find file {file_path}")
        return None
    except Exception as e:
        print(f"Error processing file: {str(e)}")
        return None

In [7]:
sentiment_lists = load_emotion_lexicon(csv_path)

df = pd.DataFrame({'word': sentiment_lists.keys(), 'emotions': sentiment_lists.values()})
df

Processed 14155 rows
Skipped 0 rows


Unnamed: 0,word,emotions
0,aback,[]
1,abacus,[trust]
2,abandon,"[fear, negative, sadness]"
3,abandoned,"[anger, fear, negative, sadness]"
4,abandonment,"[anger, fear, negative, sadness, surprise]"
...,...,...
14149,zone,[]
14150,zoo,[]
14151,zoological,[]
14152,zoology,[]


In [None]:
df.to_csv('data/emotion_lists.csv', header=True, index=False)

In [6]:
encoder = LabelEncoder()

y = encoder.fit_transform(raw_df['Emotion'])

print(encoder.classes_)

['anger' 'anticipation' 'disgust' 'fear' 'joy' 'negative' 'positive'
 'sadness' 'surprise' 'trust' nan]


In [7]:
train_df, test_df = train_test_split(raw_df, test_size=0.3, random_state=123)

X_train, y_train = (
    train_df.drop(columns=["Emotion"]),
    train_df["Emotion"]
)

X_test, y_test = (
    test_df.drop(columns=["Emotion"]),
    test_df["Emotion"]
)

model = LogisticRegression()
model.fit(train_df, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

ValueError: could not convert string to float: 'cash'

In [None]:
train_df.info()

In [None]:
raw_df.describe(include="all")