# **Lexicon Labeling**

Â© Kuncahyo Setyo Nugroho

In [None]:
import numpy as np
import pandas as pd

from textblob import TextBlob
from collections import Counter
from json import load

import nltk
nltk.download('punkt')
nltk.download('wordnet')

from sklearn.preprocessing import MultiLabelBinarizer # for multi-label binarization

In [None]:
def __build_word_affect__(self):
    affect_list = []
    affect_dict = dict()
    affect_frequencies = Counter()
    lexicon_keys = self.__lexicon__.keys()

    for word in self.words:
        if word in lexicon_keys:
            affect_list.extend(self.__lexicon__[word])
            affect_dict.update({word: self.__lexicon__[word]})

    for word in affect_list:
        affect_frequencies[word] += 1
    sum_values = sum(affect_frequencies.values())

    affect_percent = {'fear': 0.0, 'anger': 0.0, 'surprise': 0.0, 'sadness': 0.0, 'disgust': 0.0, 'joy': 0.0}

    for key in affect_frequencies.keys():
        affect_percent.update({key: float(affect_frequencies[key]) / float(sum_values)})

    self.affect_list = affect_list
    self.affect_dict = affect_dict
    self.raw_emotion_scores = dict(affect_frequencies)
    self.affect_frequencies = affect_percent

def top_emotions(self):
    emo_dict = self.affect_frequencies
    max_value = max(emo_dict.values())
    top_emotions = []

    for key in emo_dict.keys():
        if emo_dict[key] == max_value:
            top_emotions.append((key, max_value))

    self.top_emotions = top_emotions

def emotions_value(self):
    emo_dict = self.affect_frequencies
    max_value = max(emo_dict.values())
    emotions_value = []

    for key in emo_dict.keys():
        if emo_dict[key] > 0.0:
            emotions_value.append((key))
            
    self.emotions_value = emotions_value

class NRCLex:
    """
    Lexicon source is (C) 2016 National Research Council Canada (NRC) and library is for research purposes only.  
    Source: http://sentiment.nrc.ca/lexicons-for-research/
    """
    def __init__(self, lexicon_file='nrc_id.json'):
        with open(lexicon_file, 'r') as json_file:
            self.__lexicon__ = load(json_file)

    def load_raw_text(self, text):
        self.text = text.lower()
        blob = TextBlob(self.text)
        self.words = [w for w in blob.words]
        self.sentences = list(blob.sentences)
        __build_word_affect__(self)
        top_emotions(self)
        emotions_value(self)

In [None]:
# create function to get the top emotions
def lex_label(text):
  nrc_emotion = NRCLex(lexicon_file='nrc_id.json')
  nrc_emotion.load_raw_text(text)
  return nrc_emotion.emotions_value

In [None]:
%%time
source_folder_path = 'data/data-for-annotation'
destination_folder_path = 'data/result-lexicon-annotation'

emotion_list = ['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise']

for emotion in emotion_list:
    df = pd.read_csv(f'{source_folder_path}/{emotion}.csv', usecols=['tweet'], encoding='utf8')
    df['emotion'] = df['tweet'].apply(lex_label)
    df.to_csv(f'{destination_folder_path}/{emotion}.csv')

    mlb = MultiLabelBinarizer()
    binarizer_df = df.join(pd.DataFrame(mlb.fit_transform(df.pop('emotion')), columns=mlb.classes_, index=df.index))
    binarizer_df.to_csv(f'{destination_folder_path}/binarizer/{emotion}.csv')