# User input

In [12]:
# enter folder drive location of the extracted (final) datasets
drive_location = '/content/drive/MyDrive/nasdaq/concatenated datasets'

# enter Loughran-McDonald (LM) positive words dictionary location
lm_positive_words_location = '/content/drive/MyDrive/datasets/source datasets/lmd_positive_words.csv'

# enter Loughran-McDonald (LM) negative words dictionary location
lm_negative_words_location = '/content/drive/MyDrive/datasets/source datasets/lmd_negative_words.csv'

# Post Processor Class

In [None]:
pip install english-words==1.1.0

In [None]:
import pandas as pd
from gensim.parsing.preprocessing import STOPWORDS
import nltk
import gdown
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('omw-1.4')
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from english_words import english_words_lower_alpha_set as eng_words

class PostProcessor:

  def __init__(self, datasets, lm_datasets, word_column, count_column, shap_values_column, sentence_uuid_column = None):
    self.datasets = [dataset.copy(True) for dataset in datasets]
    self.lm_datasets = lm_datasets

    self.word_column = word_column
    self.count_column = count_column
    self.shap_values_column = shap_values_column
    self.sentence_uuid_column = sentence_uuid_column
    self.lemmatizer = WordNetLemmatizer()

    self.max_shap_value_column = f'max_{self.shap_values_column}'
    self.min_shap_value_column = f'min_{self.shap_values_column}'
    self.average_shap_value_column = f'average_{self.shap_values_column}'
    self.sum_shap_value_column = f'sum_{self.shap_values_column}'

    self.total_count_column = 'total'

  def __clean_words(self, dataset):
    # cleaning words such that special character ġ from RoBERTa tokenizator is replaced, the words are striped and changed to lower cases
    dataset[self.word_column] = dataset[self.word_column].apply(lambda word: str(word).replace('ġ', '').strip().lower())
    # words with length < 3 are dropped
    dataset.drop(dataset[dataset[self.word_column].str.len() < 3].index, inplace = True)

    return dataset

  def __get_wordnet_pos(self, word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

  def __lemmatize_words(self, dataset):
    dataset[self.word_column] = dataset[self.word_column].apply(lambda word: self.lemmatizer.lemmatize(word, self.__get_wordnet_pos(word)))

    return dataset

  def __remove_stop_words(self, dataset):
    dataset = dataset[~dataset[self.word_column].isin(STOPWORDS)]

    return dataset

  def __create_additional_columns(self, dataset):
    # adding max min and average shap value column
    # initially they are all equal
    dataset[self.max_shap_value_column] = dataset[self.shap_values_column]
    dataset[self.min_shap_value_column] = dataset[self.shap_values_column]
    dataset[self.average_shap_value_column] = dataset[self.shap_values_column]

    # adding total count column which in this moment is equal to count column
    dataset[self.total_count_column] = dataset[self.count_column]

    return dataset

  def __group_duplicate_words(self, dataset):
    self.__create_additional_columns(dataset)

    # group, then aggreagate words by function
    dataset = dataset.groupby(by=[self.word_column]).agg(
        {self.count_column: 'sum', self.total_count_column: 'sum', self.shap_values_column: 'sum', self.average_shap_value_column: 'mean', 
         self.max_shap_value_column: 'max', self.min_shap_value_column: 'min', self.sentence_uuid_column: ','.join}).reset_index()

    # changing the name from shap_values_column to sum_shap_value_column
    columns = list(dataset.columns)
    index_to_change = columns.index(self.shap_values_column)
    columns[index_to_change] = self.sum_shap_value_column

    dataset.columns = columns

    return dataset


  def process_dataset(self, dataset):
    modified_dataset = self.__clean_words(dataset)

    modified_dataset = self.__group_duplicate_words(modified_dataset)
    modified_dataset = self.__lemmatize_words(modified_dataset)
    modified_dataset = self.__remove_stop_words(modified_dataset)

    return modified_dataset

  def clean_lm_words(self, dataset):
    # strip and change to lower cases lm words
    dataset[self.word_column] = dataset[self.word_column].apply(lambda word: str(word).strip().lower())
    return self.__lemmatize_words(dataset)

  def word_exist_in_dictionary(self, word):
    # checking if the word exists in dictionary
    return word in eng_words

  def remove_non_dictionary_words_from_dataset(self, dataset):
    # finding all words from the dataset that don't exist in a dictionary
    throw_out_words = [word for word in dataset[self.word_column].unique() if not self.word_exist_in_dictionary(word)]

    # removing all words that don't exist in a dictionary
    dataset = dataset.loc[~dataset[self.word_column].isin(throw_out_words)]
  
    return dataset

  def run_processing_on_datasets(self):
    datasets = [self.process_dataset(dataset.copy(True)) for dataset in self.datasets]
    lm_datasets = [self.clean_lm_words(lm_dataset.copy(True)) for lm_dataset in self.lm_datasets]

    datasets = [self.remove_non_dictionary_words_from_dataset(dataset) for dataset in datasets]

    return datasets, lm_datasets

In [4]:
def find_unnecessary_words(df):
  df_copy = df.sort_values(by='word').copy(True)

  not_words = [instance for instance in df_copy['word'].values if (not instance.isalpha()) or instance.isnumeric()]
  print(f'Removed {len(not_words)} instances')
  return df_copy[~df_copy['word'].isin(not_words)]

In [5]:
import os
from os import listdir
from os.path import isfile, join

def get_word_datasets(location):
  files_locations = [join(location, f) for f in listdir(location) if isfile(join(location, f))]

  print(f'Reading datasets from location: {location}')

  assert len(files_locations) != 0, 'No files in the provided location'

  pos_loc = [f for f in files_locations if 'positive' in f]
  neg_loc = [f for f in files_locations if 'negative' in f]

  if not pos_loc:
    raise Exception('Positive words dataset was not found')
  if not neg_loc:
    raise Exception('Negative words dataset was not found')
  
  print(f'Reading {pos_loc}, {neg_loc} ...')

  positive_words_df = pd.read_csv(pos_loc[0])
  negative_words_df = pd.read_csv(neg_loc[0])

  print(f'Reading datasets successfully finished ...')
  
  return positive_words_df, negative_words_df

def save_datasets(location, datasets_map):
  location_mod = location if location[-1] == '/' else f'{location}/'
  datasets_folder_loc = f'{location_mod}processed/'

  if not os.path.exists(datasets_folder_loc):
    os.makedirs(datasets_folder_loc)

  for dataset_name in datasets_map:
    file_name = f'{datasets_folder_loc}{dataset_name}'
    datasets_map[dataset_name].to_csv(file_name, index = False)

  print(f'Datasets saved on location: {datasets_folder_loc}')

# Post processing of the words

In [None]:
# reading positive and negative extracted words from drive
positive_words, negative_words = get_word_datasets(drive_location)


# read the positive and negative LM words
lm_positive_words = pd.read_csv(lm_positive_words_location)
lm_negative_words = pd.read_csv(lm_negative_words_location)

In [None]:
# creating PostProcessor object with the shap generated sentiment datasets, the LM sentiment datasets, as well as the column that point to the words, their count, ther shap values, and uuid
post_processor = PostProcessor([positive_words, negative_words], [lm_positive_words, lm_negative_words], 'word', 'count', 'shap_values', 'sentence_uuid')

# running the processing of the words
# output: the processed sentiment datasets for both the shap generated ones and LM
our, lm = post_processor.run_processing_on_datasets()

# extracting the processed sentiment datasets
positive_words, negative_words = our
lm_positive_words, lm_negative_words = lm


# filtering the shap generated sentiment dataset in order to avoid the unnecassary "words" (the ones that are not actually words)
positive_words = find_unnecessary_words(positive_words)
negative_words = find_unnecessary_words(negative_words)


datasets_map = {'processed-positive_words.csv': positive_words, 'processed-negative_words.csv': negative_words,
                'processed-lm_positive_words.csv': lm_positive_words, 'processed-lm_negative_words.csv': lm_negative_words}

# datasets are saved on Google Drive, folder named as 'processed'
save_datasets(drive_location, datasets_map)