# User input

In [None]:
# enter the location of the processed datasets
drive_location = '/content/drive/MyDrive/nasdaq/processed/'

# ExplainableLexiconGenerator

In [None]:
import pandas as pd
import numpy as np

class ExplainableLexiconGenerator:

  def __init__(self, positive_words_dataset, negative_words_dataset, positive_lm_dataset, negative_lm_dataset, word_column, aggregate_function = None):
    # datasets
    self.positive_words_dataset = positive_words_dataset
    self.negative_words_dataset = negative_words_dataset
    self.datasets = [self.positive_words_dataset, self.negative_words_dataset]
    
    self.positive_lm_dataset = positive_lm_dataset
    self.negative_lm_dataset = negative_lm_dataset
    self.lm_datasets = [self.positive_lm_dataset, self.negative_lm_dataset]
    
    # columns
    self.word_column = word_column
    self.sum_shap_values_column = 'sum_shap_values'
    self.average_shap_values_column = 'average_shap_values'
    self.count_column = 'count'
    self.total_count_column = 'total'
    self.word_count_column = 'word_count'
    self.ratio_column = 'ratio'
    self.category_column = 'category'
    self.max_shap_values_column = 'max_shap_values'
    self.min_shap_values_column = 'min_shap_values'
    self.sentence_uuid_column = 'sentence_uuid'

    # prefixes
    self.opposite_prefix = 'opposite_'
    self.tm_column_prefix = 'TM_'
    self.lm_column_prefix = 'LM_'

    self.columns_in_both_datasets = [self.count_column, self.sum_shap_values_column, self.average_shap_values_column, self.max_shap_values_column, self.min_shap_values_column,
                                     self.ratio_column]

    # values
    self.positive_category_value = 'positive'
    self.negative_category_value = 'negative'
    self.tm_source_value = 'OUR_WORDS'
    self.lm_source_value = 'LM'
    self.aggregate_function = {self.count_column: 'sum', self.total_count_column: 'sum', 
                               self.sum_shap_values_column: 'sum', self.average_shap_values_column: 'mean', 
                               self.max_shap_values_column: 'max', self.min_shap_values_column: 'min', self.sentence_uuid_column: ','.join} \
                                if aggregate_function == None else aggregate_function

    self.source_column = 'src'  


  def __find_all_duplicates(self, dataset):
    all_duplicates = dataset[dataset.duplicated(subset=[self.word_column]) == True]
    unique_duplicated_words = all_duplicates[self.word_column].unique()

    return unique_duplicated_words


  def __remove_duplicates_in_dataset(self, dataset):
    unique_duplicated_words = self.__find_all_duplicates(dataset)

    # aggreagate words based on aggregate_function
    dataset = dataset.groupby(by=[self.word_column]).agg(self.aggregate_function).reset_index()

    # calculate the average_shap_values for dataset by dividing the sum_shap_values with the total count
    duplicated_rows = dataset.loc[dataset[self.word_column].isin(unique_duplicated_words)]
    averaged_values = duplicated_rows[self.sum_shap_values_column].values / duplicated_rows[self.total_count_column].values
    dataset.loc[dataset[self.word_column].isin(unique_duplicated_words), self.average_shap_values_column] = averaged_values

    return dataset

  def check_for_duplicates(self, dataset):
    all_duplicates = dataset[dataset.duplicated(subset=[self.word_column]) == True]
    
    assert len(all_duplicates) == 0, "datasets contain duplicates"

  def remove_duplicates_within_datasets(self, datasets):
    datasets = [self.__remove_duplicates_in_dataset(dataset.copy(True)) for dataset in datasets]

    # check if there are no duplicates left in the datasets
    [self.check_for_duplicates(dataset) for dataset in datasets]

    return datasets
  
  def add_new_columns(self, dataset, category_value):
    # adding 1 initially for ratio_column and word_count_column
    dataset = self.__add_new_column_to_dataset(dataset, self.ratio_column, 1)
    dataset = self.__add_new_column_to_dataset(dataset, self.word_count_column, 1)
    dataset = self.__add_new_column_to_dataset(dataset, self.category_column, category_value)

    return dataset

  def __add_new_column_to_dataset(self, dataset, column, value):
    dataset[column] = value

    return dataset

  def add_opposite_columns(self, dataset):
    modified_datasets = [self.__add_new_column_to_dataset(dataset, f'{self.opposite_prefix}{opposite_column}', 0) for opposite_column in self.columns_in_both_datasets]

    return modified_datasets[-1]

  def rearrange_columns(self, dataset):
    # rearranging columns in a specific way
    initial_columns = [self.word_column, self.count_column, self.total_count_column, f'{self.opposite_prefix}{self.count_column}', self.category_column]

    datasets_columns = self.columns_in_both_datasets[1:]
    opposite_columns = [f'{self.opposite_prefix}{column}' for column in datasets_columns]

    new_column_order = initial_columns + datasets_columns + opposite_columns + [self.word_count_column, self.sentence_uuid_column]

    dataset = dataset.reindex(columns=new_column_order)

    return dataset

  def add_all_necessary_columns(self, dataset, category_value):
    dataset = self.add_new_columns(dataset, category_value)
    dataset = self.add_opposite_columns(dataset)
    dataset = self.rearrange_columns(dataset)

    return dataset

  def __find_overlapping_words_between_datasets(self, positive_dataset, negative_dataset):
    # intersecting between the words from positive and negative dataset, thus finding the overlapping ones
    positive_unique_words = set(positive_dataset[self.word_column].unique())
    negative_unique_words = set(negative_dataset[self.word_column].unique())

    overlapping_words = positive_unique_words.intersection(negative_unique_words)
    
    return overlapping_words

  def __categorize_word(self, word, positive_dataset, negative_dataset):
    # column names in a list
    columns = list(positive_dataset.columns)
    # values as an array for the word in the positive and negative dataset
    positive_occurence = positive_dataset.loc[positive_dataset[self.word_column] == word].values[0]
    negative_occurence = negative_dataset.loc[negative_dataset[self.word_column] == word].values[0]

    count_column_index = columns.index(self.count_column)
    avg_shap_values_column_index = columns.index(self.average_shap_values_column)

    # categorization decision making values
    positive_decision_value = positive_occurence[count_column_index] * positive_occurence[avg_shap_values_column_index]
    negative_decision_value = negative_occurence[count_column_index] * negative_occurence[avg_shap_values_column_index]

    # finding column indexes
    total_column_index = columns.index(self.total_count_column)

    columns_in_both_datasets = self.columns_in_both_datasets[:-1]
    opposite_columns = [f'{self.opposite_prefix}{column}' for column in columns_in_both_datasets]

    ratio_column_index = columns.index(self.ratio_column)
    opposite_ratio_column_index = columns.index(f'{self.opposite_prefix}{self.ratio_column}')
    
    sentence_uuid_column_index = columns.index(self.sentence_uuid_column)

    # determining the selected and opposite dataset
    if positive_decision_value > negative_decision_value:
      selected_occurence = positive_occurence
      opposite_occurence = negative_occurence
    else:
      selected_occurence = negative_occurence
      opposite_occurence = positive_occurence

    # adding all opposite values except ratio
    for value, opposite_value in zip(columns_in_both_datasets, opposite_columns):
      column_index = columns.index(opposite_value)
      opposite_column_index = columns.index(value)
      selected_occurence[column_index] = opposite_occurence[opposite_column_index]

    # updating total count
    selected_occurence[total_column_index] = selected_occurence[count_column_index] + opposite_occurence[count_column_index]

    # adding ratio and opposite ratio
    ratio = selected_occurence[avg_shap_values_column_index] / (selected_occurence[avg_shap_values_column_index] + opposite_occurence[avg_shap_values_column_index])
    selected_occurence[ratio_column_index] = ratio
    selected_occurence[opposite_ratio_column_index] = 1 - ratio

    # updating sentence uuids
    selected_occurence[sentence_uuid_column_index] = f'{selected_occurence[sentence_uuid_column_index]},{opposite_occurence[sentence_uuid_column_index]}'

    return selected_occurence

  def __add_word_occurences_to_datasets(self, categorized_words):
    columns = self.positive_words_dataset.columns
    category_column_index = columns.index(self.category_column)

    positive_word_occurences, negative_word_occurences = [], []

    # dividng the positive and negative word occurences
    for word_occurence in categorized_words:
      category_value = word_occurence[category_column_index]

      positive_word_occurences.append(word_occurence) if self.positive_category_value == category_value else negative_word_occurences.append(word_occurence)

    # appending to the existing datasets
    new_positive_words_df = pd.DataFrame([positive_word_occurences], columns = columns)
    self.positive_words_dataset = self.positive_words_dataset.append(new_positive_words_df, ignore_index = True)
    
    new_negative_words_df = pd.DataFrame([negative_word_occurences], columns = columns)
    self.negative_words_dataset = self.negative_words_dataset.append(new_negative_words_df, ignore_index = True)

  def remove_duplicates_between_datasets_and_merge_them(self, positive_dataset, negative_dataset):
    overlapping_words = self.__find_overlapping_words_between_datasets(positive_dataset, negative_dataset)

    # categorizing overlapping word occurences
    categorized_word_occurences = []
    [categorized_word_occurences.append(self.__categorize_word(word, positive_dataset, negative_dataset)) for word in overlapping_words]

    # removing the occurences that contains the overlapping words from both datasets
    positive_dataset = positive_dataset.loc[~positive_dataset[self.word_column].isin(overlapping_words)]
    negative_dataset = negative_dataset.loc[~negative_dataset[self.word_column].isin(overlapping_words)]

    categorized_words_df = pd.DataFrame(categorized_word_occurences, columns = positive_dataset.columns)
  
    combined_words_dataset = positive_dataset.append(negative_dataset, ignore_index = True)
    combined_words_dataset = combined_words_dataset.append(categorized_words_df, ignore_index = True)

    return combined_words_dataset

  def remove_duplicates_from_lm_datasets_and_add_columns(self):
    # just dropping duplicated words since we don't need any aggregation
    lm_datasets = [dataset.copy(True) for dataset in self.lm_datasets]
    [dataset.drop_duplicates(subset=[self.word_column], inplace=True) for dataset in lm_datasets]

    column_to_add = [self.sum_shap_values_column, self.total_count_column, self.word_count_column,
                     self.max_shap_values_column, self.min_shap_values_column,
                     self.ratio_column, self.average_shap_values_column, self.count_column]

    # adding all columns regarding each dataset with value 1
    [[self.__add_new_column_to_dataset(dataset, column, 1) for column in column_to_add] for dataset in lm_datasets]

    # adding opposite columns with value 0
    [self.add_opposite_columns(dataset) for dataset in lm_datasets]

    # adding special columns with string value and adding category value
    [self.__add_new_column_to_dataset(dataset, self.sentence_uuid_column, '') for dataset in lm_datasets]


    # adding category column to LM datasets
    lm_positive_dataset, lm_negative_dataset = lm_datasets
    lm_positive_dataset = self.__add_new_column_to_dataset(lm_positive_dataset, self.category_column, self.positive_category_value)
    lm_negative_dataset = self.__add_new_column_to_dataset(lm_negative_dataset, self.category_column, self.negative_category_value)

    # rearranging columns
    lm_positive_dataset = self.rearrange_columns(lm_positive_dataset)
    lm_negative_dataset = self.rearrange_columns(lm_negative_dataset)

    return lm_positive_dataset, lm_negative_dataset

  def __add_columns_prefix(self, dataset, prefix):
    dataset.columns = [dataset.columns[0]] + [f'{prefix}{column}' for column in list(dataset.columns)[1:]]

    return dataset

  def prepare_for_merging(self):
    # removing duplicates from positive and negative datasets
    positive_dataset, negative_dataset = self.remove_duplicates_within_datasets(self.datasets)
    
    # adding the required columns for the next calculations
    positive_dataset = self.add_all_necessary_columns(positive_dataset, self.positive_category_value)
    negative_dataset = self.add_all_necessary_columns(negative_dataset, self.negative_category_value)
    
    # removing duplicates between positive and negative datasets
    combined_words_dataset =  self.remove_duplicates_between_datasets_and_merge_them(positive_dataset, negative_dataset)

    # # processing LM words datasets
    lm_positive_dataset, lm_negative_dataset = self.remove_duplicates_from_lm_datasets_and_add_columns()
    lm_combined_words_dataset = self.remove_duplicates_between_datasets_and_merge_them(lm_positive_dataset, lm_negative_dataset)

    return combined_words_dataset, lm_combined_words_dataset

  def merge_our_and_lm_datasets(self, combined_words_dataset, lm_combined_words_dataset):
    # adding TM_ and LM_ prefixes to the columns
    combined_words_dataset = self.__add_columns_prefix(combined_words_dataset, self.tm_column_prefix)
    lm_combined_words_dataset = self.__add_columns_prefix(lm_combined_words_dataset, self.lm_column_prefix)

    # adding "OUR WORDS" and "LM" as source values in column to corresponding datasets
    combined_words_dataset = self.__add_new_column_to_dataset(combined_words_dataset, f'{self.tm_column_prefix}{self.source_column}', self.tm_source_value)
    lm_combined_words_dataset = self.__add_new_column_to_dataset(lm_combined_words_dataset, f'{self.lm_column_prefix}{self.source_column}', self.lm_source_value)

    # merging our combined dataset with LM dataset with outer join
    merged_dataset = pd.merge(combined_words_dataset, lm_combined_words_dataset, how="outer", on=[self.word_column])

    # we fill TM_category and LM_category columns with "none" values since that are words that do not appear for that dataset
    merged_dataset[f'{self.tm_column_prefix}{self.category_column}'].fillna('none', inplace = True)
    merged_dataset[f'{self.lm_column_prefix}{self.category_column}'].fillna('none', inplace = True)

    # we fill TM_src that is na with "LM" as source value since that word is from LM words
    # we fill LM_src that is na with "OUR WORDS" as source value since that word is from our words
    # the words that appear in the both datasets will have "OUR WORDS" in the TM_src column and "LM" in the LM_src column
    merged_dataset[f'{self.tm_column_prefix}{self.source_column}'].fillna(self.lm_source_value, inplace = True)
    merged_dataset[f'{self.lm_column_prefix}{self.source_column}'].fillna(self.tm_source_value, inplace = True)

    merged_dataset[f'{self.tm_column_prefix}{self.sentence_uuid_column}'].fillna('', inplace = True)
    merged_dataset[f'{self.lm_column_prefix}{self.sentence_uuid_column}'].fillna('', inplace = True)

    # we fill all other columns with 0 value because they don't have value for the other source and this way they will not interfere with the calculation
    merged_dataset.fillna(0, inplace = True)

    return merged_dataset

  def generate_merged_dataset(self):
    combined_words_dataset, lm_combined_words_dataset = self.prepare_for_merging()
    merged_dataset = self.merge_our_and_lm_datasets(combined_words_dataset, lm_combined_words_dataset)
    
    return merged_dataset


  def __normalize_column(self, dataset, column):
    column_max_value = dataset[column].max()
    if column_max_value == 0:
      return dataset
    
    dataset[column] = dataset[column].apply(lambda value: value / column_max_value)

    return dataset

  def normalize_dataset(self, dataset):
    columns_to_normalize = dataset.select_dtypes(include=np.number).columns.tolist()

    print(f'Columns to normalize: {columns_to_normalize}')
    
    dataset_copy = dataset.copy(True)
    modified_datasets = [self.__normalize_column(dataset_copy, column) for column in columns_to_normalize]

    print('Columns normalized')

    return modified_datasets[-1]

In [None]:
import os
from os import listdir
from os.path import isfile, join

def extract_dataset(files, dataset_type):
  dataset_file = [f for f in files if dataset_type in f]

  if not dataset_file:
    raise Exception(f'{dataset_type} words dataset {dataset_file} was not found')

  df = pd.read_csv(dataset_file[0])

  return df

def extract_pos_neg_datasets(files):
  pos_df = extract_dataset(files, 'positive')
  neg_df = extract_dataset(files, 'negative')

  return pos_df, neg_df

def extract_processed_datasets(location):
  files_locations = [join(location, f) for f in listdir(location) if isfile(join(location, f))]

  print(f'Reading datasets from: {location} ...')

  assert files_locations != 0, 'No files found in the provided location'

  lm_words_files = [f for f in files_locations if 'lm' in f]
  our_words_files = list(set(files_locations) - set(lm_words_files))

  print(f'Reading TM datasets {our_words_files} ...')
  our_dfs = extract_pos_neg_datasets(our_words_files)
  
  print(f'Reading LM datasets {lm_words_files} ...')
  lm_dfs = extract_pos_neg_datasets(lm_words_files)
  

  print(f'Reading datasets successfully finished ...')

  return our_dfs + lm_dfs


def save_lexicons(location, lexicons):
  location_mod = location if location[-1] == '/' else f'{location}/'
  parent_location = os.path.abspath(os.path.join(location_mod, os.pardir))
  
  lexicons_location = f'{parent_location}/lexicons'

  if not os.path.exists(lexicons_location):
    os.makedirs(lexicons_location)

  for lexicon_name in lexicons:
    loc = f'{lexicons_location}/{lexicon_name}'
    print(f'Saving {lexicon_name} on location {loc}')
    lexicons[lexicon_name].to_csv(loc, index = False)
  
  print(f'Lexicons saved on location: {lexicons_location}')

# SHAP-LM lexicon generation

In [None]:
import pandas as pd

# reading the sentiment datasets
positive_words, negative_words, lm_pos_words, lm_neg_words = extract_processed_datasets(drive_location)

# SHAP-LM lexicon generation
explainableLexiconGenerator = ExplainableLexiconGenerator(positive_words, negative_words, lm_pos_words, lm_neg_words, 'word')
merged_dataset = explainableLexiconGenerator.generate_merged_dataset()

# creating normalized version of the SHAP-LM lexicon
normalized_merged_df = explainableLexiconGenerator.normalize_dataset(merged_dataset)

# saving lexicons to lexicon folder on drive
lexicons = {'shap_lm_lexicon.csv': merged_dataset, 'normalized-shap_lm_lexicon.csv': normalized_merged_df}
save_lexicons(drive_location, lexicons)