<a href="https://colab.research.google.com/github/hristijanpeshov/SHAP-Explainable-Lexicon-Model/blob/master/FinBERT_concatenate_result_datasets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# User input

In [None]:
# enter the lexicon_folder_loc value from the "initial_words_collecting.ipynb" notebook (the folder where the extracted words' datasets are stored)
drive_folder_location = '/content/drive/MyDrive/finbert process/nasdaq'

# ented source dataset location
dataset_file_location = '/content/drive/MyDrive/datasets/source datasets/nasdaq.csv'

# Definitions

In [None]:
import pandas as pd
import os
from os import listdir
from os.path import isfile, join

def get_execution_data_frame(files, file_type):
  exec_file = [f for f in files if file_type in f][0]
  exec_df = pd.read_csv(exec_file)

  return exec_df

def extract_dfs_from_execution(exec, files_locations):
  exec_files = [f for f in files_locations if exec in f]
  if len(exec_files) > 3:
    final_files = [f for f in exec_files if 'tmp' not in f]
  else:
    final_files = exec_files.copy()

  log_df = get_execution_data_frame(final_files, 'log')
  positive_words_df = get_execution_data_frame(final_files, 'positive')
  negative_words_df = get_execution_data_frame(final_files, 'negative')

  return log_df, positive_words_df, negative_words_df

def extract_final_dfs(all_executions):
  all_log_dfs = all_executions[0][0].copy(True)
  all_pos_dfs = all_executions[0][1].copy(True)
  all_neg_dfs = all_executions[0][2].copy(True)

  for exec in all_executions[1:]:
    all_log_dfs = all_log_dfs.append(exec[0], ignore_index=True)
    all_pos_dfs = all_pos_dfs.append(exec[1], ignore_index=True)
    all_neg_dfs = all_neg_dfs.append(exec[2], ignore_index=True)

  return all_log_dfs, all_pos_dfs, all_neg_dfs

def save_df(df, loc):
  df.to_csv(loc, index=False)

def concatenate_datasets(drive_loc, dataset_loc):
  files_locations = [join(drive_loc, f) for f in listdir(drive_loc) if isfile(join(drive_loc, f))]

  assert len(files_locations) != 0, f'No files found in the provided location: [{drive_loc}]'

  executions = set([f.split('/')[-1].split(sep='--')[0] for f in files_locations])
  all_executions_dfs = [extract_dfs_from_execution(exec, files_locations) for exec in executions]

  final_log_df, final_pos_df, final_neg_df = extract_final_dfs(all_executions_dfs)

  dataset = pd.read_csv(dataset_loc)

  assert len(dataset) == len(final_log_df), 'Words extraction was not executed completely. Please complete the extraction of words from the dataset'

  drive_loc_mod = drive_loc if drive_loc[-1] == '/' else f'{drive_loc}/'
  conc_drive_loc = f'{drive_loc_mod}concatenated datasets'
  if not os.path.exists(conc_drive_loc):
    os.makedirs(conc_drive_loc)

  save_df(final_log_df, f'{conc_drive_loc}/log_dataset.csv')
  save_df(final_pos_df, f'{conc_drive_loc}/positive_words.csv')
  save_df(final_neg_df, f'{conc_drive_loc}/negative_words.csv')

  print(f'Datasets saved to folder: {conc_drive_loc}')

  return final_log_df, final_pos_df, final_neg_df

# Concatenate datasets

In [None]:
logs, positive_words, negative_words = concatenate_datasets(drive_folder_location, dataset_file_location)