<a href="https://colab.research.google.com/github/hristijanpeshov/SHAP-Explainable-Lexicon-Model/blob/master/FinBERT_RoBERTa_comparison.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# User Input

In [None]:
import pandas as pd

# enter the location of the summary_df.csv file for each of the lexicons
# this file is located in the results folder of the appropriate lexicon

# enter location of the RoBERTa lexicons (please make sure that there are only lexicons files in the folder)
roberta_lexicons_results_loc = {
    'nasdaq': '/content/drive/MyDrive/nasdaq/concatenated datasets/results/summary_df.csv',
    'fpb': '/content/drive/MyDrive/fpb/concatenated datasets/results/summary_df.csv',
    'sentfin': '/content/drive/MyDrive/sentfin/concatenated datasets/results/summary_df.csv'
}

# enter location of the FinBERT lexicons (please make sure that there are only lexicons files in the folder)
finbert_lexicons_results_loc = {
    'nasdaq': '/content/drive/MyDrive/finbert process/nasdaq/concatenated datasets/results/summary_df.csv',
    'fpb': '/content/drive/MyDrive/finbert process/fpb/concatenated datasets/results/summary_df.csv',
    'sentfin': '/content/drive/MyDrive/finbert process/sentfin/concatenated datasets/results/summary_df.csv'
}

# FinBERT - RoBERTa Comparison

In [None]:
def get_metric_values(df, eval_df, normalized, metric):
  # the different types of evaluation
  word_sources = ['LMD', 'OUR', 'OUR + LMD', 'LMD on LMD', 'OUR on LMD', 'OUR + LMD on LMD']
  all_metric_values = []

  for ws in word_sources:
    eval_df_mask = df['Evaluation Dataset'] == eval_df
    lexicon_normalized_mask = df['Lexicon Normalized'] == normalized
    word_source_mask = df['Words Source'] == ws

    combined_mask = eval_df_mask & lexicon_normalized_mask & word_source_mask

    # extracting the selected metric values for all lexicon sources
    metric_value = df[combined_mask][metric].values[0]

    all_metric_values.append(metric_value)

  return all_metric_values

def is_coef_irregular(coefs):
  return len(coefs) != 1 and '\\' not in coefs

def get_coefs(df):
  c1 = df['C1'].unique()
  c2 = df['C2'].unique()
  c3 = df['C3'].unique()
  c4 = df['C4'].unique()

  if is_coef_irregular(c1) or is_coef_irregular(c2) or is_coef_irregular(c3) or is_coef_irregular(c4):
    print('Missing values for coefficients')

  return [c1[0], c2[0], c3[0], c4[0]]

def create_summary_dataset(df, metric):
  # source lexicon name
  source_df = df['Lexicon Source'].unique()[0]

  # evaluation dataset names
  eval_dfs = df['Evaluation Dataset'].unique()
  # is the lexicon normalized
  normalized = True
  # extracting the coefficients
  coefs = get_coefs(df)
  # the decision maker is average_shap_values
  decision_maker = 'average_shap_values'

  summary_df_values = []

  # for the selected source lexicon and each of evaluation datasets, extract the metric value
  for ed in eval_dfs:

    for n in [normalized, not normalized]:
      metric_values = get_metric_values(df, ed, n, metric)
      row_value = [source_df, n, ed, decision_maker] + coefs + metric_values
      summary_df_values.append(row_value)

  cols = ['Lexicon Source', 'Lexicon Normalized', 'Evaluation Dataset', 'Decision Maker', 'C1', 'C2', 'C3', 'C4',
          'LM', 'XLex', 'XLex + LM', 'LM on LM', 'XLex on LM', 'XLex + LM on LM']

  return pd.DataFrame(summary_df_values, columns = cols)

In [None]:
def load_lexicons(lexicon_map):
  lexicon_dfs = []
  for lexicon_name in lexicon_map:
    loc = lexicon_map[lexicon_name]
    lex = pd.read_csv(loc)

    lex_source = lex['Lexicon Source'].values[0]
    if lex_source == 'fpb':
      lex['Lexicon Source'] = 'financial_phrase_bank'
    elif lex_source == 'sentfin':
      lex['Lexicon Source'] = 'fiqa_fpb_sentfin_neutral'

    lexicon_dfs.append(lex)

  return lexicon_dfs

def create_metric_df(lexicon_maps):
  model_map = {}
  for lm in lexicon_maps:
    metrics = ['Accuracy', 'F1', 'MCC']
    sources = load_lexicons(lexicon_maps[lm])
    metric_dfs_map = {}

    # for each metric values, extract the results for the source lexicons
    for metric in metrics:
      df = pd.DataFrame()

      for source in sources:
        summary_dataset = create_summary_dataset(source, metric)
        df = pd.concat([df, summary_dataset], ignore_index = True)

      metric_dfs_map[metric] = df

    model_map[lm] = metric_dfs_map

  return model_map

In [None]:
def absolute_metric_increase(df, first_col, second_col):
  new_value = df[first_col].mean()
  old_value = df[second_col].mean()

  return new_value - old_value

def calc_incr(df, combination):
  first_col = combination[0]
  second_col = combination[1]
  print(f'{first_col}, {second_col}')

  absolute_increase = absolute_metric_increase(df, first_col, second_col)
  print(absolute_increase)

  return absolute_increase

def calc_for_metric(df, metric, combinations):
  absolute_increases = []

  print('Difference between the two columns (absolute difference):')
  for comb in combinations:
    absolute_increases.append(calc_incr(df, comb))
  print()

  return absolute_increases

def calc_difference(metrics, models_lexicons_res_map):
  combinations = [['XLex + LM', 'LM'], ['XLex', 'LM']]

  for metric in metrics:
    print('Calculations for metric: ', metric)
    print()
    models_absolute_increases = []
    for name in models_lexicons_res_map:
      lex_map = models_lexicons_res_map[name]
      lex_df = lex_map[metric]
      print(name)
      models_absolute_increases.append(calc_for_metric(lex_df, metric, combinations))

    print()

    roberta_abs_increases, finbert_abs_increases = models_absolute_increases
    roberta_xlex_lm_abs_inc, roberta_xlex_abs_inc = roberta_abs_increases
    finbert_xlex_lm_abs_inc, finbert_xlex_abs_inc = finbert_abs_increases

    print('Difference: FinBERT - RoBERTa')
    print(', '.join(combinations[0]))
    print(finbert_xlex_lm_abs_inc - roberta_xlex_lm_abs_inc)
    print()

    print('Difference: FinBERT - RoBERTa')
    print(', '.join(combinations[1]))
    print(finbert_xlex_abs_inc - roberta_xlex_abs_inc)

    print()
    print()
    print()

In [None]:
models_lexicons_results_map = {
    'RoBERTa': roberta_lexicons_results_loc,
    'FinBERT': finbert_lexicons_results_loc
}

models_lexicons_results_dfs = create_metric_df(models_lexicons_results_map)

calc_difference(['Accuracy', 'F1', 'MCC'], models_lexicons_results_dfs)

Calculations for metric:  Accuracy

RoBERTa
Difference between the two columns (absolute difference):
XLex + LM, LM
0.42334428404339486
XLex, LM
0.40972001522165913

FinBERT
Difference between the two columns (absolute difference):
XLex + LM, LM
0.43431836577490374
XLex, LM
0.41118192332613007


Difference: FinBERT - RoBERTa
XLex + LM, LM
0.010974081731508878

Difference: FinBERT - RoBERTa
XLex, LM
0.0014619081044709437



Calculations for metric:  F1

RoBERTa
Difference between the two columns (absolute difference):
XLex + LM, LM
0.276409039748679
XLex, LM
0.18824323572215362

FinBERT
Difference between the two columns (absolute difference):
XLex + LM, LM
0.26955628472774623
XLex, LM
0.22446185739717145


Difference: FinBERT - RoBERTa
XLex + LM, LM
-0.006852755020932744

Difference: FinBERT - RoBERTa
XLex, LM
0.036218621675017826



Calculations for metric:  MCC

RoBERTa
Difference between the two columns (absolute difference):
XLex + LM, LM
0.19599610724447994
XLex, LM
0.121205492238