<a href="https://colab.research.google.com/github/hristijanpeshov/SHAP-Explainable-Lexicon-Model/blob/master/notebooks/RoBERTa%20notebooks/model_evaluation_all_metrics_results.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# User Input

In [None]:
# enter the location of the summary_df.csv file for each of the lexicons
# this file is located in the results folder of the appropriate lexicon

nasdaq_res_loc = '/content/drive/MyDrive/nasdaq/concatenated datasets/results/summary_df.csv'
fpb_res_loc = '/content/drive/MyDrive/fpb/concatenated datasets/results/summary_df.csv'
sentfin_res_loc = '/content/drive/MyDrive/sentfin/concatenated datasets/results/summary_df.csv'

# All metrics summary

In [None]:
import pandas as pd

nasdaq_res = pd.read_csv(nasdaq_res_loc)
fpb_res = pd.read_csv(fpb_res_loc)
sentfin_res = pd.read_csv(sentfin_res_loc)

In [None]:
sentfin_res['Lexicon Source'] = 'fiqa_fpb_sentfin_neutral'
fpb_res['Lexicon Source'] = 'financial_phrase_bank'

In [None]:
def get_metric_values(df, eval_df, normalized, metric):
  # the different types of evaluation
  word_sources = ['LMD', 'OUR', 'OUR + LMD', 'LMD on LMD', 'OUR on LMD', 'OUR + LMD on LMD']
  all_metric_values = []

  for ws in word_sources:
    eval_df_mask = df['Evaluation Dataset'] == eval_df
    lexicon_normalized_mask = df['Lexicon Normalized'] == normalized
    word_source_mask = df['Words Source'] == ws

    combined_mask = eval_df_mask & lexicon_normalized_mask & word_source_mask

    # extracting the selected metric values for all lexicon sources
    metric_value = df[combined_mask][metric].values[0]

    all_metric_values.append(metric_value)

  return all_metric_values

def is_coef_irregular(coefs):
  return len(coefs) != 1 and '\\' not in coefs

def get_coefs(df):
  c1 = df['C1'].unique()
  c2 = df['C2'].unique()
  c3 = df['C3'].unique()
  c4 = df['C4'].unique()

  if is_coef_irregular(c1) or is_coef_irregular(c2) or is_coef_irregular(c3) or is_coef_irregular(c4):
    print('Missing values for coefficients')

  return [c1[0], c2[0], c3[0], c4[0]]

def create_summary_dataset(df, metric):
  # source lexicon name
  source_df = df['Lexicon Source'].unique()[0]

  # evaluation dataset names
  eval_dfs = df['Evaluation Dataset'].unique()
  # is the lexicon normalized
  normalized = True
  # extracting the coefficients
  coefs = get_coefs(df)
  # the decision maker is average_shap_values
  decision_maker = 'average_shap_values'

  summary_df_values = []

  # for the selected source lexicon and each of evaluation datasets, extract the metric value
  for ed in eval_dfs:

    for n in [normalized, not normalized]:
      metric_values = get_metric_values(df, ed, n, metric)
      row_value = [source_df, n, ed, decision_maker] + coefs + metric_values
      summary_df_values.append(row_value)

  cols = ['Lexicon Source', 'Lexicon Normalized', 'Evaluation Dataset', 'Decision Maker', 'C1', 'C2', 'C3', 'C4',
          'LM', 'XLex', 'XLex + LM', 'LM on LM', 'XLex on LM', 'XLex + LM on LM']

  return pd.DataFrame(summary_df_values, columns = cols)

In [None]:
metrics = ['Accuracy', 'F1', 'MCC', 'Precision', 'Recall']
sources = [nasdaq_res, sentfin_res, fpb_res]
metric_dfs_map = {}

# for each metric values, extract the results for the source lexicons
for metric in metrics:
  df = pd.DataFrame()

  for source in sources:
    summary_dataset = create_summary_dataset(source, metric)
    df = pd.concat([df, summary_dataset], ignore_index = True)

  metric_dfs_map[metric] = df

In [None]:
# presenting the results in relation to the Accuracy
metric_dfs_map['Accuracy'].sort_values(by=['Lexicon Source', 'Lexicon Normalized', 'Evaluation Dataset'], ascending=[False, False, True])

Unnamed: 0,Lexicon Source,Lexicon Normalized,Evaluation Dataset,Decision Maker,C1,C2,C3,C4,LM,XLex,XLex + LM,LM on LM,XLex on LM,XLex + LM on LM
2,nasdaq,True,financial_phrase_bank,average_shap_values,0.3,0.1,0.1,0.5,0.302825,0.836158,0.842938,0.752809,0.783708,0.800562
4,nasdaq,True,fiqa_labeled_df,average_shap_values,0.3,0.1,0.1,0.5,0.313433,0.721393,0.731343,0.807692,0.794872,0.820513
0,nasdaq,True,fpb_fiqa,average_shap_values,0.3,0.1,0.1,0.5,0.29572,0.7607,0.776265,0.747541,0.72623,0.765574
6,nasdaq,True,sem_eval,average_shap_values,0.3,0.1,0.1,0.5,0.274788,0.745042,0.764873,0.751938,0.72093,0.775194
3,nasdaq,False,financial_phrase_bank,average_shap_values,0.3,0.1,0.1,0.5,0.302825,0.837288,0.842938,0.752809,0.780899,0.794944
5,nasdaq,False,fiqa_labeled_df,average_shap_values,0.3,0.1,0.1,0.5,0.313433,0.696517,0.706468,0.807692,0.794872,0.820513
1,nasdaq,False,fpb_fiqa,average_shap_values,0.3,0.1,0.1,0.5,0.29572,0.750324,0.767834,0.747541,0.719672,0.763934
7,nasdaq,False,sem_eval,average_shap_values,0.3,0.1,0.1,0.5,0.274788,0.745042,0.756374,0.751938,0.728682,0.75969
10,fiqa_fpb_sentfin_neutral,True,financial_phrase_bank,average_shap_values,0.3,0.1,0.1,0.5,0.302825,0.80791,0.80791,0.752809,0.800562,0.800562
12,fiqa_fpb_sentfin_neutral,True,fiqa_labeled_df,average_shap_values,0.3,0.1,0.1,0.5,0.313433,0.696517,0.711443,0.807692,0.807692,0.846154


In [None]:
# presenting the results in relation to the F1
metric_dfs_map['F1'].sort_values(by=['Lexicon Source', 'Lexicon Normalized', 'Evaluation Dataset'], ascending=[False, False, True])

Unnamed: 0,Lexicon Source,Lexicon Normalized,Evaluation Dataset,Decision Maker,C1,C2,C3,C4,LM,XLex,XLex + LM,LM on LM,XLex on LM,XLex + LM on LM
2,nasdaq,True,financial_phrase_bank,average_shap_values,0.3,0.1,0.1,0.5,0.286817,0.451388,0.714177,0.688254,0.447723,0.731035
4,nasdaq,True,fiqa_labeled_df,average_shap_values,0.3,0.1,0.1,0.5,0.297491,0.456897,0.47316,0.789379,0.739348,0.801743
0,nasdaq,True,fpb_fiqa,average_shap_values,0.3,0.1,0.1,0.5,0.295412,0.434178,0.465934,0.725735,0.437242,0.738891
6,nasdaq,True,sem_eval,average_shap_values,0.3,0.1,0.1,0.5,0.304723,0.664449,0.720986,0.751565,0.703827,0.77514
3,nasdaq,False,financial_phrase_bank,average_shap_values,0.3,0.1,0.1,0.5,0.286817,0.457467,0.720312,0.688254,0.445964,0.725561
5,nasdaq,False,fiqa_labeled_df,average_shap_values,0.3,0.1,0.1,0.5,0.297491,0.436887,0.454212,0.789379,0.739348,0.801743
1,nasdaq,False,fpb_fiqa,average_shap_values,0.3,0.1,0.1,0.5,0.295412,0.42557,0.459255,0.725735,0.433463,0.737976
7,nasdaq,False,sem_eval,average_shap_values,0.3,0.1,0.1,0.5,0.304723,0.671526,0.713171,0.751565,0.717194,0.759459
10,fiqa_fpb_sentfin_neutral,True,financial_phrase_bank,average_shap_values,0.3,0.1,0.1,0.5,0.286817,0.450993,0.689163,0.688254,0.474451,0.731035
12,fiqa_fpb_sentfin_neutral,True,fiqa_labeled_df,average_shap_values,0.3,0.1,0.1,0.5,0.297491,0.434311,0.455665,0.789379,0.751856,0.826923


In [None]:
# presenting the results in relation to the MCC
metric_dfs_map['MCC'].sort_values(by=['Lexicon Source', 'Lexicon Normalized', 'Evaluation Dataset'], ascending=[False, False, True])

Unnamed: 0,Lexicon Source,Lexicon Normalized,Evaluation Dataset,Decision Maker,C1,C2,C3,C4,LM,XLex,XLex + LM,LM on LM,XLex on LM,XLex + LM on LM
2,nasdaq,True,financial_phrase_bank,average_shap_values,0.3,0.1,0.1,0.5,0.191545,0.361564,0.452809,0.489058,0.363621,0.538721
4,nasdaq,True,fiqa_labeled_df,average_shap_values,0.3,0.1,0.1,0.5,0.215596,0.372176,0.431853,0.593824,0.491936,0.614887
0,nasdaq,True,fpb_fiqa,average_shap_values,0.3,0.1,0.1,0.5,0.208682,0.305891,0.417734,0.519298,0.312068,0.524179
6,nasdaq,True,sem_eval,average_shap_values,0.3,0.1,0.1,0.5,0.264968,0.332091,0.476881,0.566951,0.417018,0.601202
3,nasdaq,False,financial_phrase_bank,average_shap_values,0.3,0.1,0.1,0.5,0.191545,0.382939,0.4706,0.489058,0.35949,0.531283
5,nasdaq,False,fiqa_labeled_df,average_shap_values,0.3,0.1,0.1,0.5,0.215596,0.312675,0.374014,0.593824,0.491936,0.614887
1,nasdaq,False,fpb_fiqa,average_shap_values,0.3,0.1,0.1,0.5,0.208682,0.280815,0.398645,0.519298,0.301408,0.525004
7,nasdaq,False,sem_eval,average_shap_values,0.3,0.1,0.1,0.5,0.264968,0.349471,0.464952,0.566951,0.436695,0.578295
10,fiqa_fpb_sentfin_neutral,True,financial_phrase_bank,average_shap_values,0.3,0.1,0.1,0.5,0.191545,0.392105,0.433814,0.489058,0.460456,0.538721
12,fiqa_fpb_sentfin_neutral,True,fiqa_labeled_df,average_shap_values,0.3,0.1,0.1,0.5,0.215596,0.302576,0.370605,0.593824,0.522734,0.659244


In [None]:
# presenting the results in relation to the Precision
metric_dfs_map['Precision'].sort_values(by=['Lexicon Source', 'Lexicon Normalized', 'Evaluation Dataset'], ascending=[False, False, True])

Unnamed: 0,Lexicon Source,Lexicon Normalized,Evaluation Dataset,Decision Maker,C1,C2,C3,C4,LM,XLex,XLex + LM,LM on LM,XLex on LM,XLex + LM on LM
2,nasdaq,True,financial_phrase_bank,average_shap_values,0.3,0.1,0.1,0.5,0.456199,0.439929,0.686288,0.684298,0.436851,0.709835
4,nasdaq,True,fiqa_labeled_df,average_shap_values,0.3,0.1,0.1,0.5,0.520018,0.456807,0.471765,0.780027,0.769444,0.791667
0,nasdaq,True,fpb_fiqa,average_shap_values,0.3,0.1,0.1,0.5,0.485867,0.429447,0.455991,0.728801,0.435152,0.733139
6,nasdaq,True,sem_eval,average_shap_values,0.3,0.1,0.1,0.5,0.525425,0.65774,0.709641,0.788138,0.716979,0.802323
3,nasdaq,False,financial_phrase_bank,average_shap_values,0.3,0.1,0.1,0.5,0.456199,0.444206,0.690528,0.684298,0.43529,0.706073
5,nasdaq,False,fiqa_labeled_df,average_shap_values,0.3,0.1,0.1,0.5,0.520018,0.437381,0.453451,0.780027,0.769444,0.791667
1,nasdaq,False,fpb_fiqa,average_shap_values,0.3,0.1,0.1,0.5,0.485867,0.421067,0.450001,0.728801,0.431,0.73314
7,nasdaq,False,sem_eval,average_shap_values,0.3,0.1,0.1,0.5,0.525425,0.66301,0.703452,0.788138,0.721939,0.792785
10,fiqa_fpb_sentfin_neutral,True,financial_phrase_bank,average_shap_values,0.3,0.1,0.1,0.5,0.456199,0.435774,0.664505,0.684298,0.459885,0.709835
12,fiqa_fpb_sentfin_neutral,True,fiqa_labeled_df,average_shap_values,0.3,0.1,0.1,0.5,0.520018,0.437637,0.455688,0.780027,0.792189,0.817143


In [None]:
# presenting the results in relation to the Recall
metric_dfs_map['Recall'].sort_values(by=['Lexicon Source', 'Lexicon Normalized', 'Evaluation Dataset'], ascending=[False, False, True])

Unnamed: 0,Lexicon Source,Lexicon Normalized,Evaluation Dataset,Decision Maker,C1,C2,C3,C4,LM,XLex,XLex + LM,LM on LM,XLex on LM,XLex + LM on LM
2,nasdaq,True,financial_phrase_bank,average_shap_values,0.3,0.1,0.1,0.5,0.251752,0.470459,0.775159,0.824444,0.476696,0.845772
4,nasdaq,True,fiqa_labeled_df,average_shap_values,0.3,0.1,0.1,0.5,0.210644,0.457918,0.483562,0.814815,0.724537,0.824074
0,nasdaq,True,fpb_fiqa,average_shap_values,0.3,0.1,0.1,0.5,0.239366,0.441748,0.491661,0.794656,0.44008,0.794635
6,nasdaq,True,sem_eval,average_shap_values,0.3,0.1,0.1,0.5,0.261609,0.674788,0.771196,0.778889,0.70037,0.798889
3,nasdaq,False,financial_phrase_bank,average_shap_values,0.3,0.1,0.1,0.5,0.251752,0.481179,0.790593,0.824444,0.475581,0.842428
5,nasdaq,False,fiqa_labeled_df,average_shap_values,0.3,0.1,0.1,0.5,0.210644,0.436992,0.462636,0.814815,0.724537,0.824074
1,nasdaq,False,fpb_fiqa,average_shap_values,0.3,0.1,0.1,0.5,0.239366,0.433334,0.484876,0.794656,0.437123,0.795563
7,nasdaq,False,sem_eval,average_shap_values,0.3,0.1,0.1,0.5,0.261609,0.687305,0.76564,0.778889,0.714815,0.785556
10,fiqa_fpb_sentfin_neutral,True,financial_phrase_bank,average_shap_values,0.3,0.1,0.1,0.5,0.251752,0.50085,0.786001,0.824444,0.52125,0.845772
12,fiqa_fpb_sentfin_neutral,True,fiqa_labeled_df,average_shap_values,0.3,0.1,0.1,0.5,0.210644,0.431036,0.459078,0.814815,0.733796,0.842593
