<a href="https://colab.research.google.com/github/hristijanpeshov/SHAP-Explainable-Lexicon-Model/blob/master/notebooks/RoBERTa%20notebooks/model_evaluation_all_metrics_results.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# User Input

In [None]:
# enter the location of the summary_df.csv file for each of the lexicons
# this file is located in the results folder of the appropriate lexicon

nasdaq_res_loc = '/content/drive/MyDrive/nasdaq/concatenated datasets/results/summary_df.csv'
fpb_res_loc = '/content/drive/MyDrive/fpb/concatenated datasets/results/summary_df.csv'
sentfin_res_loc = '/content/drive/MyDrive/sentfin/concatenated datasets/results/summary_df.csv'

# All metrics summary

In [None]:
import pandas as pd

nasdaq_res = pd.read_csv(nasdaq_res_loc)
fpb_res = pd.read_csv(fpb_res_loc)
sentfin_res = pd.read_csv(sentfin_res_loc)

In [None]:
sentfin_res['Lexicon Source'] = 'fiqa_fpb_sentfin_neutral'
fpb_res['Lexicon Source'] = 'financial_phrase_bank'

In [None]:
def get_metric_values(df, eval_df, normalized, metric):
  # the different types of evaluation
  word_sources = ['LMD', 'OUR', 'OUR + LMD', 'LMD on LMD', 'OUR on LMD', 'OUR + LMD on LMD']
  all_metric_values = []

  for ws in word_sources:
    eval_df_mask = df['Evaluation Dataset'] == eval_df
    lexicon_normalized_mask = df['Lexicon Normalized'] == normalized
    word_source_mask = df['Words Source'] == ws

    combined_mask = eval_df_mask & lexicon_normalized_mask & word_source_mask

    # extracting the selected metric values for all lexicon sources
    metric_value = df[combined_mask][metric].values[0]

    all_metric_values.append(metric_value)

  return all_metric_values

def is_coef_irregular(coefs):
  return len(coefs) != 1 and '\\' not in coefs

def get_coefs(df):
  c1 = df['C1'].unique()
  c2 = df['C2'].unique()
  c3 = df['C3'].unique()
  c4 = df['C4'].unique()

  if is_coef_irregular(c1) or is_coef_irregular(c2) or is_coef_irregular(c3) or is_coef_irregular(c4):
    print('Missing values for coefficients')

  return [c1[0], c2[0], c3[0], c4[0]]

def create_summary_dataset(df, metric):
  # source lexicon name
  source_df = df['Lexicon Source'].unique()[0]

  # evaluation dataset names
  eval_dfs = df['Evaluation Dataset'].unique()
  # is the lexicon normalized
  normalized = True
  # extracting the coefficients
  coefs = get_coefs(df)
  # the decision maker is average_shap_values
  decision_maker = 'average_shap_values'

  summary_df_values = []

  # for the selected source lexicon and each of evaluation datasets, extract the metric value
  for ed in eval_dfs:

    for n in [normalized, not normalized]:
      metric_values = get_metric_values(df, ed, n, metric)
      row_value = [source_df, n, ed, decision_maker] + coefs + metric_values
      summary_df_values.append(row_value)

  cols = ['Lexicon Source', 'Lexicon Normalized', 'Evaluation Dataset', 'Decision Maker', 'C1', 'C2', 'C3', 'C4',
          'LM', 'XLex', 'XLex + LM', 'LM on LM', 'XLex on LM', 'XLex + LM on LM']

  return pd.DataFrame(summary_df_values, columns = cols)

In [None]:
metrics = ['Accuracy', 'F1', 'MCC', 'Precision', 'Recall']
sources = [nasdaq_res, sentfin_res, fpb_res]
metric_dfs_map = {}

# for each metric values, extract the results for the source lexicons
for metric in metrics:
  df = pd.DataFrame()

  for source in sources:
    summary_dataset = create_summary_dataset(source, metric)
    df = pd.concat([df, summary_dataset], ignore_index = True)

  metric_dfs_map[metric] = df

In [None]:
# presenting the results in relation to the Accuracy
metric_dfs_map['Accuracy'].sort_values(by=['Lexicon Source', 'Lexicon Normalized', 'Evaluation Dataset'], ascending=[False, False, True])

Unnamed: 0,Lexicon Source,Lexicon Normalized,Evaluation Dataset,Decision Maker,C1,C2,C3,C4,LM,XLex,XLex + LM,LM on LM,XLex on LM,XLex + LM on LM
4,nasdaq,True,dev_df,average_shap_values,0.3,0.1,0.1,0.5,0.366834,0.68593,0.756281,0.793478,0.684783,0.836957
2,nasdaq,True,financial_phrase_bank,average_shap_values,0.3,0.1,0.1,0.5,0.306215,0.836158,0.842938,0.744505,0.791209,0.807692
6,nasdaq,True,fiqa_labeled_df,average_shap_values,0.3,0.1,0.1,0.5,0.258706,0.721393,0.701493,0.732394,0.788732,0.732394
0,nasdaq,True,fpb_fiqa,average_shap_values,0.3,0.1,0.1,0.5,0.282101,0.7607,0.765888,0.722591,0.737542,0.750831
5,nasdaq,False,dev_df,average_shap_values,0.3,0.1,0.1,0.5,0.366834,0.698492,0.756281,0.793478,0.701087,0.826087
3,nasdaq,False,financial_phrase_bank,average_shap_values,0.3,0.1,0.1,0.5,0.306215,0.837288,0.844068,0.744505,0.785714,0.802198
7,nasdaq,False,fiqa_labeled_df,average_shap_values,0.3,0.1,0.1,0.5,0.258706,0.696517,0.681592,0.732394,0.774648,0.732394
1,nasdaq,False,fpb_fiqa,average_shap_values,0.3,0.1,0.1,0.5,0.282101,0.750324,0.759403,0.722591,0.725914,0.749169
12,fiqa_fpb_sentfin_neutral,True,dev_df,average_shap_values,0.3,0.1,0.1,0.5,0.366834,0.698492,0.751256,0.793478,0.711957,0.826087
10,fiqa_fpb_sentfin_neutral,True,financial_phrase_bank,average_shap_values,0.3,0.1,0.1,0.5,0.306215,0.80791,0.80904,0.744505,0.804945,0.807692


In [None]:
# presenting the results in relation to the F1
metric_dfs_map['F1'].sort_values(by=['Lexicon Source', 'Lexicon Normalized', 'Evaluation Dataset'], ascending=[False, False, True])

Unnamed: 0,Lexicon Source,Lexicon Normalized,Evaluation Dataset,Decision Maker,C1,C2,C3,C4,LM,XLex,XLex + LM,LM on LM,XLex on LM,XLex + LM on LM
4,nasdaq,True,dev_df,average_shap_values,0.3,0.1,0.1,0.5,0.32673,0.676197,0.754981,0.77971,0.684447,0.831193
2,nasdaq,True,financial_phrase_bank,average_shap_values,0.3,0.1,0.1,0.5,0.284005,0.451388,0.715744,0.676389,0.443727,0.731394
6,nasdaq,True,fiqa_labeled_df,average_shap_values,0.3,0.1,0.1,0.5,0.248774,0.456897,0.451254,0.70933,0.734877,0.70933
0,nasdaq,True,fpb_fiqa,average_shap_values,0.3,0.1,0.1,0.5,0.276376,0.434178,0.456577,0.694347,0.439349,0.717052
5,nasdaq,False,dev_df,average_shap_values,0.3,0.1,0.1,0.5,0.32673,0.69195,0.755465,0.77971,0.701078,0.819165
3,nasdaq,False,financial_phrase_bank,average_shap_values,0.3,0.1,0.1,0.5,0.284005,0.457467,0.721581,0.676389,0.442659,0.725949
7,nasdaq,False,fiqa_labeled_df,average_shap_values,0.3,0.1,0.1,0.5,0.248774,0.436887,0.435128,0.70933,0.721569,0.70933
1,nasdaq,False,fpb_fiqa,average_shap_values,0.3,0.1,0.1,0.5,0.276376,0.42557,0.450729,0.694347,0.433546,0.716267
12,fiqa_fpb_sentfin_neutral,True,dev_df,average_shap_values,0.3,0.1,0.1,0.5,0.32673,0.695719,0.751129,0.77971,0.711539,0.819165
10,fiqa_fpb_sentfin_neutral,True,financial_phrase_bank,average_shap_values,0.3,0.1,0.1,0.5,0.284005,0.450993,0.690291,0.676389,0.471998,0.731394


In [None]:
# presenting the results in relation to the MCC
metric_dfs_map['MCC'].sort_values(by=['Lexicon Source', 'Lexicon Normalized', 'Evaluation Dataset'], ascending=[False, False, True])

Unnamed: 0,Lexicon Source,Lexicon Normalized,Evaluation Dataset,Decision Maker,C1,C2,C3,C4,LM,XLex,XLex + LM,LM on LM,XLex on LM,XLex + LM on LM
4,nasdaq,True,dev_df,average_shap_values,0.3,0.1,0.1,0.5,0.250719,0.393855,0.517006,0.584795,0.406343,0.66763
2,nasdaq,True,financial_phrase_bank,average_shap_values,0.3,0.1,0.1,0.5,0.187135,0.361564,0.457278,0.473721,0.345545,0.53294
6,nasdaq,True,fiqa_labeled_df,average_shap_values,0.3,0.1,0.1,0.5,0.156333,0.372176,0.366498,0.458816,0.470179,0.458816
0,nasdaq,True,fpb_fiqa,average_shap_values,0.3,0.1,0.1,0.5,0.183169,0.305891,0.389196,0.470736,0.320263,0.491504
5,nasdaq,False,dev_df,average_shap_values,0.3,0.1,0.1,0.5,0.250719,0.412849,0.515172,0.584795,0.424414,0.645837
3,nasdaq,False,financial_phrase_bank,average_shap_values,0.3,0.1,0.1,0.5,0.187135,0.382939,0.472566,0.473721,0.346857,0.525424
7,nasdaq,False,fiqa_labeled_df,average_shap_values,0.3,0.1,0.1,0.5,0.156333,0.312675,0.315961,0.458816,0.443137,0.458816
1,nasdaq,False,fpb_fiqa,average_shap_values,0.3,0.1,0.1,0.5,0.183169,0.280815,0.37136,0.470736,0.305452,0.492904
12,fiqa_fpb_sentfin_neutral,True,dev_df,average_shap_values,0.3,0.1,0.1,0.5,0.250719,0.403084,0.502723,0.584795,0.435359,0.645837
10,fiqa_fpb_sentfin_neutral,True,financial_phrase_bank,average_shap_values,0.3,0.1,0.1,0.5,0.187135,0.392105,0.435424,0.473721,0.449596,0.53294


In [None]:
# presenting the results in relation to the Precision
metric_dfs_map['Precision'].sort_values(by=['Lexicon Source', 'Lexicon Normalized', 'Evaluation Dataset'], ascending=[False, False, True])

Unnamed: 0,Lexicon Source,Lexicon Normalized,Evaluation Dataset,Decision Maker,C1,C2,C3,C4,LM,XLex,XLex + LM,LM on LM,XLex on LM,XLex + LM on LM
4,nasdaq,True,dev_df,average_shap_values,0.3,0.1,0.1,0.5,0.541325,0.709538,0.761097,0.811987,0.705405,0.840852
2,nasdaq,True,financial_phrase_bank,average_shap_values,0.3,0.1,0.1,0.5,0.450286,0.439929,0.687356,0.675429,0.433276,0.707298
6,nasdaq,True,fiqa_labeled_df,average_shap_values,0.3,0.1,0.1,0.5,0.472043,0.456807,0.450774,0.708065,0.738866,0.708065
0,nasdaq,True,fpb_fiqa,average_shap_values,0.3,0.1,0.1,0.5,0.467712,0.429447,0.447605,0.701568,0.435567,0.71275
5,nasdaq,False,dev_df,average_shap_values,0.3,0.1,0.1,0.5,0.541325,0.715449,0.759171,0.811987,0.711914,0.831643
3,nasdaq,False,financial_phrase_bank,average_shap_values,0.3,0.1,0.1,0.5,0.450286,0.444206,0.691697,0.675429,0.431992,0.70342
7,nasdaq,False,fiqa_labeled_df,average_shap_values,0.3,0.1,0.1,0.5,0.472043,0.437381,0.434994,0.708065,0.721569,0.708065
1,nasdaq,False,fpb_fiqa,average_shap_values,0.3,0.1,0.1,0.5,0.467712,0.421067,0.442364,0.701568,0.429296,0.712997
12,fiqa_fpb_sentfin_neutral,True,dev_df,average_shap_values,0.3,0.1,0.1,0.5,0.541325,0.705116,0.751562,0.811987,0.716139,0.831643
10,fiqa_fpb_sentfin_neutral,True,financial_phrase_bank,average_shap_values,0.3,0.1,0.1,0.5,0.450286,0.435774,0.665355,0.675429,0.456968,0.707298


In [None]:
# presenting the results in relation to the Recall
metric_dfs_map['Recall'].sort_values(by=['Lexicon Source', 'Lexicon Normalized', 'Evaluation Dataset'], ascending=[False, False, True])

Unnamed: 0,Lexicon Source,Lexicon Normalized,Evaluation Dataset,Decision Maker,C1,C2,C3,C4,LM,XLex,XLex + LM,LM on LM,XLex on LM,XLex + LM on LM
4,nasdaq,True,dev_df,average_shap_values,0.3,0.1,0.1,0.5,0.244949,0.685076,0.755934,0.774038,0.700962,0.826923
2,nasdaq,True,financial_phrase_bank,average_shap_values,0.3,0.1,0.1,0.5,0.250471,0.470459,0.779017,0.819805,0.467532,0.842532
6,nasdaq,True,fiqa_labeled_df,average_shap_values,0.3,0.1,0.1,0.5,0.172352,0.457918,0.460238,0.752941,0.731373,0.752941
0,nasdaq,True,fpb_fiqa,average_shap_values,0.3,0.1,0.1,0.5,0.220588,0.441748,0.480789,0.774836,0.445344,0.783873
5,nasdaq,False,dev_df,average_shap_values,0.3,0.1,0.1,0.5,0.244949,0.697778,0.75601,0.774038,0.7125,0.814423
3,nasdaq,False,financial_phrase_bank,average_shap_values,0.3,0.1,0.1,0.5,0.250471,0.481179,0.791239,0.819805,0.470238,0.839286
7,nasdaq,False,fiqa_labeled_df,average_shap_values,0.3,0.1,0.1,0.5,0.172352,0.436992,0.441711,0.752941,0.721569,0.752941
1,nasdaq,False,fpb_fiqa,average_shap_values,0.3,0.1,0.1,0.5,0.220588,0.433334,0.473994,0.774836,0.441836,0.785162
12,fiqa_fpb_sentfin_neutral,True,dev_df,average_shap_values,0.3,0.1,0.1,0.5,0.244949,0.69803,0.751162,0.774038,0.719231,0.814423
10,fiqa_fpb_sentfin_neutral,True,financial_phrase_bank,average_shap_values,0.3,0.1,0.1,0.5,0.250471,0.50085,0.786647,0.819805,0.516775,0.842532
