<a href="https://colab.research.google.com/github/hristijanpeshov/SHAP-Explainable-Lexicon-Model/blob/master/notebooks/FinBERT%20notebooks/FinBERT_model_evaluation_all_metrics_results.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# User Input

In [None]:
# enter the location of the summary_df.csv file for each of the lexicons
# this file is located in the results folder of the appropriate lexicon

nasdaq_res_loc = '/content/drive/MyDrive/finbert process/nasdaq/concatenated datasets/results/summary_df.csv'
fpb_res_loc = '/content/drive/MyDrive/finbert process/fpb/concatenated datasets/results/summary_df.csv'
sentfin_res_loc = '/content/drive/MyDrive/finbert process/sentfin/concatenated datasets/results/summary_df.csv'

# All metrics summary

In [None]:
import pandas as pd

nasdaq_res = pd.read_csv(nasdaq_res_loc)
fpb_res = pd.read_csv(fpb_res_loc)
sentfin_res = pd.read_csv(sentfin_res_loc)

In [None]:
sentfin_res['Lexicon Source'] = 'fiqa_fpb_sentfin_neutral'
fpb_res['Lexicon Source'] = 'financial_phrase_bank'

In [None]:
def get_metric_values(df, eval_df, normalized, metric):
  # the different types of evaluation
  word_sources = ['LMD', 'OUR', 'OUR + LMD', 'LMD on LMD', 'OUR on LMD', 'OUR + LMD on LMD']
  all_metric_values = []

  for ws in word_sources:
    eval_df_mask = df['Evaluation Dataset'] == eval_df
    lexicon_normalized_mask = df['Lexicon Normalized'] == normalized
    word_source_mask = df['Words Source'] == ws

    combined_mask = eval_df_mask & lexicon_normalized_mask & word_source_mask

    # extracting the selected metric values for all lexicon sources
    metric_value = df[combined_mask][metric].values[0]

    all_metric_values.append(metric_value)

  return all_metric_values

def is_coef_irregular(coefs):
  return len(coefs) != 1 and '\\' not in coefs

def get_coefs(df):
  c1 = df['C1'].unique()
  c2 = df['C2'].unique()
  c3 = df['C3'].unique()
  c4 = df['C4'].unique()

  if is_coef_irregular(c1) or is_coef_irregular(c2) or is_coef_irregular(c3) or is_coef_irregular(c4):
    print('Missing values for coefficients')

  return [c1[0], c2[0], c3[0], c4[0]]

def create_summary_dataset(df, metric):
  # source lexicon name
  source_df = df['Lexicon Source'].unique()[0]

  # evaluation dataset names
  eval_dfs = df['Evaluation Dataset'].unique()
  # is the lexicon normalized
  normalized = True
  # extracting the coefficients
  coefs = get_coefs(df)
  # the decision maker is average_shap_values
  decision_maker = 'average_shap_values'

  summary_df_values = []

  # for the selected source lexicon and each of evaluation datasets, extract the metric value
  for ed in eval_dfs:

    for n in [normalized, not normalized]:
      metric_values = get_metric_values(df, ed, n, metric)
      row_value = [source_df, n, ed, decision_maker] + coefs + metric_values
      summary_df_values.append(row_value)

  cols = ['Lexicon Source', 'Lexicon Normalized', 'Evaluation Dataset', 'Decision Maker', 'C1', 'C2', 'C3', 'C4',
          'LM', 'XLex', 'XLex + LM', 'LM on LM', 'XLex on LM', 'XLex + LM on LM']

  return pd.DataFrame(summary_df_values, columns = cols)

In [None]:
metrics = ['Accuracy', 'F1', 'MCC', 'Precision', 'Recall']
sources = [nasdaq_res, sentfin_res, fpb_res]
metric_dfs_map = {}

# for each metric values, extract the results for the source lexicons
for metric in metrics:
  df = pd.DataFrame()

  for source in sources:
    summary_dataset = create_summary_dataset(source, metric)
    df = pd.concat([df, summary_dataset], ignore_index = True)

  metric_dfs_map[metric] = df

In [None]:
# presenting the results in relation to the Accuracy
metric_dfs_map['Accuracy'].sort_values(by=['Lexicon Source', 'Lexicon Normalized', 'Evaluation Dataset'], ascending=[False, False, True])

Unnamed: 0,Lexicon Source,Lexicon Normalized,Evaluation Dataset,Decision Maker,C1,C2,C3,C4,LM,XLex,XLex + LM,LM on LM,XLex on LM,XLex + LM on LM
4,nasdaq,True,dev_df,average_shap_values,0.5,0.1,0.1,0.5,0.40201,0.718593,0.733668,0.76555,0.755981,0.784689
2,nasdaq,True,financial_phrase_bank,average_shap_values,0.5,0.1,0.1,0.5,0.310734,0.768362,0.732203,0.626424,0.728929,0.656036
6,nasdaq,True,fiqa_labeled_df,average_shap_values,0.5,0.1,0.1,0.5,0.313433,0.761194,0.766169,0.797468,0.835443,0.848101
0,nasdaq,True,fpb_fiqa,average_shap_values,0.5,0.1,0.1,0.5,0.293774,0.747082,0.722438,0.668142,0.756637,0.70059
5,nasdaq,False,dev_df,average_shap_values,0.5,0.1,0.1,0.5,0.40201,0.721106,0.741206,0.76555,0.741627,0.779904
3,nasdaq,False,financial_phrase_bank,average_shap_values,0.5,0.1,0.1,0.5,0.310734,0.693785,0.683616,0.626424,0.678815,0.658314
7,nasdaq,False,fiqa_labeled_df,average_shap_values,0.5,0.1,0.1,0.5,0.313433,0.761194,0.756219,0.797468,0.848101,0.835443
1,nasdaq,False,fpb_fiqa,average_shap_values,0.5,0.1,0.1,0.5,0.293774,0.70882,0.697795,0.668142,0.724189,0.699115
12,fiqa_fpb_sentfin_neutral,True,dev_df,average_shap_values,0.5,0.1,0.1,0.5,0.40201,0.741206,0.778894,0.76555,0.746411,0.818182
10,fiqa_fpb_sentfin_neutral,True,financial_phrase_bank,average_shap_values,0.5,0.1,0.1,0.5,0.310734,0.830508,0.79435,0.626424,0.785877,0.712984


In [None]:
# presenting the results in relation to the F1
metric_dfs_map['F1'].sort_values(by=['Lexicon Source', 'Lexicon Normalized', 'Evaluation Dataset'], ascending=[False, False, True])

Unnamed: 0,Lexicon Source,Lexicon Normalized,Evaluation Dataset,Decision Maker,C1,C2,C3,C4,LM,XLex,XLex + LM,LM on LM,XLex on LM,XLex + LM on LM
4,nasdaq,True,dev_df,average_shap_values,0.5,0.1,0.1,0.5,0.338536,0.718016,0.7335,0.742927,0.751521,0.768036
2,nasdaq,True,financial_phrase_bank,average_shap_values,0.5,0.1,0.1,0.5,0.271732,0.421435,0.413188,0.58208,0.42893,0.604851
6,nasdaq,True,fiqa_labeled_df,average_shap_values,0.5,0.1,0.1,0.5,0.293063,0.496581,0.500838,0.774929,0.803144,0.820455
0,nasdaq,True,fpb_fiqa,average_shap_values,0.5,0.1,0.1,0.5,0.277328,0.448329,0.44053,0.646707,0.472131,0.672365
5,nasdaq,False,dev_df,average_shap_values,0.5,0.1,0.1,0.5,0.338536,0.721062,0.739825,0.742927,0.735021,0.762218
3,nasdaq,False,financial_phrase_bank,average_shap_values,0.5,0.1,0.1,0.5,0.271732,0.389312,0.392027,0.58208,0.40504,0.606725
7,nasdaq,False,fiqa_labeled_df,average_shap_values,0.5,0.1,0.1,0.5,0.293063,0.497711,0.495672,0.774929,0.815994,0.807714
1,nasdaq,False,fpb_fiqa,average_shap_values,0.5,0.1,0.1,0.5,0.277328,0.427824,0.427306,0.646707,0.453774,0.671062
12,fiqa_fpb_sentfin_neutral,True,dev_df,average_shap_values,0.5,0.1,0.1,0.5,0.338536,0.739626,0.778889,0.742927,0.744068,0.807607
10,fiqa_fpb_sentfin_neutral,True,financial_phrase_bank,average_shap_values,0.5,0.1,0.1,0.5,0.271732,0.708175,0.683988,0.58208,0.701904,0.65262


In [None]:
# presenting the results in relation to the MCC
metric_dfs_map['MCC'].sort_values(by=['Lexicon Source', 'Lexicon Normalized', 'Evaluation Dataset'], ascending=[False, False, True])

Unnamed: 0,Lexicon Source,Lexicon Normalized,Evaluation Dataset,Decision Maker,C1,C2,C3,C4,LM,XLex,XLex + LM,LM on LM,XLex on LM,XLex + LM on LM
4,nasdaq,True,dev_df,average_shap_values,0.5,0.1,0.1,0.5,0.253978,0.438365,0.468336,0.530581,0.503078,0.56583
2,nasdaq,True,financial_phrase_bank,average_shap_values,0.5,0.1,0.1,0.5,0.184529,0.314963,0.336675,0.373037,0.360861,0.390486
6,nasdaq,True,fiqa_labeled_df,average_shap_values,0.5,0.1,0.1,0.5,0.207771,0.497095,0.510956,0.569203,0.606641,0.642331
0,nasdaq,True,fpb_fiqa,average_shap_values,0.5,0.1,0.1,0.5,0.181591,0.374968,0.377897,0.418258,0.442806,0.436482
5,nasdaq,False,dev_df,average_shap_values,0.5,0.1,0.1,0.5,0.253978,0.442569,0.488762,0.530581,0.470603,0.556386
3,nasdaq,False,financial_phrase_bank,average_shap_values,0.5,0.1,0.1,0.5,0.184529,0.278537,0.317946,0.373037,0.330772,0.392545
7,nasdaq,False,fiqa_labeled_df,average_shap_values,0.5,0.1,0.1,0.5,0.207771,0.503234,0.50201,0.569203,0.631988,0.618507
1,nasdaq,False,fpb_fiqa,average_shap_values,0.5,0.1,0.1,0.5,0.181591,0.334136,0.353755,0.418258,0.404264,0.434766
12,fiqa_fpb_sentfin_neutral,True,dev_df,average_shap_values,0.5,0.1,0.1,0.5,0.253978,0.487229,0.557778,0.530581,0.490943,0.631744
10,fiqa_fpb_sentfin_neutral,True,financial_phrase_bank,average_shap_values,0.5,0.1,0.1,0.5,0.184529,0.454297,0.442919,0.373037,0.461729,0.445013


In [None]:
# presenting the results in relation to the Precision
metric_dfs_map['Precision'].sort_values(by=['Lexicon Source', 'Lexicon Normalized', 'Evaluation Dataset'], ascending=[False, False, True])

Unnamed: 0,Lexicon Source,Lexicon Normalized,Evaluation Dataset,Decision Maker,C1,C2,C3,C4,LM,XLex,XLex + LM,LM on LM,XLex on LM,XLex + LM on LM
4,nasdaq,True,dev_df,average_shap_values,0.5,0.1,0.1,0.5,0.529971,0.719984,0.734524,0.794956,0.751211,0.805284
2,nasdaq,True,financial_phrase_bank,average_shap_values,0.5,0.1,0.1,0.5,0.424419,0.412915,0.412669,0.636628,0.426389,0.643064
6,nasdaq,True,fiqa_labeled_df,average_shap_values,0.5,0.1,0.1,0.5,0.509857,0.49652,0.500685,0.764785,0.799621,0.813704
0,nasdaq,True,fpb_fiqa,average_shap_values,0.5,0.1,0.1,0.5,0.451807,0.441445,0.438172,0.67771,0.465235,0.68566
5,nasdaq,False,dev_df,average_shap_values,0.5,0.1,0.1,0.5,0.529971,0.721383,0.747203,0.794956,0.736765,0.801569
3,nasdaq,False,financial_phrase_bank,average_shap_values,0.5,0.1,0.1,0.5,0.424419,0.397854,0.405444,0.636628,0.415805,0.643845
7,nasdaq,False,fiqa_labeled_df,average_shap_values,0.5,0.1,0.1,0.5,0.509857,0.497773,0.496381,0.764785,0.815994,0.798984
1,nasdaq,False,fpb_fiqa,average_shap_values,0.5,0.1,0.1,0.5,0.451807,0.426595,0.430099,0.67771,0.451223,0.684899
12,fiqa_fpb_sentfin_neutral,True,dev_df,average_shap_values,0.5,0.1,0.1,0.5,0.529971,0.746428,0.778889,0.794956,0.743434,0.831398
10,fiqa_fpb_sentfin_neutral,True,financial_phrase_bank,average_shap_values,0.5,0.1,0.1,0.5,0.424419,0.679563,0.662721,0.636628,0.68278,0.66485


In [None]:
# presenting the results in relation to the Recall
metric_dfs_map['Recall'].sort_values(by=['Lexicon Source', 'Lexicon Normalized', 'Evaluation Dataset'], ascending=[False, False, True])

Unnamed: 0,Lexicon Source,Lexicon Normalized,Evaluation Dataset,Decision Maker,C1,C2,C3,C4,LM,XLex,XLex + LM,LM on LM,XLex on LM,XLex + LM on LM
4,nasdaq,True,dev_df,average_shap_values,0.5,0.1,0.1,0.5,0.268535,0.718384,0.733813,0.738609,0.751867,0.762185
2,nasdaq,True,financial_phrase_bank,average_shap_values,0.5,0.1,0.1,0.5,0.288207,0.472915,0.492574,0.754626,0.489715,0.766454
6,nasdaq,True,fiqa_labeled_df,average_shap_values,0.5,0.1,0.1,0.5,0.207666,0.503907,0.509283,0.805901,0.807065,0.828804
0,nasdaq,True,fpb_fiqa,average_shap_values,0.5,0.1,0.1,0.5,0.239376,0.477886,0.48485,0.746103,0.498576,0.75654
5,nasdaq,False,dev_df,average_shap_values,0.5,0.1,0.1,0.5,0.268535,0.721187,0.741591,0.738609,0.733847,0.756629
3,nasdaq,False,financial_phrase_bank,average_shap_values,0.5,0.1,0.1,0.5,0.288207,0.467642,0.489489,0.754626,0.481417,0.767809
7,nasdaq,False,fiqa_labeled_df,average_shap_values,0.5,0.1,0.1,0.5,0.207666,0.506885,0.507465,0.805901,0.815994,0.819876
1,nasdaq,False,fpb_fiqa,average_shap_values,0.5,0.1,0.1,0.5,0.239376,0.466072,0.477061,0.746103,0.487299,0.755574
12,fiqa_fpb_sentfin_neutral,True,dev_df,average_shap_values,0.5,0.1,0.1,0.5,0.268535,0.740833,0.778889,0.738609,0.747526,0.801074
10,fiqa_fpb_sentfin_neutral,True,financial_phrase_bank,average_shap_values,0.5,0.1,0.1,0.5,0.288207,0.787345,0.8014,0.754626,0.791599,0.800329
