In [1]:
import glob
import pandas as pd 
import numpy as np
import os
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score, jaccard_score

# pandas maxcolumns
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

# We provide the minimal code to evaluate the CWRI results

In [5]:
# load answer sheet from google drive and place under data folder
answer_sheet = pd.read_csv('../data/answer_sheet.csv', index_col=0)
answer_sheet = answer_sheet.values.flatten()
print(answer_sheet)

[1 1 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 0 0 1 1 0 0 0 0 0 0 1 1 0 0 1 1 1
 1 1 1 0 0 0 0 1 1 0 0 0 0 1 1 1 1 0 0 1 1 1 1 0 0 0 0 0 0 1 1 0 0 1 1 1 1
 1 1 0 0 1 1 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 0 0]


# Random Evaluation

In [4]:
# generate random array filled with 1 or zero

recall_list = []
precision_list = []
f1_list = []
accuracy_list = []
jaccard_list = []

for i in range(1000):
    random_array = np.random.randint(2, size=answer_sheet.shape) # random array filled with 0 or 1. In practice, this would be the binarized output of class-wise importance from your model 
    recall = recall_score(answer_sheet, random_array)
    precision = precision_score(answer_sheet, random_array)
    f1 = f1_score(answer_sheet, random_array)
    accuracy = accuracy_score(answer_sheet, random_array)
    jaccard = jaccard_score(answer_sheet, random_array)

    recall_list.append(recall)
    precision_list.append(precision)
    f1_list.append(f1)
    accuracy_list.append(accuracy)
    jaccard_list.append(jaccard)
    

print(f"f1: {np.mean(f1_list):.3f} ± {np.std(f1_list):.3f}")
print(f"jaccard: {np.mean(jaccard_list):.3f} ± {np.std(jaccard_list):.3f}")
print(f"Interpret accuracy: {np.mean(accuracy_list):.3f} ± {np.std(accuracy_list):.3f}")
print(f"recall: {np.mean(recall_list):.3f} ± {np.std(recall_list):.3f}")
print(f"precision: {np.mean(precision_list):.3f} ± {np.std(precision_list):.3f}")


f1: 0.495 ± 0.058
jaccard: 0.331 ± 0.051
Interpret accuracy: 0.498 ± 0.052
recall: 0.494 ± 0.071
precision: 0.498 ± 0.053


In [6]:
# Calculate the same metrics with the reversed answer sheet
reversed_answer_sheet = 1 - answer_sheet
print(reversed_answer_sheet)

[0 0 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 1 1 0 0 1 1 1 1 1 1 0 0 1 1 0 0 0
 0 0 0 1 1 1 1 0 0 1 1 1 1 0 0 0 0 1 1 0 0 0 0 1 1 1 1 1 1 0 0 1 1 0 0 0 0
 0 0 1 1 0 0 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 1 1]


In [7]:
# generate random array filled with 1 or zero

recall_list = []
precision_list = []
f1_list = []
accuracy_list = []
jaccard_list = []

for i in range(1000):
    random_array = np.random.randint(2, size=reversed_answer_sheet.shape) # random array filled with 0 or 1. In practice, this would be the binarized output of class-wise importance from your model 
    recall = recall_score(reversed_answer_sheet, random_array)
    precision = precision_score(reversed_answer_sheet, random_array)
    f1 = f1_score(reversed_answer_sheet, random_array)
    accuracy = accuracy_score(reversed_answer_sheet, random_array)
    jaccard = jaccard_score(reversed_answer_sheet, random_array)

    recall_list.append(recall)
    precision_list.append(precision)
    f1_list.append(f1)
    accuracy_list.append(accuracy)
    jaccard_list.append(jaccard)
    

print(f"f1: {np.mean(f1_list):.3f} ± {np.std(f1_list):.3f}")
print(f"jaccard: {np.mean(jaccard_list):.3f} ± {np.std(jaccard_list):.3f}")
print(f"Interpret accuracy: {np.mean(accuracy_list):.3f} ± {np.std(accuracy_list):.3f}")
print(f"recall: {np.mean(recall_list):.3f} ± {np.std(recall_list):.3f}")
print(f"precision: {np.mean(precision_list):.3f} ± {np.std(precision_list):.3f}")


f1: 0.497 ± 0.057
jaccard: 0.332 ± 0.050
Interpret accuracy: 0.498 ± 0.050
recall: 0.499 ± 0.072
precision: 0.498 ± 0.051


### In our paper, we reported the result which yielded the highest f1 score

Please check our `Appendix E: Evaluation of CWRI Metrics` for further explanation.