## X-Ray Abnormality Detection | Evaluation

> **Antonopoulos Ilias** ( *p3352004* ) <br />
> **Ndoja Silva** ( *p3352017* ) <br />
> **MSc in Data Science, AUEB**

## Table of Contents


* [CNN architecture with sensible defaults](#CNN-architecture-with-sensible-defaults,-all-study-types)
* [DenseNet169 architecture, pretrained on ImageNet](#DenseNet169-architecture,-pretrained-on-ImageNet)
* [VGG19 architecture, pretrained on ImageNet](#VGG19-architecture,-pretrained-on-ImageNet)
* [Ensemble model: generic CNN + wrist CNN](#Ensemble-model:-generic-CNN-+-wrist-CNN)
* [Ensemble model: DenseNet169 + VGG19](#Ensemble-model:-DenseNet169-+-VGG19)

In [1]:
import os
import pathlib
import random
import re

import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_addons as tfa
from sklearn import metrics
from sklearn.utils import shuffle

from utils import (
    clean_up,
    F1Score,
    inspect_df,
    study_oriented_transformation,
    study_oriented_transformation_on_ensemble,
)

In [2]:
print(tf.__version__)

2.8.0


In [3]:
SEED = 99910123

os.environ["PYTHONHASHSEED"] = str(SEED)
random.seed(SEED)
tf.random.set_seed(SEED)
np.random.seed(SEED)

In [4]:
DATASET_DIR = "../data/MURA-v1.1/"
IMAGE_WIDTH = 224
IMAGE_HEIGHT = 224

In [5]:
STUDY_TYPES = [
    "XR_SHOULDER",
    "XR_ELBOW",
    "XR_HUMERUS",
    "XR_HAND",
    "XR_WRIST",
    "XR_FOREARM",
    "XR_FINGER",
]

In [6]:
METRICS = [
    tf.keras.metrics.TruePositives(name="tp"),
    tf.keras.metrics.FalsePositives(name="fp"),
    tf.keras.metrics.TrueNegatives(name="tn"),
    tf.keras.metrics.FalseNegatives(name="fn"),
    tf.keras.metrics.BinaryAccuracy(name="binary_acc"),
    tf.keras.metrics.Precision(name="precision"),
    tf.keras.metrics.Recall(name="recall"),
    F1Score(name="f1_score"),
    tf.keras.metrics.AUC(name="roc_auc", curve="ROC"),
    tf.keras.metrics.AUC(name="pr_auc", curve="PR"),
    tfa.metrics.CohenKappa(name="cohen_kappa", num_classes=2),
]

2022-04-03 17:17:17.313690: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-04-03 17:17:17.340725: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-04-03 17:17:17.340900: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-04-03 17:17:17.341735: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

In [7]:
test_image_paths = pd.read_csv(
    os.path.join(DATASET_DIR, "valid_image_paths.csv"),
    names=["image_path"],
    header=None,
    index_col=False,
)

In [8]:
test_image_paths["image_path"] = test_image_paths["image_path"].map(
    lambda x: os.path.join("../data/", x)
)
test_image_paths["study_type"] = test_image_paths["image_path"].map(
    lambda x: x.split("/")[4]
)
test_image_paths["patient"] = test_image_paths["image_path"].map(
    lambda x: x.split("/")[5]
)
test_image_paths["study"] = test_image_paths["image_path"].map(
    lambda x: x.split("/")[6]
)
test_image_paths["study_path"] = test_image_paths["image_path"].map(
    lambda x: re.sub(r"image\d+.png", "", x)
)

In [9]:
test_labeled_studies = pd.read_csv(
    os.path.join(DATASET_DIR, "valid_labeled_studies.csv"),
    names=["study_path", "label"],
    header=None,
    index_col=False,
)

In [10]:
test_labeled_studies["study_path"] = test_labeled_studies["study_path"].map(
    lambda x: os.path.join("../data/", x)
)
test_labeled_studies["label"] = test_labeled_studies["label"].map(str)

In [11]:
ref_testset = pd.merge(
    test_image_paths, test_labeled_studies, how="inner", on="study_path"
)

inspect_df(ref_testset)

shape: (3197, 6)


Unnamed: 0,image_path,study_type,patient,study,study_path,label
0,../data/MURA-v1.1/valid/XR_WRIST/patient11185/...,XR_WRIST,patient11185,study1_positive,../data/MURA-v1.1/valid/XR_WRIST/patient11185/...,1
1,../data/MURA-v1.1/valid/XR_WRIST/patient11185/...,XR_WRIST,patient11185,study1_positive,../data/MURA-v1.1/valid/XR_WRIST/patient11185/...,1
2,../data/MURA-v1.1/valid/XR_WRIST/patient11185/...,XR_WRIST,patient11185,study1_positive,../data/MURA-v1.1/valid/XR_WRIST/patient11185/...,1
3,../data/MURA-v1.1/valid/XR_WRIST/patient11185/...,XR_WRIST,patient11185,study1_positive,../data/MURA-v1.1/valid/XR_WRIST/patient11185/...,1
4,../data/MURA-v1.1/valid/XR_WRIST/patient11186/...,XR_WRIST,patient11186,study1_positive,../data/MURA-v1.1/valid/XR_WRIST/patient11186/...,1


### CNN architecture with sensible defaults, all study types

In [12]:
cnn_testing = tf.keras.preprocessing.image.ImageDataGenerator(
    rescale=1.0 / 255,
)

In [13]:
testset = cnn_testing.flow_from_dataframe(
    dataframe=ref_testset,
    x_col="image_path",
    y_col="label",
    target_size=(IMAGE_WIDTH, IMAGE_HEIGHT),
    class_mode="binary",
    batch_size=32,
    seed=SEED,
    shuffle=False,  # this is very important in order to properly use raw predictions to evaluate on study-level!
)

Found 3197 validated image filenames belonging to 2 classes.


In [14]:
model = tf.keras.models.load_model(
    "../models/cnn-all-study-types-3309921.h5", custom_objects={"F1Score": F1Score}
)

In [15]:
evaluation_metrics = model.evaluate(testset, verbose=1)

2022-04-03 17:17:31.937370: I tensorflow/stream_executor/cuda/cuda_dnn.cc:368] Loaded cuDNN version 8303




In [16]:
loss = evaluation_metrics.pop(0)

print("=" * 32)
print(f"test loss   : {loss}")
for metric, score in zip(METRICS, evaluation_metrics):

    print(f"{metric.name + ' ' * (12 - len(metric.name))}: {score}")
print("=" * 32)

test loss   : 0.6806519031524658
tp          : 898.0
fp          : 416.0
tn          : 1251.0
fn          : 632.0
binary_acc  : 0.6721926927566528
precision   : 0.6834094524383545
recall      : 0.586928129196167
f1_score    : 0.6315049529075623
roc_auc     : 0.7455726861953735
pr_auc      : 0.752379298210144
cohen_kappa : 0.33934664726257324


In [17]:
# get predictions for the test dataset
preds = model.predict(testset, verbose=1)



In [18]:
temp_testset = ref_testset.copy()

temp_testset["label"] = temp_testset["label"].map(int)
temp_testset["prediction"] = preds.ravel()

In [19]:
temp_testset = pd.DataFrame(
    [*study_oriented_transformation(temp_testset)],
    columns=["study_type", "study", "label", "prediction"],
)

In [20]:
k = metrics.cohen_kappa_score(
    temp_testset["label"].values, temp_testset["prediction"].values
)
roc_auc = metrics.roc_auc_score(
    temp_testset["label"].values, temp_testset["prediction"].values
)
f1 = metrics.f1_score(temp_testset["label"].values, temp_testset["prediction"].values)

print("=" * 38)
print(f"study type       : all studies")
print("-----------------")
print(f"Cohen's kappa (κ): {k}")
print(f"ROC AUC          : {roc_auc}")
print(f"F1               : {f1}")
print("=" * 38)

study type       : all studies
-----------------
Cohen's kappa (κ): 0.3868457901656811
ROC AUC          : 0.6909141269564533
F1               : 0.6413586413586414


In [21]:
for study_type in STUDY_TYPES:

    temp = temp_testset[temp_testset["study_type"] == study_type]

    k = metrics.cohen_kappa_score(temp["label"].values, temp["prediction"].values)
    roc_auc = metrics.roc_auc_score(temp["label"].values, temp["prediction"].values)
    f1 = metrics.f1_score(temp["label"].values, temp["prediction"].values)
    print("=" * 38)
    print(f"study type       : {study_type}")
    print("-----------------")
    print(f"Cohen's kappa (κ): {k}")
    print(f"ROC AUC          : {roc_auc}")
    print(f"F1               : {f1}")

study type       : XR_SHOULDER
-----------------
Cohen's kappa (κ): 0.17390390074650397
ROC AUC          : 0.5879319510898458
F1               : 0.6666666666666666
study type       : XR_ELBOW
-----------------
Cohen's kappa (κ): 0.501328903654485
ROC AUC          : 0.7485177865612649
F1               : 0.703125
study type       : XR_HUMERUS
-----------------
Cohen's kappa (κ): 0.4371296905859118
ROC AUC          : 0.7186128182616329
F1               : 0.7205882352941175
study type       : XR_HAND
-----------------
Cohen's kappa (κ): 0.17563003377500652
ROC AUC          : 0.5760576057605761
F1               : 0.2962962962962963
study type       : XR_WRIST
-----------------
Cohen's kappa (κ): 0.5101806491120637
ROC AUC          : 0.74539764359352
F1               : 0.6823529411764705
study type       : XR_FOREARM
-----------------
Cohen's kappa (κ): 0.4361177953477713
ROC AUC          : 0.7154664855072463
F1               : 0.6542056074766355
study type       : XR_FINGER
----------------

In [22]:
clean_up(model)
del temp, temp_testset

### DenseNet169 architecture, pretrained on ImageNet

In [23]:
densenet_testing = tf.keras.preprocessing.image.ImageDataGenerator(
    preprocessing_function=tf.keras.applications.densenet.preprocess_input,
)

In [24]:
testset = densenet_testing.flow_from_dataframe(
    dataframe=ref_testset,
    x_col="image_path",
    y_col="label",
    target_size=(IMAGE_WIDTH, IMAGE_HEIGHT),
    class_mode="binary",
    batch_size=32,
    seed=SEED,
    shuffle=False,  # this is very important in order to properly use raw predictions to evaluate on study-level!
)

Found 3197 validated image filenames belonging to 2 classes.


In [25]:
model = tf.keras.models.load_model(
    "../models/densenet169_pt_imagenet-all-study-types-12644545.h5",
    custom_objects={"F1Score": F1Score},
)

In [26]:
evaluation_metrics = model.evaluate(testset, verbose=1)

2022-04-03 17:19:40.411147: W tensorflow/core/common_runtime/bfc_allocator.cc:275] Allocator (GPU_0_bfc) ran out of memory trying to allocate 1.86GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.
2022-04-03 17:19:40.411176: W tensorflow/core/common_runtime/bfc_allocator.cc:275] Allocator (GPU_0_bfc) ran out of memory trying to allocate 1.86GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.




2022-04-03 17:20:05.311489: W tensorflow/core/common_runtime/bfc_allocator.cc:275] Allocator (GPU_0_bfc) ran out of memory trying to allocate 1.86GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.
2022-04-03 17:20:05.311521: W tensorflow/core/common_runtime/bfc_allocator.cc:275] Allocator (GPU_0_bfc) ran out of memory trying to allocate 1.86GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.




In [27]:
loss = evaluation_metrics.pop(0)

print("=" * 32)
print(f"test loss   : {loss}")
for metric, score in zip(METRICS, evaluation_metrics):

    print(f"{metric.name + ' ' * (12 - len(metric.name))}: {score}")
print("=" * 32)

test loss   : 0.47125089168548584
tp          : 1054.0
fp          : 128.0
tn          : 1539.0
fn          : 476.0
binary_acc  : 0.8110728859901428
precision   : 0.8917089700698853
recall      : 0.6888889074325562
f1_score    : 0.7772861123085022
roc_auc     : 0.882323145866394
pr_auc      : 0.8807572722434998
cohen_kappa : 0.6178791522979736


In [28]:
# get predictions for the test dataset
preds = model.predict(testset, verbose=1)



In [29]:
temp_testset = ref_testset.copy()

temp_testset["label"] = temp_testset["label"].map(int)
temp_testset["prediction"] = preds.ravel()

In [30]:
temp_testset = pd.DataFrame(
    [*study_oriented_transformation(temp_testset)],
    columns=["study_type", "study", "label", "prediction"],
)

In [31]:
k = metrics.cohen_kappa_score(
    temp_testset["label"].values, temp_testset["prediction"].values
)
roc_auc = metrics.roc_auc_score(
    temp_testset["label"].values, temp_testset["prediction"].values
)
f1 = metrics.f1_score(temp_testset["label"].values, temp_testset["prediction"].values)

print("=" * 38)
print(f"study type       : all studies")
print("-----------------")
print(f"Cohen's kappa (κ): {k}")
print(f"ROC AUC          : {roc_auc}")
print(f"F1               : {f1}")
print("=" * 38)

study type       : all studies
-----------------
Cohen's kappa (κ): 0.6291519630338803
ROC AUC          : 0.8074492854692396
F1               : 0.7724867724867724


In [32]:
for study_type in STUDY_TYPES:

    temp = temp_testset[temp_testset["study_type"] == study_type]

    k = metrics.cohen_kappa_score(temp["label"].values, temp["prediction"].values)
    roc_auc = metrics.roc_auc_score(temp["label"].values, temp["prediction"].values)
    f1 = metrics.f1_score(temp["label"].values, temp["prediction"].values)
    print("=" * 38)
    print(f"study type       : {study_type}")
    print("-----------------")
    print(f"Cohen's kappa (κ): {k}")
    print(f"ROC AUC          : {roc_auc}")
    print(f"F1               : {f1}")

study type       : XR_SHOULDER
-----------------
Cohen's kappa (κ): 0.545619078036836
ROC AUC          : 0.772461456671983
F1               : 0.7608695652173914
study type       : XR_ELBOW
-----------------
Cohen's kappa (κ): 0.7177131189382338
ROC AUC          : 0.8473320158102767
F1               : 0.8205128205128205
study type       : XR_HUMERUS
-----------------
Cohen's kappa (κ): 0.718348523114088
ROC AUC          : 0.8589771729587358
F1               : 0.8527131782945736
study type       : XR_HAND
-----------------
Cohen's kappa (κ): 0.4344594036507806
ROC AUC          : 0.6972697269726973
F1               : 0.577319587628866
study type       : XR_WRIST
-----------------
Cohen's kappa (κ): 0.709736680955297
ROC AUC          : 0.8413843888070692
F1               : 0.8117647058823529
study type       : XR_FOREARM
-----------------
Cohen's kappa (κ): 0.6494786295405064
ROC AUC          : 0.8208786231884058
F1               : 0.7850467289719625
study type       : XR_FINGER
----------

In [33]:
clean_up(model)
del temp, temp_testset

### VGG19 architecture, pretrained on ImageNet

In [34]:
vgg19_testing = tf.keras.preprocessing.image.ImageDataGenerator(
    preprocessing_function=tf.keras.applications.vgg19.preprocess_input,
)

In [35]:
testset = vgg19_testing.flow_from_dataframe(
    dataframe=ref_testset,
    x_col="image_path",
    y_col="label",
    target_size=(IMAGE_WIDTH, IMAGE_HEIGHT),
    class_mode="binary",
    batch_size=32,
    seed=SEED,
    shuffle=False,  # this is very important in order to properly use raw predictions to evaluate on study-level!
)

Found 3197 validated image filenames belonging to 2 classes.


In [36]:
model = tf.keras.models.load_model(
    "../models/vgg19_pt_imagenet-all-study-types-20024897.h5",
    custom_objects={"F1Score": F1Score},
)

In [37]:
evaluation_metrics = model.evaluate(testset, verbose=1)

2022-04-03 17:22:35.980760: W tensorflow/core/common_runtime/bfc_allocator.cc:275] Allocator (GPU_0_bfc) ran out of memory trying to allocate 1.02GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.
2022-04-03 17:22:35.980788: W tensorflow/core/common_runtime/bfc_allocator.cc:275] Allocator (GPU_0_bfc) ran out of memory trying to allocate 1.02GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.
2022-04-03 17:22:36.049683: W tensorflow/core/common_runtime/bfc_allocator.cc:275] Allocator (GPU_0_bfc) ran out of memory trying to allocate 939.37MiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.
2022-04-03 17:22:36.049711: W tensorflow/core/common_runtime/bfc_allocator.cc:275] All



In [38]:
loss = evaluation_metrics.pop(0)

print("=" * 32)
print(f"test loss   : {loss}")
for metric, score in zip(METRICS, evaluation_metrics):

    print(f"{metric.name + ' ' * (12 - len(metric.name))}: {score}")
print("=" * 32)

test loss   : 0.494743287563324
tp          : 1017.0
fp          : 134.0
tn          : 1533.0
fn          : 513.0
binary_acc  : 0.797622799873352
precision   : 0.8835794925689697
recall      : 0.6647058725357056
f1_score    : 0.7586721777915955
roc_auc     : 0.8708081841468811
pr_auc      : 0.8756653070449829
cohen_kappa : 0.5903308987617493


In [39]:
# get predictions for the test dataset
preds = model.predict(testset, verbose=1)



In [40]:
temp_testset = ref_testset.copy()

temp_testset["label"] = temp_testset["label"].map(int)
temp_testset["prediction"] = preds.ravel()

In [41]:
temp_testset = pd.DataFrame(
    [*study_oriented_transformation(temp_testset)],
    columns=["study_type", "study", "label", "prediction"],
)

In [42]:
k = metrics.cohen_kappa_score(
    temp_testset["label"].values, temp_testset["prediction"].values
)
roc_auc = metrics.roc_auc_score(
    temp_testset["label"].values, temp_testset["prediction"].values
)
f1 = metrics.f1_score(temp_testset["label"].values, temp_testset["prediction"].values)

print("=" * 38)
print(f"study type       : all studies")
print("-----------------")
print(f"Cohen's kappa (κ): {k}")
print(f"ROC AUC          : {roc_auc}")
print(f"F1               : {f1}")
print("=" * 38)

study type       : all studies
-----------------
Cohen's kappa (κ): 0.5989064545711468
ROC AUC          : 0.7919959057190581
F1               : 0.7510729613733906


In [43]:
for study_type in STUDY_TYPES:

    temp = temp_testset[temp_testset["study_type"] == study_type]

    k = metrics.cohen_kappa_score(temp["label"].values, temp["prediction"].values)
    roc_auc = metrics.roc_auc_score(temp["label"].values, temp["prediction"].values)
    f1 = metrics.f1_score(temp["label"].values, temp["prediction"].values)
    print("=" * 38)
    print(f"study type       : {study_type}")
    print("-----------------")
    print(f"Cohen's kappa (κ): {k}")
    print(f"ROC AUC          : {roc_auc}")
    print(f"F1               : {f1}")

study type       : XR_SHOULDER
-----------------
Cohen's kappa (κ): 0.513914063332978
ROC AUC          : 0.7562466772993088
F1               : 0.7344632768361581
study type       : XR_ELBOW
-----------------
Cohen's kappa (κ): 0.6752312435765673
ROC AUC          : 0.8246047430830038
F1               : 0.7894736842105263
study type       : XR_HUMERUS
-----------------
Cohen's kappa (κ): 0.7034270650263621
ROC AUC          : 0.8514047410008778
F1               : 0.8412698412698412
study type       : XR_HAND
-----------------
Cohen's kappa (κ): 0.4223356942843024
ROC AUC          : 0.6923192319231923
F1               : 0.5714285714285714
study type       : XR_WRIST
-----------------
Cohen's kappa (κ): 0.6734537660747091
ROC AUC          : 0.8239322533136966
F1               : 0.7882352941176471
study type       : XR_FOREARM
-----------------
Cohen's kappa (κ): 0.6189985103701157
ROC AUC          : 0.8058197463768116
F1               : 0.766355140186916
study type       : XR_FINGER
-------

In [44]:
clean_up(model)
del temp, temp_testset

### Ensemble model: generic CNN + wrist CNN

This is simply a "showerthought" idea to try and create a model where the inference for a particular upper extremity region (aka study type) is being delegated to a model that has been trained exclusively for this region (in our case here `XR_WRIST`, but several other combination can be tried).

**Note**: for this ensemble model to practically be meaningful, we make the assumption that we have (or we can acquire) the knowledge of the upper extremity region in which a given unseen image belongs to. If, during inference, we only have unseen images without any additional metadata then this approach does not stand.

In [45]:
testset = cnn_testing.flow_from_dataframe(
    dataframe=ref_testset,
    x_col="image_path",
    y_col="label",
    target_size=(IMAGE_WIDTH, IMAGE_HEIGHT),
    class_mode="binary",
    batch_size=32,
    seed=SEED,
    shuffle=False,  # this is very important in order to properly use raw predictions to evaluate on study-level!
)

Found 3197 validated image filenames belonging to 2 classes.


In [46]:
all_types_model = tf.keras.models.load_model(
    "../models/cnn-all-study-types-3309921.h5", custom_objects={"F1Score": F1Score}
)

In [47]:
wrist_model = tf.keras.models.load_model(
    "../models/cnn-XR_WRIST-3309921.h5", custom_objects={"F1Score": F1Score}
)

In [48]:
# get predictions for the test dataset
all_types_model_preds = all_types_model.predict(testset, verbose=1)



In [49]:
# get predictions for the test dataset
wrist_model_preds = wrist_model.predict(testset, verbose=1)



In [50]:
temp_testset = ref_testset.copy()

temp_testset["label"] = temp_testset["label"].map(int)
temp_testset["prediction"] = all_types_model_preds.ravel()
temp_testset["wrist_prediction"] = wrist_model_preds.ravel()

In [51]:
temp_testset["prediction"] = temp_testset.apply(
    lambda x: x.wrist_prediction if x.study_type == "XR_WRIST" else x.prediction, axis=1
)  # replace generic predictions with wrist prediction when we're talking about wrist images

In [52]:
temp_testset = pd.DataFrame(
    [*study_oriented_transformation(temp_testset)],
    columns=["study_type", "study", "label", "prediction"],
)

In [53]:
k = metrics.cohen_kappa_score(
    temp_testset["label"].values, temp_testset["prediction"].values
)
roc_auc = metrics.roc_auc_score(
    temp_testset["label"].values, temp_testset["prediction"].values
)
f1 = metrics.f1_score(temp_testset["label"].values, temp_testset["prediction"].values)

print("=" * 38)
print(f"study type       : all studies")
print("-----------------")
print(f"Cohen's kappa (κ): {k}")
print(f"ROC AUC          : {roc_auc}")
print(f"F1               : {f1}")
print("=" * 38)

study type       : all studies
-----------------
Cohen's kappa (κ): 0.4001677065419764
ROC AUC          : 0.697938518297724
F1               : 0.6528599605522682


In [54]:
for study_type in STUDY_TYPES:

    temp = temp_testset[temp_testset["study_type"] == study_type]

    k = metrics.cohen_kappa_score(temp["label"].values, temp["prediction"].values)
    roc_auc = metrics.roc_auc_score(temp["label"].values, temp["prediction"].values)
    f1 = metrics.f1_score(temp["label"].values, temp["prediction"].values)
    print("=" * 38)
    print(f"study type       : {study_type}")
    print("-----------------")
    print(f"Cohen's kappa (κ): {k}")
    print(f"ROC AUC          : {roc_auc}")
    print(f"F1               : {f1}")

study type       : XR_SHOULDER
-----------------
Cohen's kappa (κ): 0.17390390074650397
ROC AUC          : 0.5879319510898458
F1               : 0.6666666666666666
study type       : XR_ELBOW
-----------------
Cohen's kappa (κ): 0.501328903654485
ROC AUC          : 0.7485177865612649
F1               : 0.703125
study type       : XR_HUMERUS
-----------------
Cohen's kappa (κ): 0.4371296905859118
ROC AUC          : 0.7186128182616329
F1               : 0.7205882352941175
study type       : XR_HAND
-----------------
Cohen's kappa (κ): 0.17563003377500652
ROC AUC          : 0.5760576057605761
F1               : 0.2962962962962963
study type       : XR_WRIST
-----------------
Cohen's kappa (κ): 0.5826057631056321
ROC AUC          : 0.7862297496318115
F1               : 0.7431693989071039
study type       : XR_FOREARM
-----------------
Cohen's kappa (κ): 0.4361177953477713
ROC AUC          : 0.7154664855072463
F1               : 0.6542056074766355
study type       : XR_FINGER
--------------

In [55]:
clean_up(all_types_model)
clean_up(wrist_model)
del temp, temp_testset

### Ensemble model: DenseNet169 + VGG19

In [56]:
densenet_testset = densenet_testing.flow_from_dataframe(
    dataframe=ref_testset,
    x_col="image_path",
    y_col="label",
    target_size=(IMAGE_WIDTH, IMAGE_HEIGHT),
    class_mode="binary",
    batch_size=32,
    seed=SEED,
    shuffle=False,  # this is very important in order to properly use raw predictions to evaluate on study-level!
)

Found 3197 validated image filenames belonging to 2 classes.


In [57]:
vgg19_testset = vgg19_testing.flow_from_dataframe(
    dataframe=ref_testset,
    x_col="image_path",
    y_col="label",
    target_size=(IMAGE_WIDTH, IMAGE_HEIGHT),
    class_mode="binary",
    batch_size=32,
    seed=SEED,
    shuffle=False,  # this is very important in order to properly use raw predictions to evaluate on study-level!
)

Found 3197 validated image filenames belonging to 2 classes.


In [58]:
densenet_model = tf.keras.models.load_model(
    "../models/densenet169_pt_imagenet-all-study-types-12644545.h5",
    custom_objects={"F1Score": F1Score},
)

In [59]:
vgg19_model = tf.keras.models.load_model(
    "../models/vgg19_pt_imagenet-all-study-types-20024897.h5",
    custom_objects={"F1Score": F1Score},
)

In [60]:
densenet_preds = densenet_model.predict(densenet_testset, verbose=1)



In [61]:
vgg19_preds = vgg19_model.predict(vgg19_testset, verbose=1)



In [62]:
temp_testset = ref_testset.copy()

temp_testset["label"] = temp_testset["label"].map(int)
temp_testset["predictionA"] = densenet_preds.ravel()
temp_testset["predictionB"] = vgg19_preds.ravel()

In [63]:
temp_testset = pd.DataFrame(
    [*study_oriented_transformation_on_ensemble(temp_testset)],
    columns=["study_type", "study", "label", "prediction"],
)

In [64]:
k = metrics.cohen_kappa_score(
    temp_testset["label"].values, temp_testset["prediction"].values
)
roc_auc = metrics.roc_auc_score(
    temp_testset["label"].values, temp_testset["prediction"].values
)
f1 = metrics.f1_score(temp_testset["label"].values, temp_testset["prediction"].values)

print("=" * 38)
print(f"study type       : all studies")
print("-----------------")
print(f"Cohen's kappa (κ): {k}")
print(f"ROC AUC          : {roc_auc}")
print(f"F1               : {f1}")
print("=" * 38)

study type       : all studies
-----------------
Cohen's kappa (κ): 0.6290206756015813
ROC AUC          : 0.807276347091542
F1               : 0.7720042417815483


In [65]:
for study_type in STUDY_TYPES:

    temp = temp_testset[temp_testset["study_type"] == study_type]

    k = metrics.cohen_kappa_score(temp["label"].values, temp["prediction"].values)
    roc_auc = metrics.roc_auc_score(temp["label"].values, temp["prediction"].values)
    f1 = metrics.f1_score(temp["label"].values, temp["prediction"].values)
    print("=" * 38)
    print(f"study type       : {study_type}")
    print("-----------------")
    print(f"Cohen's kappa (κ): {k}")
    print(f"ROC AUC          : {roc_auc}")
    print(f"F1               : {f1}")

study type       : XR_SHOULDER
-----------------
Cohen's kappa (κ): 0.5662727563078889
ROC AUC          : 0.7827751196172249
F1               : 0.7717391304347826
study type       : XR_ELBOW
-----------------
Cohen's kappa (κ): 0.7036152796725784
ROC AUC          : 0.8397562582345192
F1               : 0.810344827586207
study type       : XR_HUMERUS
-----------------
Cohen's kappa (κ): 0.733142982648803
ROC AUC          : 0.8663301141352062
F1               : 0.8593750000000001
study type       : XR_HAND
-----------------
Cohen's kappa (κ): 0.44984351836600234
ROC AUC          : 0.7048454845484549
F1               : 0.5918367346938775
study type       : XR_WRIST
-----------------
Cohen's kappa (κ): 0.7001725129384704
ROC AUC          : 0.8362297496318114
F1               : 0.8047337278106509
study type       : XR_FOREARM
-----------------
Cohen's kappa (κ): 0.6344480073293632
ROC AUC          : 0.8136322463768116
F1               : 0.7777777777777778
study type       : XR_FINGER
------

In [66]:
clean_up(densenet_model)
clean_up(vgg19_model)
del temp, temp_testset