## X-Ray Abnormality Detection | Evaluation

> **Antonopoulos Ilias** ( *p3352004* ) <br />
> **Ndoja Silva** ( *p3352017* ) <br />
> **MSc in Data Science, AUEB**

## Table of Contents


* [CNN architecture with sensible defaults](#CNN-architecture-with-sensible-defaults,-all-study-types)
* [DenseNet169 architecture, pretrained on ImageNet](#DenseNet169-architecture,-pretrained-on-ImageNet)
* [VGG19 architecture, pretrained on ImageNet](#VGG19-architecture,-pretrained-on-ImageNet)
* [Ensemble model: generic CNN + wrist CNN](#Ensemble-model:-generic-CNN-+-wrist-CNN)
* [Ensemble model: DenseNet169 + VGG19](#Ensemble-model:-DenseNet169-+-VGG19)

In [1]:
import os
import pathlib
import random
import re

import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_addons as tfa
from sklearn import metrics
from sklearn.utils import shuffle

from utils import (
    clean_up,
    F1Score,
    inspect_df,
    study_oriented_transformation,
    study_oriented_transformation_on_ensemble,
)

In [2]:
print(tf.__version__)

2.8.0


In [4]:
SEED = 99910123

os.environ["PYTHONHASHSEED"] = str(SEED)
random.seed(SEED)
tf.random.set_seed(SEED)
np.random.seed(SEED)

In [5]:
DATASET_DIR = "../data/MURA-v1.1/"
IMAGE_WIDTH = 224
IMAGE_HEIGHT = 224

In [6]:
STUDY_TYPES = [
    "XR_SHOULDER",
    "XR_ELBOW",
    "XR_HUMERUS",
    "XR_HAND",
    "XR_WRIST",
    "XR_FOREARM",
    "XR_FINGER",
]

In [7]:
METRICS = [
    tf.keras.metrics.TruePositives(name="tp"),
    tf.keras.metrics.FalsePositives(name="fp"),
    tf.keras.metrics.TrueNegatives(name="tn"),
    tf.keras.metrics.FalseNegatives(name="fn"),
    tf.keras.metrics.BinaryAccuracy(name="binary_acc"),
    tf.keras.metrics.Precision(name="precision"),
    tf.keras.metrics.Recall(name="recall"),
    F1Score(name="f1_score"),
    tf.keras.metrics.AUC(name="roc_auc", curve="ROC"),
    tf.keras.metrics.AUC(name="pr_auc", curve="PR"),
    tfa.metrics.CohenKappa(name="cohen_kappa", num_classes=2),
]

2022-04-02 20:19:48.692619: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-04-02 20:19:48.693485: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-04-02 20:19:48.693776: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-04-02 20:19:48.694016: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zer

In [8]:
test_image_paths = pd.read_csv(
    os.path.join(DATASET_DIR, "valid_image_paths.csv"),
    names=["image_path"],
    header=None,
    index_col=False,
)

In [9]:
test_image_paths["image_path"] = test_image_paths["image_path"].map(
    lambda x: os.path.join("../data/", x)
)
test_image_paths["study_type"] = test_image_paths["image_path"].map(
    lambda x: x.split("/")[4]
)
test_image_paths["patient"] = test_image_paths["image_path"].map(
    lambda x: x.split("/")[5]
)
test_image_paths["study"] = test_image_paths["image_path"].map(
    lambda x: x.split("/")[6]
)
test_image_paths["study_path"] = test_image_paths["image_path"].map(
    lambda x: re.sub(r"image\d+.png", "", x)
)

In [10]:
test_labeled_studies = pd.read_csv(
    os.path.join(DATASET_DIR, "valid_labeled_studies.csv"),
    names=["study_path", "label"],
    header=None,
    index_col=False,
)

In [11]:
test_labeled_studies["study_path"] = test_labeled_studies["study_path"].map(
    lambda x: os.path.join("../data/", x)
)
test_labeled_studies["label"] = test_labeled_studies["label"].map(str)

In [12]:
ref_testset = pd.merge(
    test_image_paths, test_labeled_studies, how="inner", on="study_path"
)

inspect_df(ref_testset)

shape: (3197, 6)


Unnamed: 0,image_path,study_type,patient,study,study_path,label
0,../data/MURA-v1.1/valid/XR_WRIST/patient11185/...,XR_WRIST,patient11185,study1_positive,../data/MURA-v1.1/valid/XR_WRIST/patient11185/...,1
1,../data/MURA-v1.1/valid/XR_WRIST/patient11185/...,XR_WRIST,patient11185,study1_positive,../data/MURA-v1.1/valid/XR_WRIST/patient11185/...,1
2,../data/MURA-v1.1/valid/XR_WRIST/patient11185/...,XR_WRIST,patient11185,study1_positive,../data/MURA-v1.1/valid/XR_WRIST/patient11185/...,1
3,../data/MURA-v1.1/valid/XR_WRIST/patient11185/...,XR_WRIST,patient11185,study1_positive,../data/MURA-v1.1/valid/XR_WRIST/patient11185/...,1
4,../data/MURA-v1.1/valid/XR_WRIST/patient11186/...,XR_WRIST,patient11186,study1_positive,../data/MURA-v1.1/valid/XR_WRIST/patient11186/...,1


### CNN architecture with sensible defaults, all study types

In [13]:
cnn_testing = tf.keras.preprocessing.image.ImageDataGenerator(
    rescale=1.0 / 255,
)

In [14]:
testset = cnn_testing.flow_from_dataframe(
    dataframe=ref_testset,
    x_col="image_path",
    y_col="label",
    target_size=(IMAGE_WIDTH, IMAGE_HEIGHT),
    class_mode="binary",
    batch_size=32,
    seed=SEED,
    shuffle=False,  # this is very important in order to properly use raw predictions to evaluate on study-level!
)

Found 3197 validated image filenames belonging to 2 classes.


In [15]:
model = tf.keras.models.load_model(
    "../models/cnn-all-study-types-3305473.h5", custom_objects={"F1Score": F1Score}
)

In [16]:
evaluation_metrics = model.evaluate(testset, verbose=1)

2022-04-02 20:19:51.200312: I tensorflow/stream_executor/cuda/cuda_dnn.cc:368] Loaded cuDNN version 8303




In [17]:
loss = evaluation_metrics.pop(0)

print("=" * 32)
print(f"test loss   : {loss}")
for metric, score in zip(METRICS, evaluation_metrics):

    print(f"{metric.name + ' ' * (12 - len(metric.name))}: {score}")
print("=" * 32)

test loss   : 0.7335954308509827
tp          : 573.0
fp          : 89.0
tn          : 1578.0
fn          : 957.0
binary_acc  : 0.6728182435035706
precision   : 0.865558922290802
recall      : 0.3745098114013672
f1_score    : 0.5228102803230286
roc_auc     : 0.7789220809936523
pr_auc      : 0.7841886878013611
cohen_kappa : 0.328785240650177


In [18]:
# get predictions for the test dataset
preds = model.predict(testset, verbose=1)



In [19]:
temp_testset = ref_testset.copy()

temp_testset["label"] = temp_testset["label"].map(int)
temp_testset["prediction"] = preds.ravel()

In [20]:
temp_testset = pd.DataFrame(
    [*study_oriented_transformation(temp_testset)],
    columns=["study_type", "study", "label", "prediction"],
)

In [21]:
k = metrics.cohen_kappa_score(
    temp_testset["label"].values, temp_testset["prediction"].values
)
roc_auc = metrics.roc_auc_score(
    temp_testset["label"].values, temp_testset["prediction"].values
)
f1 = metrics.f1_score(temp_testset["label"].values, temp_testset["prediction"].values)

print("=" * 38)
print(f"study type       : all studies")
print("-----------------")
print(f"Cohen's kappa (κ): {k}")
print(f"ROC AUC          : {roc_auc}")
print(f"F1               : {f1}")
print("=" * 38)

study type       : all studies
-----------------
Cohen's kappa (κ): 0.33419514321369503
ROC AUC          : 0.6573879837353565
F1               : 0.49729729729729727


In [22]:
for study_type in STUDY_TYPES:

    temp = temp_testset[temp_testset["study_type"] == study_type]

    k = metrics.cohen_kappa_score(temp["label"].values, temp["prediction"].values)
    roc_auc = metrics.roc_auc_score(temp["label"].values, temp["prediction"].values)
    f1 = metrics.f1_score(temp["label"].values, temp["prediction"].values)
    print("=" * 38)
    print(f"study type       : {study_type}")
    print("-----------------")
    print(f"Cohen's kappa (κ): {k}")
    print(f"ROC AUC          : {roc_auc}")
    print(f"F1               : {f1}")

study type       : XR_SHOULDER
-----------------
Cohen's kappa (κ): 0.3331184874852293
ROC AUC          : 0.6648591174906964
F1               : 0.5492957746478874
study type       : XR_ELBOW
-----------------
Cohen's kappa (κ): 0.30396475770925113
ROC AUC          : 0.6363636363636364
F1               : 0.42857142857142855
study type       : XR_HUMERUS
-----------------
Cohen's kappa (κ): 0.4200903183169953
ROC AUC          : 0.7092844600526779
F1               : 0.6138613861386137
study type       : XR_HAND
-----------------
Cohen's kappa (κ): 0.1603598835670813
ROC AUC          : 0.5681818181818181
F1               : 0.24000000000000002
study type       : XR_WRIST
-----------------
Cohen's kappa (κ): 0.37654296005504073
ROC AUC          : 0.671281296023564
F1               : 0.5255474452554746
study type       : XR_FOREARM
-----------------
Cohen's kappa (κ): 0.38368860055607046
ROC AUC          : 0.6875
F1               : 0.5454545454545454
study type       : XR_FINGER
-------------

In [23]:
clean_up(model)
del temp, temp_testset

### DenseNet169 architecture, pretrained on ImageNet

In [24]:
densenet_testing = tf.keras.preprocessing.image.ImageDataGenerator(
    preprocessing_function=tf.keras.applications.densenet.preprocess_input,
)

In [25]:
testset = densenet_testing.flow_from_dataframe(
    dataframe=ref_testset,
    x_col="image_path",
    y_col="label",
    target_size=(IMAGE_WIDTH, IMAGE_HEIGHT),
    class_mode="binary",
    batch_size=32,
    seed=SEED,
    shuffle=False,  # this is very important in order to properly use raw predictions to evaluate on study-level!
)

Found 3197 validated image filenames belonging to 2 classes.


In [26]:
model = tf.keras.models.load_model(
    "../models/densenet169_pt_imagenet-all-study-types-12644545.h5",
    custom_objects={"F1Score": F1Score},
)

In [27]:
evaluation_metrics = model.evaluate(testset, verbose=1)



In [28]:
loss = evaluation_metrics.pop(0)

print("=" * 32)
print(f"test loss   : {loss}")
for metric, score in zip(METRICS, evaluation_metrics):

    print(f"{metric.name + ' ' * (12 - len(metric.name))}: {score}")
print("=" * 32)

test loss   : 0.47183555364608765
tp          : 1074.0
fp          : 159.0
tn          : 1508.0
fn          : 456.0
binary_acc  : 0.8076321482658386
precision   : 0.871046245098114
recall      : 0.7019608020782471
f1_score    : 0.7774158120155334
roc_auc     : 0.8799281120300293
pr_auc      : 0.8806084990501404
cohen_kappa : 0.6114572286605835


In [29]:
# get predictions for the test dataset
preds = model.predict(testset, verbose=1)



In [30]:
temp_testset = ref_testset.copy()

temp_testset["label"] = temp_testset["label"].map(int)
temp_testset["prediction"] = preds.ravel()

In [31]:
temp_testset = pd.DataFrame(
    [*study_oriented_transformation(temp_testset)],
    columns=["study_type", "study", "label", "prediction"],
)

In [32]:
k = metrics.cohen_kappa_score(
    temp_testset["label"].values, temp_testset["prediction"].values
)
roc_auc = metrics.roc_auc_score(
    temp_testset["label"].values, temp_testset["prediction"].values
)
f1 = metrics.f1_score(temp_testset["label"].values, temp_testset["prediction"].values)

print("=" * 38)
print(f"study type       : all studies")
print("-----------------")
print(f"Cohen's kappa (κ): {k}")
print(f"ROC AUC          : {roc_auc}")
print(f"F1               : {f1}")
print("=" * 38)

study type       : all studies
-----------------
Cohen's kappa (κ): 0.6389900496085796
ROC AUC          : 0.8133066942618202
F1               : 0.7821576763485477


In [33]:
for study_type in STUDY_TYPES:

    temp = temp_testset[temp_testset["study_type"] == study_type]

    k = metrics.cohen_kappa_score(temp["label"].values, temp["prediction"].values)
    roc_auc = metrics.roc_auc_score(temp["label"].values, temp["prediction"].values)
    f1 = metrics.f1_score(temp["label"].values, temp["prediction"].values)
    print("=" * 38)
    print(f"study type       : {study_type}")
    print("-----------------")
    print(f"Cohen's kappa (κ): {k}")
    print(f"ROC AUC          : {roc_auc}")
    print(f"F1               : {f1}")

study type       : XR_SHOULDER
-----------------
Cohen's kappa (κ): 0.5365750079626288
ROC AUC          : 0.7686868686868686
F1               : 0.7715736040609137
study type       : XR_ELBOW
-----------------
Cohen's kappa (κ): 0.6908286540752084
ROC AUC          : 0.8343214756258235
F1               : 0.8034188034188035
study type       : XR_HUMERUS
-----------------
Cohen's kappa (κ): 0.733142982648803
ROC AUC          : 0.8663301141352062
F1               : 0.8593750000000001
study type       : XR_HAND
-----------------
Cohen's kappa (κ): 0.46513919684651384
ROC AUC          : 0.7124212421242124
F1               : 0.6060606060606061
study type       : XR_WRIST
-----------------
Cohen's kappa (κ): 0.7106889448386359
ROC AUC          : 0.8429675994108984
F1               : 0.8139534883720931
study type       : XR_FOREARM
-----------------
Cohen's kappa (κ): 0.6803250543664874
ROC AUC          : 0.8365036231884059
F1               : 0.8073394495412843
study type       : XR_FINGER
-----

In [34]:
clean_up(model)
del temp, temp_testset

### VGG19 architecture, pretrained on ImageNet

In [35]:
vgg19_testing = tf.keras.preprocessing.image.ImageDataGenerator(
    preprocessing_function=tf.keras.applications.vgg19.preprocess_input,
)

In [36]:
testset = vgg19_testing.flow_from_dataframe(
    dataframe=ref_testset,
    x_col="image_path",
    y_col="label",
    target_size=(IMAGE_WIDTH, IMAGE_HEIGHT),
    class_mode="binary",
    batch_size=32,
    seed=SEED,
    shuffle=False,  # this is very important in order to properly use raw predictions to evaluate on study-level!
)

Found 3197 validated image filenames belonging to 2 classes.


In [37]:
model = tf.keras.models.load_model(
    "../models/vgg19_pt_imagenet-all-study-types-20024897.h5",
    custom_objects={"F1Score": F1Score},
)

In [38]:
evaluation_metrics = model.evaluate(testset, verbose=1)

2022-04-02 20:21:06.787414: W tensorflow/core/common_runtime/bfc_allocator.cc:275] Allocator (GPU_0_bfc) ran out of memory trying to allocate 1.02GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.
2022-04-02 20:21:06.787452: W tensorflow/core/common_runtime/bfc_allocator.cc:275] Allocator (GPU_0_bfc) ran out of memory trying to allocate 1.02GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.
2022-04-02 20:21:07.263980: W tensorflow/core/common_runtime/bfc_allocator.cc:275] Allocator (GPU_0_bfc) ran out of memory trying to allocate 3.46GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.
2022-04-02 20:21:07.264017: W tensorflow/core/common_runtime/bfc_allocator.cc:275] Alloc



In [39]:
loss = evaluation_metrics.pop(0)

print("=" * 32)
print(f"test loss   : {loss}")
for metric, score in zip(METRICS, evaluation_metrics):

    print(f"{metric.name + ' ' * (12 - len(metric.name))}: {score}")
print("=" * 32)

test loss   : 0.45638495683670044
tp          : 1124.0
fp          : 199.0
tn          : 1468.0
fn          : 406.0
binary_acc  : 0.8107600808143616
precision   : 0.8495842814445496
recall      : 0.7346405386924744
f1_score    : 0.7879425287246704
roc_auc     : 0.881947934627533
pr_auc      : 0.8868595361709595
cohen_kappa : 0.618704080581665


In [40]:
# get predictions for the test dataset
preds = model.predict(testset, verbose=1)



In [41]:
temp_testset = ref_testset.copy()

temp_testset["label"] = temp_testset["label"].map(int)
temp_testset["prediction"] = preds.ravel()

In [42]:
temp_testset = pd.DataFrame(
    [*study_oriented_transformation(temp_testset)],
    columns=["study_type", "study", "label", "prediction"],
)

In [43]:
k = metrics.cohen_kappa_score(
    temp_testset["label"].values, temp_testset["prediction"].values
)
roc_auc = metrics.roc_auc_score(
    temp_testset["label"].values, temp_testset["prediction"].values
)
f1 = metrics.f1_score(temp_testset["label"].values, temp_testset["prediction"].values)

print("=" * 38)
print(f"study type       : all studies")
print("-----------------")
print(f"Cohen's kappa (κ): {k}")
print(f"ROC AUC          : {roc_auc}")
print(f"F1               : {f1}")
print("=" * 38)

study type       : all studies
-----------------
Cohen's kappa (κ): 0.6472350842514489
ROC AUC          : 0.8185806117800561
F1               : 0.7910750507099392


In [44]:
for study_type in STUDY_TYPES:

    temp = temp_testset[temp_testset["study_type"] == study_type]

    k = metrics.cohen_kappa_score(temp["label"].values, temp["prediction"].values)
    roc_auc = metrics.roc_auc_score(temp["label"].values, temp["prediction"].values)
    f1 = metrics.f1_score(temp["label"].values, temp["prediction"].values)
    print("=" * 38)
    print(f"study type       : {study_type}")
    print("-----------------")
    print(f"Cohen's kappa (κ): {k}")
    print(f"ROC AUC          : {roc_auc}")
    print(f"F1               : {f1}")

study type       : XR_SHOULDER
-----------------
Cohen's kappa (κ): 0.5770498776986068
ROC AUC          : 0.7884635832004253
F1               : 0.783068783068783
study type       : XR_ELBOW
-----------------
Cohen's kappa (κ): 0.6780984719864176
ROC AUC          : 0.8288866930171278
F1               : 0.7966101694915254
study type       : XR_HUMERUS
-----------------
Cohen's kappa (κ): 0.718348523114088
ROC AUC          : 0.8589771729587358
F1               : 0.8527131782945736
study type       : XR_HAND
-----------------
Cohen's kappa (κ): 0.5281343343828206
ROC AUC          : 0.7453495349534953
F1               : 0.6666666666666667
study type       : XR_WRIST
-----------------
Cohen's kappa (κ): 0.6965885985390466
ROC AUC          : 0.8405743740795287
F1               : 0.8111111111111111
study type       : XR_FOREARM
-----------------
Cohen's kappa (κ): 0.6810551558752997
ROC AUC          : 0.8376358695652174
F1               : 0.8141592920353982
study type       : XR_FINGER
-------

In [45]:
clean_up(model)
del temp, temp_testset

### Ensemble model: generic CNN + wrist CNN

This is simply a "showerthought" idea to try and create a model where the inference for a particular upper extremity region (aka study type) is being delegated to a model that has been trained exclusively for this region (in our case here `XR_WRIST`, but several other combination can be tried).

**Note**: for this ensemble model to practically be meaningful, we make the assumption that we have (or we can acquire) the knowledge of the upper extremity region in which a given unseen image belongs to. If, during inference, we only have unseen images without any additional metadata then this approach does not stand.

In [46]:
testset = cnn_testing.flow_from_dataframe(
    dataframe=ref_testset,
    x_col="image_path",
    y_col="label",
    target_size=(IMAGE_WIDTH, IMAGE_HEIGHT),
    class_mode="binary",
    batch_size=32,
    seed=SEED,
    shuffle=False,  # this is very important in order to properly use raw predictions to evaluate on study-level!
)

Found 3197 validated image filenames belonging to 2 classes.


In [47]:
all_types_model = tf.keras.models.load_model(
    "../models/cnn-all-study-types-3305473.h5", custom_objects={"F1Score": F1Score}
)

In [48]:
wrist_model = tf.keras.models.load_model(
    "../models/cnn-XR_WRIST-3305473.h5", custom_objects={"F1Score": F1Score}
)

In [49]:
# get predictions for the test dataset
all_types_model_preds = all_types_model.predict(testset, verbose=1)



In [50]:
# get predictions for the test dataset
wrist_model_preds = wrist_model.predict(testset, verbose=1)



In [51]:
temp_testset = ref_testset.copy()

temp_testset["label"] = temp_testset["label"].map(int)
temp_testset["prediction"] = all_types_model_preds.ravel()
temp_testset["wrist_prediction"] = wrist_model_preds.ravel()

In [52]:
temp_testset["prediction"] = temp_testset.apply(
    lambda x: x.wrist_prediction if x.study_type == "XR_WRIST" else x.prediction, axis=1
)  # replace generic predictions with wrist prediction when we're talking about wrist images

In [53]:
temp_testset = pd.DataFrame(
    [*study_oriented_transformation(temp_testset)],
    columns=["study_type", "study", "label", "prediction"],
)

In [54]:
k = metrics.cohen_kappa_score(
    temp_testset["label"].values, temp_testset["prediction"].values
)
roc_auc = metrics.roc_auc_score(
    temp_testset["label"].values, temp_testset["prediction"].values
)
f1 = metrics.f1_score(temp_testset["label"].values, temp_testset["prediction"].values)

print("=" * 38)
print(f"study type       : all studies")
print("-----------------")
print(f"Cohen's kappa (κ): {k}")
print(f"ROC AUC          : {roc_auc}")
print(f"F1               : {f1}")
print("=" * 38)

study type       : all studies
-----------------
Cohen's kappa (κ): 0.33525353478153885
ROC AUC          : 0.6589880152298252
F1               : 0.519280205655527


In [55]:
for study_type in STUDY_TYPES:

    temp = temp_testset[temp_testset["study_type"] == study_type]

    k = metrics.cohen_kappa_score(temp["label"].values, temp["prediction"].values)
    roc_auc = metrics.roc_auc_score(temp["label"].values, temp["prediction"].values)
    f1 = metrics.f1_score(temp["label"].values, temp["prediction"].values)
    print("=" * 38)
    print(f"study type       : {study_type}")
    print("-----------------")
    print(f"Cohen's kappa (κ): {k}")
    print(f"ROC AUC          : {roc_auc}")
    print(f"F1               : {f1}")

study type       : XR_SHOULDER
-----------------
Cohen's kappa (κ): 0.3331184874852293
ROC AUC          : 0.6648591174906964
F1               : 0.5492957746478874
study type       : XR_ELBOW
-----------------
Cohen's kappa (κ): 0.30396475770925113
ROC AUC          : 0.6363636363636364
F1               : 0.42857142857142855
study type       : XR_HUMERUS
-----------------
Cohen's kappa (κ): 0.4200903183169953
ROC AUC          : 0.7092844600526779
F1               : 0.6138613861386137
study type       : XR_HAND
-----------------
Cohen's kappa (κ): 0.1603598835670813
ROC AUC          : 0.5681818181818181
F1               : 0.24000000000000002
study type       : XR_WRIST
-----------------
Cohen's kappa (κ): 0.39722127320350764
ROC AUC          : 0.6926362297496318
F1               : 0.6171428571428572
study type       : XR_FOREARM
-----------------
Cohen's kappa (κ): 0.38368860055607046
ROC AUC          : 0.6875
F1               : 0.5454545454545454
study type       : XR_FINGER
------------

In [56]:
clean_up(all_types_model)
clean_up(wrist_model)
del temp, temp_testset

### Ensemble model: DenseNet169 + VGG19

In [57]:
densenet_testset = densenet_testing.flow_from_dataframe(
    dataframe=ref_testset,
    x_col="image_path",
    y_col="label",
    target_size=(IMAGE_WIDTH, IMAGE_HEIGHT),
    class_mode="binary",
    batch_size=32,
    seed=SEED,
    shuffle=False,  # this is very important in order to properly use raw predictions to evaluate on study-level!
)

Found 3197 validated image filenames belonging to 2 classes.


In [58]:
vgg19_testset = vgg19_testing.flow_from_dataframe(
    dataframe=ref_testset,
    x_col="image_path",
    y_col="label",
    target_size=(IMAGE_WIDTH, IMAGE_HEIGHT),
    class_mode="binary",
    batch_size=32,
    seed=SEED,
    shuffle=False,  # this is very important in order to properly use raw predictions to evaluate on study-level!
)

Found 3197 validated image filenames belonging to 2 classes.


In [59]:
densenet_model = tf.keras.models.load_model(
    "../models/densenet169_pt_imagenet-all-study-types-12644545.h5",
    custom_objects={"F1Score": F1Score},
)

In [60]:
vgg19_model = tf.keras.models.load_model(
    "../models/vgg19_pt_imagenet-all-study-types-20024897.h5",
    custom_objects={"F1Score": F1Score},
)

In [61]:
densenet_preds = densenet_model.predict(densenet_testset, verbose=1)



In [62]:
vgg19_preds = vgg19_model.predict(vgg19_testset, verbose=1)



In [63]:
temp_testset = ref_testset.copy()

temp_testset["label"] = temp_testset["label"].map(int)
temp_testset["predictionA"] = densenet_preds.ravel()
temp_testset["predictionB"] = vgg19_preds.ravel()

In [64]:
temp_testset = pd.DataFrame(
    [*study_oriented_transformation_on_ensemble(temp_testset)],
    columns=["study_type", "study", "label", "prediction"],
)

In [65]:
k = metrics.cohen_kappa_score(
    temp_testset["label"].values, temp_testset["prediction"].values
)
roc_auc = metrics.roc_auc_score(
    temp_testset["label"].values, temp_testset["prediction"].values
)
f1 = metrics.f1_score(temp_testset["label"].values, temp_testset["prediction"].values)

print("=" * 38)
print(f"study type       : all studies")
print("-----------------")
print(f"Cohen's kappa (κ): {k}")
print(f"ROC AUC          : {roc_auc}")
print(f"F1               : {f1}")
print("=" * 38)

study type       : all studies
-----------------
Cohen's kappa (κ): 0.646365349262074
ROC AUC          : 0.8173700431361743
F1               : 0.7880658436213993


In [66]:
for study_type in STUDY_TYPES:

    temp = temp_testset[temp_testset["study_type"] == study_type]

    k = metrics.cohen_kappa_score(temp["label"].values, temp["prediction"].values)
    roc_auc = metrics.roc_auc_score(temp["label"].values, temp["prediction"].values)
    f1 = metrics.f1_score(temp["label"].values, temp["prediction"].values)
    print("=" * 38)
    print(f"study type       : {study_type}")
    print("-----------------")
    print(f"Cohen's kappa (κ): {k}")
    print(f"ROC AUC          : {roc_auc}")
    print(f"F1               : {f1}")

study type       : XR_SHOULDER
-----------------
Cohen's kappa (κ): 0.5567952396132185
ROC AUC          : 0.7785752259436471
F1               : 0.7772020725388602
study type       : XR_ELBOW
-----------------
Cohen's kappa (κ): 0.7049235993208829
ROC AUC          : 0.841897233201581
F1               : 0.8135593220338984
study type       : XR_HUMERUS
-----------------
Cohen's kappa (κ): 0.718348523114088
ROC AUC          : 0.8589771729587358
F1               : 0.8527131782945736
study type       : XR_HAND
-----------------
Cohen's kappa (κ): 0.4803471994759253
ROC AUC          : 0.71999699969997
F1               : 0.6200000000000001
study type       : XR_WRIST
-----------------
Cohen's kappa (κ): 0.7296577946768061
ROC AUC          : 0.8532768777614138
F1               : 0.8275862068965517
study type       : XR_FOREARM
-----------------
Cohen's kappa (κ): 0.634866163349348
ROC AUC          : 0.8141983695652174
F1               : 0.7818181818181819
study type       : XR_FINGER
----------

In [67]:
clean_up(densenet_model)
clean_up(vgg19_model)
del temp, temp_testset