# Evaluate Segmentations of self-supervised & fine-tuned model

In [1]:
import pandas as pd
import json

from tqdm.notebook import tqdm
from monai.transforms import (
    LoadImaged,
    EnsureChannelFirstd,
    MapLabelValued,
    Lambdad,
    Compose,
    AsDiscreted,
    Spacingd,
)
from monai.metrics import DiceMetric

# private libraries
import sys

if "../scripts" not in sys.path:
    sys.path.insert(1, "../scripts")
import config
import labelmappings as lm

In [2]:
with open("../metadata/updated_metadata.json") as f:
    metadata = json.load(f)

channels = list(metadata["network_data_format"]["outputs"]["pred"]["channel_def"].values())[1:]

## Load evaluation of our own models

In [3]:
get_section = lambda x: x.split("/")[-1].split("_")[0][-1]
get_type = lambda x: x.split("/")[-1].split("_")[1].split(".")[0]


def load_scores(path):
    scores = pd.read_csv(path)
    scores.columns = ["filename"] + channels + ["mean"]
    scores["section"] = scores["filename"].apply(get_section)
    try:
        scores["dixon_type"] = scores["filename"].apply(get_type)
    except Exception:
        scores["dixon_type"] = scores["filename"].apply(lambda x: x.split("/")[-2])

    return scores

In [4]:
# Dice of self-supervised and finetuned models (test set was used
ss_scores = load_scores("../bundles/mr_segmentator/eval/ss_02.01_MeanDice_raw.csv")
ft_scores = load_scores("../bundles/mr_segmentator/eval/ft_02.01_MeanDice_raw.csv")

# Dice of finetuned model that was trained on only w-only sequences
ft_wonly_scores = load_scores("../bundles/mr_segmentator/eval/ft_w_only_03.01_MeanDice_raw.csv")

## Evaluate Totalsegmentator and MrSegmentator

In [5]:
transforms = lambda labelmapper: Compose(
    [
        LoadImaged(keys=["pred", "label"], image_only=True),
        EnsureChannelFirstd(keys=["pred", "label"]),
        Spacingd(
            keys=["pred", "label"],
            pixdim=[3, 3, 3],
            mode="nearest",
        ),
        MapLabelValued(
            keys=["pred"],
            orig_labels=labelmapper.keys(),
            target_labels=labelmapper.values(),
            dtype=int,
        ),
        AsDiscreted(keys=["pred", "label"], to_onehot=41),
        Lambdad(keys=["pred", "label"], func=lambda x: x.unsqueeze(0)),
    ]
)

mr_transforms = transforms(lm.mr2new)
total_transforms = transforms(lm.total2new)


def add_meta(df):
    df.columns = channels
    df["filename"] = ss_scores["filename"]
    df["section"] = ss_scores["section"]
    df["dixon_type"] = ss_scores["dixon_type"]
    return df

In [10]:
# ukbb data
data = pd.read_csv(config.ukbb + "test.csv")
data["label"] = config.ukbb + "annotations/" + data["label"]
data["total_pred"] = (
    config.ukbb + "preds_total/" + data["image"].apply(lambda x: x.replace("/", "_"))
)
data["mr_pred"] = (
    config.ukbb + "preds_mr_testset/" + data["image"].apply(lambda x: x.replace("/", "_"))
)

In [11]:
dice_metric = DiceMetric(include_background=False, reduction="none", num_classes=41)
dice_metric.reset()

data_list = data.rename({"total_pred": "pred"}, axis=1)[["label", "pred"]].to_dict("records")
for item in tqdm(data_list):
    _item = total_transforms(item)
    dice_metric(_item["pred"], _item["label"])
total_scores = pd.DataFrame(dice_metric.get_buffer())
total_scores = add_meta(total_scores)

dice_metric.reset()
data_list = data.rename({"mr_pred": "pred"}, axis=1)[["label", "pred"]].to_dict("records")
for item in tqdm(data_list):
    _item = mr_transforms(item)
    dice_metric(_item["pred"], _item["label"])
mr_scores = pd.DataFrame(dice_metric.get_buffer())
mr_scores = add_meta(mr_scores)

  0%|          | 0/372 [00:00<?, ?it/s]

## Compare Predictions

In [17]:
all_scores = pd.DataFrame(index=channels)
all_scores["MRSegmentator"] = mr_scores.mean(numeric_only=True)
all_scores["TotalSegmentator"] = total_scores.mean(numeric_only=True)
all_scores["self-supervised"] = ss_scores.mean(numeric_only=True)
all_scores["finetuned"] = ft_scores.mean(numeric_only=True)
all_scores["finetuned_w_only"] = ft_wonly_scores.mean(numeric_only=True)
all_scores["combined"] = all_scores.max(axis=1)
print("Result for all sequence types averaged:")
print(all_scores.mean())

Result for all sequence types averaged:
MRSegmentator       0.263249
TotalSegmentator    0.264248
self-supervised     0.582886
finetuned           0.744870
finetuned_w_only    0.309872
combined            0.750487
dtype: float64


In [12]:
select_w = lambda df: df.loc[df["dixon_type"] == "W"]
all_w_scores = pd.DataFrame(index=channels)
all_w_scores["MRSegmentator"] = select_w(mr_scores).mean(numeric_only=True)
all_w_scores["TotalSegmentator"] = select_w(total_scores).mean(numeric_only=True)
all_w_scores["self-supervised"] = select_w(ss_scores).mean(numeric_only=True)
all_w_scores["finetuned"] = select_w(ft_scores).mean(numeric_only=True)
all_w_scores["finetuned_w_only"] = select_w(ft_wonly_scores).mean(numeric_only=True)
all_w_scores["combined"] = all_w_scores.max(axis=1)
print("Result for w-only sequences:")
print(all_w_scores.mean())

Result for w-only sequences:
MRSegmentator       0.318813
TotalSegmentator    0.622426
self-supervised     0.596215
finetuned           0.754860
finetuned_w_only    0.755312
combined            0.838993
dtype: float64


In [13]:
all_w_scores

Unnamed: 0,MRSegmentator,TotalSegmentator,self-supervised,finetuned,finetuned_w_only,combined
spleen,0.825775,0.987942,0.890458,0.912804,0.931982,0.987942
right_kidney,0.710431,0.961463,0.893777,0.928857,0.933242,0.961463
left_kidney,0.734384,0.980585,0.888548,0.935495,0.940053,0.980585
gallbladder,0.550029,0.159216,0.555213,0.63238,0.645656,0.645656
liver,0.798552,0.984348,0.906824,0.916086,0.933846,0.984348
stomach,0.611308,0.861766,0.693386,0.633639,0.622397,0.861766
pancreas,0.612155,0.893635,0.682128,0.66006,0.66884,0.893635
right_adrenal_gland,0.464298,0.739766,0.511144,0.467385,0.465954,0.739766
left_adrenal_gland,0.519871,0.706952,0.520569,0.435809,0.421481,0.706952
left_lung,0.0,0.830875,0.799686,0.879226,0.877164,0.879226


## Evaluate on external data set
I use the Charité kidney tumor test set, which consists of 18 abdominal MR scans that are either T1, T2 or T1fs-postKM weighted.
Lina annotated 20 different organs in this set. I will exclude the remaining organs from this analysis.

In [14]:
# Dice of self-supervised and finetuned models on external charite test set (only for original 20 classes)
ss_charite_scores = load_scores("../bundles/mr_segmentator/eval/ss_charite_22.12_MeanDice_raw.csv")
ft_charite_scores = load_scores("../bundles/mr_segmentator/eval/ft_charite_22.12_MeanDice_raw.csv")

orig_classes = [channels[i - 1] for i in list(lm.mr2new.values())[1:]]
ss_charite_scores = ss_charite_scores[orig_classes]
ft_charite_scores = ft_charite_scores[orig_classes]

In [15]:
charite_scores = pd.DataFrame(index=orig_classes)
charite_scores["self-supervised"] = ss_scores.mean(numeric_only=True)
charite_scores["finetuned"] = ft_scores.mean(numeric_only=True)
print("Result for all sequence types averaged:")
print(charite_scores.mean())

Result for all sequence types averaged:
self-supervised    0.705679
finetuned          0.728399
dtype: float64


In [16]:
print("T1:\t", ft_charite_scores.mean(axis=1)[0:6].mean())
print("T2:\t", ft_charite_scores.mean(axis=1)[6:12].mean())
print("T1fs:\t", ft_charite_scores.mean(axis=1)[12:].mean())

T1:	 0.7352580580894738
T2:	 0.6279132709238596
T1fs:	 0.8004684009429824


In [19]:
with open("../metadata/updated_metadata.json") as f:
    metadata = json.load(f)

channels = list(metadata["network_data_format"]["outputs"]["pred"]["channel_def"].values())

In [21]:
for i, c in enumerate(channels):
    print(f"| {i} | {c} |")

| 0 | background |
| 1 | spleen |
| 2 | right_kidney |
| 3 | left_kidney |
| 4 | gallbladder |
| 5 | liver |
| 6 | stomach |
| 7 | pancreas |
| 8 | right_adrenal_gland |
| 9 | left_adrenal_gland |
| 10 | left_lung |
| 11 | right_lung |
| 12 | heart |
| 13 | aorta |
| 14 | inferior_vena_cava |
| 15 | portal_vein_and_splenic_vein |
| 16 | left_iliac_artery |
| 17 | right_iliac_artery |
| 18 | left_iliac_vena |
| 19 | right_iliac_vena |
| 20 | esophagus |
| 21 | small_bowel |
| 22 | duodenum |
| 23 | colon |
| 24 | urinary_bladder |
| 25 | spine |
| 26 | sacrum |
| 27 | left_hip |
| 28 | right_hip |
| 29 | left_femur |
| 30 | right_femur |
| 31 | left_autochthonous_muscle |
| 32 | right_autochthonous_muscle |
| 33 | left_iliopsoas_muscle |
| 34 | right_iliopsoas_muscle |
| 35 | left_gluteus_maximus |
| 36 | right_gluteus_maximus |
| 37 | left_gluteus_medius |
| 38 | right_gluteus_medius |
| 39 | left_gluteus_minimus |
| 40 | right_gluteus_minimus |
