# Evaluate Segmentations of self-supervised & fine-tuned model

In [14]:
import pandas as pd
import json

from tqdm.notebook import tqdm
from monai.transforms import (
    LoadImaged,
    EnsureChannelFirstd,
    MapLabelValued,
    Lambdad,
    Compose,
    AsDiscreted,
    Spacingd,
)
from monai.metrics import DiceMetric

# private libraries
import sys

if "../scripts" not in sys.path:
    sys.path.insert(1, "../scripts")
import config
import labelmappings as lm

In [15]:
with open("../metadata/updated_metadata.json") as f:
    metadata = json.load(f)

channels = list(metadata["network_data_format"]["outputs"]["pred"]["channel_def"].values())[1:]

## Load evaluation of our own models

In [16]:
get_section = lambda x: x.split("/")[-1].split("_")[0][-1]
get_type = lambda x: x.split("/")[-1].split("_")[1].split(".")[0]


def load_scores(path):
    scores = pd.read_csv(path)
    scores.columns = ["filename"] + channels + ["mean"]
    scores["section"] = scores["filename"].apply(get_section)
    try:
        scores["dixon_type"] = scores["filename"].apply(get_type)
    except Exception:
        scores["dixon_type"] = scores["filename"].apply(lambda x: x.split("/")[-2])

    return scores

In [17]:
# Dice of self-supervised and finetuned models
ss_scores = load_scores("../bundles/mr_segmentator/eval/ss_02.01_MeanDice_raw.csv")
ft_scores = load_scores("../bundles/mr_segmentator/eval/ft_02.01_MeanDice_raw.csv")

# Dice of second generation self-supervised and finetuned models
# (retrained on the segmentations of the first generation)
ss_scores2 = load_scores("../bundles/mr_segmentator/eval/ss_gen2_05.01_MeanDice_raw.csv")
ft_scores2 = load_scores("../bundles/mr_segmentator/eval/ft_gen2_05.01_MeanDice_raw.csv")

# Dice of finetuned model that was trained on only w-only sequences
ft_wonly_scores = load_scores("../bundles/mr_segmentator/eval/ft_w_only_03.01_MeanDice_raw.csv")

## Evaluate Totalsegmentator and MrSegmentator

In [18]:
transforms = lambda labelmapper: Compose(
    [
        LoadImaged(keys=["pred", "label"], image_only=True),
        EnsureChannelFirstd(keys=["pred", "label"]),
        Spacingd(
            keys=["pred", "label"],
            pixdim=[3, 3, 3],
            mode="nearest",
        ),
        MapLabelValued(
            keys=["pred"],
            orig_labels=labelmapper.keys(),
            target_labels=labelmapper.values(),
            dtype=int,
        ),
        AsDiscreted(keys=["pred", "label"], to_onehot=41),
        Lambdad(keys=["pred", "label"], func=lambda x: x.unsqueeze(0)),
    ]
)

mr_transforms = transforms(lm.mr2new)
total_transforms = transforms(lm.total2new)


def add_meta(df):
    df.columns = channels
    df["filename"] = ss_scores["filename"]
    df["section"] = ss_scores["section"]
    df["dixon_type"] = ss_scores["dixon_type"]
    return df

In [19]:
# ukbb data
data = pd.read_csv(config.ukbb + "test.csv")
data["label"] = config.ukbb + "annotations/" + data["label"]
data["total_pred"] = (
    config.ukbb + "preds_total/" + data["image"].apply(lambda x: x.replace("/", "_"))
)
data["mr_pred"] = (
    config.ukbb + "preds_mr_testset/" + data["image"].apply(lambda x: x.replace("/", "_"))
)

In [20]:
dice_metric = DiceMetric(include_background=False, reduction="none", num_classes=41)
dice_metric.reset()

data_list = data.rename({"total_pred": "pred"}, axis=1)[["label", "pred"]].to_dict("records")
for item in tqdm(data_list):
    _item = total_transforms(item)
    dice_metric(_item["pred"], _item["label"])
total_scores = pd.DataFrame(dice_metric.get_buffer())
total_scores = add_meta(total_scores)

dice_metric.reset()
data_list = data.rename({"mr_pred": "pred"}, axis=1)[["label", "pred"]].to_dict("records")
for item in tqdm(data_list):
    _item = mr_transforms(item)
    dice_metric(_item["pred"], _item["label"])
mr_scores = pd.DataFrame(dice_metric.get_buffer())
mr_scores = add_meta(mr_scores)

  0%|          | 0/372 [00:00<?, ?it/s]

  0%|          | 0/372 [00:00<?, ?it/s]

## Compare Predictions

In [21]:
all_scores = pd.DataFrame(index=channels)
all_scores["MRSegmentator"] = mr_scores.mean(numeric_only=True)
all_scores["TotalSegmentator"] = total_scores.mean(numeric_only=True)
all_scores["self-supervised"] = ss_scores.mean(numeric_only=True)
all_scores["finetuned"] = ft_scores.mean(numeric_only=True)
all_scores["combined"] = all_scores.max(axis=1)
all_scores["finetuned_w_only"] = ft_wonly_scores.mean(numeric_only=True)
all_scores["self-supervised2"] = ss_scores2.mean(numeric_only=True)
all_scores["finetuned2"] = ft_scores2.mean(numeric_only=True)
print("Result for all sequence types averaged:")
print(all_scores.mean())

Result for all sequence types averaged:
MRSegmentator       0.263249
TotalSegmentator    0.264248
self-supervised     0.582886
finetuned           0.744870
combined            0.750487
finetuned_w_only    0.309872
self-supervised2    0.723866
finetuned2          0.749023
dtype: float64


In [22]:
select_w = lambda df: df.loc[df["dixon_type"] == "W"]
all_w_scores = pd.DataFrame(index=channels)
all_w_scores["MRSegmentator"] = select_w(mr_scores).mean(numeric_only=True)
all_w_scores["TotalSegmentator"] = select_w(total_scores).mean(numeric_only=True)
all_w_scores["self-supervised"] = select_w(ss_scores).mean(numeric_only=True)
all_w_scores["finetuned"] = select_w(ft_scores).mean(numeric_only=True)
all_w_scores["combined"] = all_w_scores.max(axis=1)
all_w_scores["finetuned_w_only"] = select_w(ft_wonly_scores).mean(numeric_only=True)
all_w_scores["self-supervised2"] = select_w(ss_scores2).mean(numeric_only=True)
all_w_scores["finetuned2"] = select_w(ft_scores2).mean(numeric_only=True)
print("Result for w-only sequences:")
print(all_w_scores.mean())

Result for w-only sequences:
MRSegmentator       0.318813
TotalSegmentator    0.622426
self-supervised     0.596215
finetuned           0.754860
combined            0.838272
finetuned_w_only    0.755312
self-supervised2    0.737053
finetuned2          0.759269
dtype: float64


In [23]:
all_w_scores

Unnamed: 0,MRSegmentator,TotalSegmentator,self-supervised,finetuned,combined,finetuned_w_only,self-supervised2,finetuned2
spleen,0.825775,0.987942,0.890458,0.912804,0.987942,0.931982,0.889542,0.921829
right_kidney,0.710431,0.961463,0.893777,0.928857,0.961463,0.933242,0.89284,0.92929
left_kidney,0.734384,0.980585,0.888548,0.935495,0.980585,0.940053,0.881824,0.936775
gallbladder,0.550029,0.159216,0.555213,0.63238,0.63238,0.645656,0.573248,0.644494
liver,0.798552,0.984348,0.906824,0.916086,0.984348,0.933846,0.907493,0.913621
stomach,0.611308,0.861766,0.693386,0.633639,0.861766,0.622397,0.697778,0.624231
pancreas,0.612155,0.893635,0.682128,0.66006,0.893635,0.66884,0.681765,0.657899
right_adrenal_gland,0.464298,0.739766,0.511144,0.467385,0.739766,0.465954,0.517035,0.458616
left_adrenal_gland,0.519871,0.706952,0.520569,0.435809,0.706952,0.421481,0.511488,0.456126
left_lung,0.0,0.830875,0.799686,0.879226,0.879226,0.877164,0.884846,0.881684


## Evaluate on external data set
I use the Charité kidney tumor test set, which consists of 221 abdominal MR scans that are either T1, T2 or T1fs-postKM weighted.
Lina annotated 20 different organs in this set. I will exclude the remaining organs from this analysis.

## Second Generation Model

In [44]:
# load annotation doc
data = pd.read_csv(config.ukbb + "csv/charite_annotations.csv")
data["pred"] = config.ukbb + "preds_charite/" + data["label"].apply(lambda x: x.split("/")[1])
data["label"] = config.mr_label_path + data["label"]

221


In [33]:
dice_metric.reset()
data_list = data.rename({"label": "pred", "pred": "label"}, axis=1)[["label", "pred"]].to_dict(
    "records"
)
for item in tqdm(data_list):
    _item = mr_transforms(item)
    dice_metric(_item["pred"], _item["label"])
charite_scores = pd.DataFrame(dice_metric.get_buffer())
charite_scores = add_meta(charite_scores)

In [38]:
multiple_datasets_scores = pd.DataFrame(index=channels)
multiple_datasets_scores["UKBB dataset"] = ft_scores2.mean(numeric_only=True)
multiple_datasets_scores["Charite dataset"] = charite_scores.mean(numeric_only=True)
multiple_datasets_scores

Unnamed: 0,UKBB dataset,Charite dataset
spleen,0.880869,0.830295
right_kidney,0.91725,0.840691
left_kidney,0.92256,0.836642
gallbladder,0.611149,0.429986
liver,0.901898,0.896576
stomach,0.629968,0.803173
pancreas,0.634437,0.683904
right_adrenal_gland,0.431962,0.450028
left_adrenal_gland,0.479641,0.515056
left_lung,0.882304,0.0
