# Evaluate Segmentations of self-supervised & fine-tuned model

In [1]:
import pandas as pd
import json

from tqdm.notebook import tqdm
from monai.transforms import (
    LoadImaged,
    EnsureChannelFirstd,
    MapLabelValued,
    Lambdad,
    Compose,
    AsDiscreted,
    Spacingd,
)
from monai.metrics import DiceMetric

# private libraries
import sys

if "../scripts" not in sys.path:
    sys.path.insert(1, "../scripts")
import config
import labelmappings as lm

In [2]:
with open("../metadata/updated_metadata.json") as f:
    metadata = json.load(f)

channels = list(metadata["network_data_format"]["outputs"]["pred"]["channel_def"].values())[1:]

## Evaluate Totalsegmentator and MrSegmentator
Calculate Dice Scores for all models:
1. **nnUNet self-supervised** ((trained on combination of predictions of TotalSegementator and and a first verison of MRSegmentator on more than 8000 UKKB scans (sections 1 to 3))
2. **nnUNet fine-tuned V1** ((2), but fine-tuned on manually annotated UKBB & Charité scans (section 1 to 3))
2. **nnUNet fine-tuned V2** ((2), but fine-tuned on manually annotated UKBB & Charité scans (section 0 to 4))

I removed the evaluation of previous models of the monai framework, because they were not trained sections 0 and 4 and perform much worse on them.

In [3]:
transforms = lambda labelmapper: Compose(
    [
        LoadImaged(keys=["pred", "label"], image_only=True),
        Lambdad(
            keys=["pred", "label"],
            func=lambda x: x[:, :, 1:-1],
        ),
        EnsureChannelFirstd(keys=["pred", "label"]),
        Spacingd(
            keys=["pred", "label"],
            pixdim=[5, 5, 5],
            mode="nearest",
        ),
        MapLabelValued(
            keys=["pred"],
            orig_labels=labelmapper.keys(),
            target_labels=labelmapper.values(),
            dtype=int,
        ),
        AsDiscreted(keys=["pred", "label"], to_onehot=41),
        Lambdad(keys=["pred", "label"], func=lambda x: x[None]),
    ]
)


def add_meta(df, meta_df):
    df.columns = channels
    df["eid"] = meta_df["eid"]
    df["section"] = meta_df["section"]
    df["dixon_type"] = meta_df["dixon_type"]
    return df


def calc_dice(data, model_name, labelmapper=None):
    if labelmapper is None:
        labelmapper = {0: 0}
    dice_metric = DiceMetric(include_background=False, reduction="none", num_classes=41)

    data_list = data.rename({model_name: "pred"}, axis=1)[["label", "pred"]].to_dict("records")
    for item in tqdm(data_list, desc=f'Calc. Dice for  model "{model_name}"'):
        _item = transforms(labelmapper)(item)
        dice_metric(_item["pred"], _item["label"])
    scores = add_meta(pd.DataFrame(dice_metric.get_buffer()), data)

    return scores

In [5]:
# ukbb data
data = pd.read_csv(config.ukbb + "valid_finetuning.csv")
data = data.loc[data["section"].apply(lambda x: x in [0, 1, 2, 3, 4])].reset_index(drop=True)
data["label"] = config.ukbb + "annotations/" + data["label"]

# TotalSegmentator
data["total"] = config.ukbb + "preds_total/" + data["image"].apply(lambda x: x.replace("/", "_"))

# Self-supervised nnUnet (trained on section 1 to 3)
data["nnUNet_ss_1to3"] = (
    config.ukbb_cache + "nnUNet/data/preds4/" + data["image"].apply(lambda x: x.replace("/", "_"))
)

# Fine-tuned nnUnet (trained on section 1 to 3)
data["nnUNet_ft_1to3"] = (
    config.ukbb_cache + "nnUNet/data/preds5/" + data["image"].apply(lambda x: x.replace("/", "_"))
)

# Fine-tuned nnUnet (trained on section 0 to 4)
data["nnUNet_ft_0to4"] = (
    config.ukbb_cache + "nnUNet/data/preds8/" + data["image"].apply(lambda x: x.replace("/", "_"))
)

In [8]:
models = ["nnUNet_ss_1to3", "nnUNet_ft_1to3", "nnUNet_ft_0to4"]
scores = {_model: None for _model in models}

# scores["total"] = calc_dice(data, "total", lm.total2new)

for _model in models[1:]:
    scores[_model] = calc_dice(data, _model)

Calc. Dice for  model "nnUNet_ss_1to3":   0%|          | 0/120 [00:00<?, ?it/s]

Calc. Dice for  model "nnUNet_ft_1to3":   0%|          | 0/120 [00:00<?, ?it/s]

Calc. Dice for  model "nnUNet_ft_0to4":   0%|          | 0/120 [00:00<?, ?it/s]

## Compare Predictions
The test dataset is very small (only six subjects with 4*5 scans each). Additionaly we can observe significant annotation bias: Segmentations of TotalSegmentator have increased Dice, because our annotations were presegmented with TotalSegmentator as well. Another problem is incomplete annotation in challenging regions. There, segmentations can be much better than the annotation, which contradictorily results in a worse Dice score. Lastly, if a model fails to segment organs at image borders the Dice for that instance wil be zero, even if its in fact only a few voxels. 

Consequently we need to visually control our segmentations as well. Visually analysis shows, that the model **nnUNet fine-tuned** has the best segmentations (even though the model without fine-tuning has a better Dice score).

In [9]:
def compare(scores, func=lambda x: x):
    df = pd.DataFrame(index=channels)
    for k in scores.keys():
        df[k] = func(scores[k]).mean(numeric_only=True)
    return df


select_w = lambda df: df.loc[df["dixon_type"] == "W"]

In [17]:
print("Result for all sequence types averaged:")
print(compare(scores).mean())

print("\nResult for water only sequenes averaged:")
print(compare(scores, func=select_w).mean())

print("\nResult for section 0 and 4:")
print(compare(scores, func=lambda x: x.loc[x["section"].apply(lambda s: s in [0, 4])]).mean())

Result for all sequence types averaged:
nnUNet_ss_1to3    0.752353
nnUNet_ft_1to3    0.741897
nnUNet_ft_0to4    0.766982
dtype: float64

Result for water only sequenes averaged:
nnUNet_ss_1to3    0.774470
nnUNet_ft_1to3    0.769988
nnUNet_ft_0to4    0.785220
dtype: float64

Result for section 0 and 4:
nnUNet_ss_1to3    0.382550
nnUNet_ft_1to3    0.397909
nnUNet_ft_0to4    0.589267
dtype: float64


In [19]:
compare(scores)

Unnamed: 0,nnUNet_ss_1to3,nnUNet_ft_1to3,nnUNet_ft_0to4
spleen,0.943936,0.900945,0.899683
right_kidney,0.94458,0.946559,0.941391
left_kidney,0.957828,0.953009,0.949052
gallbladder,0.562427,0.605011,0.610802
liver,0.95245,0.871341,0.861531
stomach,0.652022,0.702345,0.698713
pancreas,0.715827,0.671353,0.664474
right_adrenal_gland,0.515189,0.503454,0.504408
left_adrenal_gland,0.560421,0.505688,0.476177
left_lung,0.892081,0.864747,0.917996


## Evaluate on external data set
I use the Charité kidney tumor test set, which consists of 221 abdominal MR scans that are either T1, T2 or T1fs-postKM weighted.
Lina annotated 20 different organs in this set. I will exclude the remaining organs from this analysis.

## Second Generation Model (Monai)

In [49]:
# load annotation doc
data = pd.read_csv(config.ukbb + "csv/charite_annotations.csv")
data["pred"] = config.ukbb + "preds_charite/" + data["label"].apply(lambda x: x.split("/")[1])
data["label"] = config.mr_label_path + data["label"]
data["eid"] = None
data["section"] = None
data["dixon_type"] = None

# Switch label and prediction and calculate dice (same logic but easier to implement)
data = data.rename({"label": "pred", "pred": "label"}, axis=1)
charite_scores = calc_dice(data, "pred", lm.mr2new)

Calc. Dice for  model "pred":   0%|          | 0/221 [00:00<?, ?it/s]

In [51]:
multiple_datasets_scores = pd.DataFrame(index=channels)
multiple_datasets_scores["UKBB dataset"] = scores["monai_ft"].mean(numeric_only=True)
multiple_datasets_scores["Charite dataset"] = charite_scores.mean(numeric_only=True)

orig_channels = [c - 1 for c in lm.mr2new.values()][1:]
print(multiple_datasets_scores["UKBB dataset"][orig_channels].mean())
print(multiple_datasets_scores["Charite dataset"][orig_channels].mean())
multiple_datasets_scores.iloc[orig_channels]

0.7361883714795112
0.69536716


Unnamed: 0,UKBB dataset,Charite dataset
spleen,0.825297,0.823757
right_kidney,0.919115,0.830899
left_kidney,0.920938,0.827756
gallbladder,0.633903,0.434405
liver,0.883581,0.893879
stomach,0.714419,0.79792
aorta,0.872198,0.811834
inferior_vena_cava,0.764746,0.641341
portal_vein_and_splenic_vein,0.327156,0.401341
pancreas,0.636187,0.672319
