# Evaluate Segmentations of self-supervised & fine-tuned model

In [1]:
import pandas as pd
import json

from tqdm.notebook import tqdm
from monai.transforms import (
    LoadImaged,
    EnsureChannelFirstd,
    MapLabelValued,
    Lambdad,
    Compose,
    AsDiscreted,
    Spacingd,
)
from monai.metrics import DiceMetric

# private libraries
import sys

if "../scripts" not in sys.path:
    sys.path.insert(1, "../scripts")
import config
import labelmappings as lm

In [2]:
with open("../metadata/updated_metadata.json") as f:
    metadata = json.load(f)

channels = list(metadata["network_data_format"]["outputs"]["pred"]["channel_def"].values())[1:]

## Evaluate Totalsegmentator and MrSegmentator
Calculate Dice Scores for all models:
1. **TotalSegmentator** (which we used for the creation of presegmented masks on water only sequences)
2. **MRSegmentator V1** (trained with 20 classed on Charité dataset)
3. **MRSegementator V2 self-supervised** (trained on combination of predictions of (1) and (2) on more than 8000 UKKB scans)
4. **MRSegementator V2 fine-tuned** ((3), but fine-tuned on manually annotated UKBB scans)
5. **nnUNet self-supervised** (like (3), but with nnUNet framework)
6. **nnUNet fine-tuned** ((5), but fine-tuned on manually annotated UKBB & Charité scans)

In [14]:
transforms = lambda labelmapper: Compose(
    [
        LoadImaged(keys=["pred", "label"], image_only=True),
        Lambdad(
            keys=["pred", "label"],
            func=lambda x: x[:, :, 1:-1],
        ),
        EnsureChannelFirstd(keys=["pred", "label"]),
        Spacingd(
            keys=["pred", "label"],
            pixdim=[5, 5, 5],
            mode="nearest",
        ),
        MapLabelValued(
            keys=["pred"],
            orig_labels=labelmapper.keys(),
            target_labels=labelmapper.values(),
            dtype=int,
        ),
        AsDiscreted(keys=["pred", "label"], to_onehot=41),
        Lambdad(keys=["pred", "label"], func=lambda x: x[None]),
    ]
)


def add_meta(df, meta_df):
    df.columns = channels
    df["eid"] = meta_df["eid"]
    df["section"] = meta_df["section"]
    df["dixon_type"] = meta_df["dixon_type"]
    return df


def calc_dice(data, model_name, labelmapper=None):
    if labelmapper is None:
        labelmapper = {0: 0}
    dice_metric = DiceMetric(include_background=False, reduction="none", num_classes=41)

    data_list = data.rename({model_name: "pred"}, axis=1)[["label", "pred"]].to_dict("records")
    for item in tqdm(data_list, desc=f'Calc. Dice for  model "{model_name}"'):
        _item = transforms(labelmapper)(item)
        dice_metric(_item["pred"], _item["label"])
    scores = add_meta(pd.DataFrame(dice_metric.get_buffer()), data)

    return scores

In [4]:
# ukbb data
data = pd.read_csv(config.ukbb + "valid_finetuning.csv")
data = data.loc[data["section"].apply(lambda x: x in [1, 2, 3])].reset_index(drop=True)
data["label"] = config.ukbb + "annotations/" + data["label"]

# TotalSegmentator
data["total"] = config.ukbb + "preds_total/" + data["image"].apply(lambda x: x.replace("/", "_"))

# First Model trained on 20 classes
data["monai"] = (
    config.ukbb + "preds_mr_testset/" + data["image"].apply(lambda x: x.replace("/", "_"))
)

# Self-supversied Model
data["monai_ss"] = (
    config.ukbb + "preds_testset_ss/" + data["image"].apply(lambda x: x.replace("/", "_"))
)

# Self-supversied + fine-tuned Model
data["monai_ft"] = (
    config.ukbb + "preds_testset_ft_charite/" + data["image"].apply(lambda x: x.replace("/", "_"))
)

# Self-supervised nnUnet
data["nnUNet_ss"] = (
    config.ukbb_cache + "nnUNet/data/pred_ss/" + data["image"].apply(lambda x: x.replace("/", "_"))
)

# Fine-tuned nnUnet
data["nnUNet_ft"] = (
    config.ukbb_cache
    + "nnUNet/data/preds_ft_lowLR/"
    + data["image"].apply(lambda x: x.replace("/", "_"))
)

In [5]:
models = ["total", "monai", "monai_ss", "monai_ft", "nnUNet_ss", "nnUNet_ft"]
scores = {_model: None for _model in models}

scores["total"] = calc_dice(data, "total", lm.total2new)
scores["monai"] = calc_dice(data, "monai", lm.mr2new)

for _model in models[2:]:
    scores[_model] = calc_dice(data, _model)

Calc. Dice for  model "total":   0%|          | 0/72 [00:00<?, ?it/s]

Calc. Dice for  model "monai":   0%|          | 0/72 [00:00<?, ?it/s]

Calc. Dice for  model "monai_ss":   0%|          | 0/72 [00:00<?, ?it/s]

Calc. Dice for  model "monai_ft":   0%|          | 0/72 [00:00<?, ?it/s]

Calc. Dice for  model "nnUNet_ss":   0%|          | 0/72 [00:00<?, ?it/s]

Calc. Dice for  model "nnUNet_ft":   0%|          | 0/72 [00:00<?, ?it/s]

## Compare Predictions
The test dataset is very small (only six subjects with 4*3 scans each). Additionaly we can observe significant annotation bias: Segmentations of TotalSegmentator have increased Dice, because our annotations were presegmented with TotalSegmentator as well. Another problem is incomplete annotation in challenging regions. There, segmentations can be much better than the annotation, which contradictorily results in a worse Dice score. Lastly, if a model fails to segment organs at image borders the Dice for that instance wil be zero, even if its in fact only a few voxels. 

Consequently we need to visually control our segmentations as well. Visually analysis shows, that the model **nnUNet fine-tuned** has the best segmentations (even though the model without fine-tuning has a better Dice score).

In [6]:
def compare(scores, func=lambda x: x):
    df = pd.DataFrame(index=channels)
    for k in scores.keys():
        df[k] = func(scores[k]).mean(numeric_only=True)
    return df


select_w = lambda df: df.loc[df["dixon_type"] == "W"]

In [7]:
print("Result for all sequence types averaged:")
print(compare(scores).mean())

print("\nResult for water only sequenes averaged:")
print(compare(scores, func=select_w).mean())

Result for all sequence types averaged:
total        0.267239
monai        0.272690
monai_ss     0.749801
monai_ft     0.755038
nnUNet_ss    0.794217
nnUNet_ft    0.779648
dtype: float64

Result for water only sequenes averaged:
total        0.629633
monai        0.329073
monai_ss     0.760822
monai_ft     0.768524
nnUNet_ss    0.806877
nnUNet_ft    0.796827
dtype: float64


In [8]:
compare(scores)

Unnamed: 0,total,monai,monai_ss,monai_ft,nnUNet_ss,nnUNet_ft
spleen,0.292369,0.6542,0.913644,0.825297,0.940338,0.896074
right_kidney,0.431626,0.721886,0.918324,0.919115,0.945428,0.946523
left_kidney,0.425045,0.758141,0.924406,0.920938,0.959969,0.953996
gallbladder,0.034795,0.424178,0.56163,0.633903,0.551892,0.597977
liver,0.463384,0.676894,0.914737,0.883581,0.956161,0.871747
stomach,0.281708,0.616515,0.676617,0.714419,0.661219,0.704996
pancreas,0.285281,0.484669,0.646103,0.636187,0.716874,0.660256
right_adrenal_gland,0.179797,0.36456,0.49437,0.493037,0.580788,0.548708
left_adrenal_gland,0.130184,0.331388,0.37203,0.433115,0.512167,0.466088
left_lung,0.430197,0.0,0.878782,0.889345,0.889998,0.89272


## Evaluate on external data set
I use the Charité kidney tumor test set, which consists of 221 abdominal MR scans that are either T1, T2 or T1fs-postKM weighted.
Lina annotated 20 different organs in this set. I will exclude the remaining organs from this analysis.

## Second Generation Model (Monai)

In [49]:
# load annotation doc
data = pd.read_csv(config.ukbb + "csv/charite_annotations.csv")
data["pred"] = config.ukbb + "preds_charite/" + data["label"].apply(lambda x: x.split("/")[1])
data["label"] = config.mr_label_path + data["label"]
data["eid"] = None
data["section"] = None
data["dixon_type"] = None

# Switch label and prediction and calculate dice (same logic but easier to implement)
data = data.rename({"label": "pred", "pred": "label"}, axis=1)
charite_scores = calc_dice(data, "pred", lm.mr2new)

Calc. Dice for  model "pred":   0%|          | 0/221 [00:00<?, ?it/s]

In [51]:
multiple_datasets_scores = pd.DataFrame(index=channels)
multiple_datasets_scores["UKBB dataset"] = scores["monai_ft"].mean(numeric_only=True)
multiple_datasets_scores["Charite dataset"] = charite_scores.mean(numeric_only=True)

orig_channels = [c - 1 for c in lm.mr2new.values()][1:]
print(multiple_datasets_scores["UKBB dataset"][orig_channels].mean())
print(multiple_datasets_scores["Charite dataset"][orig_channels].mean())
multiple_datasets_scores.iloc[orig_channels]

0.7361883714795112
0.69536716


Unnamed: 0,UKBB dataset,Charite dataset
spleen,0.825297,0.823757
right_kidney,0.919115,0.830899
left_kidney,0.920938,0.827756
gallbladder,0.633903,0.434405
liver,0.883581,0.893879
stomach,0.714419,0.79792
aorta,0.872198,0.811834
inferior_vena_cava,0.764746,0.641341
portal_vein_and_splenic_vein,0.327156,0.401341
pancreas,0.636187,0.672319
