# Split UKBB Data
I downloaded scans from 1000 subjects of the UKBB dataset. We have manual annotations for 31 of them.
I split the data as follwing:
- Training set:  925 subjects
- Validation set: 30 subjects
- Test set: 31 patients
- Exclude: 14 patients (More than 5 sections exist for some subjects and Im not sure why, other subject were badly segmentated)

In [1]:
import pandas as pd
import os
import random

from monai.transforms import (
    Compose,
    LoadImage,
    EnsureChannelFirst,
    Spacing,
    Lambda,
)

# private libraries
import sys

if "../scripts" not in sys.path:
    sys.path.insert(1, "../scripts")
import config

In [2]:
# Load manifest
data = pd.read_csv(config.ukbb + "manifest.csv")

# Exlude patients
# Lina said segmentation inst working well for these samples
bad_samples = [
    int(s)
    for s in [
        "1013238",
        "1013386",
        "1017226",
        "1017752",
        "1018450",
        "1018829",
        "1010497",
        "1010973",
        "1008327",
    ]
]
patients2remove = data.loc[data["section"] == 6]["eid"].unique()
patients2remove = list(patients2remove) + bad_samples
_index = data.loc[data["eid"].apply(lambda x: x in patients2remove)].index
data = data.drop(_index).reset_index(drop=True)
print(f"Dropped {len(patients2remove)} patients.")

# Focus on sections 1, 2 and 3
data = data.loc[data["section"].apply(lambda x: x in [1, 2, 3])]
data = data.reset_index(drop=True)

# Map water only labels to other sequence types
data["label"] = data["image"].apply(lambda x: x.replace("/", "_"))
data["label"] = data["label"].apply(lambda x: x.replace("in", "W"))
data["label"] = data["label"].apply(lambda x: x.replace("opp", "W"))
data["label"] = data["label"].apply(lambda x: x.replace("F", "W"))

Dropped 14 patients.


## Choose good segmentations based on segmentation volume
The Dice score of segmentations correlates stronlgy to their segmentation volume. Therefore, I limit train and validation set to segmentations with increased volume.

In [4]:
# Calculate Volume (takes around five minutes for 16 cores)
calc_volume = Compose(
    [
        LoadImage(image_only=True),
        EnsureChannelFirst(),
        Spacing(pixdim=[3, 3, 3]),
        Lambda(func=lambda x: (x != 0).sum()),
        Lambda(func=lambda x: x.item()),
    ]
)

# Calculate volume for water-only sequences.
# Other sequence types will have same volume as they were derived from water only
data["seg_volume"] = 0
_volume = list(
    data.loc[data["dixon_type"] == "W"]["label"].apply(
        lambda x: calc_volume(config.ukbb + "preds_combined/" + x)
    )
)
for seq in ["in", "opp", "W", "F"]:
    data.loc[data["dixon_type"] == seq, "seg_volume"] = _volume

In [5]:
def calc_quantile_thr(volumes, quantile=0.7):
    """Return threshold that is smaller than 'quantile' of the values in the input list."""
    volumes = list(volumes).copy()
    volumes.sort()
    return volumes[int(len(volumes) * (1 - quantile))]


# calculate threshold
thr_sec1 = calc_quantile_thr(data.loc[data["section"] == 1]["seg_volume"])
thr_sec2 = calc_quantile_thr(data.loc[data["section"] == 2]["seg_volume"])
thr_sec3 = calc_quantile_thr(data.loc[data["section"] == 3]["seg_volume"])

# select scans with high segmentation volume
high_volume = pd.concat(
    [
        data.loc[(data["section"] == 1) & (data["seg_volume"] > thr_sec1)],
        data.loc[(data["section"] == 2) & (data["seg_volume"] > thr_sec2)],
        data.loc[(data["section"] == 3) & (data["seg_volume"] > thr_sec3)],
    ]
).reset_index(drop=True)

## Split on subject level

In [6]:
# Patient level split
patients = set(data["eid"].unique())

# Manual annotations
annotations = [f.name for f in os.scandir(config.ukbb + "annotations") if f.name[-7:] == ".nii.gz"]
test_patients = set([int(f.split("_")[0]) for f in annotations])
for p in test_patients:
    assert p in patients, " Annoation incorrectly included"
print(f"Found manual annotations for {len(test_patients)} patients.")

# Train and validation split
random.seed(42)
patients_remaining = patients.difference(test_patients)
valid_patients = set(random.sample(list(patients_remaining), 30))
train_patients = patients_remaining.difference(valid_patients)

Found manual annotations for 31 patients.


In [7]:
train_set = high_volume.loc[high_volume["eid"].apply(lambda x: x in train_patients)].reset_index(
    drop=True
)
valid_set = high_volume.loc[high_volume["eid"].apply(lambda x: x in valid_patients)].reset_index(
    drop=True
)

test_set = data.loc[data["eid"].apply(lambda x: x in test_patients)].reset_index(drop=True)

for _set in [train_set, valid_set, test_set]:
    for p in [1, 2, 3]:
        print(f" Scans in section {p}:", sum(_set["section"] == p))

 Scans in section 1: 2580
 Scans in section 2: 2576
 Scans in section 3: 2596
 Scans in section 1: 96
 Scans in section 2: 96
 Scans in section 3: 92
 Scans in section 1: 124
 Scans in section 2: 124
 Scans in section 3: 124


## Save data frames

In [8]:
test_set.to_csv(config.ukbb + "test.csv", index=False)
valid_set.to_csv(config.ukbb + "valid.csv", index=False)
train_set.sample(random_state=13, frac=1).to_csv(config.ukbb + "train.csv", index=False)

## Save water-only data frames

In [10]:
select_w = lambda df: df.loc[df["dixon_type"] == "W"].reset_index(drop=True)

select_w(test_set).to_csv(config.ukbb + "test_w_only.csv", index=False)
select_w(valid_set).to_csv(config.ukbb + "valid_w_only.csv", index=False)
select_w(train_set).sample(random_state=13, frac=1).to_csv(
    config.ukbb + "train_w_only.csv", index=False
)