Creating a consistent stratified (in respect to images with calcifications) case-wise split of the database into train/validation/test with test portion having 50% of cases.

We have shared this split with our collegues from anothe team.

In [11]:
import sys; sys.path.insert(0, '../..')

In [2]:
from pathlib import Path

from database.dataset import *

db = INBreast_Dataset(
    return_lesions_mask=True,
    level='image',
    partitions=['train', 'validation', 'test'],
    max_lesion_diam_mm=None,
    extract_patches=True,
    extract_patches_method='all',
    patch_size=224,
    stride=100,
    min_breast_fraction_roi=0.5,
    cropped_imgs=True,
    # use_muscle_mask=True,
    lesion_types=None
)

INFO:numexpr.utils:NumExpr defaulting to 8 threads.


In [3]:
calc_cases = db.rois_df.loc[db.rois_df.lesion_type.isin(['cluster','calcification']), 'case_id'].unique()
all_cases = db.df.case_id.unique()
non_calc_cases = [i for i in all_cases if i not in calc_cases]

In [4]:
len(calc_cases), len(all_cases), len(non_calc_cases)

(89, 108, 19)

In [5]:
 np.random.seed(10)
 train_calc_cases = np.random.choice(calc_cases, replace=False, size=int(np.ceil(len(calc_cases)/2)))
 train_non_calc_cases = np.random.choice(non_calc_cases, replace=False, size=int(np.ceil(len(non_calc_cases)/2)))
 test_non_calc_cases = [i for i in non_calc_cases if i not in train_non_calc_cases]
 test_calc_cases = [i for i in calc_cases if i not in train_calc_cases]

In [6]:
len(train_calc_cases), len(test_calc_cases), len(train_non_calc_cases), len(test_non_calc_cases)

(45, 44, 10, 9)

In [7]:
train_cases = train_calc_cases.tolist() + train_non_calc_cases.tolist()
test_cases = test_calc_cases + test_non_calc_cases

print(len(train_cases), len(test_cases))

55 53


In [8]:
train_images = db.df.loc[db.df.case_id.isin(train_cases), 'img_id'].unique()
test_images = db.df.loc[db.df.case_id.isin(test_cases), 'img_id'].unique()

In [9]:
len(train_images), len(test_images)

(210, 200)

In [10]:
train_data = pd.DataFrame(np.asarray([train_images, ['train']*len(train_images)]).T, columns=['image_id', 'partition'])
test_data = pd.DataFrame(np.asarray([test_images, ['test']*len(test_images)]).T, columns=['image_id', 'partition'])

partitioning_df = pd.concat([train_data, test_data], ignore_index=True)

In [16]:
partitioning_df.to_csv(Path.cwd().parent/'data'/'standard_partitions.csv')