# LUNA Sampling Strategy - Evaluation of SOTA Nodule Detection Algorithms Trained on Public Dataset Tested on Dataset Representative of LCS

## Description

- There are 888 LUNA scans with 1186 nodules.
- There are 601 scans with nodules and 287 scans without nodules

- MONAI Detection only used scans with nodules in heir published metrics on LUNA performance
- This project aims to establish the impact of training with and without noduleless scans

- The experiment set up is as follows:

### Assumptions
5-folds i.e., use 4 folds for training and 1 for validation

### Two arms
Arm 1: Train on balanced combination of nodule/ no-nodule scans, validate on combination nodule & non-nodule
Arm 2: Train on no-nodule scans only, validate on combination nodule & non-nodule

|Arm 1 Composition||Arm 2 Composition||
|----|----|----|----|
|Name: Arm1_Fold1||Name: Arm2_Fold1||
|\# Scans with nodules|120|# Scans with nodules|120|
|\# Scans without nodules|58|# Scans without nodules|0|
|\# Total|178|# Total|120|
|Name: Arm1_Fold2||Name: Arm2_Fold2||
|\# Scans with nodules|120|# Scans with nodules|120|
|\# Scans without nodules|57|# Scans without nodules|0|
|\# Total|177|# Total|120|
|Name: Arm1_Fold3||Name: Arm2_Fold3|
|\# Scans with nodules|120|# Scans with nodules|120|
|\# Scans without nodules|57|# Scans without nodules|0|
|\# Total|177|# Total|120|
|Name: Arm1_Fold4||Name: Arm2_Fold4||
|\# Scans with nodules|120|# Scans with nodules|120|
|\# Scans without nodules|57|# Scans without nodules|0|
|\# Total|177|# Total|120|
|Name: Arm1_Fold5||Name: Arm2_Fold5||
|\# Scans with nodules|121|# Scans with nodules|121|
|\# Scans without nodules|58|# Scans without nodules|0|
|\# Total|179|# Total|121|
||||
|Total scans with nodules|601|Total scans with nodules|601|
|Total scans without nodules|287|Total scans without nodules|0|
|Total scans|888|Total scans|601|
|Training Scheme||Training Scheme||
|1. Train on Arm1_Fold1, 2, 3, 4 & Validate on Arm1_Fold 5|Trn #: 709, Val # 179|1. Train on Arm2_Fold1, 2, 3, 4 & Validate on Arm1_Fold 5|Trn #: 480, Val # 179|
|2. Train on Arm1_Fold1, 2, 3, 5 & Validate on Arm1_Fold 4|Trn #: 711, Val # 177|2. Train on Arm2_Fold1, 2, 3, 5 & Validate on Arm1_Fold 4|Trn #: 481, Val # 177|
|3. Train on Arm1_Fold1, 2, 4, 5 & Validate on Arm1_Fold 3|Trn #: 711, Val # 177|3. Train on Arm2_Fold1, 2, 4, 5 & Validate on Arm1_Fold 3|Trn #: 481, Val # 177|
|4. Train on Arm1_Fold1, 3, 4, 5 & Validate on Arm1_Fold 2|Trn #: 711, Val # 177|4. Train on Arm2_Fold1, 3, 4, 5 & Validate on Arm1_Fold 2|Trn #: 481, Val # 177|
|5. Train on Arm1_Fold1, 2, 3, 4 & Validate on Arm1_Fold 5|Trn #: 710, Val # 178|5. Train on Arm2_Fold1, 2, 3, 4 & Validate on Arm1_Fold 5|Trn #: 481, Val # 178|





In [5]:
import pandas as pd
import numpy as np
import os

workspacedirectory = os.path.abspath(os.path.join(os.getcwd(), os.pardir))

In [10]:


luna_scans = pd.read_csv(f'{workspacedirectory}/scans/luna16/metadata/candidates.csv', usecols=['seriesuid']).drop_duplicates()
luna_annotations = pd.read_csv(f'{workspacedirectory}/scans/luna16//metadata/annotations.csv')

luna_scans_with_nodules = luna_scans[luna_scans['seriesuid'].isin(luna_annotations['seriesuid'])]
luna_scans_without_nodules = luna_scans[~luna_scans['seriesuid'].isin(luna_annotations['seriesuid'])]

print(f'Number of scans with nodules: {len(luna_scans_with_nodules)}')
print(f'Number of scans without nodules: {len(luna_scans_without_nodules)}')

Number of scans with nodules: 601
Number of scans without nodules: 287


In [11]:

splits = {
    'fold1': {
        'with_nodules' : 120,
        'without_nodules' : 58
    },
    'fold2': {
        'with_nodules' : 120,
        'without_nodules' : 57
    },
    'fold3': {
        'with_nodules' : 120,
        'without_nodules' : 57
    },
    'fold4': {
        'with_nodules' : 120,
        'without_nodules' : 57
    },
    'fold5': {
        'with_nodules' : 121,
        'without_nodules' : 58
    }
}

# Generate arm 1

copy_luna_scans_with_nodules = luna_scans_with_nodules.copy()
copy_luna_scans_without_nodules = luna_scans_without_nodules.copy()

arm1_fold_ds = {
    'fold1': {'scans' : None, 'annotations' : None}, 
    'fold2': {'scans' : None, 'annotations' : None},  
    'fold3': {'scans' : None, 'annotations' : None}, 
    'fold4': {'scans' : None, 'annotations' : None}, 
    'fold5': {'scans' : None, 'annotations' : None}
}

for fold, split in splits.items():
    with_nodules = copy_luna_scans_with_nodules.sample(n=split['with_nodules'], replace=False, random_state=42)
    copy_luna_scans_with_nodules.drop(with_nodules.index, inplace=True)

    without_nodules = copy_luna_scans_without_nodules.sample(n=split['without_nodules'], replace=False, random_state=42)
    copy_luna_scans_without_nodules.drop(without_nodules.index, inplace=True)

    arm1_fold_ds[fold]['scans'] = pd.concat([with_nodules, without_nodules])
    arm1_fold_ds[fold]['annotations'] = luna_annotations[luna_annotations['seriesuid'].isin(arm1_fold_ds[fold]['scans']['seriesuid'])]

    print(f'Arm 1 - Fold {fold} - Scans with nodules: {len(with_nodules)}, Scans without nodules: {len(without_nodules)} , Annotations: {len(arm1_fold_ds[fold]["annotations"])}')

# Generate arm 2
    
copy_luna_scans_with_nodules = luna_scans_with_nodules.copy()
copy_luna_scans_without_nodules = luna_scans_without_nodules.copy()

arm2_fold_ds = {
    'fold1': {'scans' : None, 'annotations' : None}, 
    'fold2': {'scans' : None, 'annotations' : None},  
    'fold3': {'scans' : None, 'annotations' : None}, 
    'fold4': {'scans' : None, 'annotations' : None}, 
    'fold5': {'scans' : None, 'annotations' : None}
}

for fold, split in splits.items():
    arm2_fold_ds[fold]['scans'] = arm1_fold_ds[fold]['scans']
    arm2_fold_ds[fold]['annotations'] = luna_annotations[luna_annotations['seriesuid'].isin(arm2_fold_ds[fold]['scans']['seriesuid'])]

    print(f'Arm 2 - Fold {fold} - Scans with nodules: {len(with_nodules)}, Scans without nodules: {len(without_nodules)} , Annotations: {len(arm2_fold_ds[fold]["annotations"])}')

Arm 1 - Fold fold1 - Scans with nodules: 120, Scans without nodules: 58 , Annotations: 246
Arm 1 - Fold fold2 - Scans with nodules: 120, Scans without nodules: 57 , Annotations: 219
Arm 1 - Fold fold3 - Scans with nodules: 120, Scans without nodules: 57 , Annotations: 258
Arm 1 - Fold fold4 - Scans with nodules: 120, Scans without nodules: 57 , Annotations: 224
Arm 1 - Fold fold5 - Scans with nodules: 121, Scans without nodules: 58 , Annotations: 239
Arm 2 - Fold fold1 - Scans with nodules: 121, Scans without nodules: 58 , Annotations: 246
Arm 2 - Fold fold2 - Scans with nodules: 121, Scans without nodules: 58 , Annotations: 219
Arm 2 - Fold fold3 - Scans with nodules: 121, Scans without nodules: 58 , Annotations: 258
Arm 2 - Fold fold4 - Scans with nodules: 121, Scans without nodules: 58 , Annotations: 224
Arm 2 - Fold fold5 - Scans with nodules: 121, Scans without nodules: 58 , Annotations: 239


In [12]:
from pathlib import Path

# Write out the arm 1 datasets
for fold, split in arm1_fold_ds.items():

    output_path = Path(f'{workspacedirectory}/output/metadata/luna/arm1')
    output_path.mkdir(parents=True, exist_ok=True)

    split['scans'].to_csv(f'{workspacedirectory}/output/metadata/luna/arm1/fold{fold}_scans.csv', index=False)
    split['annotations'].to_csv(f'{workspacedirectory}/output/metadata/luna/arm1/fold{fold}_annotations.csv', index=False)

# Write out the arm 2 datasets
for fold, split in arm2_fold_ds.items():

    output_path = Path(f'{workspacedirectory}/output/metadata/luna/arm2')
    output_path.mkdir(parents=True, exist_ok=True)

    split['scans'].to_csv(f'{workspacedirectory}/output/metadata/luna/arm2/fold{fold}_scans.csv', index=False)
    split['annotations'].to_csv(f'{workspacedirectory}/output/metadata/luna/arm2/fold{fold}_annotations.csv', index=False)

# Detection

- Convert the LUNA folds into dataset_json formats

In [None]:
import json

def convert_folds_to_dataset_json(folds, fold):

    dataset_json = {
        'training': [],
        'validation': [],
        'test': []
    }

    for f, split in folds.items():
        

        for idx, scan in split['scans'].iterrows():
            annotation_dict = {'box' : [], 'image' : '', 'label' : []}
            annotation_dict['image'] = f"{scan['seriesuid']}/{scan['seriesuid']}.nii.gz"

            annotations = split['annotations'][split['annotations']['seriesuid'] == scan['seriesuid']]
            for idx, annotation in annotations.iterrows():
                annotation_dict['box'].append(
                    [
                        annotation['coordX'],
                        annotation['coordY'], 
                        annotation['coordZ'], 
                        annotation['diameter_mm'], 
                        annotation['diameter_mm'], 
                        annotation['diameter_mm']
                    ])

                annotation_dict['label'].append(0)

            if f == fold:
                dataset_json['validation'].append(annotation_dict)
            else:
                dataset_json['training'].append(annotation_dict)

    return dataset_json


In [13]:

# Write out the arm 1 datasets
for fold, split in arm1_fold_ds.items():

    dataset_json = convert_folds_to_dataset_json(arm1_fold_ds, fold)

    output_path = Path(f'{workspacedirectory}models/detection/datasplits/LUNA/arm1')
    output_path.mkdir(parents=True, exist_ok=True)

    with open(f'{workspacedirectory}/models/detection/datasplits/LUNA/arm1/dataset_{fold}.json', 'w') as f:
        json.dump(dataset_json, f, indent=4)

# Write out the arm 2 datasets
for fold, split in arm2_fold_ds.items():

    dataset_json = convert_folds_to_dataset_json(arm2_fold_ds, fold)

    output_path = Path(f'{workspacedirectory}models/detection/datasplits/LUNA/arm2')
    output_path.mkdir(parents=True, exist_ok=True)

    with open(f'{workspacedirectory}/models/detection/datasplits/LUNA/arm2/dataset_{fold}.json', 'w') as f:
        json.dump(dataset_json, f, indent=4)

# SUMMIT

- Moving onto the SUMMIT data
- LUNA is generated on DSH and sits in metadata/SUMMIT/full_eval
- This needs to be converted to dataset_json

In [15]:
from pathlib import Path
import json

def convert_metadata_to_json(metadata_path, output_path):
    
    metadata_json = {
        'training': [],
        'validation': [],
        'test': []
    }


    for ds, split in metadata_json.items():

        scans = pd.read_csv(f'{metadata_path}/{ds}_scans.csv')

        for idx, scan in scans.iterrows():
            annotation_dict = {'box' : [], 'image' : '', 'label' : []}

            scan_metadata = pd.read_csv(f'{metadata_path}/{ds}_metadata.csv')
            scan_metadata = scan_metadata[scan_metadata['scan_id'] == scan.scan_id]
            annotation_dict['image'] = f"{scan.scan_id}/{scan.scan_id}.nii.gz"

            for idx, annotation in scan_metadata.iterrows():

                annotation_dict['box'].append(
                    [
                        annotation['radiology_report_nodule_x_coordinate'],
                        annotation['radiology_report_nodule_y_coordinate'], 
                        annotation['radiology_report_nodule_z_coordinate'], 
                        annotation['radiology_report_nodule_diameter_mm'], 
                        annotation['radiology_report_nodule_diameter_mm'], 
                        annotation['radiology_report_nodule_diameter_mm']
                    ])

                annotation_dict['label'].append(0)

            metadata_json[ds].append(annotation_dict)
 

    output_path = Path(output_path)
    output_path.mkdir(parents=True, exist_ok=True)

    with open(Path(output_path, 'dataset.json'), 'w') as f:
        json.dump(metadata_json, f, indent=4)

convert_metadata_to_json(
    f'{workspacedirectory}/output/metadata/SUMMIT/full_eval',
    f'{workspacedirectory}/models/detection/datasplits/SUMMIT/full_eval'
)    

  scans = pd.read_csv(f'{metadata_path}/{ds}_scans.csv')
