# LUNA 5 Folds

Convert the luna metadata into 5-fold dataset_jsons so that they can be trained to create a baseline comparison.


In [7]:
from importlib import metadata
import json
import pandas as pd

def json_from_metadata(fold_number, scan_ids, training_scans, validation_scans, test_scans, metadata, recode={}):

    metadata = metadata.rename(columns=recode)

    dataset_dict = {'training':[], 'validation':[], 'test' : []}

    for scan_id in scan_ids:

        scan_dict = {
            'box' : [],
            'image' : f'{scan_id}/{scan_id}.nii.gz',
            'label' : []
        } 

        for idx, row in metadata[metadata.scan_id == scan_id].iterrows():
            scan_dict['box'].append([
                row.nodule_x_coordinate,
                row.nodule_y_coordinate,
                row.nodule_z_coordinate,
                row.nodule_diameter_mm,
                row.nodule_diameter_mm,
                row.nodule_diameter_mm
            ])
            scan_dict['label'].append(0)

        if scan_id in training_scans:
            dataset_dict['training'].append(scan_dict)

        elif scan_id in validation_scans:
            dataset_dict['validation'].append(scan_dict)

        elif scan_id in test_scans:
            dataset_dict['test'].append(scan_dict)


    print(f"Fold{fold_number} counts: {len(dataset_dict['training'])}, {len(dataset_dict['validation'])}, {len(dataset_dict['test'])}")
    return dataset_dict

luna_metadata   = pd.read_csv('../output/metadata/luna/test_fold1_metadata.csv')
luna_scans      = pd.read_csv('../output/metadata/luna/test_fold1_scans.csv').scan_id.values

how_many = 5

split_scans = [luna_scans[i::how_many] for i in range(how_many)]

for idx, validation_scans in enumerate(split_scans):

    training_scans = [scan for scan in luna_scans if scan not in validation_scans]
    test_scans = []

    dataset_dict = json_from_metadata(
                        idx+1,
                        luna_scans, 
                        training_scans, 
                        validation_scans, 
                        test_scans,
                        luna_metadata,
                    )


    with open(f'../models/detection/datasplits/LUNA/5_folds/dataset_fold{idx+1}.json', 'w') as f:
        json.dump(dataset_dict, f, indent=4)


Fold1 counts: 710, 178, 0
Fold2 counts: 710, 178, 0
Fold3 counts: 710, 178, 0
Fold4 counts: 711, 177, 0
Fold5 counts: 711, 177, 0


In [4]:
from importlib import metadata
import json
import pandas as pd

def json_from_metadata(scan_ids, metadata, splits=[.8, .2, .0], recode={}):

    training_scans = scan_ids[:int(len(scan_ids) * splits[0])]
    validation_scans = scan_ids[int(len(scan_ids) * splits[0]):int(len(scan_ids) * (splits[0] + splits[1]))]
    test_scans = scan_ids[int(len(scan_ids) * (splits[0] + splits[1])):]
    metadata = metadata.rename(columns=recode)

    dataset_dict = {'training':[], 'validation':[], 'test' : []}

    for scan_id in scan_ids:

        scan_dict = {
            'box' : [],
            'image' : f'{scan_id}/{scan_id}.nii.gz',
            'label' : []
        } 

        for idx, row in metadata[metadata.scan_id == scan_id].iterrows():
            scan_dict['box'].append([
                row.nodule_x_coordinate,
                row.nodule_y_coordinate,
                row.nodule_z_coordinate,
                row.nodule_diameter_mm,
                row.nodule_diameter_mm,
                row.nodule_diameter_mm
            ])
            scan_dict['label'].append(0)

        if scan_id in training_scans:
            dataset_dict['training'].append(scan_dict)

        elif scan_id in validation_scans:
            dataset_dict['validation'].append(scan_dict)

        elif scan_id in test_scans:
            dataset_dict['test'].append(scan_dict)

    return dataset_dict


processed = pd.read_csv('/Users/john/Projects/SOTAEvaluationNoduleDetection/models/detection/datasplits/LUNA/processed.txt', header=None)

luna_metadata   = pd.read_csv('/Users/john/Projects/SOTAEvaluationNoduleDetection/output/metadata/luna/test_fold1_metadata.csv')
all_scan_ids = pd.read_csv('/Users/john/Projects/SOTAEvaluationNoduleDetection/output/metadata/luna/test_fold1_scans.csv').scan_id.values

missing_scan_ids = [scan_id for scan_id in all_scan_ids if scan_id not in processed.values]

print(len(missing_scan_ids))

dataset_dict = json_from_metadata(missing_scan_ids, luna_metadata, splits=[1, .0, .0])


with open(f'/Users/john/Projects/SOTAEvaluationNoduleDetection/models/detection/datasplits/LUNA/dataset_missing.json', 'w') as f:
    json.dump(dataset_dict, f, indent=4)

287
