# Double check sample numbers are correct for training SUMMIT

In [1]:
import pandas as pd
from pathlib import Path
import os

workspace_path = Path(os.getcwd()).parent.parent

print(workspace_path)


datasets = {
    'Training' : {},
    'Validation': {},
    'Test' : {}
}

for ds in datasets:

    datasets[ds]['scans'] = pd.read_csv(f'{workspace_path}/metadata/summit/partial/{ds}_scans.csv')
    datasets[ds]['metadata'] = pd.read_csv(f'{workspace_path}/metadata/summit/partial/{ds}_metadata.csv').assign(scan_id=lambda df: df.participant_id + '_Y0_BASELINE_A')
    datasets[ds]['excludes'] = pd.read_csv(f'{workspace_path}/metadata/summit/partial/{ds}_excludes.csv').assign(scan_id=lambda df: df.participant_id + '_Y0_BASELINE_A')


    print(f'{ds} scans: {datasets[ds]["scans"].shape[0]}')
    print(f'{ds} metadata: {datasets[ds]["metadata"].shape[0]}')
    print(f'{ds} unique metadata scans: {datasets[ds]["metadata"]["scan_id"].nunique()}')
    print(f'{ds} excludes: {datasets[ds]["excludes"].shape[0]}')

print('Total scans in all datasets:', sum([datasets[ds]['scans'].shape[0] for ds in datasets]))
print('Total metadata in all datasets:', sum([datasets[ds]['metadata'].shape[0] for ds in datasets]))
print('Unique scans in metadata:', sum([datasets[ds]['metadata']['scan_id'].nunique() for ds in datasets]))
print('Total excludes in all datasets:', sum([datasets[ds]['excludes'].shape[0] for ds in datasets]))


/Users/john/Projects/SOTAEvaluationNoduleDetection
Training scans: 4754
Training metadata: 6073
Training unique metadata scans: 2657
Training excludes: 1473
Validation scans: 392
Validation metadata: 468
Validation unique metadata scans: 210
Validation excludes: 112
Test scans: 797
Test metadata: 1082
Test unique metadata scans: 438
Test excludes: 210
Total scans in all datasets: 5943
Total metadata in all datasets: 7623
Unique scans in metadata: 3305
Total excludes in all datasets: 1795


In [2]:
# What fields are in the datasets
print('Scans fields:', datasets['Training']['scans'].columns)
print('Metadata fields:', datasets['Training']['metadata'].columns)

Scans fields: Index(['scan_id'], dtype='object')
Metadata fields: Index(['form_instance_id', 'form_instance_status', 'participant_id',
       'form_instance_index', 'nodule_brock_score', 'nodule_category',
       'nodule_diameter_mm', 'nodule_lesion_id', 'nodule_lung_rads',
       'nodule_mass', 'nodule_mass_core', 'nodule_mass_double_time_core',
       'nodule_mass_doubling_time', 'nodule_reliable_segment', 'nodule_site',
       'nodule_size_volume_cub_mm', 'nodule_slice_number',
       'nodule_spiculation', 'nodule_subsolid_major_axis_diameter',
       'nodule_type', 'nodule_volume_core', 'nodule_volume_doubling_time',
       'nodule_volume_percentage_change',
       'nodule_volume_volume_double_time_core', 'nodule_x_coordinate',
       'nodule_y_coordinate', 'nodule_z_coordinate',
       'radiology_report_management_plan_value', 'management_plan',
       'radiology_report_malignancy_diagnosis',
       'radiology_report_malignancy_criteria',
       'radiology_report_malignancy_primar

# Create dataset.json for MONAI Detection

In [3]:
dataset_json = {
    'Training': [],
    'Validation': [],
    'Test': []
}

for ds in datasets.keys():

    # for training just add positive samples .. iterate over metadata
    
    if ds == 'Training':
        scan_ids = datasets[ds]['metadata']['scan_id'].sort_values().unique().tolist()
    else:
        scan_ids = datasets[ds]['scans']['scan_id'].sort_values().tolist()

    for scan_id in scan_ids:
        annotation_dict = {"box" : [], "image" : f"{scan_id.split('_')[0]}/{scan_id}.nii.gz", "label" : []}

        metadata = datasets[ds]['metadata'][datasets[ds]['metadata']['scan_id'] == scan_id]

        for idx, row in metadata.iterrows():
            annotation_dict['box'].append([
                row['nodule_x_coordinate'],
                row['nodule_y_coordinate'],
                row['nodule_z_coordinate'],
                row['nodule_diameter_mm'],
                row['nodule_diameter_mm'],
                row['nodule_diameter_mm']
            ])

            annotation_dict['label'].append(0)

        dataset_json[ds].append(annotation_dict)

# Check numbers in json
print('Training:', len(dataset_json['Training']))
print('Training boxes:', sum([len(x['box']) for x in dataset_json['Training']]))

print('Validation:', len(dataset_json['Validation']))
print('Validation boxes:', sum([len(x['box']) for x in dataset_json['Validation']]))
      
print('Test:', len(dataset_json['Test']))
print('Test boxes:', sum([len(x['box']) for x in dataset_json['Test']]))

# Save json
import json
with open(f'{workspace_path}/models/detection/datasplits/summit/partial/dataset_partial.json', 'w') as f:
    json.dump(dataset_json, f, indent=4)

Training: 2657
Training boxes: 6073
Validation: 392
Validation boxes: 468
Test: 797
Test boxes: 1082


# Check that the files exist on the server before starting training

In [4]:
# Detection

server_listings = open('detection_listings.txt', 'r').readlines()
server_listings = [x.replace('./','').replace('\n','') for x in server_listings]

missed = 0
for ds in datasets:
    for image in dataset_json[ds]:
        image_path = image['image']
        if image_path not in server_listings:
            print(f'Image {image_path} not found in server listings')
            missed += 1
            
print('Missed Detection:', missed)

Missed Detection: 0


In [7]:
# Grt123
from yaml import scan


server_listings = open('grt123_listings.txt', 'r').readlines()
server_listings = [x.replace('_clean.npy','').replace('\n','') for x in server_listings]

missed = 0
for ds in datasets:
    
    for scan_id in datasets[ds]['scans']['scan_id'].values:

        
        if scan_id not in server_listings:
            print(f'Scan {scan_id} not found in server listings')
            missed += 1

print('Missed GRT123:', missed)

Missed GRT123: 0
