# Double check sample numbers are correct for training SUMMIT

In [1]:
import pandas as pd
from pathlib import Path
import os

workspace_path = Path(os.getcwd()).parent.parent

print(workspace_path)


datasets = {
    'Training' : {},
    'Validation': {},
    'Test' : {}
}

for ds in datasets:

    datasets[ds]['scans'] = pd.read_csv(f'{workspace_path}/metadata/summit/partial/{ds}_scans.csv')
    datasets[ds]['metadata'] = pd.read_csv(f'{workspace_path}/metadata/summit/partial/{ds}_metadata.csv').assign(scan_id=lambda df: df.participant_id + '_Y0_BASELINE_A')
    datasets[ds]['excludes'] = pd.read_csv(f'{workspace_path}/metadata/summit/partial/{ds}_excludes.csv').assign(scan_id=lambda df: df.participant_id + '_Y0_BASELINE_A')


    print(f'{ds} scans: {datasets[ds]["scans"].shape[0]}')
    print(f'{ds} metadata: {datasets[ds]["metadata"].shape[0]}')
    print(f'{ds} unique metadata scans: {datasets[ds]["metadata"]["scan_id"].nunique()}')
    print(f'{ds} excludes: {datasets[ds]["excludes"].shape[0]}')

print('Total scans in all datasets:', sum([datasets[ds]['scans'].shape[0] for ds in datasets]))
print('Total metadata in all datasets:', sum([datasets[ds]['metadata'].shape[0] for ds in datasets]))
print('Unique scans in metadata:', sum([datasets[ds]['metadata']['scan_id'].nunique() for ds in datasets]))
print('Total excludes in all datasets:', sum([datasets[ds]['excludes'].shape[0] for ds in datasets]))


/Users/john/Projects/SOTAEvaluationNoduleDetection
Training scans: 4753
Training metadata: 6145
Training unique metadata scans: 2672
Training excludes: 1428
Validation scans: 297
Validation metadata: 387
Validation unique metadata scans: 173
Validation excludes: 79
Test scans: 891
Test metadata: 1087
Test unique metadata scans: 458
Test excludes: 288
Total scans in all datasets: 5941
Total metadata in all datasets: 7619
Unique scans in metadata: 3303
Total excludes in all datasets: 1795


In [2]:
# What fields are in the datasets
print('Scans fields:', datasets['Training']['scans'].columns)
print('Metadata fields:', datasets['Training']['metadata'].columns)

Scans fields: Index(['scan_id'], dtype='object')
Metadata fields: Index(['form_instance_id', 'form_instance_status', 'participant_id',
       'form_instance_index', 'nodule_brock_score', 'nodule_category',
       'nodule_diameter_mm', 'nodule_lesion_id', 'nodule_lung_rads',
       'nodule_mass', 'nodule_mass_core', 'nodule_mass_double_time_core',
       'nodule_mass_doubling_time', 'nodule_reliable_segment', 'nodule_site',
       'nodule_size_volume_cub_mm', 'nodule_slice_number',
       'nodule_spiculation', 'nodule_subsolid_major_axis_diameter',
       'nodule_type', 'nodule_volume_core', 'nodule_volume_doubling_time',
       'nodule_volume_percentage_change',
       'nodule_volume_volume_double_time_core', 'nodule_x_coordinate',
       'nodule_y_coordinate', 'nodule_z_coordinate',
       'radiology_report_management_plan_value', 'management_plan',
       'radiology_report_malignancy_diagnosis',
       'radiology_report_malignancy_criteria',
       'radiology_report_malignancy_primar

# Create dataset.json for MONAI Detection

As the 

In [3]:
dataset_json = {
    'Training': [],
    'Test': []
}

for ds in datasets.keys():

    # for training just add positive samples .. iterate over metadata
    
    if ds == 'Training' or ds == 'Validation':
        scan_ids = datasets[ds]['metadata']['scan_id'].sort_values().unique().tolist()
    
    if ds == 'Test':
        scan_ids = datasets[ds]['scans']['scan_id'].sort_values().tolist()

    for scan_id in scan_ids:
        annotation_dict = {"box" : [], "image" : f"{scan_id.split('_')[0]}/{scan_id}.nii.gz", "label" : []}

        metadata = datasets[ds]['metadata'][datasets[ds]['metadata']['scan_id'] == scan_id]

        for idx, row in metadata.iterrows():
            annotation_dict['box'].append([
                row['nodule_x_coordinate'],
                row['nodule_y_coordinate'],
                row['nodule_z_coordinate'],
                row['nodule_diameter_mm'],
                row['nodule_diameter_mm'],
                row['nodule_diameter_mm']
            ])

            annotation_dict['label'].append(0)

        dataset_json['Test' if ds == 'Test' else 'Training'].append(annotation_dict)

# Check numbers in json
print('Training:', len(dataset_json['Training']))
print('Training boxes:', sum([len(x['box']) for x in dataset_json['Training']]))

print('Test:', len(dataset_json['Test']))
print('Test boxes:', sum([len(x['box']) for x in dataset_json['Test']]))

# Save json
import json
with open(f'{workspace_path}/models/detection/datasplits/summit/partial/dataset_partial.json', 'w') as f:
    json.dump(dataset_json, f, indent=4)

Training: 2845
Training boxes: 6532
Test: 891
Test boxes: 1087


# Check that the files exist on the server before starting training

In [4]:
# Detection

server_listings = open('detection_listings.txt', 'r').readlines()
server_listings = [x.replace('/','').replace('\n','') for x in server_listings]


# Split each line into columns
columns = [line.split() for line in server_listings]

# Create a dataframe from the columns
df = pd.DataFrame(columns).drop(0,axis=0)

# Set column names
df.columns = ['permissions', 'links', 'owner', 'group', 'size', 'month', 'day', 'time', 'filename']

display(df.head())


missed = 0
for ds in datasets:
    for image in dataset_json['Test' if ds == 'Test' else 'Training']:
        image_id = image['image'].split('/')[1]
        if image_id not in df.filename.values:
            print(f'Image {image_id} not found in server listings')
            missed += 1
        else:
            # update listings df with dataset
            df.loc[df.filename.str.contains(image_id), 'dataset'] = 'Test' if ds == 'Test' else 'Training'
            
print('Missed Detection:', missed)
# Print the size of the training data
bytes = df[df.dataset == 'Training']['size'].astype(int).sum()
print('Training size:', bytes/1e9, 'GB')

Unnamed: 0,permissions,links,owner,group,size,month,day,time,filename
1,-rw-r--r--,1,jmccabe,summit,107275,Feb,8,16:14,listing.sh
2,-rw-r--r--,1,jmccabe,summit,315490,Feb,9,14:59,listings.txt
3,-rw-r--r--,1,jmccabe,summit,185564258,Sep,6,2023,summit-2222-djr_Y0_BASELINE_A.nii.gz
4,-rw-r--r--,1,jmccabe,summit,215327282,Sep,7,2023,summit-2223-yts_Y0_BASELINE_A.nii.gz
5,-rw-r--r--,1,jmccabe,summit,175178964,Sep,7,2023,summit-2224-eju_Y0_BASELINE_A.nii.gz


Missed Detection: 0
Training size: 472.614774511 GB


In [5]:
import warnings

warnings.filterwarnings('ignore')

import pandas as pd
# Grt123

server_listings = open('grt123_listings.txt', 'r').readlines()
server_listings = [x.replace('\n','') for x in server_listings]


# Split each line into columns
columns = [line.split() for line in server_listings]

# Create a dataframe from the columns
df = pd.DataFrame(columns).drop(0,axis=0)

# Set column names
df.columns = ['permissions', 'links', 'owner', 'group', 'size', 'month', 'day', 'time', 'filename']

display(df.head())

scans_on_server = df[df.filename.str.contains('_clean.npy')].filename.str.replace('_clean.npy','').values

missed = 0
for ds in datasets:
    
    for scan_id in datasets[ds]['scans']['scan_id'].values:

        
        if scan_id not in scans_on_server:
            print(f'Scan {scan_id} not found in server listings')
            missed += 1

        else:
            # update listings df with dataset
            df.loc[df.filename.str.contains(scan_id), 'dataset'] = ds

print('Missed GRT123:', missed)

# Print the size of the training data
bytes = df[df.dataset == 'Training']['size'].astype(int).sum()


Unnamed: 0,permissions,links,owner,group,size,month,day,time,filename
1,drwxr-sr-x,2,jmccabe,summit,307200,Mar,20,15:39,exclusions
2,-rw-r--r--,1,jmccabe,summit,0,Apr,24,12:37,grt123_listings.txt
3,-rw-r--r--,1,jmccabe,summit,22270688,Feb,9,16:01,summit-2223-yts_Y0_BASELINE_A_clean.npy
4,-rw-r--r--,1,jmccabe,summit,160,Feb,9,16:01,summit-2223-yts_Y0_BASELINE_A_label.npy
5,-rw-r--r--,1,jmccabe,summit,16011992,Feb,8,23:04,summit-2224-gak_Y0_BASELINE_A_clean.npy


Missed GRT123: 0


In [6]:
print('Training size:', bytes/1e9, 'GB')

Training size: 86.670393324 GB


In [11]:

# Check nodule includes and exclude counts
import pandas as pd
from pathlib import Path
import os

workspace_path = Path(os.getcwd()).parent.parent

for flavour in ['test_balanced', 'male_only', 'white_only']:

    for ds in ['training','validation','test']:

        df = pd.read_csv(f'{workspace_path}/metadata/summit/{flavour}/{ds}_metadata.csv')

        print('*'*20)
        print(f'{flavour} {ds} includes:', df.management_plan[df.management_plan.isin(['3_MONTH_FOLLOW_UP_SCAN','URGENT_REFERRAL', 'ALWAYS_SCAN_AT_YEAR_1'])].count())
        print(f'{flavour} {ds} excludes:', df.management_plan[df.management_plan.isin(['RANDOMISATION_AT_YEAR_1'])].count())
        includes_percentage = df.management_plan[df.management_plan.isin(['3_MONTH_FOLLOW_UP_SCAN','URGENT_REFERRAL', 'ALWAYS_SCAN_AT_YEAR_1'])].count() / df.shape[0] * 100
        print(f'{flavour} {ds} includes percentage:', round(includes_percentage,2))
        print(f'{flavour} {ds} total:', df.shape[0])


        

********************
test_balanced training includes: 2466
test_balanced training excludes: 4075
test_balanced training includes percentage: 37.7
test_balanced training total: 6541
********************
test_balanced validation includes: 130
test_balanced validation excludes: 247
test_balanced validation includes percentage: 34.48
test_balanced validation total: 377
********************
test_balanced test includes: 230
test_balanced test excludes: 469
test_balanced test includes percentage: 32.9
test_balanced test total: 699
********************
male_only training includes: 700
male_only training excludes: 1418
male_only training includes percentage: 33.05
male_only training total: 2118
********************
male_only validation includes: 52
male_only validation excludes: 69
male_only validation includes percentage: 42.98
male_only validation total: 121
********************
male_only test includes: 162
male_only test excludes: 318
male_only test includes percentage: 33.75
male_only test 

In [11]:
# Check nodule includes and exclude counts
import pandas as pd
from pathlib import Path
import os

workspace_path = Path(os.getcwd()).parent.parent

for flavour in ['test_balanced', 'male_only', 'white_only']:

    df = pd.read_csv(f'{workspace_path}/metadata/summit/{flavour}/test_scans_metadata.csv')
    print('*'*20)
    print(f'flavour: {flavour}')
    print('Scans:', df.shape[0])
    print('Gender')
    display(df.participant_details_gender.value_counts())
    print('Ethnic Group')
    display(df.lung_health_check_demographics_race_ethnicgroup.value_counts())


    # Check nodule includes and exclude counts
    df = pd.read_csv(f'{workspace_path}/metadata/summit/{flavour}/test_metadata.csv').assign(actionable=lambda df: df.management_plan.isin(['3_MONTH_FOLLOW_UP_SCAN','URGENT_REFERRAL', 'ALWAYS_SCAN_AT_YEAR_1']))
    
    print('Nodules')
    print('Total:', df.shape[0])
    print('Gender')
    display(df.gender.value_counts())
    print('Ethnic Group')
    display(df.ethnic_group.value_counts())
    print('Actionable Gender')
    display(df[df.actionable].gender.value_counts())
    print('Actionable Ethnic Group')
    display(df[df.actionable].ethnic_group.value_counts())
    

********************
flavour: test_balanced
Scans: 594
Gender


MALE      344
FEMALE    250
Name: participant_details_gender, dtype: int64

Ethnic Group


Black                     198
Asian or Asian British    198
White                     198
Name: lung_health_check_demographics_race_ethnicgroup, dtype: int64

Nodules
Total: 699
Gender


MALE      400
FEMALE    299
Name: gender, dtype: int64

Ethnic Group


Black                     250
White                     242
Asian or Asian British    207
Name: ethnic_group, dtype: int64

Actionable Gender


MALE      120
FEMALE    110
Name: gender, dtype: int64

Actionable Ethnic Group


White                     103
Asian or Asian British     65
Black                      62
Name: ethnic_group, dtype: int64

********************
flavour: male_only
Scans: 420
Gender


MALE    420
Name: participant_details_gender, dtype: int64

Ethnic Group


White                     140
Asian or Asian British    140
Black                     140
Name: lung_health_check_demographics_race_ethnicgroup, dtype: int64

Nodules
Total: 480
Gender


MALE    480
Name: gender, dtype: int64

Ethnic Group


White                     188
Black                     171
Asian or Asian British    121
Name: ethnic_group, dtype: int64

Actionable Gender


MALE    162
Name: gender, dtype: int64

Actionable Ethnic Group


White                     82
Asian or Asian British    43
Black                     37
Name: ethnic_group, dtype: int64

********************
flavour: white_only
Scans: 798
Gender


FEMALE    399
MALE      399
Name: participant_details_gender, dtype: int64

Ethnic Group


White    798
Name: lung_health_check_demographics_race_ethnicgroup, dtype: int64

Nodules
Total: 1105
Gender


MALE      553
FEMALE    552
Name: gender, dtype: int64

Ethnic Group


White    1105
Name: ethnic_group, dtype: int64

Actionable Gender


FEMALE    216
MALE      198
Name: gender, dtype: int64

Actionable Ethnic Group


White    414
Name: ethnic_group, dtype: int64

In [26]:
import pandas as pd

ticnet_annotations = pd.read_csv('/Users/john/Projects/TiCNet-main/annotations/summit/test_balanced/test_metadata.csv')
metadata = pd.read_csv('/Users/john/Projects/SOTAEvaluationNoduleDetection/metadata/summit/test_balanced/test_metadata.csv').assign(seriesuid=lambda df: df.participant_id + '_Y0_BASELINE_A')

metadata = metadata.assign(seriesuid_counter=lambda df: df.groupby('seriesuid').cumcount() + 1)
ticnet_annotations = ticnet_annotations.assign(seriesuid_counter=lambda df: df.groupby('seriesuid').cumcount() + 1)

df = pd.merge(metadata, ticnet_annotations, on=['seriesuid','seriesuid_counter'], how='outer')

df['diameter_proportion'] = df['nodule_diameter_mm'] / df['diameter_mm']

df['diameter_proportion'].min(), df['diameter_proportion'].max()

df[[
    'seriesuid',
    'nodule_x_coordinate',
    'nodule_y_coordinate',
    'nodule_z_coordinate',
    'coordX',
    'coordY',
    'coordZ',
    ]]


Unnamed: 0,seriesuid,nodule_x_coordinate,nodule_y_coordinate,nodule_z_coordinate,coordX,coordY,coordZ
0,summit-2294-pxb_Y0_BASELINE_A,-57.090,72.70,-168.30,76.210,181.20,89.200
1,summit-2298-djm_Y0_BASELINE_A,-91.860,-53.27,-117.39,60.840,48.73,144.610
2,summit-2323-kha_Y0_BASELINE_A,65.820,52.25,-129.88,193.320,168.25,88.620
3,summit-2323-kha_Y0_BASELINE_A,53.860,81.70,-169.06,181.360,197.70,49.440
4,summit-2323-kha_Y0_BASELINE_A,96.625,66.25,-162.50,224.125,182.25,56.000
...,...,...,...,...,...,...,...
694,summit-9967-eya_Y0_BASELINE_A,-121.530,21.67,-205.95,38.470,145.17,122.425
695,summit-9967-eya_Y0_BASELINE_A,-40.440,21.33,-53.65,119.560,144.83,274.725
696,summit-9967-eya_Y0_BASELINE_A,-99.130,32.74,-93.57,60.870,156.24,234.805
697,summit-9967-rsb_Y0_BASELINE_A,105.240,21.78,-37.30,227.240,116.28,183.075
