# Process Slicer Markup files - Tranche 1 

Description: 

This notebook processes the Slicer markup files that were generated by Joe & Daisuke from the raw Veolity outputs.

They removed non-nodules and added in nodules that were not picked up by Veolity.

This final LSUT nodule locations will need to be tied to the LSUT annotations file to add in the additional nodule detail.

There will additionally need to be some error resolutions where there are discrepancies between this new nodule identification process and the original one carried out on LSUT.

<strong>Steps</strong>
1. Load the markup files into a dataframe
2. Compare the raw Veolity output with the adjusted markup files
3. Review metrics for Veolity
4. Merge in annotations file and assign characteristics to nodules where possible / check
5. Generate a spreedsheet with nodule data including data entry capability to add in nodule-type and nodule-diameter-mm


Cases to exclude due:
- UCLH_43663037 - too many nodules to manually annotate, Veolity has identified 75 candidates, annotations.Total_nos_nod: 25
- UCLH_45634500 - too many ground glass nodules Veolity only identified 10, annotations say 15 reality was 100's non-solid nodules
- UCLH_59066126 - too many nodules to consider trying to mark up and confirm them all
- UCLH_50882667 - no mhd file on cluster
- UCLH_27847999 - no mhd file on cluster



In [177]:
import json
import pandas as pd
from pathlib import Path

# 1-3. Load & combine markup files, compare and gen. metrics

In [181]:

blacklist = ['UCLH_43663037','UCLH_45634500','UCLH_59066126','UCLH_50882667','UCLH_27847999']

def read_markup(file_path):

    patient_id = Path(file_path).stem
    markup_json = json.load(open(file_path))

    control_points_json = markup_json['markups'][0]['controlPoints']

    control_points = []
    for control_point in control_points_json:
        control_points.append({
            'patient_id' : patient_id,
            'label' : control_point['label'],
            'X' : control_point['position'][0],
            'Y' : control_point['position'][1],
            'Z' : control_point['position'][2],
            'orientation' : control_point['orientation']
        })
    return pd.DataFrame(control_points)

all_patients_ids = [patient_id.stem for patient_id in Path('RadiologistReview/tranche1').rglob('*.json') if patient_id.stem != 'clean']
all_patient_ids = list(set(all_patients_ids))

print('Number of patients:', len(all_patient_ids))

reader1_original_markup_data = pd.concat([
    read_markup(original_markup_file)
    for original_markup_file in Path('RadiologistReview/tranche1/reader1').glob('*.json')
])

reader2_original_markup_data = pd.concat([
    read_markup(original_markup_file)
    for original_markup_file in Path('RadiologistReview/tranche1/reader2').glob('*.json')
])

original_markup_data = reader1_original_markup_data
original_markup_data = pd.concat([reader1_original_markup_data, reader2_original_markup_data]).reset_index(drop=True)

reader1_corrected_markup_data = pd.concat([
    read_markup(corrected_markup_file)
    for corrected_markup_file in Path('RadiologistReview/tranche1/reader1/corrected').glob('*.json')
])

reader2_corrected_markup_data = pd.concat([
    read_markup(corrected_markup_file)
    for corrected_markup_file in Path('RadiologistReview/tranche1/reader2/corrected').glob('*.json')
])

corrected_markup_data = reader1_corrected_markup_data.reset_index(drop=True)
corrected_markup_data = pd.concat([reader1_corrected_markup_data, reader2_corrected_markup_data]).reset_index(drop=True)

scan_count = 0
tp_counts = []
fp_counts = []
fn_counts = []
for patient_id in all_patient_ids:

    original_patient_data = original_markup_data[original_markup_data.patient_id == patient_id]
    corrected_patient_data = corrected_markup_data[corrected_markup_data.patient_id == patient_id]
    scan_count += 1
    
    if original_patient_data.shape[0] > 0 or corrected_patient_data.shape[0] > 0:
        tp_cnt = original_patient_data.merge(corrected_patient_data, on=['label'], how='inner').shape[0]
        tp_counts.append(tp_cnt)
        fp_counts.append(original_patient_data.patient_id.count() - tp_cnt)
        fn_counts.append(corrected_patient_data.patient_id.count() - tp_cnt)

tp_counts = sum(tp_counts)
fp_counts = sum(fp_counts)
fn_counts = sum(fn_counts)

print('Scan count:', scan_count)
print('True positives:', tp_counts, 'False negatives:', fn_counts)
print('Sensitivity:', round(tp_counts / (tp_counts + fn_counts),1))
print('False positives:', fp_counts, 'False positive per scan rate:', round(fp_counts / scan_count,1))


# Double negative cases i.e. scans that had no control points in corrected markup
blank_markup_ids = []
for patient_id in all_patient_ids:

    corrected_patient_data = corrected_markup_data[corrected_markup_data.patient_id == patient_id]

    if corrected_patient_data.shape[0] == 0:
        blank_markup_ids.append(patient_id)

blank_markup_ids = set(blank_markup_ids)

print('Empty markup count:', len(blank_markup_ids))

Number of patients: 131
Scan count: 131
True positives: 751 False negatives: 158
Sensitivity: 0.8
False positives: 257 False positive per scan rate: 2.0
Empty markup count: 15


In [182]:
# 4. Load and merge annotations file

def pixel_to_real_world(offset, spacing, pixel_value):
    return round(offset + pixel_value * spacing, 2)

annotations = pd.read_csv('annotations.csv')

display(annotations.Total_no_nods.value_counts())
display(annotations.Nod1_type.value_counts().sum())

metaio_metadata = pd.read_csv('lung_metadata.csv').assign(scan_id=lambda x: x['scan_id'].str.replace('.mhd', ''))

annotations = pd.merge(
    metaio_metadata,
    annotations,
    left_on='scan_id',
    right_on='ScananonID',
    how='left'
)

annotations['Nod1_floc'] = annotations.apply(
    lambda row: row['slices'] - row['Nod1_loc'] if pd.notnull(row['Nod1_loc']) else None, axis=1
)

annotations['Nod2_floc'] = annotations.apply(
    lambda row: row['slices'] - row['Nod2_loc'] if pd.notnull(row['Nod2_loc']) else None, axis=1
)
    
annotations['Nod1_real_world'] = annotations.apply(
    lambda row: pixel_to_real_world(row['z-offset'], row['z-spacing'], row['Nod1_floc']) if pd.notnull(row['Nod1_floc']) else (None), axis=1
)

annotations['Nod2_real_world'] = annotations.apply(
    lambda row: pixel_to_real_world(row['z-offset'], row['z-spacing'], row['Nod2_floc']) if pd.notnull(row['Nod2_floc']) else (None), axis=1
)

nod1_recode = {
    'Nod1_diam' : 'Nod_diam',
    'Nod1_type' : 'Nod_type',
    'Nod1_type_other' : 'Nod_type_other',
    'Nod1_real_world' : 'Nod_real_world',
    'Nod1_pos' : 'Nod_pos',
    'Nod1_pos_other' : 'Nod_pos_other',
}

nod2_recode = {
    'Nod2_diam' : 'Nod_diam',
    'Nod2_type' : 'Nod_type',
    'Nod2_type_other' : 'Nod_type_other',
    'Nod2_real_world' : 'Nod_real_world',
    'Nod2_pos' : 'Nod_pos',
    'Nod2_pos_other' : 'Nod_pos_other',
}

nod1_data = annotations[['ScananonID', 'Total_no_nods'] + list(nod1_recode.keys())].rename(columns=nod1_recode).query('Nod_real_world.notnull()')
nod2_data = annotations[['ScananonID', 'Total_no_nods'] + list(nod2_recode.keys())].rename(columns=nod2_recode).query('Nod_real_world.notnull()')

nod_data = pd.concat([nod1_data, nod2_data]).reset_index(drop=True)

display(nod_data.head())

display(nod_data.Nod_type.value_counts())
display(nod_data.Nod_pos.value_counts())
display(nod_data.Nod_pos_other.value_counts())

0.0     580
1.0     115
2.0      24
3.0       9
10.0      8
4.0       6
5.0       5
15.0      3
8.0       3
6.0       2
20.0      1
16.0      1
25.0      1
50.0      1
12.0      1
Name: Total_no_nods, dtype: int64

158

  metaio_metadata = pd.read_csv('lung_metadata.csv').assign(scan_id=lambda x: x['scan_id'].str.replace('.mhd', ''))


Unnamed: 0,ScananonID,Total_no_nods,Nod_diam,Nod_type,Nod_type_other,Nod_real_world,Nod_pos,Nod_pos_other
0,UCLH_00134949,1.0,6.0,SN,,-1452.8,subpleural (<5mm from pleura),
1,UCLH_00239233,1.0,15.0,PSN,airspace,1786.1,parenchymal,
2,UCLH_07024905,10.0,22.0,SN,,1721.7,subpleural (<5mm from pleura),
3,UCLH_22801382,2.0,2.5,SN,,2118.1,parenchymal,
4,UCLH_23344772,1.0,6.0,SN,,1854.5,parenchymal,


SN       99
pGGN     25
PSN      18
Other     4
Name: Nod_type, dtype: int64

subpleural (<5mm from pleura)    70
parenchymal                      55
other                            20
Name: Nod_pos, dtype: int64

parenchymal             13
perifissural             2
pleural based            2
interfissural            1
central bronchogenic     1
parenchyma               1
Name: Nod_pos_other, dtype: int64

# Annotations indicate nodules but review said no nodules

## Downgraded

In [183]:
annotations_with_nodule_cnt_ids = set(annotations.query('Total_no_nods > 0').ScananonID)
downgraded_ids = set(blank_markup_ids.intersection(annotations_with_nodule_cnt_ids))
print('Scans with nodules but with blank mark up files:', len(downgraded_ids))

Scans with nodules but with blank mark up files: 8


# Cases that had annotations nod count = 0 but had control points in corrected markup

## Upgraded

In [184]:
annotations_without_nodule_cnt_ids = set(annotations.query('Total_no_nods == 0').ScananonID)
zero_nodule_selection = annotations_without_nodule_cnt_ids.intersection(all_patient_ids)
upgraded_ids = set(zero_nodule_selection - blank_markup_ids)
print('Scans that were predicted no nods but had markups', len(upgraded_ids))


Scans that were predicted no nods but had markups 13


# Match up the annotation data with the corrected markup data

### Validation purposes only, 

In [107]:
# Now match up the annotations with the corrected markup data but only for the cases that
# have been corrected i.e., all_patient_ids

found = {idx : [] for idx in nod_data.query('ScananonID in @all_patient_ids').index}
used = {mdx : None for mdx in corrected_markup_data.index}

for patient_id in corrected_markup_data.patient_id.unique():

    patient_annotation_data = nod_data[nod_data.ScananonID == patient_id]
    patient_markup_nodule_data = corrected_markup_data[corrected_markup_data.patient_id == patient_id]
    for idx, annotation_nodule in patient_annotation_data.iterrows():
        
        for mdx, markup_nodule in patient_markup_nodule_data.iterrows():

            if abs(annotation_nodule['Nod_real_world'] - markup_nodule['Z']) <= (annotation_nodule['Nod_diam'] * 0.8):
                found[idx].append(mdx)
                used[mdx] = idx

used_df = pd.DataFrame([(k, v) for k, v in used.items()], columns=['markup_idx', 'annotation_idx'])

lsut_nodule_data = (
    corrected_markup_data
    .merge(used_df, left_index=True, right_on='markup_idx', how='left')
    .merge(nod_data, left_on='annotation_idx', right_index=True, how='left')
    .drop(columns=['ScananonID','Total_no_nods'])
    .merge(annotations[['ScananonID','Total_no_nods']], left_on='patient_id', right_on='ScananonID', how='left')
    .filter(
        [
            'patient_id',
            'label',
            'X',
            'Y',
            'Z',
            'Total_no_nods',
            'orientation',
            'Nod_diam',
            'Nod_type',
            'Nod_type_other',
            'Nod_real_world',
            'Nod_pos',
            'Nod_pos_other'
        ]
    )
)

lsut_nodule_data.to_csv('lsut_nodule_data.csv', index=False)
lsut_nodule_data.head()

Unnamed: 0,patient_id,label,X,Y,Z,Total_no_nods,orientation,Nod_diam,Nod_type,Nod_type_other,Nod_real_world,Nod_pos,Nod_pos_other
0,UCLH_00134949,F-0,28.125,-25.0,-1442.4,1.0,"[-1.0, -0.0, -0.0, -0.0, -1.0, -0.0, 0.0, 0.0,...",,,,,,
1,UCLH_00134949,F-1,24.375,-15.625,-1449.6,1.0,"[-1.0, -0.0, -0.0, -0.0, -1.0, -0.0, 0.0, 0.0,...",6.0,SN,,-1452.8,subpleural (<5mm from pleura),
2,UCLH_00134949,F-2,24.375,-15.625,-1449.6,1.0,"[-1.0, -0.0, -0.0, -0.0, -1.0, -0.0, 0.0, 0.0,...",6.0,SN,,-1452.8,subpleural (<5mm from pleura),
3,UCLH_00134949,F-3,-70.0,61.25,-1566.4,1.0,"[-1.0, -0.0, -0.0, -0.0, -1.0, -0.0, 0.0, 0.0,...",,,,,,
4,UCLH_00134949,UCLH_00134949-2,-35.178335,17.050104,-1439.9,1.0,"[-1.0, -0.0, -0.0, -0.0, -1.0, -0.0, 0.0, 0.0,...",,,,,,


# Investigate unmatched annotation nodules with markup data

In [112]:
found_df = pd.DataFrame([(k, v) for k, v in found.items()], columns=['annotation_idx', 'markup_idx'])
found_nod_data = nod_data.merge(found_df, left_index=True, right_on='annotation_idx')
found_nod_data = found_nod_data[found_nod_data['markup_idx'].apply(lambda x: len(x) == 0)]
found_nod_data


Unnamed: 0,ScananonID,Total_no_nods,Nod_diam,Nod_type,Nod_type_other,Nod_real_world,Nod_pos,Nod_pos_other,annotation_idx,markup_idx
3,UCLH_22801382,2.0,2.5,SN,,2118.1,parenchymal,,3,[]
11,UCLH_26601243,1.0,2.0,SN,,-1035.8,subpleural (<5mm from pleura),,13,[]
20,UCLH_33128697,1.0,4.0,SN,,34.0,parenchymal,,22,[]
22,UCLH_05198655,1.0,17.0,pGGN,,1649.4,parenchymal,,24,[]
25,UCLH_44541434,1.0,2.0,pGGN,,-796.9,parenchymal,,27,[]
28,UCLH_46651351,1.0,4.2,SN,,1925.0,subpleural (<5mm from pleura),,31,[]
29,UCLH_46718385,1.0,15.0,SN,,1876.5,subpleural (<5mm from pleura),,32,[]
31,UCLH_40522666,1.0,16.0,pGGN,,1691.4,parenchymal,,35,[]
43,UCLH_47356601,1.0,3.5,SN,,-1767.5,subpleural (<5mm from pleura),,48,[]
44,UCLH_53038494,2.0,14.0,pGGN,,1004.5,subpleural (<5mm from pleura),,49,[]


# Cell used to copy data from cluster to local machine

This is used when attributing diameter and nodule type to the mark ups

In [121]:
import shutil
import subprocess



print(len(lsut_nodule_data.patient_id.unique()))

batch_numbers = [] # update this list to process the batches
for batch_number in batch_numbers:
    batch_start = batch_number * 4
    batch = lsut_nodule_data.patient_id.unique()[batch_start:batch_start + 4]

    # batch = ['UCLH_46718385'] used to overide the copying of a single scan
    print(f'Processing batch: {batch}')
    for patient_id in batch:
        print('Copying patient:', patient_id)
        cmd = f'scp -P 2222 -r jmccabe@localhost:/cluster/project0/lung-triage/lsut/LUNG/{patient_id} /Users/john/Projects/SOTAEvaluationNoduleDetection/cache/sota/lsut/LUNG/{patient_id}'
        subprocess.run(cmd, shell=True)

        if Path(f'/Users/john/Projects/SOTAEvaluationNoduleDetection/data/lsut/reader1/corrected/{patient_id}.json').exists():
            markup_file = f'/Users/john/Projects/SOTAEvaluationNoduleDetection/data/lsut/reader1/corrected/{patient_id}.json'

        if Path(f'/Users/john/Projects/SOTAEvaluationNoduleDetection/data/lsut/reader2/corrected/{patient_id}.json').exists():
            markup_file = f'/Users/john/Projects/SOTAEvaluationNoduleDetection/data/lsut/reader2/corrected/{patient_id}.json'
            
        shutil.copy(markup_file, f'/Users/john/Projects/SOTAEvaluationNoduleDetection/cache/sota/lsut/LUNG/{patient_id}')
        
    break

116


# Combine into single list of useable scans

In [190]:
# Double negative cases i.e. tranche 1 scans that had no control points in corrected markup

tranche1_nodule_ids = set(all_patient_ids) - set(blacklist)

tranche1_dbl_pos_ids = set(tranche1_nodule_ids - downgraded_ids - upgraded_ids)
tranche1_dbl_neg_ids = set(open('tranche1_soft_recon_patients_with_no_nodules.txt').read().split('\n')) - tranche1_nodule_ids - set(blacklist)

print('Number of tranche 1, double pos. scans:', len(tranche1_dbl_pos_ids))
print('Number of tranche 1, double neg. scans:', len(tranche1_dbl_neg_ids))

x = set(open('tranche1_soft_recon_patients_with_no_nodules.txt').read().split('\n'))
y = set(all_patient_ids)
print('Warning: number of dble neg scans that were pulled in as part of 20 neg:', len(x.intersection(y)))
print('downgraded_ids:', len(downgraded_ids))
print('upgraded_ids:', len(upgraded_ids))

tranche1_all_ids = (
    tranche1_dbl_pos_ids
    .union(tranche1_dbl_neg_ids)
    .union(downgraded_ids)
    .union(upgraded_ids)
)

print('Total', len(tranche1_all_ids))



Number of tranche 1, double pos. scans: 107
Number of tranche 1, double neg. scans: 99
downgraded_ids: 8
upgraded_ids: 13
Total 227


# Write out LSUT scan ids and scan_metadata

NOTE: for use in generating labels and analysis for detection models

In [147]:

annotations = pd.read_csv('annotations.csv')
annotations.query('ScananonID in @tranche1_all_ids').to_csv('/Users/john/Projects/SOTAEvaluationNoduleDetection/metadata/lsut/tranche1_scan_metdata.csv', index=False)
annotations.query('ScananonID in @tranche1_all_ids')['ScananonID'].to_csv('/Users/john/Projects/SOTAEvaluationNoduleDetection/metadata/lsut/tranche1_scans.csv', index=False)

# Read Nodule Data Associated With Tranche 1 Scans

Convert to standard nodule metadata format and push to standard directory

In [192]:
import pandas as pd

tranche1_nodule_data = (
    pd.read_csv('tranche1_nodule_data.csv', encoding='iso-8859-1')
    .assign(tranche=1)
    .query('patient_id.notnull()')
)

nodule_type_recode = {
    'SN' : 'SOLID',
    'PSN' : 'PART-SOLID',
    'pGGN' : 'NON-SOLID',
    'Perifissural' : 'PERIFISSURAL'
}

def is_actionable(row):
    if row['nodule_type'] == 'SOLID' and row['nodule_diameter_mm'] >= 6:
        return True
    
    if row['nodule_type'] == 'PART-SOLID':
        return True
    
    if row['nodule_type'] == 'NON-SOLID' and row['nodule_diameter_mm'] >= 10:
        return True

    return False


(
    tranche1_nodule_data
    .rename(columns={
        'patient_id' : 'scan_id',
        'X' : 'nodule_x_coordinate',
        'Y' : 'nodule_y_coordinate',
        'Z' : 'nodule_z_coordinate',
        'Nod_diam' : 'nodule_diameter_mm'
    })
    .assign(nodule_type=lambda x: x['Nod_type'].map(nodule_type_recode))
    .assign(actionable=lambda x: x.apply(is_actionable, axis=1))
    .to_csv('/Users/john/Projects/SOTAEvaluationNoduleDetection/metadata/lsut/tranche1_metadata.csv', index=False)
)

  pd.read_csv('tranche1_nodule_data.csv', encoding='iso-8859-1')
