# # Combine Tranche1 and Tranche2 data into a single file

In [14]:
import pandas as pd

In [15]:
blacklist = ['UCLH_43663037','UCLH_45634500','UCLH_59066126','UCLH_50882667','UCLH_27847999'] + ['UCLH_92436946']

# Scans

In [16]:
tranche1_scans = pd.read_csv('/Users/john/Projects/SOTAEvaluationNoduleDetection/metadata/lsut/tranche1_scans.csv')
tranche2_scans = pd.read_csv('/Users/john/Projects/SOTAEvaluationNoduleDetection/metadata/lsut/tranche2_scans.csv')

lsut_scans = pd.concat([tranche1_scans, tranche2_scans], ignore_index=True)

lsut_scans = lsut_scans[~lsut_scans['scan_id'].isin(blacklist)]
lsut_scans.to_csv('/Users/john/Projects/SOTAEvaluationNoduleDetection/metadata/lsut/lsut_scans.csv', index=False)

lsut_scans_ids = set(lsut_scans['scan_id'].values)

# Scans metadata

In [18]:
tranche1_scans_metadata = pd.read_csv('/Users/john/Projects/SOTAEvaluationNoduleDetection/metadata/lsut/tranche1_scan_metadata.csv')
tranche2_scans_metadata = pd.read_csv('/Users/john/Projects/SOTAEvaluationNoduleDetection/metadata/lsut/tranche2_scan_metadata.csv')

lsut_scans_metadata = pd.concat([tranche1_scans_metadata, tranche2_scans_metadata], ignore_index=True)
lsut_scans_metadata = lsut_scans_metadata[~lsut_scans_metadata['ScananonID'].isin(blacklist)]
lsut_scans_metadata.to_csv('/Users/john/Projects/SOTAEvaluationNoduleDetection/metadata/lsut/lsut_scans_metadata.csv', index=False)

# Nodule Metadata

In [19]:
import pandas as pd
import numpy as np


tranche1_nodules = pd.read_csv('/Users/john/Projects/SOTAEvaluationNoduleDetection/metadata/lsut/tranche1_metadata.csv')
tranche2_nodules = pd.read_csv('/Users/john/Projects/SOTAEvaluationNoduleDetection/metadata/lsut/tranche2_metadata.csv')

lsut_metadata = pd.concat([tranche1_nodules, tranche2_nodules], ignore_index=True)
lsut_metadata = lsut_metadata[~lsut_metadata['scan_id'].isin(blacklist)]

def nms(df, scan_id_col, x_col, y_col, z_col, diameter_col, iou_threshold=0.5):

    def iou(box1, box2):
        x1, y1, z1, d1 = box1
        x2, y2, z2, d2 = box2

        r1 = d1 / 2
        r2 = d2 / 2

        dist = np.sqrt((x1 - x2) ** 2 + (y1 - y2) ** 2 + (z1 - z2) ** 2)
        return min(r1, r2) / max(r1, r2) if dist < r1 + r2 else 0

    def nms_single_scan(scan_df):
        boxes = scan_df[[x_col, y_col, z_col, diameter_col]].values
        scores = np.ones(len(boxes))  # Assuming all nodules have the same score

        indices = np.argsort(scores)[::-1]
        keep = []

        while len(indices) > 0:
            current = indices[0]
            keep.append(current)
            if len(indices) == 1:
                break

            current_box = boxes[current]
            rest_boxes = boxes[indices[1:]]

            ious = np.array([iou(current_box, box) for box in rest_boxes])
            indices = indices[1:][ious < iou_threshold]

        return scan_df.iloc[keep]

    result_df = df.groupby(scan_id_col).apply(nms_single_scan).reset_index(drop=True)
    return result_df

lsut_metadata = nms(lsut_metadata, 'scan_id', 'nodule_x_coordinate', 'nodule_y_coordinate', 'nodule_z_coordinate', 'nodule_diameter_mm')
lsut_metadata.to_csv('/Users/john/Projects/SOTAEvaluationNoduleDetection/metadata/lsut/lsut_metadata.csv', index=False)


lsut_metadata['scan_id'].nunique()

146

# Check we have everything processed for GRT123

In [11]:
grt123_listings = (
    pd.read_csv('grt123_cache_listings.txt', header=None, names=['filename'])
    .drop(index=0)
    .assign(filename=lambda df: df['filename'].str.replace('.npy', '', regex=False))
    .assign(scan_id=lambda df: df['filename'].apply(lambda x: '_'.join(x.split('_')[:-1])),
        filetype=lambda df: df['filename'].apply(lambda x: x.split('_')[-1]))
)
grt123_listings_ids = set(grt123_listings.query('filetype == "clean"')['scan_id'].values)

print(len(lsut_scans_ids), len(grt123_listings_ids), lsut_scans_ids - grt123_listings_ids)

297 297 set()


# Check that we have preprocessed everything for TicNet

In [12]:
ticnet_listings = (
    pd.read_csv('ticnet_cache_listings.txt', header=None, names=['filename'])
    .drop(index=0)
    .assign(filename=lambda df: df['filename'].str.replace('.npy', '', regex=False))
    .assign(filetype=lambda df: df['filename'].apply(lambda x: x.split('_')[-1] if len(x.split('_')) == 3 else 'image'))
)
ticnet_preprocessed_ids = set(ticnet_listings.query('filetype == "image"')['filename'].values)

print(len(lsut_scans_ids), len(ticnet_preprocessed_ids), lsut_scans_ids - ticnet_preprocessed_ids)

297 297 set()


# Check that we have preprocessed everything for Detection

In [13]:
detection_listings = (
    pd.read_csv('detection_cache_listings.txt', header=None, names=['foldername'])
    .drop(index=0)
)

detection_preprocessed_ids = set(detection_listings['foldername'].values)

print(len(lsut_scans_ids), len(detection_preprocessed_ids), lsut_scans_ids - detection_preprocessed_ids)

297 297 set()


# Generate the dataset json

In [14]:
dataset_json = {'trainings': [], 'validations': [], 'test': []}

for scan_id in lsut_scans_ids:

    group = lsut_metadata.query('scan_id == @scan_id')

    if group.shape[0] == 0:

        dataset_json['test'].append({
            'image' : f'{scan_id}/{scan_id}.nii.gz',
            'box' : [],
            'label' : []
        })

    else:

        dataset_json['test'].append({
            'image' : f'{scan_id}/{scan_id}.nii.gz',
            'box' : group[[
                'nodule_x_coordinate',
                'nodule_y_coordinate',
                'nodule_z_coordinate',
                'nodule_diameter_mm'
            ]].values.tolist(),
            'label' : [0] * group.shape[0]
        })

import json
with open('/Users/john/Projects/SOTAEvaluationNoduleDetection/models/detection/datasplits/lsut/dataset.json', 'w') as f:
    json.dump(dataset_json, f)