In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import sys
import subprocess

workspace_path = '/Users/john/Projects/SOTAEvaluationNoduleDetection'

if os.path.basename(os.getcwd()).upper() == 'SOTAEVALUATIONNODULEDETECTION':
    sys.path.append('utilities')
    sys.path.append('notebooks')
else:
    sys.path.append('../../../utilities')
    sys.path.append('../../../notebooks')

from summit_utils import SummitScan, xyz2irc, XyzTuple

def get_voxel_coords(scan, x, y, z):
    # Convert real-world coordinates to voxel coordinates
    voxel_coords = xyz2irc(
        XyzTuple(x, y, z),
        scan.origin,
        scan.voxel_size,
        scan.orientation
    )
    return voxel_coords

def copy_scan_from_cluster(scan_id):
    study_id = scan_id.split('_')[0]
   # now copy the src file
    if not os.path.exists(f"{workspace_path}/data/summit/scans/{study_id}/{scan_id}.mhd"):
        os.makedirs(f"{workspace_path}/data/summit/scans/{study_id}", exist_ok=True)

        command = [
            "scp",
            "-P 2222",
            f"jmccabe@localhost:/cluster/project2/SummitLung50/{study_id}/{scan_id}.*",
            f"{workspace_path}/data/summit/scans/{study_id}/."
        ]
        result = subprocess.run(command)
        print(result.stdout)
    else:
        print(f'{scan_id} already exists')

    return f"{workspace_path}/data/summit/scans/{study_id}/{scan_id}.mhd"

def display_nodules(scan_id, scan_path, nodule_data):

    scan = SummitScan.load_scan(scan_path)

    # Create a new figure based on the number of nodules
    fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(13, 5))
    fig.suptitle(f"{scan_id} - nodules")

    irc = get_voxel_coords(
        scan,
        nodule_data.nodule_x_coordinate,
        nodule_data.nodule_y_coordinate,
        nodule_data.nodule_z_coordinate
    )

    img = scan.image[irc.index, :, :]
    axs[0].imshow(img, cmap='gray')

    if nodule_data.nodule_diameter_mm == 0:
        diameter = 15
    else:
        diameter = nodule_data.nodule_diameter_mm

    # Add a rectangle centered on irc
    rect = plt.Rectangle(
        (
            irc.col - diameter,
            irc.row - diameter
        ),
        diameter * 2,
        diameter * 2,
        linewidth=1,
        edgecolor='r',
        facecolor='none'
    )

    axs[0].add_patch(rect)
    axs[0].set_title(f"{nodule_data.nodule_type} ({nodule_data.nodule_diameter_mm}mm)\nLoction: {irc.index}, {irc.row}, {irc.col}")

    # Scatter plot for each timepoint
    timepoints = ['y0', 'y0_3m', 'y0_6m', 'y1', 'y1_3m', 'y1_6m', 'y2', 'y2_3m']
    colors = {'SOLID': 'blue', 'PART_SOLID': 'green', 'NON_SOLID': 'red'}
    
    y_value = 0  # Initialize y_value before the loop

    for tdx, tp in enumerate(timepoints):
        type_val = nodule_data[f'{tp}_radiology_report_nodule_type']
        mass_val = nodule_data[f'{tp}_radiology_report_nodule_mass']
        category_val = nodule_data[f'{tp}_radiology_report_nodule_category']

        color = colors.get(type_val, 'black')

        size = float(mass_val)

        if category_val == 'GROWING':
            y_value += 5
        elif category_val == 'SHRINKING':
            y_value -= 5
        else:
            y_value = y_value

        axs[1].scatter(tdx + 1, y_value, color=color, s=size, alpha=0.6, edgecolors='w', linewidth=0.5)

        axs[1].set_ylim(-10, 10)
        axs[1].set_xticks(np.arange(1, len(timepoints) + 1))
        axs[1].set_xticklabels(timepoints, rotation=90)
        axs[1].set_title(f'Nodule Tracking Data for {scan_id}\nMalignant: {group.RedCap_NoduleMalignancy_diagnosis}-{group.RedCap_NoduleMalignancy_criteria}-{group.NodulePrimaryOrder}')
        axs[1].set_xlabel('Timepoint')
        axs[1].set_ylabel('Nodule Diameter (mm)')

    plt.tight_layout()
    plt.show()



In [19]:
scan_data = pd.read_csv(f"{workspace_path}/data/summit/data/24-Nov-07/ScansData.csv")
scan_data.head()

  scan_data = pd.read_csv(f"{workspace_path}/data/summit/data/24-Nov-07/ScansData.csv")


Unnamed: 0,LDCT_RADIOLOGY_REPORT_main_form_instance_id,LDCT_RADIOLOGY_REPORT_main_form_instance_status,LDCT_RADIOLOGY_REPORT_main_participant_id,radiology_report_incidental_coronary_calcium_cir,radiology_report_incidental_coronary_calcium_lmlad,radiology_report_incidental_coronary_calcium_rca,radiology_report_incidental_family_history_of_lung_cancer,radiology_report_incidental_mediastinal,radiology_report_incidental_mediastinal_enum_anterior_mediastinal_mass,radiology_report_incidental_mediastinal_enum_aortic_valve_calcification,...,mpi_RadiologyAlgorithmValidationReport_ChosenManagementPlan_Y1_NODULE_FOLLOWUP_3M,mpi_RadiologyAlgorithmValidationReport_SubmittedDate_Y1_NODULE_FOLLOWUP_6M,mpi_RadiologyAlgorithmValidationReport_ProcedureType_Y1_NODULE_FOLLOWUP_6M,mpi_RadiologyAlgorithmValidationReport_ChosenManagementPlan_Y1_NODULE_FOLLOWUP_6M,mpi_RadiologyAlgorithmValidationReport_SubmittedDate_Y2,mpi_RadiologyAlgorithmValidationReport_ProcedureType_Y2,mpi_RadiologyAlgorithmValidationReport_ChosenManagementPlan_Y2,mpi_RadiologyAlgorithmValidationReport_SubmittedDate_Y2_NODULE_FOLLOWUP_3M,mpi_RadiologyAlgorithmValidationReport_ProcedureType_Y2_NODULE_FOLLOWUP_3M,mpi_RadiologyAlgorithmValidationReport_ChosenManagementPlan_Y2_NODULE_FOLLOWUP_3M
0,--5w-A7DB3TUIldq0ib4xrOf1PSlV_ZgM0y8xV2FO1c,COMPLETED,summit-8898-hhm,MODERATE,MODERATE,SEVERE,NOT_ANSWERED,AORTIC_VALVE_CALCIFICATION,False,True,...,,,,,2021-12-13 15:20:27,Y2,SUMMIT_COMPLETE_NO_FURTHER_FOLLOW_UP_REQUIRED,,,
1,--7WIzX7ljkTx-O5-LpqFlOuMGdkaMpEbRhw0jb5QOs,COMPLETED,summit-7773-unh,NONE,MILD,NONE,NOT_ANSWERED,NONE,False,False,...,,,,,,,,,,
2,--Dsio6MVVt2UmtquPiB0JvArO7W2xxjgK873KtZYzo,COMPLETED,summit-9632-adu,MODERATE,SEVERE,MODERATE,NOT_ANSWERED,NONE,False,False,...,,,,,2022-01-12 17:50:03,Y2,SUMMIT_COMPLETE_NO_FURTHER_FOLLOW_UP_REQUIRED,,,
3,-0bIe02Bf_wnpcZ7C_wzxlNUNYWFDqA0R-vu_fYbXNs,COMPLETED,summit-8824-sch,NONE,NONE,NONE,NOT_ANSWERED,NONE,False,False,...,,,,,2022-05-26 09:05:13,Y2,SUMMIT_COMPLETE_NO_FURTHER_FOLLOW_UP_REQUIRED,,,
4,-0ierlyJNHwpFZgNHe-p_8H96K4PxW-PVxBKPgRRo6o,COMPLETED,summit-7489-jrw,NONE,MILD,NONE,NOT_ANSWERED,AORTIC_VALVE_CALCIFICATION,False,True,...,,,,,,,,,,


In [27]:
workspace_path = '/Users/john/Projects/SOTAEvaluationNoduleDetection'
nodule_data = pd.read_csv(
    f"{workspace_path}/data/summit/data/nodule_data.csv",
    usecols=[
        'participant_id',
        'radiology_report_nodule_lesion_id',
        'radiology_report_nodule_x_coordinate',
        'radiology_report_nodule_y_coordinate',
        'radiology_report_nodule_z_coordinate',
        'radiology_report_nodule_diameter_mm',
        'radiology_report_nodule_type'
    ]
).assign(uid=lambda x: x.participant_id + '_Y0_BASELINE_A_' + x.radiology_report_nodule_lesion_id.astype(str))
nodule_data.head()

Unnamed: 0,participant_id,radiology_report_nodule_diameter_mm,radiology_report_nodule_lesion_id,radiology_report_nodule_type,radiology_report_nodule_x_coordinate,radiology_report_nodule_y_coordinate,radiology_report_nodule_z_coordinate,uid
0,summit-2222-djr,7.6,1.0,PERIFISSURAL,73.05,56.84,-190.33,summit-2222-djr_Y0_BASELINE_A_1.0
1,summit-2222-djr,10.0,2.0,SOLID,58.76,61.71,-187.04,summit-2222-djr_Y0_BASELINE_A_2.0
2,summit-2222-djr,4.8,3.0,PERIFISSURAL,-92.18,-51.99,-230.21,summit-2222-djr_Y0_BASELINE_A_3.0
3,summit-2222-zmd,6.1,1.0,PERIFISSURAL,29.85,54.13,-69.19,summit-2222-zmd_Y0_BASELINE_A_1.0
4,summit-2223-sbv,5.3,1.0,PERIFISSURAL,138.97,48.41,-195.11,summit-2223-sbv_Y0_BASELINE_A_1.0


In [28]:
# Pull in multi-timepoint data
import re

wide_nodule_data = (
    pd.read_csv('/Users/john/Projects/SOTAEvaluationNoduleDetection/data/summit/data/24-Nov-07/NoduleData.csv')
    .rename(columns={'LDCT_RADIOLOGY_REPORT_radiology_report_nodule_brock_score_participant_id' : 'study_id'}) 
    .assign(uid=lambda x: x.study_id + '_Y0_BASELINE_A_' + x.radiology_report_nodule_lesion_id.astype(str))   
)

wide_nodule_data.rename(columns={
    col : re.sub(r'^(radiology_report_nodule_.*)$', r'y0_\1', col)
    for col in wide_nodule_data.columns
}, inplace=True)

char_vars = []
for col in wide_nodule_data.columns:
    if re.search(r'(type|category|diameter_mm|mass)$', col):
        char_vars.append(col)

nodule_tracking_data = wide_nodule_data[['uid'] + char_vars]
nodule_tracking_data.head()

cancer_nodules = (
    pd.read_csv('/Users/john/Projects/SOTAEvaluationNoduleDetection/data/summit/data/24-Nov-07/MalignantNodules.csv', delimiter='\t')
    .rename(columns={'RedCap_NoduleMalignancy_participant_id' : 'study_id'})
    .assign(uid=lambda x: x.study_id + '_Y0_BASELINE_A_' + x.RedCap_NoduleMalignancy_cancer_nodule_id.astype(str))
)

df = pd.merge(nodule_data, nodule_tracking_data, on='uid', how='left')

df = pd.merge(df, cancer_nodules, on='uid', how='left')

print(df.shape)


  pd.read_csv('/Users/john/Projects/SOTAEvaluationNoduleDetection/data/summit/data/24-Nov-07/NoduleData.csv')


(17987, 45)


In [33]:
study_id = 'summit-4679-wxc'
uid = "summit-4679-wxc_Y0_BASELINE_A_2.0"
scan_path = copy_scan_from_cluster(scan_id)
df[df.participant_id == study_id]

# display(scan_data[scan_data['study_id'] == scan_id.split('_')[0]])
# display(nodule_data[nodule_data['study_id'] == scan_id.split('_')[0]])
# display_nodules(scan_id, )

None


ssh: connect to host localhost port 2222: Connection refused
scp: Connection closed


Unnamed: 0,participant_id,radiology_report_nodule_diameter_mm,radiology_report_nodule_lesion_id,radiology_report_nodule_type,radiology_report_nodule_x_coordinate,radiology_report_nodule_y_coordinate,radiology_report_nodule_z_coordinate,uid,y0_radiology_report_nodule_category,y0_radiology_report_nodule_diameter_mm,...,y2_radiology_report_nodule_type,y2_3m_radiology_report_nodule_category,y2_3m_radiology_report_nodule_diameter_mm,y2_3m_radiology_report_nodule_mass,y2_3m_radiology_report_nodule_type,study_id,RedCap_NoduleMalignancy_diagnosis,RedCap_NoduleMalignancy_cancer_nodule_id,RedCap_NoduleMalignancy_criteria,NodulePrimaryOrder
6206,summit-4679-wxc,0.0,1.0,SOLID,149.53555,26.845901,-73.625,summit-4679-wxc_Y0_BASELINE_A_1.0,BASELINE,0.0,...,,,,,,,,,,
