In [1]:
import os
from pathlib import Path
import pylidc as pl
from utils import cluster_annots, get_cropped_annot
import SimpleITK as sitk
from radiomics import featureextractor
import pandas as pd
import radiomics

### 1. Radiomics

- Get Cropped Images and Maks from Scans

> We began by setting up a directory to store data, creating folders for each patient and each nodule.

> For each scan, we checked if the annotations for that scan exist, and if so, we used the **cluster_annots()** function to group similar annotations and the **get_cropped_annot()** function to obtain cropped images of the nodules and their respective masks.


> The cropped data is converted from NumPy arrays to SimplesITK images and then saved in NIfTI (.nii) format, which supports 3D medical imaging

**Note:** The decision to use 3D instead of 2D feature extraction is driven by the fact that lung nodules are three-dimensional structures, and capturing their full spatial characteristics seems important for accurate analysis.

In [4]:
data_path = Path('../data_cleaned')
os.makedirs(data_path, exist_ok=True)

#import all scans
#scans = pl.query(pl.Scan).filter(pl.Scan.patient_id == 'LIDC-IDRI-0001').all()
scans = pl.query(pl.Scan).all()
for scan in scans:
    #print(scan.patient_id)
    patient_dir = Path('../data_cleaned/')/scan.patient_id
    os.makedirs(patient_dir, exist_ok=True)
    try:
        if len(scan.annotations) == 0:
            # Scan has no annotations, there is nothing to do
            continue
        
        nodules = cluster_annots(scan)
        croped, mask = get_cropped_annot(scan, nodules, False)

    except RuntimeError as e:
        print(e)
    
    #save images and maks
    for i in range(len(mask)):
        nodule_id = nodules[i][0].id
        #print(nodule_id)
        nod_path = Path(f'../data_cleaned/{scan.patient_id}/{nodule_id}')
        os.makedirs(nod_path, exist_ok=True)
        image_path = Path(f'{nod_path}/image')
        mask_path = Path(f'{nod_path}/mask')
        os.makedirs(image_path, exist_ok=True)
        os.makedirs(mask_path, exist_ok=True)

        #numpy array to SimpleITK image
        mask_sitk = sitk.GetImageFromArray(mask[i].astype(int))
        image_sitk = sitk.GetImageFromArray(croped[i])

        #save SimpleITK images as.nii (suports 3D)
        mask_path = Path(f'{mask_path}/mask.nii')
        image_path = Path(f'{image_path}/image.nii')
        sitk.WriteImage(mask_sitk, mask_path)
        sitk.WriteImage(image_sitk, image_path)

Failed to reduce all groups to <= 4 Annotations.
Some nodules may be close and must be grouped manually.
Failed to reduce all groups to <= 4 Annotations.
Some nodules may be close and must be grouped manually.
Failed to reduce all groups to <= 4 Annotations.
Some nodules may be close and must be grouped manually.
Failed to reduce all groups to <= 4 Annotations.
Some nodules may be close and must be grouped manually.
Failed to reduce all groups to <= 4 Annotations.
Some nodules may be close and must be grouped manually.
Failed to reduce all groups to <= 4 Annotations.
Some nodules may be close and must be grouped manually.
Failed to reduce all groups to <= 4 Annotations.
Some nodules may be close and must be grouped manually.
Failed to reduce all groups to <= 4 Annotations.
Some nodules may be close and must be grouped manually.
Failed to reduce all groups to <= 4 Annotations.
Some nodules may be close and must be grouped manually.
Failed to reduce all groups to <= 4 Annotations.
Some n

- Extract Features using pyradiomics library


> This function processes all nodules for a given patient. For each nodule, the image and mask files are loaded using SImpleITK, and radiomics features are extracted by the **RadiomicsFeatureExatractor()**. Then, the extracted features are converted into a dataframe.

In [5]:
def extract_features_from_patient(patient_dir):
    extractor = featureextractor.RadiomicsFeatureExtractor()
    patient_id = os.path.basename(patient_dir)
    
    patient_results = []
    
    for nodule_dir in os.listdir(patient_dir):
        nodule_id = nodule_dir
        nodule_path = os.path.join(patient_dir, nodule_dir)

        image_path = os.path.join(nodule_path, 'image', 'image.nii')
        mask_path = os.path.join(nodule_path, 'mask', 'mask.nii')

        #verify if the files exist and import them
        if os.path.exists(image_path) and os.path.exists(mask_path):
            image = sitk.ReadImage(image_path)
            mask = sitk.ReadImage(mask_path)

            features = extractor.execute(image, mask)

            #convert to dictionary
            feature_dict = dict(features)
            feature_dict['patient_id'] = patient_id
            feature_dict['nodule_id'] = nodule_id

            patient_results.append(feature_dict)

    df = pd.DataFrame(patient_results)

    return df

In [7]:
radiomics_df = pd.DataFrame()

for patient_dir in os.listdir('../data_cleaned'):
    patient_dir = os.path.join('../data_cleaned',patient_dir)
    patient_df = extract_features_from_patient(patient_dir)

    #remove libraries versions, configuration details, hashs, ...
    patient_df = patient_df[patient_df.columns.drop(list(patient_df.filter(regex='Versions')))]
    patient_df = patient_df[patient_df.columns.drop(list(patient_df.filter(regex='Configuration')))]
    patient_df = patient_df[patient_df.columns.drop(list(patient_df.filter(regex='Hash')))]
    patient_df = patient_df[patient_df.columns.drop(list(patient_df.filter(regex='diagnostics')))]

    radiomics_df = pd.concat([radiomics_df, patient_df], ignore_index=True)

radiomics_df.to_csv('radiomics.csv')
radiomics_df


GLCM is symmetrical, therefore Sum Average = 2 * Joint Average, only 1 needs to be calculated
GLCM is symmetrical, therefore Sum Average = 2 * Joint Average, only 1 needs to be calculated
GLCM is symmetrical, therefore Sum Average = 2 * Joint Average, only 1 needs to be calculated
GLCM is symmetrical, therefore Sum Average = 2 * Joint Average, only 1 needs to be calculated
GLCM is symmetrical, therefore Sum Average = 2 * Joint Average, only 1 needs to be calculated
GLCM is symmetrical, therefore Sum Average = 2 * Joint Average, only 1 needs to be calculated
GLCM is symmetrical, therefore Sum Average = 2 * Joint Average, only 1 needs to be calculated
GLCM is symmetrical, therefore Sum Average = 2 * Joint Average, only 1 needs to be calculated
GLCM is symmetrical, therefore Sum Average = 2 * Joint Average, only 1 needs to be calculated
GLCM is symmetrical, therefore Sum Average = 2 * Joint Average, only 1 needs to be calculated
GLCM is symmetrical, therefore Sum Average = 2 * Joint Avera

Unnamed: 0,original_shape_Elongation,original_shape_Flatness,original_shape_LeastAxisLength,original_shape_MajorAxisLength,original_shape_Maximum2DDiameterColumn,original_shape_Maximum2DDiameterRow,original_shape_Maximum2DDiameterSlice,original_shape_Maximum3DDiameter,original_shape_MeshVolume,original_shape_MinorAxisLength,...,original_glszm_ZoneEntropy,original_glszm_ZonePercentage,original_glszm_ZoneVariance,original_ngtdm_Busyness,original_ngtdm_Coarseness,original_ngtdm_Complexity,original_ngtdm_Contrast,original_ngtdm_Strength,patient_id,nodule_id
0,0.823485,0.422484,3.597711,8.515614,8.06225774829855,9.848857801796104,10.04987562112089,10.246950765959598,139.95833333333334,7.012477,...,0.9999999999999993,0.013513513513513514,5329.0,0.18134343196454217,1.39271505025801,0.00970299955544391,7.781722304553213e-05,1.16543070367493,LIDC-IDRI-0848,5523
1,0.822030,0.332701,5.249749,15.779186,19.026297590440446,18.24828759089466,15.0,19.05255888325765,747.75,12.970964,...,-3.203426503814917e-16,0.0013123359580052493,0.0,0.0,1000000.0,0.0,0.0,0.0,LIDC-IDRI-0159,1338
2,0.737988,0.325554,2.055999,6.315384,5.0990195135927845,7.0710678118654755,7.0710678118654755,7.14142842854285,39.541666666666664,4.660675,...,-3.203426503814917e-16,0.022222222222222223,0.0,0.0,1000000.0,0.0,0.0,0.0,LIDC-IDRI-0159,1339
3,0.594263,0.285487,14.952101,52.373975,45.17742799230607,48.83646178829912,56.515484603779164,63.419239982831705,9251.583333333334,31.123901,...,1.2987949406953976,0.0008605851979345956,9426162.0,2.2761686585623435,0.11002898547043735,0.0019553602040750843,2.4993082078398973e-06,0.10174109988706166,LIDC-IDRI-0829,5630
4,0.935982,0.601938,4.003552,6.651108,7.280109889280518,8.06225774829855,7.211102550927978,8.12403840463596,110.58333333333333,6.225316,...,-3.203426503814917e-16,0.008547008547008548,0.0,0.0,1000000.0,0.0,0.0,0.0,LIDC-IDRI-0830,5627
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2656,0.612233,0.352484,2.305650,6.541145,4.123105625617661,7.280109889280518,7.0,7.3484692283495345,31.833333333333332,4.004703,...,-3.203426503814917e-16,0.02702702702702703,0.0,0.0,1000000.0,0.0,0.0,0.0,LIDC-IDRI-0201,1700
2657,0.789584,0.411954,2.864976,6.954600,8.0,8.54400374531753,6.082762530298219,8.54400374531753,74.75,5.491238,...,-3.203426503814917e-16,0.012345679012345678,0.0,0.0,1000000.0,0.0,0.0,0.0,LIDC-IDRI-0201,1702
2658,0.871072,0.394275,2.357012,5.978092,6.082762530298219,6.4031242374328485,6.082762530298219,7.280109889280518,44.291666666666664,5.207351,...,-3.203426503814917e-16,0.02,0.0,0.0,1000000.0,0.0,0.0,0.0,LIDC-IDRI-0201,1699
2659,0.824329,0.585070,6.185429,10.572110,9.848857801796104,12.649110640673518,12.529964086141668,12.649110640673518,393.375,8.714894,...,-3.203426503814917e-16,0.0024813895781637717,0.0,0.0,1000000.0,0.0,0.0,0.0,LIDC-IDRI-0676,6581
