## Notebook to create .nyp files and store respective .xml
- aggregate all scans images (.dcm) from a patient to a np.array [with format (scans,512,512)] 
- and create a folder to store the arrays and the .xml data that identifies the nodes characteristics from given CT scans

In [29]:
#path to directory were CT scans (*.dcm) are
#LIDC_PATH = f"/media/joaob/Data-LINUX/LIDC-IDRI/" #linux-desktop
LIDC_PATH = f"./LIDC-IDRI/" #windows-laptop

#path to directory were xml files are
#XML_PATH = f"/media/joaob/Data-LINUX/LIDC-XML/" #linux-desktop
XML_PATH = "C:/Users/joaob/Documents/tcia-lidc-xml/" #windows-laptop

#path to store the arrays & xml
SAVE_PATH = f"./LIDC-DATASET/" #linux-desktop

In [40]:
import pydicom
import os
import numpy as np
import shutil

#### Function to aggregate CT scans from a patient

In [22]:
def aggregate_scans(folder_path):
    # List to hold the slices
    slices = []
    
    # Loop through all files in the directory
    for filename in sorted(os.listdir(folder_path)):
        if filename.endswith('.dcm'):
            # Load the DICOM file
            dicom_file_path = os.path.join(folder_path, filename)
            dicom_data = pydicom.dcmread(dicom_file_path)
            
            # Extract pixel array
            slices.append(dicom_data.pixel_array)
    
    # Stack the slices into a 3D NumPy array (slices, 512, 512)
    dicom_stack = np.stack(slices, axis=0)
    
    return dicom_stack

#### loop through all xml files

In [43]:
import xml.etree.ElementTree as ET

xml_folders = sorted(os.listdir(XML_PATH))

#create base folder if it doesn't existe
#if not os.path.exists(f'./LIDC-DATASET'):
    #os.mkdir(f'./LIDC-DATASET')

for xml_f in xml_folders:
    xml_files = sorted(os.listdir(f"{XML_PATH}{xml_f}"))
    
    for file in xml_files:
        done = False
        print(f"XML: {file}")
        tree = ET.parse(f"{XML_PATH}{xml_f}/{file}")
        root = tree.getroot()

        #usefull to search each point
        namespaces = {'ns': 'http://www.nih.gov'} 
        #first folder
        study_instance_uid = root.find('.//ns:StudyInstanceUID', namespaces).text 
        #subfolder
        series_instance_uid = root.find('.//ns:SeriesInstanceUid', namespaces).text 


        # Search through all LIDC patients folders to associate a .xml using study_instance_uid and series_instance_uid
        # Also aggregates all CT scans into a unique .npy file
        for patient in sorted(os.listdir(f'{LIDC_PATH}')):
            if done: break
            for study_uid in sorted(os.listdir(f'{LIDC_PATH}/{patient}')):

                if study_uid == study_instance_uid: # .xml file correspond with scans
                    done = True
                    scans_path = f'{LIDC_PATH}{patient}/{study_instance_uid}/{series_instance_uid}/'            

                    break #remove when ready to execute

                    if os.path.exists(f'./LIDC-DATASET/{patient}/'): #this should allways be false and never execute
                        print(f'LIDC-DATASET PATH: {LIDC_PATH}{patient}/{study_instance_uid}/{series_instance_uid}/')
                        print(f'XML file: {XML_PATH}{xml_f}/{file}')
                        break

                    
                    #store in disk the aggregation of dicom images
                    os.mkdir(f'./LIDC-DATASET/{patient}/')
                    np.save(f'./LIDC-DATASET/{patient}/scan.npy' , aggregate_scans(scans_path))
                    shutil.copy(f'{XML_PATH}{xml_f}/{file}', f'./LIDC-DATASET/{patient}/') #copy .xml file to the respective patient folder
                    break



        ''' code to view the characteristics from nodes in the given .xml file
        '''
        #IF WE WANT TO FILTER AT THE SAME TIME WE AGGREGATE THE SCANS
        characteristics = root.findall('.//ns:unblindedReadNodule/ns:characteristics', namespaces)
        # If characteristics are found, extract and print the values
        print("StudyInstanceUID:", study_instance_uid)
        print("SeriesInstanceUid:", series_instance_uid)
        print("=====Caracteristicas=====")
        i = 1
        for charac in characteristics:
            if charac is not None:
                print(f'===== Nodolo {i} =====')
                i += 1

                subtlety = charac.find('ns:subtlety', namespaces).text 
                internal_structure = charac.find('ns:internalStructure', namespaces).text 
                calcification = charac.find('ns:calcification', namespaces).text 
                sphericity = charac.find('ns:sphericity', namespaces).text 
                margin = charac.find('ns:margin', namespaces).text 
                lobulation = charac.find('ns:lobulation', namespaces).text 
                spiculation = charac.find('ns:spiculation', namespaces).text 
                texture = charac.find('ns:texture', namespaces).text 
                malignancy = charac.find('ns:malignancy', namespaces).text 

                # Print the characteristics values
                print("Characteristics:")
                print("  Subtlety:", subtlety)
                print("  Internal Structure:", internal_structure)
                print("  Calcification:", calcification)
                print("  Sphericity:", sphericity)
                print("  Margin:", margin)
                print("  Lobulation:", lobulation)
                print("  Spiculation:", spiculation)
                print("  Texture:", texture)
                print("  Malignancy:", malignancy)


        break #remove to go through all .xml files
    break #remove to go through all .xml files

XML: 158.xml
StudyInstanceUID: 1.3.6.1.4.1.14519.5.2.1.6279.6001.339170810277323131167631068432
SeriesInstanceUid: 1.3.6.1.4.1.14519.5.2.1.6279.6001.303494235102183795724852353824
=====Caracteristicas=====
===== Nodolo 1 =====
Characteristics:
  Subtlety: 5
  Internal Structure: 1
  Calcification: 6
  Sphericity: 3
  Margin: 4
  Lobulation: 1
  Spiculation: 1
  Texture: 5
  Malignancy: 3
===== Nodolo 2 =====
Characteristics:
  Subtlety: 4
  Internal Structure: 1
  Calcification: 6
  Sphericity: 4
  Margin: 4
  Lobulation: 1
  Spiculation: 2
  Texture: 5
  Malignancy: 3
===== Nodolo 3 =====
Characteristics:
  Subtlety: 5
  Internal Structure: 1
  Calcification: 4
  Sphericity: 3
  Margin: 5
  Lobulation: 2
  Spiculation: 3
  Texture: 5
  Malignancy: 4
===== Nodolo 4 =====
Characteristics:
  Subtlety: 5
  Internal Structure: 1
  Calcification: 6
  Sphericity: 4
  Margin: 2
  Lobulation: 4
  Spiculation: 1
  Texture: 5
  Malignancy: 5
===== Nodolo 5 =====
Characteristics:
  Subtlety: 4
  