## Notebook to create .nyp files and store respective .xml
- aggregate all scans images (.dcm) from a patient to a np.array [with format (scans,512,512)] 
- and create a folder to store the arrays and the .xml data that identifies the nodes characteristics from given CT scans

In [5]:
#path to directory were CT scans (*.dcm) are
LIDC_PATH = f"/media/joaob/Data-LINUX/LIDC-IDRI/"
#path to directory were xml files are
XML_PATH = f"/media/joaob/Data-LINUX/LIDC-XML/"
#path to store the arrays & xml
SAVE_PATH = f"./LIDC-DATASET/"

In [6]:
import pydicom
import os
import numpy as np

In [3]:
def load_dicom_slices(folder_path):
    # List to hold the slices
    slices = []
    
    # Loop through all files in the directory
    for filename in sorted(os.listdir(folder_path)):
        if filename.endswith('.dcm'):
            # Load the DICOM file
            dicom_file_path = os.path.join(folder_path, filename)
            dicom_data = pydicom.dcmread(dicom_file_path)
            
            # Extract pixel array (assuming it's 512x512)
            slices.append(dicom_data.pixel_array)
    
    # Stack the slices into a 3D NumPy array (slices, 512, 512)
    dicom_stack = np.stack(slices, axis=0)
    
    return dicom_stack

# Usage
folder_path = f'/media/joaob/Data-LINUX/LIDC-IDRI/LIDC-IDRI-0001/1.3.6.1.4.1.14519.5.2.1.6279.6001.298806137288633453246975630178/1.3.6.1.4.1.14519.5.2.1.6279.6001.179049373636438705059720603192'  # Replace with the actual folder path
dicom_array = load_dicom_slices(folder_path)

print(dicom_array.shape)  # Output should be (number_of_slices, 512, 512)

(133, 512, 512)


#### loop through all xml files

In [73]:
import xml.etree.ElementTree as ET

xml_folders = sorted(os.listdir(XML_PATH))

for xml_f in xml_folders:
    xml_files = sorted(os.listdir(f"{XML_PATH}{xml_f}"))
    
    for file in xml_files:
        tree = ET.parse(f"{XML_PATH}{xml_f}/{file}")
        root = tree.getroot()

        #usefull to search each point
        namespaces = {'ns': 'http://www.nih.gov'} 

        #first folder
        study_instance_uid = root.find('.//ns:StudyInstanceUID', namespaces).text 
        print("StudyInstanceUID:", study_instance_uid)

        #subfolder
        series_instance_uid = root.find('.//ns:SeriesInstanceUid', namespaces).text 
        print("SeriesInstanceUid:", series_instance_uid)



        characteristics = root.findall('.//ns:unblindedReadNodule/ns:characteristics', namespaces)

        # If characteristics are found, extract and print the values
        for charac in characteristics:
            if charac is not None:
                subtlety = charac.find('ns:subtlety', namespaces).text 
                internal_structure = charac.find('ns:internalStructure', namespaces).text 
                calcification = charac.find('ns:calcification', namespaces).text 
                sphericity = charac.find('ns:sphericity', namespaces).text 
                margin = charac.find('ns:margin', namespaces).text 
                lobulation = charac.find('ns:lobulation', namespaces).text 
                spiculation = charac.find('ns:spiculation', namespaces).text 
                texture = charac.find('ns:texture', namespaces).text 
                malignancy = charac.find('ns:malignancy', namespaces).text 

                # Print the characteristics values
                print("Characteristics:")
                print("  Subtlety:", subtlety)
                print("  Internal Structure:", internal_structure)
                print("  Calcification:", calcification)
                print("  Sphericity:", sphericity)
                print("  Margin:", margin)
                print("  Lobulation:", lobulation)
                print("  Spiculation:", spiculation)
                print("  Texture:", texture)
                print("  Malignancy:", malignancy)


        break
    break




StudyInstanceUID: 1.3.6.1.4.1.14519.5.2.1.6279.6001.339170810277323131167631068432
SeriesInstanceUid: 1.3.6.1.4.1.14519.5.2.1.6279.6001.303494235102183795724852353824
Characteristics:
  Subtlety: 5
  Internal Structure: 1
  Calcification: 6
  Sphericity: 3
  Margin: 4
  Lobulation: 1
  Spiculation: 1
  Texture: 5
  Malignancy: 3
Characteristics:
  Subtlety: 4
  Internal Structure: 1
  Calcification: 6
  Sphericity: 4
  Margin: 4
  Lobulation: 1
  Spiculation: 2
  Texture: 5
  Malignancy: 3
Characteristics:
  Subtlety: 5
  Internal Structure: 1
  Calcification: 4
  Sphericity: 3
  Margin: 5
  Lobulation: 2
  Spiculation: 3
  Texture: 5
  Malignancy: 4
Characteristics:
  Subtlety: 5
  Internal Structure: 1
  Calcification: 6
  Sphericity: 4
  Margin: 2
  Lobulation: 4
  Spiculation: 1
  Texture: 5
  Malignancy: 5
Characteristics:
  Subtlety: 4
  Internal Structure: 1
  Calcification: 6
  Sphericity: 4
  Margin: 2
  Lobulation: 3
  Spiculation: 1
  Texture: 4
  Malignancy: 4
Characteristi