Install the necessary libraries and import the necessary packages

In [3]:
!apt-get install p7zip-full

!pip install 'SimpleITK==2.0.2'
!pip install 'diskcache==5.2.1'
!pip install 'cassandra-driver==3.25.0'

Reading package lists... Done
Building dependency tree       
Reading state information... Done
p7zip-full is already the newest version (16.02+dfsg-6).
0 upgraded, 0 newly installed, 0 to remove and 39 not upgraded.
Collecting SimpleITK==2.0.2
[?25l  Downloading https://files.pythonhosted.org/packages/9c/6b/85df5eb3a8059b23a53a9f224476e75473f9bcc0a8583ed1a9c34619f372/SimpleITK-2.0.2-cp37-cp37m-manylinux2010_x86_64.whl (47.4MB)
[K     |████████████████████████████████| 47.4MB 92kB/s 
[?25hInstalling collected packages: SimpleITK
Successfully installed SimpleITK-2.0.2
Collecting diskcache==5.2.1
[?25l  Downloading https://files.pythonhosted.org/packages/6a/5e/3deb8f9c83bead2af6f2cda97c4400516488464fede2853875a81e502953/diskcache-5.2.1-py3-none-any.whl (44kB)
[K     |████████████████████████████████| 51kB 4.4MB/s 
[?25hInstalling collected packages: diskcache
Successfully installed diskcache-5.2.1
Collecting cassandra-driver==3.25.0
[?25l  Downloading https://files.pythonhosted.or

In [7]:
#data from: https://luna16.grand-challenge.org/Download/
!wget -c -O zipped_subset_0.zip https://zenodo.org/record/3723295/files/subset0.zip?download=1
!mkdir /content/data
!7z e zipped_subset_0.zip -o/content/data/subset_0 #unzips files 

--2021-06-10 20:24:28--  https://zenodo.org/record/3723295/files/subset0.zip?download=1
Resolving zenodo.org (zenodo.org)... 137.138.76.77
Connecting to zenodo.org (zenodo.org)|137.138.76.77|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6811924508 (6.3G) [application/octet-stream]
Saving to: ‘zipped_subset_0.zip’


2021-06-10 20:32:13 (14.0 MB/s) - ‘zipped_subset_0.zip’ saved [6811924508/6811924508]



In [4]:
#clone the github repository to access the .py files and others
!rm -rf lungCancerSegmentation/
!git clone https://github.com/jeffreyboschman/lungCancerSegmentation.git

Cloning into 'lungCancerSegmentation'...
remote: Enumerating objects: 99, done.[K
remote: Counting objects: 100% (99/99), done.[K
remote: Compressing objects: 100% (78/78), done.[K
remote: Total 99 (delta 49), reused 62 (delta 19), pack-reused 0[K
Unpacking objects: 100% (99/99), done.


Exploring the data

In [21]:
#import packages and modules
import SimpleITK as sitk
import numpy as np
from lungCancerSegmentation.code.datasets import getCandidateInfoList, getCt, LunaDataset
from lungCancerSegmentation.code.vis import findPositiveSamples, showCandidate


In [29]:
candidateInfo_list = getCandidateInfoList(requireOnDisk_bool=True)
positiveInfo_list = [x for x in candidateInfo_list if x[0]]
diameter_list = [x[1] for x in positiveInfo_list]
print(len(positiveInfo_list))
print(positiveInfo_list[0]) #to see what a positive candidate tuple contains

122
CandidateInfoTuple(isNodule_bool=True, diameter_mm=25.23320204, series_uid='1.3.6.1.4.1.14519.5.2.1.6279.6001.511347030803753100045216493273', center_xyz=(63.4740118048, 73.9174523314, -213.736128767))


In [30]:
#see the distribution of nodule sizes in the dataset
for i in range(0, len(diameter_list), 100):
    print('{:4}  {:4.1f} mm'.format(i, diameter_list[i]))

   0  25.2 mm
 100   0.0 mm


In [31]:
np.histogram(diameter_list)

(array([26, 11, 41, 15, 11,  8,  3,  5,  1,  1]),
 array([ 0.        ,  2.5233202 ,  5.04664041,  7.56996061, 10.09328082,
        12.61660102, 15.13992122, 17.66324143, 20.18656163, 22.70988184,
        25.23320204]))

In [32]:
positiveSample_list = findPositiveSamples() #similar to positiveInfo_list, but we can specify the start index and limit (default start_ndx=0, limit=100)


2021-06-10 20:54:51,419 INFO     pid:62 lungCancerSegmentation.code.datasets:174:__init__ <lungCancerSegmentation.code.datasets.LunaDataset object at 0x7fbb4256a790>: 56938 training samples


0 CandidateInfoTuple(isNodule_bool=True, diameter_mm=25.23320204, series_uid='1.3.6.1.4.1.14519.5.2.1.6279.6001.511347030803753100045216493273', center_xyz=(63.4740118048, 73.9174523314, -213.736128767))
1 CandidateInfoTuple(isNodule_bool=True, diameter_mm=21.58311204, series_uid='1.3.6.1.4.1.14519.5.2.1.6279.6001.905371958588660410240398317235', center_xyz=(109.142472723, 49.6356928166, -121.183579092))
2 CandidateInfoTuple(isNodule_bool=True, diameter_mm=19.65387738, series_uid='1.3.6.1.4.1.14519.5.2.1.6279.6001.752756872840730509471096155114', center_xyz=(56.1226132601, 67.868268695, -65.6269886453))
3 CandidateInfoTuple(isNodule_bool=True, diameter_mm=18.7832325, series_uid='1.3.6.1.4.1.14519.5.2.1.6279.6001.202811684116768680758082619196', center_xyz=(-82.79150362, -21.43587141, -97.18427459))
4 CandidateInfoTuple(isNodule_bool=True, diameter_mm=17.75323185, series_uid='1.3.6.1.4.1.14519.5.2.1.6279.6001.187451715205085403623595258748', center_xyz=(94.1132711884, -15.8936132585, -2

In [36]:
series_uid = positiveSample_list[11][2]
print(series_uid)
showCandidate(series_uid)


2021-06-10 20:56:51,960 INFO     pid:62 lungCancerSegmentation.code.datasets:174:__init__ <lungCancerSegmentation.code.datasets.LunaDataset object at 0x7fbb47543bd0>: 548 training samples


1.3.6.1.4.1.14519.5.2.1.6279.6001.213140617640021803112060161074


IndexError: ignored

In [35]:
#see what the tuples that are loaded by the sitk package look like
mhd_path = "/content/data/subset_0/1.3.6.1.4.1.14519.5.2.1.6279.6001.105756658031515062000744821260.mhd"
ct_mhd = sitk.ReadImage(mhd_path)
print(ct_mhd.GetOrigin()) #the offset of the origin in mm 
print(ct_mhd.GetSpacing()) #size of each voxel in mm
print(ct_mhd.GetDirection()) #flattened transformation matrix

(-198.100006, -195.0, -335.209991)
(0.7617189884185791, 0.7617189884185791, 2.5)
(1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0)
