In [1]:
%matplotlib inline
import numpy as np

In [2]:
from p2ch10.datasets import getCandidateInfoList, getCt, LunaDataset, raw_cache

In [3]:
candidateInfo_list = getCandidateInfoList(requireOnDisk_bool=False)
positiveInfo_list = [x for x in candidateInfo_list if x[0]]
diameter_list = [x[1] for x in positiveInfo_list]

In [4]:
print(len(positiveInfo_list))
print(positiveInfo_list[0])


1351
CandidateInfoTuple(isNodule_bool=True, diameter_mm=32.27003025, series_uid='1.3.6.1.4.1.14519.5.2.1.6279.6001.287966244644280690737019247886', center_xyz=(75.7212243102, 92.8873310394, -119.270564052))


In [5]:
sample_data = positiveInfo_list[0].series_uid
dataset = LunaDataset()
dataset[0]

2024-02-28 20:27:03,611 INFO     pid:52300 p2ch10.datasets:155:__init__ <p2ch10.datasets.LunaDataset object at 0x14de14f10>: 551065 training samples


(tensor([[[[-962., -858., -825.,  ...,   74.,  139.,  -36.],
           [-837., -870., -815.,  ...,  -57.,  -19.,  103.],
           [-745., -746., -793.,  ...,   84.,   54.,   43.],
           ...,
           [-756., -526., -737.,  ...,    8.,  -34.,   29.],
           [-801., -657., -734.,  ...,   78.,  145.,   75.],
           [-834., -905., -920.,  ...,  110.,  101.,  -20.]],
 
          [[-964., -986., -953.,  ...,   69.,   -4.,   50.],
           [-838., -801., -879.,  ...,   74.,   38.,  108.],
           [-769., -779., -875.,  ...,   64.,   20.,   47.],
           ...,
           [-851., -819., -703.,  ...,   52.,   83.,  -13.],
           [-966., -887., -782.,  ...,   80.,  177.,   51.],
           [-944., -912., -797.,  ...,   87.,   42.,  113.]],
 
          [[-913., -934., -889.,  ...,   74.,  -12.,   75.],
           [-892., -879., -869.,  ...,  171.,  -20.,   26.],
           [-853., -876., -897.,  ...,   28.,   57.,   55.],
           ...,
           [-666., -682., -515.

### Exercises

1. Implement a program that iterates through a LunaDataset instance, and time how long it takes to do so. <br> In the interest of time, it might make sense to have an option to limit the iterations to the first N=1000 samples.<br>
    1. How long does it take to run the first time?<br>
        Roughly `2 mins 43 seconds` on M1 Max Macbook (did not use `mps` as device) <br><br>
    2. How long does it take to run the second time?<br>
        `2 seconds`! <br><br>
    3. What does clearing the cache do to the runtime?<br>
        Well, as expected it again takes too long! Roughly `2 mins 30 secs`<br><br>
        
    4. What does using the last N=1000 samples do to the first/second runtime? <br>
        It took only `3 mins 17 seconds` even after clearning the cache. The second time was shorter, roughly `3 secs` <br>
        Have to check this agian <br><br>
        

2. Change the LunaDataset implementation to randomize the sample list during `__init__`. <br> Clear the cache, and run the modified version. What does that do to the runtime of the first and second runs?<br>
      The first run takes `3 mins 16 seconds` so a little longer than non-randomized. Latter runs take about `2 seconds`. <br><br>

3. Revert the randomization, and comment out the @functools.lru_cache(1, typed=True) decorator to getCt. <br>Clear the cache, and run the modified version. How does the runtime change now?<br>
      The first run takes `3 mins` but the subsequent runs are still faster at 2-3 seconds.
    


In [13]:
%%time

for i, data in enumerate(dataset):
    if i >=1000:
        break

CPU times: user 1min 6s, sys: 33.2 s, total: 1min 39s
Wall time: 1min 51s


In [14]:
%%timeit

for i, data in enumerate(dataset):
    if i >=1000:
        break

231 ms ± 49.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [15]:
getCandidateInfoList.cache_clear()
getCt.cache_clear()
raw_cache.clear()

1001

In [16]:
%%time

for i, data in enumerate(dataset):
    if i >=1000:
        break

CPU times: user 1min 31s, sys: 45.5 s, total: 2min 17s
Wall time: 2min 33s


##### Last 1000 samples

In [25]:
%%time
num_samples = len(dataset)
start_idx = num_samples - 1000

for idx in range(start_idx, num_samples):
    temp = dataset[idx]


CPU times: user 98.5 ms, sys: 76.8 ms, total: 175 ms
Wall time: 425 ms


In [10]:
getCandidateInfoList.cache_clear()
getCt.cache_clear()
raw_cache.clear()

1001

In [11]:
%%time
num_samples = len(dataset)
start_idx = num_samples - 1000

for idx in range(start_idx, num_samples):
    temp = dataset[idx]

CPU times: user 2min 5s, sys: 57.1 s, total: 3min 2s
Wall time: 3min 17s


In [12]:
%%timeit
num_samples = len(dataset)
start_idx = num_samples - 1000

for idx in range(start_idx, num_samples):
    temp = dataset[idx]

314 ms ± 10.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


##### Randomize

In [31]:
# %load_ext autoreload
# %autoreload 2

# The above didn't work so restarted kernel

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


p2ch10.datasets.LunaDataset

In [8]:
dataset = LunaDataset()
dataset[0]

2024-02-28 20:28:02,886 INFO     pid:52300 p2ch10.datasets:155:__init__ <p2ch10.datasets.LunaDataset object at 0x14de00e10>: 551065 training samples


(tensor([[[[-896., -901., -904.,  ..., -947., -931., -907.],
           [-893., -903., -903.,  ..., -956., -949., -931.],
           [-846., -872., -869.,  ..., -961., -960., -953.],
           ...,
           [-674., -814., -889.,  ..., -830., -836., -843.],
           [-676., -820., -879.,  ..., -849., -835., -841.],
           [-643., -807., -870.,  ..., -848., -851., -844.]],
 
          [[-856., -840., -852.,  ..., -925., -912., -903.],
           [-869., -840., -836.,  ..., -933., -936., -925.],
           [-857., -838., -827.,  ..., -925., -932., -929.],
           ...,
           [-854., -840., -860.,  ..., -875., -860., -861.],
           [-860., -837., -841.,  ..., -860., -860., -869.],
           [-845., -822., -805.,  ..., -877., -884., -870.]],
 
          [[-866., -853., -851.,  ..., -946., -952., -951.],
           [-860., -853., -848.,  ..., -930., -944., -951.],
           [-856., -843., -831.,  ..., -930., -944., -948.],
           ...,
           [-841., -800., -863.

In [12]:
%%time

for i, data in enumerate(dataset):
    if i >=1000:
        break

#3m 16.6s

CPU times: user 126 ms, sys: 67.7 ms, total: 194 ms
Wall time: 406 ms


In [13]:
%%timeit

for i, data in enumerate(dataset):
    if i >=1000:
        break


236 ms ± 53 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


#### LRU Cache Commented

In [5]:
getCandidateInfoList.cache_clear()
# getCt.cache_clear()
raw_cache.clear()

2004

In [6]:
dataset = LunaDataset()

2024-02-28 20:41:16,031 INFO     pid:52863 p2ch10.datasets:155:__init__ <p2ch10.datasets.LunaDataset object at 0x28673e290>: 551065 training samples


In [7]:
%%time

for i, data in enumerate(dataset):
    if i >=1000:
        break

CPU times: user 1min 46s, sys: 51.7 s, total: 2min 38s
Wall time: 3min 2s


In [8]:
%%timeit

for i, data in enumerate(dataset):
    if i >=1000:
        break

206 ms ± 14.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
