In [1]:
import pandas as pd
import numpy as np

In [2]:
SIZE = 256
SPLITS = ["train", "val", "test"]

In [3]:
split_info = pd.read_csv("/mnt/jbrockma/bachelor-thesis/medmnist_data_split/chestmnist_split_info.csv")

In [4]:
split_info.head()

Unnamed: 0,split,index,image_id
0,train,0,00000901_002
1,train,1,00018227_001
2,train,2,00027736_005
3,train,3,00018904_002
4,train,4,00005233_000


In [5]:
split_value_counts = split_info["split"].value_counts()
samples_of_split = {SPLIT: split_value_counts[SPLIT] for SPLIT in SPLITS}
samples_of_split

{'train': 78468, 'val': 11219, 'test': 22433}

In [6]:
images_of_split = {SPLIT: np.empty((samples_of_split[SPLIT], SIZE, SIZE), dtype=np.uint8) for SPLIT in SPLITS}
[images_of_split[SPLIT].shape for SPLIT in SPLITS]

[(78468, 256, 256), (11219, 256, 256), (22433, 256, 256)]

In [7]:
from tqdm.auto import tqdm
import concurrent.futures
from PIL import Image

  from .autonotebook import tqdm as notebook_tqdm


In [21]:
with tqdm(total=len(split_info)) as pbar:
    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = set()

        def preprocess(_info):
            try:
                _, _split, _index, _image_id = _info

                with Image.open(f"/mnt/jbrockma/CXR8/images/images/{_image_id}.png") as im:
                    im = im.convert("L") if im.mode != "L" else im
                    im = im.resize((SIZE, SIZE), Image.BICUBIC)

                    images_of_split[_split][_index] = np.asarray(im)
            except Exception as e:
                print(e)
                raise e

        for info in split_info.itertuples():
            future = executor.submit(preprocess, info)
            future.add_done_callback(lambda _: pbar.update())
            futures.add(future)

        concurrent.futures.wait(futures)

    exceptions = set()

    for future in futures:
        exception = future.exception()
        if exception:
            exceptions.add(exception)

    for exception in exceptions:
        print(exception)


  0%|                                                                           | 27/112120 [00:02<2:02:05, 15.30it/s]exception calling callback for <Future at 0x7fb15292d060 state=finished returned NoneType>
Traceback (most recent call last):
  File "/root/anaconda3/envs/bachelor-thesis/lib/python3.10/concurrent/futures/_base.py", line 342, in _invoke_callbacks
    callback(self)
  File "/tmp/ipykernel_45821/2840424018.py", line 18, in <lambda>
    future.add_done_callback(lambda _: pbar.update())
  File "/root/anaconda3/envs/bachelor-thesis/lib/python3.10/site-packages/tqdm/std.py", line 1239, in update
    self.refresh(lock_args=self.lock_args)
  File "/root/anaconda3/envs/bachelor-thesis/lib/python3.10/site-packages/tqdm/std.py", line 1344, in refresh
    self.display()
  File "/root/anaconda3/envs/bachelor-thesis/lib/python3.10/site-packages/tqdm/std.py", line 1492, in display
    self.sp(self.__str__() if msg is None else msg)
  File "/root/anaconda3/envs/bachelor-thesis/lib/pyth

KeyboardInterrupt: 

In [15]:
chest_mnist = np.load("/root/.medmnist/chestmnist.npz")
labels_of_split = {SPLIT: chest_mnist[f"{SPLIT}_labels"] for SPLIT in SPLITS}
[labels_of_split[SPLIT].shape for SPLIT in SPLITS]

[(22433, 14), (78468, 14), (11219, 14)]

In [20]:
name_to_array = {}
for container_name, container_of_split in [("images", images_of_split), ("labels", labels_of_split)]:
    for SPLIT in SPLITS:
        name_to_array[f"{SPLIT}_{container_name}"] = container_of_split[SPLIT]
name_to_array.keys()

dict_keys(['train_images', 'val_images', 'test_images', 'train_labels', 'val_labels', 'test_labels'])