<a href="https://colab.research.google.com/github/lhiwi/Parallelize-DL-assignment/blob/main/Data_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
# install dependencies and import libraries
!pip -q install tensorflow tensorflow-datasets opencv-python tqdm

import tensorflow_datasets as tfds
import numpy as np
import cv2
import struct
from tqdm import tqdm


## Preprocessing settings and export function

In [5]:
IMG_SIZE = 32  # 32x32 grayscale -> D=1024

def export_split(ds, max_items=None):
    X_list, y_list = [], []
    count = 0
    for ex in tqdm(tfds.as_numpy(ds)):
        img = ex["image"]   # uint8 RGB
        y   = ex["label"]   # 0/1

        img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
        img = cv2.resize(img, (IMG_SIZE, IMG_SIZE), interpolation=cv2.INTER_AREA)
        img = img.astype(np.float32) / 255.0
        X_list.append(img.reshape(-1))     # flatten to 1024
        y_list.append(np.uint8(y))

        count += 1
        if max_items and count >= max_items:
            break

    X = np.stack(X_list).astype(np.float32)
    y = np.array(y_list, dtype=np.uint8)
    return X, y


In [6]:
# Saving to binary format
def save_bin(path, X, y):
    N, D = X.shape
    with open(path, "wb") as f:
        f.write(struct.pack("ii", N, D))   # N, D
        f.write(X.tobytes(order="C"))      # X float32
        f.write(y.tobytes(order="C"))      # y uint8
    print(f"Saved {path}: N={N}, D={D}, y_counts={np.bincount(y)}")

ds_train, ds_test = tfds.load("malaria", split=["train[:80%]", "train[80%:]"])

X_train, y_train = export_split(ds_train)
X_test,  y_test  = export_split(ds_test)

save_bin("malaria_train_32.bin", X_train, y_train)
save_bin("malaria_test_32.bin",  X_test,  y_test)

100%|██████████| 22046/22046 [00:18<00:00, 1183.21it/s]
100%|██████████| 5512/5512 [00:05<00:00, 1071.65it/s]


Saved malaria_train_32.bin: N=22046, D=1024, y_counts=[10999 11047]
Saved malaria_test_32.bin: N=5512, D=1024, y_counts=[2780 2732]
