# 00 – Exploratory Data Analysis (EDA)
*Helmholtz HIDA Hematology Image Classification*

**Goal:** quick sanity‑check of the labelled dataset: class balance, image shapes, sample visuals.

In [None]:

# Basic imports
import sys, random
from pathlib import Path
import matplotlib.pyplot as plt
import torch
import torchvision

# make project code importable
sys.path.append(str(Path.cwd().parent / "src"))
from hematol.data import build_datasets


In [None]:
#from hematol.data import build_datasets
DATA_ROOT = Path('../Source')  # adjust if you keep data elsewhere

ds, class_to_idx, targets = build_datasets(DATA_ROOT)
num_classes = len(class_to_idx)
print(f"Total images: {len(ds)}   |   Classes: {num_classes}")
print(class_to_idx)


In [None]:

import collections
import numpy as np

counts = collections.Counter(targets)
labels = [cls for cls, _ in sorted(class_to_idx.items(), key=lambda kv: kv[1])]
label_counts = [counts[i] for i in range(num_classes)]

plt.figure(figsize=(10,4))
plt.bar(labels, label_counts)
plt.xticks(rotation=45, ha='right')
plt.title('Class distribution')
plt.ylabel('image count')
plt.tight_layout()
plt.show()


In [None]:
from hematol.transforms import val_tfms          # resize → center-crop → tensor
from torchvision.utils import make_grid
import matplotlib.pyplot as plt, random

idxs = random.sample(range(len(ds)), 16)
tensor_imgs = [val_tfms(ds[i][0]) for i in idxs]   # now every tensor is 3×224×224

grid = make_grid(tensor_imgs, nrow=4, normalize=True, value_range=(0, 1))

plt.figure(figsize=(8, 8))
plt.imshow(grid.permute(1, 2, 0))
plt.axis("off")
plt.title("Random samples (normalised display)")
plt.show()

## Quick observations
* The dataset is moderately imbalanced – some rare classes have <1k images.
* Images come in varying resolutions; we resize to 256 before cropping 224×224 during training.
* Colours look broadly similar across datasets (Macenko stain‑norm not critical for the baseline).

> **Next steps:** weigh the loss or oversample minority classes; experiment with stain normalisation and self‑supervised pre‑training.