In [1]:
import os
import cv2
import numpy as np
from glob import glob
from tqdm import tqdm
import warnings

In [2]:
base_path = "/sfs/gpfs/tardis/home/kcm7zp/brain_tumor_data"

tumor_types = ['glioma', 'meningioma', 'pituitary']
label_map = {'glioma': 1, 'meningioma': 2, 'pituitary': 3}

# print(os.listdir(base_path))

In [3]:
data = []

for tumor in tumor_types:
    img_folder = os.path.join(base_path, tumor, "images")
    mask_folder = os.path.join(base_path, tumor, "masks")
    
    img_files = sorted([f for f in os.listdir(img_folder) if f.endswith(".png") and not f.startswith(".")])
    
    for img_file in tqdm(img_files, desc=f"Processing {tumor}"):
        base_id = img_file.replace(f"{tumor}_", "").replace(".png", "")
        mask_file = f"{tumor}_mask_{base_id}.png"
        
        img_path = os.path.join(img_folder, img_file)
        mask_path = os.path.join(mask_folder, mask_file)
        
        # Read in the image and mask 
        img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
        mask = cv2.imread(mask_path, cv2.IMREAD_GRAYSCALE)

        if img is None or mask is None:
            print(f"Failed to load: {img_path} or {mask_path}")
            continue

        # Resize to common dimensions
        img = cv2.resize(img, (256, 256))
        mask = cv2.resize(mask, (256, 256))

        # Normalize to [0, 1]
        img = img / 255.0
        mask = mask / 255.0
        
        # Append to data list
        data.append({
            'image': img,
            'mask': mask,
            'label': label_map[tumor]
        })

print(f"\nTotal samples loaded: {len(data)}")


Processing glioma: 100%|██████████| 1426/1426 [00:09<00:00, 143.14it/s]
Processing meningioma: 100%|██████████| 708/708 [00:08<00:00, 83.79it/s] 
Processing pituitary: 100%|██████████| 930/930 [00:12<00:00, 76.34it/s] 


Total samples loaded: 3064





In [4]:
# Sanity check 
print(f"Total samples loaded: {len(data)}")

labels = [item['label'] for item in data]
unique_labels, counts = np.unique(labels, return_counts=True)
print("Labels and counts:", dict(zip(unique_labels, counts)))

Total samples loaded: 3064
Labels and counts: {1: 1426, 2: 708, 3: 930}


In [37]:
# Convert to numpy arrays -- run this again
# Skip converting masks right now to save memory
images = np.array([item['image'] for item in data])
# masks = np.array([item['mask'] for item in data])
labels = np.array([item['label'] for item in data])

# Add channel dimension for Tensorflow/Keras 
images = np.expand_dims(images, axis=-1)  # shape: (N, 256, 256, 1)
# masks = np.expand_dims(masks, axis=-1)

In [38]:
# Train/Val/Test Split 
from sklearn.model_selection import train_test_split

# First split off test
X_trainval, X_test, y_trainval, y_test = train_test_split(images, labels, test_size=0.15, stratify=labels, random_state=42)

# Then split train and val
X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=0.15, stratify=y_trainval, random_state=42)

## Build the CNN in Keras

In [39]:
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.metrics import Precision, Recall, SparseCategoricalAccuracy

model = models.Sequential([
    layers.Conv2D(32, (3, 3), activation='relu', input_shape=(256, 256, 1)),
    layers.MaxPooling2D((2, 2)),

    layers.Conv2D(64, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),

    layers.Conv2D(128, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),

    layers.Flatten(),
    layers.Dense(64, activation='relu'),
    layers.Dense(3, activation='softmax')  # 3 tumor classes
])

## Compile the Model

In [40]:
# Shift labels to 0-based indexing (1 --> 0, etc.) 
y_train = y_train - 1
y_val = y_val - 1
y_test = y_test - 1

y_train = y_train.flatten()
y_val = y_val.flatten()
y_test = y_test.flatten()


model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

## Train the Model

In [41]:
print(X_train.shape) 
print(y_train.shape)
print(np.unique(y_train, return_counts=True))

## Output
# (2213, 256, 256, 1)
# (2213,)
# (array([0, 1, 2]), array([count_0, count_1, count_2]))

(2213, 256, 256, 1)
(2213,)
(array([0, 1, 2]), array([1030,  512,  671]))


In [20]:
# Check for NaNs or Inf values
print(np.isnan(X_train).sum(), np.isinf(X_train).sum())
print(np.isnan(y_train).sum(), np.isinf(y_train).sum())

0 0
0 0


In [21]:
print(np.unique(y_train))     # Should output [0 1 2]
print(y_train.shape)          # Should be (N,) — a 1D array
print(y_train[:5])            # Check first few labels


[0 1 2]
(2213,)
[2 1 0 2 2]


In [22]:
print(images.shape)           # (3064, 256, 256, 1)
print(labels.shape)           # (3064,)
print(labels.dtype)           # should be int32 or int64

(3064, 256, 256, 1)
(3064,)
int64


In [35]:
# Suppress warnings

import os
import warnings
import logging

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
warnings.filterwarnings("ignore")
logging.getLogger("tensorflow").setLevel(logging.ERROR)


In [None]:
# Hide warnings 
warnings.filterwarnings("ignore")

# Flatten labels before fitting 
# y_train = y_train.flatten()
# y_val = y_val.flatten()
# y_test = y_test.flatten()

history = model.fit(
    X_train, y_train,
    epochs=10,
    batch_size=32,
    validation_data=(X_val, y_val)
)

## Evaluate the Model

In [26]:
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"\nTest Accuracy: {test_acc:.2f}")


Test Accuracy: 0.93


W0000 00:00:1744904171.686399  237933 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1744904171.686623  237933 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1744904171.686971  237933 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1744904171.687160  237933 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1744904171.687503  237933 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1744904171.687755  237933 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1744904171.688172  237933 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1744904171.688564  237933 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1744904171.689018  237933 gp