
### OBJECTIVE

Detect patients sex chromosomes from single cell metaphase chromosomes.

### BACKGROUND

### DATA

**Source:** https://www.cellimagelibrary.org/pages/auto_chromosome_detector

Total metaphase images: <number>

Train: 
- male:
- female:
- monosomy X:

Validation: 
- normal male:
- normal female:
- monosomy X:

Test: 
- normal male:
- normal female:
- monosomy X:

The input data are pictures of human single cell G-banded metaphase chromosomes in JPG format. All images are named using the scheme image_id_sexchromosomes.jpg. Variable "image_id" is a 6-digit unique identifier according to original dataset. Variable "sexchromosomes" is sex chromosomes labeled by the author (Kadi Jairus) according to International System for Human Cytogenetic Nomenclature (XX, XY or X).

### METHOD

- TensorFlow is used to develop and train a Convolutional Neural Network (CNN) architecture aimed at classifying images.
- Different CNN architectures were tested to identify the most effective structure for our classification task.
- Data augmentation techniques were applied including random rotations, flips, and brightness adjustments are applied to increase the robustness of the model and improve generalization.
- A systematic search was conducted for the optimal set of hyperparameters, including learning rate, batch size, and number of epochs.
- Various optimizers were tested such as Adam, SGD, and RMSprop to find the best algorithm for minimizing the loss function.
- A validation strategy was implemented to monitor the model's performance on unseen data during training.

In [1]:
"""Imports modules and creates directory for files."""


import datetime
import os
import shutil
import pathlib
import numpy as np
import matplotlib as mpl
from datetime import datetime
import matplotlib.pyplot as plt
import numpy as np
import pytz

tallinn_tz = pytz.timezone('Europe/Tallinn')
TODAY = datetime.now(tallinn_tz).strftime("%Y-%m-%d_%H-%M")

def create_report_dir(selected_model, batch_size, epochs):
    # Makes directory for current training
    report_dir = f'{TODAY}_{selected_model}_model_training_{batch_size}_batch_{epochs}_epochs'
    
    try:
        os.makedirs(report_dir, exist_ok=True)
        print(f"Directory '{report_dir}' created successfully.")
    except PermissionError:
        print(f"Permission denied: Could not create directory '{report_dir}'.")

    return report_dir


In [2]:
"""Sets up the environment and imports necessary libraries for TensorFlow and Keras."""


# Suppress tensorflow info messages
%env TF_CPP_MIN_LOG_LEVEL=2
%env TF_NUM_INTEROP_THREADS=8

# Set segmentation_models framework to the correct keras
%env SM_FRAMEWORK=tf.keras

import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Conv2D, Dropout, MaxPooling2D, AveragePooling2D, Flatten, BatchNormalization
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.metrics import Precision, Recall
from tensorflow.keras.callbacks import TensorBoard, EarlyStopping, ModelCheckpoint, Callback
from tensorflow.keras.regularizers import l2
import tensorflow_datasets as tfds
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Use mixed_float16, which is a good speedup on HPC cluster GPUs
tf.keras.config.set_dtype_policy("mixed_float16")

try: 
    tf_version = tf.__version__
    keras_version = tf.keras.__version__
    print(f"Setup succesful!\nTensorflow version {tf_version}. \nKeras version: {keras_version}")
except Exception as e:
    print(f"Setup failed :( {e}")

env: TF_CPP_MIN_LOG_LEVEL=2
env: TF_NUM_INTEROP_THREADS=8
env: SM_FRAMEWORK=tf.keras
Setup succesful!
Tensorflow version 2.16.1. 
Keras version: 3.7.0


Note: I'm sticking to tensorflow==2.16.1 because newer versions are basically broken on our system

Temp Fix from: https://github.com/tensorflow/tensorflow/issues/62075#issuecomment-2387257399

In [3]:
"""Optional code block. Checks the number of GPUs."""


num_gpus = len(tf.config.list_physical_devices('GPU'))
if num_gpus == 0:
    raise SystemError('No GPU devices found')
else:
    print(f"{num_gpus} gpu(s) found")

1 gpu(s) found


In [4]:
"""Specifies the location of data. Creates directories for models and logs."""


# Uncomment below to download data into localstorage
#!bash download_data.sh "/localstorage/$USER/isic_data"
def create_dirs_for_model_and_logs(data_dir, report_dir):
    data_dir = pathlib.Path(data_dir)
    if not data_dir.exists():
        raise ValueError("The specified data directory does not exist.")
        
    train_dir = data_dir / "train"
    val_dir = data_dir / "val"
    test_dir = data_dir / "test"
    
    logs_path = f"{report_dir}/logs"
    models_path = "models"
    
    try:
        os.makedirs(logs_path, exist_ok=True)
        os.makedirs(models_path, exist_ok=True) 
        print(f"Directories for logs and models created successfully in {report_dir}.")
    except PermissionError as e:
        print(f"Permission denied: Could not create needed directories. {e}")

    return (train_dir, val_dir, test_dir)

In [5]:
"""Defines how picture data will be generated."""


from tensorflow.keras.preprocessing.image import ImageDataGenerator

def create_data_generators():
    # Define a training image data generator with augmentation
    train_datagen = ImageDataGenerator(
        rescale=1./255,  # Rescale pixel values: 0 to 1, not to 255.
        rotation_range=20,  # Random rotations, 20 degrees.
        width_shift_range=None,  # Random horizontal shifts
        height_shift_range=None,  # Random vertical shifts
        shear_range=0.2,  # Shearing (darkness)
        zoom_range=0.05,  # Zooming
        horizontal_flip=True,  # Horizontal flipping
        fill_mode='nearest'
    )
    
    # Defines a testing/validation image data generator with NO augmentation, only rescaling
    val_datagen = ImageDataGenerator(rescale=1./255)
    test_datagen = ImageDataGenerator(rescale=1./255)
    return(train_datagen, val_datagen, test_datagen)

In [6]:
"""Optional code block. Remove checkpoints if 4 classes instead of 3."""


def remove_ipynb_checkpoints(directory):
    for root, dirs, files in os.walk(directory):
        if '.ipynb_checkpoints' in dirs:
            checkpoint_path = os.path.join(root, '.ipynb_checkpoints')
            shutil.rmtree(checkpoint_path)
            print(f"Removed {checkpoint_path}")

    # Apply to train, val, and test directories
    remove_ipynb_checkpoints(train_dir)
    remove_ipynb_checkpoints(val_dir)
    remove_ipynb_checkpoints(test_dir)

In [7]:
"""Generates training, validation and testing data."""


def get_data_generators(img_size, batch_size):
    # Changes parameters after each batch
    train_generator = train_datagen.flow_from_directory(
        train_dir,
        target_size=img_size,
        batch_size=batch_size,
        class_mode='categorical',
        shuffle=True
    )
    
    # Loads images from the testing directory
    val_generator = val_datagen.flow_from_directory(
        val_dir,
        target_size=img_size,
        batch_size=batch_size,
        class_mode='categorical',  # Matching the class_mode of the training data
        shuffle=False
    )
    
    test_generator = test_datagen.flow_from_directory(
        test_dir,
        target_size=img_size,
        batch_size=batch_size,
        class_mode='categorical',
        shuffle=False  # Ensure the generator does not shuffle the images
    )
    
    print("Class indices for training data:", train_generator.class_indices)
    print("Class indices for validation data:", val_generator.class_indices)
    print("Class indices for testing data:", test_generator.class_indices)

    return (train_generator, val_generator, test_generator)

In [8]:
"""Option 1. Defines the pre-trained Xception model."""


from tensorflow.keras.applications import Xception
from tensorflow.keras.layers import GlobalAveragePooling2D


def create_xception(input_shape, num_classes, dropout_rate, l2_factor=0.01):
    """
    Define model architecture.
    
    :param int: height, width, channels
    :param int: number of classes
    :return: model
    """    
    # Load Xception base without top layers
    base_model = Xception(
        include_top=False,
        weights="imagenet",
        input_shape=(input_shape)
    )
    
    # Only train last layers
    base_model.trainable = True
    for layer in base_model.layers[:-10]:  # Freeze all layers except the last 10
        layer.trainable = False

    model = Sequential([
        base_model,
        GlobalAveragePooling2D(),
        Dense(128, activation='relu', kernel_regularizer=l2(l2_factor)),
        Dropout(dropout_rate),
        Dense(64, activation='relu', kernel_regularizer=l2(l2_factor)),
        Dropout(dropout_rate),
        Dense(num_classes, activation='softmax', kernel_regularizer=l2(l2_factor))
    ])
    
    return model


In [9]:
"""Option 2. Defines pretrained Alexnet model with dropout."""


def create_alexnet(input_shape, num_classes):
    modelAlexNet = Sequential([
        Conv2D(filters=96, kernel_size=(11,11), strides=(4,4), activation='relu', input_shape=input_shape),
        BatchNormalization(),
        MaxPooling2D(pool_size=(3,3), strides=(2,2), padding='same'),
        Conv2D(filters=256, kernel_size=(5,5), strides=(1,1), activation='relu', padding="same"),
        BatchNormalization(),
        MaxPooling2D(pool_size=(3,3), strides=(2,2), padding='same'),
        Conv2D(filters=384, kernel_size=(3,3), strides=(1,1), activation='relu', padding="same"),
        BatchNormalization(),
        Conv2D(filters=384, kernel_size=(3,3), strides=(1,1), activation='relu', padding="same"),
        BatchNormalization(),
        Conv2D(filters=256, kernel_size=(3,3), strides=(1,1), activation='relu', padding="same"),
        BatchNormalization(),
        MaxPooling2D(pool_size=(3,3), strides=(2,2), padding='same'),
        Flatten(),
        Dense(4096, activation='relu'),
        Dropout(0.5),
        Dense(4096, activation='relu'),
        Dropout(0.5),
        Dense(num_classes, activation='softmax')
    ])
    
    return modelAlexNet

In [10]:
"""Specifies variables, compiles model with single hyperparameter set and creates summary."""


# Define class weights in float32 for numerical stability
class_weights = tf.constant([1.0, 1.0, 1.51], dtype=tf.float32)

@tf.function
def weighted_categorical_crossentropy(y_true, y_pred):
    # Ensure y_true and y_pred are cast to float32 for stability
    y_true = tf.cast(y_true, tf.float32)
    y_pred = tf.cast(y_pred, tf.float32)
    
    # Apply weights to the one-hot encoded labels
    weights = tf.reduce_sum(class_weights * y_true, axis=1)
    
    # Compute the unweighted categorical crossentropy loss
    unweighted_loss = tf.keras.losses.categorical_crossentropy(y_true, y_pred)
    
    # Apply weights to the loss
    weighted_loss = unweighted_loss * weights
    return weighted_loss
    
def compile_model(report_dir, selected_model, input_shape, num_classes, dropout_rate, optimizer_name, learning_rate):
    
    if selected_model == "xception":
        model = create_xception(input_shape, num_classes, dropout_rate)
    elif selected_model == "alexnet":
        model = create_alexnet(input_shape, num_classes)
    else:
        model = create_cnn(num_classes)

    if optimizer_name == "adam":
        optimizer = Adam(learning_rate=learning_rate)
    elif optimizer_name == "sgd":
        optimizer = SGD(learning_rate=learning_rate)
    else:
        raise ValueError(f"Unexpected optimizer name: {optimizer_name}")
       
    # Compiles the model 'adam' or 'sgd' optimizer. 
    model.compile(optimizer=optimizer,
                  loss=weighted_categorical_crossentropy,
                  #metrics=['accuracy', Precision(), Recall()])    
                  metrics=['accuracy', Precision(name='precision'), Recall(name='recall')])
   
    # Saves model summary in table format
    summary_file = f"{report_dir}/{TODAY}_Model_{selected_model}_architecture_summary_optimizer_{optimizer}.txt"
    with open(summary_file, 'w') as f:
        model.summary(print_fn=lambda x: f.write(x + '\n'))
    # Shows the model
    model.summary()
    
    # Shows model as a graph
    tf.keras.utils.plot_model(
        model,
        to_file=f"{report_dir}/{TODAY}_Model_{selected_model}_graph_optimizer_{optimizer_name}.png",
        show_shapes=True,
        show_dtype=True,
        show_layer_names=True,
        rankdir="TB",
        expand_nested=True,
        dpi=60,
    )
    
    return model
    

In [11]:
"""Trains model with single hyperparameter set and writes to file."""

"""
def train_model(report_dir, model, train_generator, val_generator, epochs, batch_size, optimizer):
    # Ensure report directory exists
    import os
    if not os.path.exists(report_dir):
        os.makedirs(report_dir)

    # Callbacks
    callbacks = [
        EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True),
        ModelCheckpoint(filepath=f"{report_dir}/best_model.keras", save_best_only=True, monitor='val_loss')
    ]

    # Fit the model
    history = model.fit(
        train_generator,
        steps_per_epoch=train_generator.samples // train_generator.batch_size,
        epochs=epochs,
        validation_data=val_generator,
        validation_steps=val_generator.samples // val_generator.batch_size,
        class_weight={0: 1.0, 1: 1.0, 2: 1.51},  # Adjust weights as needed
        callbacks=callbacks,
        verbose=1
    )

    # Print history keys
    keys = history.history.keys()
    print(f"Model history keys are: \n{keys}")

    # Plot metrics
    metrics = ['loss', 'accuracy', 'precision', 'recall']
    titles = ['Loss', 'Accuracy', 'Precision', 'Recall']
    f, ax = plt.subplots(2, 2, figsize=(12, 6))

    for i, metric in enumerate(metrics):
        row, col = divmod(i, 2)
        if metric in history.history:
            ax[row, col].plot(history.history[metric], label=f"Train {metric}", color='b')
            ax[row, col].plot(history.history.get(f"val_{metric}", []), label=f"Validation {metric}", color='r')
            ax[row, col].set_title(titles[i])
            ax[row, col].legend(loc="best")
        else:
            print(f"Metric {metric} not found in history.history. Skipping plot.")
    plt.tight_layout()

    # Save plot
    filename = f"{report_dir}/{epochs}_epochs_{batch_size}_batches_training_plot.png"
    try:
        plt.savefig(filename)
        print(f"Training plot saved to {filename}")
    except Exception as e:
        print(f"Saving plot failed: {e}")

    return model


"""

def train_model(report_dir, model, train_generator, val_generator, epochs, batch_size, optimizer):
    # Fit the model to the training data
    history = model.fit(
        train_generator,
        steps_per_epoch=train_generator.samples // train_generator.batch_size,  # Ensures the model sees the x_train samples per epoch
                                                                                # allows you to control how much data the model
                                                                                # should consume before declaring one epoch complete and moving on to the next epoch.
        epochs=epochs,  # Number of epochs to train for
        #class_weight={0: 1.0, 1: 1.0, 2: 1.51},
        validation_data=val_generator,
        validation_steps=val_generator.samples // val_generator.batch_size
    )

    keys = model.history.history.keys()
    print(f"Model history keys are: \n{keys}")
    
    # EarlyStopping callback
    early_stopping = EarlyStopping(
        monitor='val_loss', 
        patience=10, 
        restore_best_weights=True
    )
    callbacks = [early_stopping]
    # Generate plots
    f,ax=plt.subplots(2,2, figsize=(12,6))
    try:
      #1st subplot: training loss and validation loss
      ax[0, 0].plot(model.history.history['loss'],color='b',label='train_loss')
      ax[0, 0].plot(model.history.history['val_loss'],color='r',label='val_loss')
      ax[0, 0].legend(loc="upper right")
    except Exception as e:
        print(f'Creating plot 0,0 failed: {e}')
    
    try:
      #2nd subplot: training accuracy and validation accuracy
      ax[0, 1].plot(model.history.history['accuracy'],color='b',label='train_accuracy')
      ax[0, 1].plot(model.history.history['val_accuracy'],color='r',label='val_accuracy')
      ax[0, 1].legend(loc="lower right")
    except Exception as e:
        print(f'Creating plot 0,1 failed: {e}')
    
    try:
      #3rd subplot: training precision and validation precision
      ax[1, 0].plot(model.history.history['precision'],color='b',label='train_precision')
      ax[1, 0].plot(model.history.history['val_precision'],color='r',label='val_precision')
      ax[1, 0].legend(loc="lower right")
    except Exception as e:
        print(f'Creating plot 1,0 failed: {e}')
    
    try:
      #4th subplot: training recall and validation recall
      ax[1, 1].plot(model.history.history['recall'],color='b',label='train_recall')
      ax[1, 1].plot(model.history.history['val_recall'],color='r',label='val_recall')
      ax[1, 1].legend(loc="lower right")
    except Exception as e:
        print(f'Creating plot 1,1 failed: {e}')
    
    # Save plots
    filename = f"{report_dir}/{TODAY}_CNN_training_plot_{epochs}_epochs_{batch_size}_batches_optimizer_{optimizer}.png"
    try:
        plt.savefig(filename)
        print('Saved figure')
    except Exception as e:
        print(f'Saving figure to first location failed: {e}')
        try:
          plt.savefig('training_image_bug')
          print('Saved figure to home directory')
        except Exception as e:
          print(f'Saving figure failed: {e}')

    return model


In [12]:
"""Option 2. Trains model with multiple sets of hyperparameters and writes to file."""

import os
import time
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam, SGD

# Define a helper to create a log directory
def get_run_logdir(run_name, root_logdir):
    run_id = time.strftime(run_name + "_%Y_%m_%d-%H_%M_%S")
    return os.path.join(root_logdir, run_id)


def compile_train_and_test_with_multiple_hp(
    REPORT_DIR,
    train_generator,
    val_generator,
    SELECTED_MODEL,
    INPUT_SHAPE,
    NUM_CLASSES,
    HP_LEARNING_RATE,
    HP_DROPOUT,
    HP_OPTIMIZER,
    METRIC_ACCURACY,
    EPOCHS,
    BATCH_SIZE,
):
    root_logdir = os.path.join(os.curdir, "logs_hp_training")
    run_name = "run"
    
    # Define HParams
    hparams = [HP_LEARNING_RATE, HP_DROPOUT, HP_OPTIMIZER]
    metric = [hp.Metric(METRIC_ACCURACY, display_name="Accuracy")]

    # Set up TensorBoard for HParam logging
    with tf.summary.create_file_writer(os.path.join(root_logdir, f"{TODAY}_hparam_tuning")).as_default():
        hp.hparams_config(hparams=hparams, metrics=metric)

    session_num = 0
    for dropout_rate in HP_DROPOUT.domain.values:
        for optimizer in HP_OPTIMIZER.domain.values:
            for learning_rate in HP_LEARNING_RATE.domain.values:
                hparams = {
                    HP_LEARNING_RATE: learning_rate,
                    HP_DROPOUT: dropout_rate,
                    HP_OPTIMIZER: optimizer,
                }
                run_dir = get_run_logdir(run_name, root_logdir)
                print(f"--- Starting trial: {run_dir}")
                print({h.name: hparams[h] for h in hparams})
                session_num += 1

                # Initialize model
                model = compile_model(REPORT_DIR, SELECTED_MODEL, INPUT_SHAPE, NUM_CLASSES, 
                                      dropout_rate, optimizer, learning_rate)
                
                # Write training info
                info_file = os.path.join(REPORT_DIR, f"{TODAY}_training_info.txt")
                with open(info_file, "w") as f:
                    f.write(f"Training model: {SELECTED_MODEL}\n")
                    f.write(f"Input shape: {INPUT_SHAPE}\n")
                    f.write(f"Epochs: {EPOCHS}\n")
                    f.write(f"Batch size: {BATCH_SIZE}\n")
                    f.write(f"Optimizer: {optimizer}\n")
                    f.write(f"Learning rate: {learning_rate}\n")
                    f.write(f"Dropout rate: {dropout_rate}\n")

                # Train model
                trained_model = train_model(REPORT_DIR, model, train_generator, val_generator, EPOCHS, BATCH_SIZE, optimizer)
                
                # Save model
                save_path = os.path.join("models", f"{TODAY}_Cyto_cnn_{SELECTED_MODEL}.keras")
                trained_model.save(save_path)
                
                # Test model
                test_model(save_path, REPORT_DIR)


In [13]:
"""Test model."""


import numpy as np
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.metrics import Accuracy, Precision, Recall


def test_model(model_path, report_dir):
    try: 
        model = tf.keras.models.load_model(model_path, compile=False)
        print(f"Loading model successful! Path: {model_path}")
    except Exception as e:
        print(f"Loading model failed. {e}")

    # Predictions and true labels
    predictions = model.predict(test_generator)
    predicted_classes = np.argmax(predictions, axis=1)
    true_labels = test_generator.classes
    class_labels = list(test_generator.class_indices.keys())
    
    # Keras metrics
    accuracy_metric = Accuracy()
    accuracy_metric.update_state(true_labels, predicted_classes)
    accuracy = accuracy_metric.result().numpy()
    
    # Generate a confusion matrix manually
    conf_matrix = np.zeros((len(class_labels), len(class_labels)), dtype=int)
    for true, pred in zip(true_labels, predicted_classes):
        conf_matrix[true, pred] += 1
    
    # Calculate precision and recall for each class
    precision_metric = Precision()
    recall_metric = Recall()
    
    precision_metric.update_state(to_categorical(true_labels, num_classes=len(class_labels)), predictions)
    recall_metric.update_state(to_categorical(true_labels, num_classes=len(class_labels)), predictions)
    
    precision = precision_metric.result().numpy()
    recall = recall_metric.result().numpy()
    
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print("Confusion Matrix:")
    print(conf_matrix)
    
    # Save the report
    filename = f"{report_dir}/{TODAY}_classification_report_model_testing.txt"
    try:
        os.makedirs(report_dir, exist_ok=True)
        with open(filename, 'w') as f:
            f.write("Classification Report\n")
            f.write(f"Accuracy: {accuracy:.4f}\n")
            f.write(f"Precision: {precision:.4f}\n")
            f.write(f"Recall: {recall:.4f}\n")
            f.write("\nConfusion Matrix:\n")
            for row in conf_matrix:
                f.write(' '.join(map(str, row)) + '\n')
    except Exception as e:
        print(f"Error saving report: {e}")
    
    # Plot incorrect predictions
    incorrect_indices = np.where(predicted_classes != true_labels)[0]
    
    # Reset generator
    test_generator.reset()
    
    # Visualize incorrect predictions
    num_samples_to_plot = 10
    plotted_count = 0
    
    for i in range(len(test_generator)):
        images, labels = next(test_generator)
    
        # Check batch size
        if len(images) == 0:
            continue
    
        batch_start_index = i * test_generator.batch_size
        batch_end_index = batch_start_index + len(labels)
    
        batch_incorrect_indices = [idx for idx in incorrect_indices
                                   if batch_start_index <= idx < batch_end_index]
        batch_relative_indices = [idx - batch_start_index for idx in batch_incorrect_indices]
    
        for idx in batch_relative_indices:
            if plotted_count >= num_samples_to_plot:
                break
    
            plt.figure(figsize=(4, 4))
            plt.imshow((images[idx] * 255).astype("uint8"))
            true_class = class_labels[true_labels[batch_start_index + idx]]
            predicted_class = class_labels[predicted_classes[batch_start_index + idx]]
    
            plt.title(f"True: {true_class}, Predicted: {predicted_class}")
            plt.axis('off')
    
            plot_filename = f"{report_dir}/Wrong_prediction_{plotted_count + 1}_{true_class}_predicted_{predicted_class}.png"
            plt.savefig(plot_filename, format='png')
            plt.show()
    
            plotted_count += 1
    
        if plotted_count >= num_samples_to_plot:
            break

        print("Testing model succesful!")


In [14]:
"""Main. Common for options 1 and 2."""


"""Adjusts the parameters."""
IMG_SIZE = (300, 300)
INPUT_SHAPE = (300, 300, 3)
SELECTED_MODEL = "xception"
# Uses 'adam' or 'sgd' optimizer
OPTIMIZER = "adam"
BATCH_SIZE = 2
EPOCHS = 70
DROPOUT_RATE = 0.3
LEARNING_RATE = 1e-4
NUM_CLASSES = 3

"""Creates directories."""
REPORT_DIR = create_report_dir(SELECTED_MODEL, BATCH_SIZE, EPOCHS)

dirs = create_dirs_for_model_and_logs("data", REPORT_DIR)
train_dir = dirs[0]
val_dir = dirs[1]
test_dir = dirs[2]

"""Generates data."""
data_gens = create_data_generators()
train_datagen = data_gens[0]
val_datagen = data_gens[1]
test_datagen = data_gens[2]

generators = get_data_generators(IMG_SIZE, BATCH_SIZE)
train_generator = generators[0]
val_generator = generators[1]
test_generator = generators[2]

"""Validates generator"""
print("Class indices:", train_generator.class_indices)
for x_batch, y_batch in train_generator:
    print(f"x_batch shape: {x_batch.shape}, y_batch shape: {y_batch.shape}")
    print(f"y_batch: {y_batch}")
    break

"""Validates custom loss function"""
y_true = tf.constant([[0, 1, 0], [1, 0, 0]], dtype=tf.float32)  # Example one-hot labels
y_pred = tf.constant([[0.1, 0.7, 0.2], [0.8, 0.1, 0.1]], dtype=tf.float32)  # Example predictions
print(weighted_categorical_crossentropy(y_true, y_pred))



Directory '2024-12-31_11-39_xception_model_training_2_batch_70_epochs' created successfully.
Directories for logs and models created successfully in 2024-12-31_11-39_xception_model_training_2_batch_70_epochs.
Found 1023 images belonging to 3 classes.
Found 288 images belonging to 3 classes.
Found 288 images belonging to 3 classes.
Class indices for training data: {'female': 0, 'male': 1, 'monosomy': 2}
Class indices for validation data: {'female': 0, 'male': 1, 'monosomy': 2}
Class indices for testing data: {'female': 0, 'male': 1, 'monosomy': 2}
Class indices: {'female': 0, 'male': 1, 'monosomy': 2}
x_batch shape: (2, 300, 300, 3), y_batch shape: (2, 3)
y_batch: [[0. 1. 0.]
 [1. 0. 0.]]
tf.Tensor([0.35667497 0.22314353], shape=(2,), dtype=float32)


In [None]:
"""Option 1. Main with single model."""

"""Compiles model.""" 
model = compile_model(REPORT_DIR, SELECTED_MODEL, INPUT_SHAPE, NUM_CLASSES, DROPOUT_RATE, OPTIMIZER, LEARNING_RATE)

"""Validating custom loss function""" 
print("Testing with small batch") 
x_batch, y_batch = next(iter(train_generator)) 
y_pred = model(x_batch) 
loss = weighted_categorical_crossentropy(y_batch, y_pred) 
print("Loss for the batch:", loss.numpy())

"""Writes training details to file.""" 
print(f"Training model {SELECTED_MODEL}:\n epochs: {EPOCHS} \n batch_size: {BATCH_SIZE} \n optimizer: {OPTIMIZER} \n learning rate: {LEARNING_RATE} \n dropout_rate: {DROPOUT_RATE}") 
info_file = os.path.join(REPORT_DIR, f"{TODAY}training_info{EPOCHS}{BATCH_SIZE}{OPTIMIZER}.txt") 
with open(info_file, "w") as f: 
    f.write(f"Training model {SELECTED_MODEL}:\n") 
    f.write(f"Input shape {str(INPUT_SHAPE)}:\n") 
    f.write(f"epochs: {EPOCHS}\n") 
    f.write(f"batch_size: {BATCH_SIZE}\n") 
    f.write(f"optimizer: {OPTIMIZER}\n") 
    f.write(f"learning rate: {LEARNING_RATE}\n") 
    f.write(f"dropout_rate: {DROPOUT_RATE}\n")

"""Trains model.""" 
trained_model = train_model(REPORT_DIR, model, train_generator, val_generator, EPOCHS, BATCH_SIZE, OPTIMIZER)

"""Saves model.""" 
save_path = f"models/{TODAY}Cyto_cnn{SELECTED_MODEL}model{EPOCHS}epochs{BATCH_SIZE}batches_optimizer{OPTIMIZER}lr{LEARNING_RATE}.keras" 
trained_model.save(save_path)

"""Tests model.""" 
test_model(save_path, REPORT_DIR)


You must install graphviz (see instructions at https://graphviz.gitlab.io/download/) for `plot_model` to work.
Testing with small batch
Loss for the batch: [1.5405672 1.4955688]
Training model xception:
 epochs: 70 
 batch_size: 2 
 optimizer: adam 
 learning rate: 0.0001 
 dropout_rate: 0.3
Epoch 1/70


  self._warn_if_super_not_called()
I0000 00:00:1735637975.860544 2378272 service.cc:145] XLA service 0x7f9ff4002ac0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1735637975.860592 2378272 service.cc:153]   StreamExecutor device (0): NVIDIA A100-SXM4-40GB, Compute Capability 8.0
2024-12-31 11:39:41.511928: E external/local_xla/xla/service/slow_operation_alarm.cc:65] Constant folding an instruction is taking > 1s:

  %reduce-window.1695 = f16[2,74,74,128]{3,2,1,0} reduce-window(f16[2,147,147,128]{3,2,1,0} %constant.509, f16[] %constant.1690), window={size=1x3x3x1 stride=1x2x2x1 pad=0_0x1_1x1_1x0_0}, to_apply=%max_F16.1691, metadata={op_type="MaxPool" op_name="sequential_1/xception_1/block2_pool_1/MaxPool2d" source_file="/opt/miniforge/envs/ogun/lib/python3.11/site-packages/tensorflow/python/framework/ops.py" source_line=1177}

This isn't necessarily a bug; constant-folding is inherently a trade-off between compilation time and speed a

[1m  1/511[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m6:13:05[0m 44s/step - accuracy: 0.0000e+00 - loss: 4.6791 - precision: 0.0000e+00 - recall: 0.0000e+00

I0000 00:00:1735638014.710753 2378272 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.
2024-12-31 11:40:23.141916: E external/local_xla/xla/service/slow_operation_alarm.cc:65] Constant folding an instruction is taking > 4s:

  %reduce-window.1689 = f16[2,74,74,128]{3,2,1,0} reduce-window(f16[2,147,147,128]{3,2,1,0} %constant.511, f16[] %constant.1684), window={size=1x3x3x1 stride=1x2x2x1 pad=0_0x1_1x1_1x0_0}, to_apply=%max_F16.1685, metadata={op_type="MaxPool" op_name="sequential_1/xception_1/block2_pool_1/MaxPool2d" source_file="/opt/miniforge/envs/ogun/lib/python3.11/site-packages/tensorflow/python/framework/ops.py" source_line=1177}

This isn't necessarily a bug; constant-folding is inherently a trade-off between compilation time and speed at runtime. XLA has some guards that attempt to keep constant folding from taking too long, but fundamentally you'll always be able to come up with an input program that takes a 

[1m  2/511[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m5:12:03[0m 37s/step - accuracy: 0.1250 - loss: 4.6600 - precision: 0.0000e+00 - recall: 0.0000e+00    

2024-12-31 11:41:03.897910: E external/local_xla/xla/service/slow_operation_alarm.cc:65] Constant folding an instruction is taking > 8s:

  %reduce-window.1689 = f16[2,74,74,128]{3,2,1,0} reduce-window(f16[2,147,147,128]{3,2,1,0} %constant.511, f16[] %constant.1684), window={size=1x3x3x1 stride=1x2x2x1 pad=0_0x1_1x1_1x0_0}, to_apply=%max_F16.1685, metadata={op_type="MaxPool" op_name="sequential_1/xception_1/block2_pool_1/MaxPool2d" source_file="/opt/miniforge/envs/ogun/lib/python3.11/site-packages/tensorflow/python/framework/ops.py" source_line=1177}

This isn't necessarily a bug; constant-folding is inherently a trade-off between compilation time and speed at runtime. XLA has some guards that attempt to keep constant folding from taking too long, but fundamentally you'll always be able to come up with an input program that takes a long time.

If you'd like to file a bug, run with envvar XLA_FLAGS=--xla_dump_to=/tmp/foo and attach the results.
2024-12-31 11:41:04.366389: E external/loc

[1m  5/511[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m5:09:44[0m 37s/step - accuracy: 0.2917 - loss: 4.6305 - precision: 0.0000e+00 - recall: 0.0000e+00

In [122]:
"""Option 2. Main with hp."""

from tensorboard.plugins.hparams import api as hp

# Define hyperparameters for tuning
HP_LEARNING_RATE = hp.HParam('learning_rate', hp.Discrete([1e-05]))
HP_DROPOUT = hp.HParam('dropout', hp.Discrete([0.4, 0.5, 0.]))
HP_OPTIMIZER = hp.HParam('optimizer', hp.Discrete(['adam']))
METRIC_ACCURACY = 'accuracy'

compile_train_and_test_with_multiple_hp(REPORT_DIR, train_generator, val_generator,  
                                        SELECTED_MODEL, INPUT_SHAPE, 
                                        NUM_CLASSES, HP_LEARNING_RATE, HP_DROPOUT, 
                                        HP_OPTIMIZER, METRIC_ACCURACY,
                                        EPOCHS, BATCH_SIZE)

--- Starting trial: ./logs_hp_training/run_2024_12_31-10_37_37
{'learning_rate': 1e-05, 'dropout': 0.0, 'optimizer': 'adam'}


You must install graphviz (see instructions at https://graphviz.gitlab.io/download/) for `plot_model` to work.


  self._warn_if_super_not_called()


Epoch 1/150


ValueError: Could not interpret loss identifier: weighted_categorical_crossentropy