# Learning Rate Range Test
## Experiments

In [1]:
import os
import pathlib
import tensorflow as tf
import pandas as pd
import wandb
import numpy as np

from bcd.model.callback import LRRangeTestCallback
from bcd.model.network.base import NetworkConfig
from bcd.model.network.tmnet import TMNetConfig, TMNetFactory
from bcd.model.store import ExperimentRepo
from bcd.model.pretrained import DenseNet, EfficientNet, Inception, InceptionResNet, MobileNet, ResNet, Xception
from bcd.model.experiment import FeatureExtractionExperiment
from bcd.model.config import ProjectConfig, DatasetConfig, CheckPointConfig, TrainConfig, EarlyStopConfig, LearningRateScheduleConfig, Config, ExperimentConfig
from bcd.model.adapter import LocalAdapter, Adapter

## Parameters

In [2]:
mode = "Development"
force = True
base_model = DenseNet()

## Platform 
The platform object encapsulates variables that are platform-dependent, such as device type, distribute strategy, api keys, file paths, etc...

In [3]:
adapter = LocalAdapter(mode=mode)
print(f"Running on {adapter.device_type}")

# Obtain the TensorFlow state and compute distribution policy, i.e. strategy
strategy  = adapter.get_strategy()

# Weights and Biases login for model and metric tracking.
wandb.login(key=adapter.wandb_api_key)

Running on CPU
ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


[34m[1mwandb[0m: Currently logged in as: [33maistudio[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/john/.netrc


True

## Reproducibility

In [4]:
def seed_everything():
    os.environ['TF_CUDNN_DETERMINISTIC'] = '1' 
    np.random.seed(hash("improves reproducibility") % 2**32 - 1)
    tf.random.set_seed(hash("by removing stochasticity") % 2**32 - 1)
seed_everything()

## Build Configuration

In [5]:
def build_config(adapter: Adapter, mode: str, strategy: tf.distribute.Strategy) -> Config:
    """Constructs an experiment Config object """
    # Encapsulates the parameters that define the project in Weights & Biases
    project_config = ProjectConfig(
        name="Breast-Cancer-Detection-Development-Learning-Rate-Range-Test",
        mode=mode)

    # The TMNet architecture has two dense layers before a sigmoid activation. We'll set the number of nodes in the last two dense layers to 1024 and 512 respectively. 
    network_config = TMNetConfig(
        activation="sigmoid", 
        input_shape=(224,224,3), 
        output_shape=1, 
        dense1=1048, 
        dense2=1024)

    # The default batch size is 64; however, if running on TPU, the rule of thumb is to optimally set the batch size to 128 * the number of TPU cores    
    batch_size = 32 if not adapter.device_type == "TPU" else 16 * strategy.num_replicas_in_sync
    dataset_config = DatasetConfig(
        mode=mode,        
        batch_size=batch_size)

    # If running on TPU, the learning rate is scaled by the number of cores corresponding to the batch size.
    learning_rate = 1e-4 if adapter.device_type != "TPU" else 1e-4 *  strategy.num_replicas_in_sync
    train_config = TrainConfig(
        epochs=10, 
        learning_rate=learning_rate)    

    # Checkpoints will be stored in the directory given by the adapter object. 
    checkpoint_config = CheckPointConfig(
        directory=adapter.model_dir, 
        monitor="val_accuracy", 
        verbose=1, 
        save_best_only=True, 
        save_weights_only=True, 
        mode="auto")

    # We'll establish an early stop callback to mitigate overfitting caused by excessive training after validation loss hasn't improved.
    early_stop_config = EarlyStopConfig(
        min_delta=1e-4, 
        monitor="val_loss", 
        patience=10, 
        restore_best_weights=True, 
        verbose=1)

    # Learning rate will be annealed by a factor of 0.5 if validation loss hasn't improved in 3 epochs.
    learning_rate_schedule_config = LearningRateScheduleConfig(        
        min_lr=1e-5, 
        max_lr=1e-1,
        epochs=10)
    

    # The experiment configuration is encapsulated into a single object 
    return ExperimentConfig(project=project_config, 
                    dataset=dataset_config, 
                    train=train_config, 
                    network=network_config, 
                    checkpoint=checkpoint_config, 
                    early_stop=early_stop_config, 
                    learning_rate_schedule=learning_rate_schedule_config)

# Construct the configuration object.
config = build_config(adapter=adapter, mode=mode, strategy=strategy)

## Build Dataset

In [6]:

def build_dataset(train_dir: str, subset: str, dataset_config: Config, tpu: bool = False) -> tf.data.Dataset:
    """Produces a TensorFlow training or validation  Dataset  """
    train_dir = pathlib.Path(train_dir).with_suffix('') 
    return tf.keras.utils.image_dataset_from_directory(
        train_dir,
        labels=dataset_config.labels,
        color_mode=dataset_config.color_mode,
        image_size=dataset_config.image_size,
        shuffle=dataset_config.shuffle,
        validation_split=dataset_config.validation_split,
        subset=subset,
        interpolation=dataset_config.interpolation,
        seed=dataset_config.seed,
        batch_size=dataset_config.batch_size
    )
train_ds = build_dataset(train_dir=adapter.train_dir, subset="training", dataset_config=config.dataset)
val_ds = build_dataset(train_dir=adapter.train_dir, subset="validation", dataset_config=config.dataset)

Found 276 files belonging to 2 classes.
Using 221 files for training.
Found 276 files belonging to 2 classes.
Using 55 files for validation.


## Dataset Optimization

In [7]:
data_augmentation = tf.keras.Sequential([
  tf.keras.layers.RandomFlip("horizontal"),
  tf.keras.layers.RandomRotation(0.2),
])
train_ds = (train_ds
            .cache()
            .shuffle(buffer_size=len(train_ds))            
            .map(lambda x, y: (data_augmentation(x, training=True), y), num_parallel_calls=tf.data.AUTOTUNE)
            .prefetch(tf.data.AUTOTUNE)
)

## Build Callbacks
Four callbacks are defined for the training phase. 
1. Early Stop: To mitigate overfitting caused by excessive training sessions, we stop training once validation loss hasn't improved in a designated number of epochs. 
2. Learning Rate Callback: If validation loss hasn't improved in the designated number of epochs (3), the learning rate is reduced by a factor of 0.5.
3. Model Checkpoint: A model checkpoint is taken when the validation accuracy has improved. The state of the model at the best validation accuracy score are restored.


In [8]:
def build_callbacks(config: Config) -> list:
    """Construct an early stop, learning rate, and model checkpoint callback. """    
    
    lr_range_test_callback = LRRangeTestCallback(min_lr=config.learning_rate_schedule.min_lr,
                                         max_lr=config.learning_rate_schedule.max_lr,
                                         epochs=config.learning_rate_schedule.epochs)
    
    return [lr_range_test_callback]

with strategy.scope():
    callbacks = build_callbacks(config=config)

## Dependencies

In [9]:
factory = TMNetFactory(config=config.network)
repo = ExperimentRepo(mode=mode, project=config.project.name, adapter=adapter)
optimizer = tf.keras.optimizers.Adam
with strategy.scope():
    metrics = ['accuracy', tf.keras.metrics.AUC(), tf.keras.metrics.Precision(), tf.keras.metrics.Recall()]

## Build Model
The model is encapsulated in a Network object that also contains metadata, the base model, and the network configuration.

In [10]:
with strategy.scope():  
    network = factory.create(base_model=base_model)
    # Tags allowing models and runs to be searched on Weights and Biases
    tags = ["CPU", network.name, network.architecture, base_model.name, "lr_range_test"]        
    experiment = FeatureExtractionExperiment(
        network=network, 
        config=config, 
        optimizer=optimizer, 
        repo=repo, 
        metrics=metrics, 
        callbacks=callbacks, checkpoint=False, tags=tags, force=force)
    experiment.run(train_ds=train_ds, val_ds=val_ds)

                                           TMNet_DenseNet                                           
# ------------------------------------------------------------------------------------------------ #
Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 TMNet_DenseNet_input_layer   [(None, 224, 224, 3)]    0         
 (InputLayer)                                                    
                                                                 
 tf.math.truediv (TFOpLambda  (None, 224, 224, 3)      0         
 )                                                               
                                                                 
 tf.nn.bias_add (TFOpLambda)  (None, 224, 224, 3)      0         
                                                                 
 tf.math.truediv_1 (TFOpLamb  (None, 224, 224, 3)      0         
 da)                                                     

INFO:LRRangeTestCallback:Setting learning rate to 1e-05.
Epoch 1/10
INFO:LRRangeTestCallback:Setting learning rate to 0.011120000000000001.
Epoch 2/10
INFO:LRRangeTestCallback:Setting learning rate to 0.022230000000000003.
Epoch 3/10
INFO:LRRangeTestCallback:Setting learning rate to 0.03334000000000001.
Epoch 4/10
INFO:LRRangeTestCallback:Setting learning rate to 0.04445000000000001.
Epoch 5/10
INFO:LRRangeTestCallback:Setting learning rate to 0.05556000000000001.
Epoch 6/10
INFO:LRRangeTestCallback:Setting learning rate to 0.06667000000000001.
Epoch 7/10
INFO:LRRangeTestCallback:Setting learning rate to 0.07778.
Epoch 8/10
INFO:LRRangeTestCallback:Setting learning rate to 0.08889000000000001.
Epoch 9/10
INFO:LRRangeTestCallback:Setting learning rate to 0.1.
Epoch 10/10




VBox(children=(Label(value='0.007 MB of 0.007 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch/accuracy,▁▁█▂▂▇▁▂▃▃
epoch/auc,▄▄█▆▅█▅▁▁▃
epoch/epoch,▁▂▃▃▄▅▆▆▇█
epoch/learning_rate,▁▂▃▃▄▅▆▆▇█
epoch/loss,▁█▁▂▁▁▁▁▁▁
epoch/precision,▆▇█▇▇█▆▃▁▁
epoch/recall,▄▅█▆▄▆▃▁▁▁
epoch/val_accuracy,█▁█▁▅▇▇▇▇▇
epoch/val_auc,█▄▅▁▆▆▆▆▆▆
epoch/val_loss,▁▆▂▂▁█▁▁▁▁

0,1
epoch/accuracy,0.61538
epoch/auc,0.51683
epoch/epoch,9.0
epoch/learning_rate,0.1
epoch/loss,0.66595
epoch/precision,0.0
epoch/recall,0.0
epoch/val_accuracy,0.56364
epoch/val_auc,0.5
epoch/val_loss,0.69316
