In [2]:
import sys
sys.path.insert(0, "../..")

import torch
import torch.nn as nn
from src.data import make_dataset
from pathlib import Path
from loguru import logger

2023-05-16 11:31:54.116158: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Lets start with our good'ol MNIST.

In [16]:
datadir = Path("../../data/raw/")
batch_size = 64
train_dataloader, test_dataloader = make_dataset.get_MNIST(datadir, batch_size=batch_size) 

In [17]:
datadir.resolve().exists()

True

In [4]:
len(train_dataloader), len(test_dataloader)

(938, 157)

We can obtain an item:

In [5]:
x, y = next(iter(train_dataloader))
x.shape, y.shape

(torch.Size([64, 1, 28, 28]), torch.Size([64]))

The image follows the channels-first convention: (channel, width, height). The label is an integer.

Let's re-use the model we had:

In [27]:
import torch
from torch import nn

# Get cpu or gpu device for training.
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

# Define model
class CNN(nn.Module):
    def __init__(self, filters, units1, units2, input_size=(32, 1, 28, 28)):
        super().__init__()

        self.convolutions = nn.Sequential(
            nn.Conv2d(1, filters, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),
            nn.Conv2d(filters, filters, kernel_size=3, stride=1, padding=0),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),
            nn.Conv2d(filters, filters, kernel_size=3, stride=1, padding=0),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),
        )

        activation_map_size = self._conv_test(input_size)
        logger.info(f"Aggregating activationmap with size {activation_map_size}")
        self.agg = nn.AvgPool2d(activation_map_size)

        self.dense = nn.Sequential(
            nn.Flatten(),
            nn.Linear(filters, units1),
            nn.ReLU(),
            nn.Linear(units1, units2),
            nn.ReLU(),
            nn.Linear(units2, 10)
        )

    def _conv_test(self, input_size = (32, 1, 28, 28)):
        x = torch.ones(input_size)
        x = self.convolutions(x)
        return x.shape[-2:]

    def forward(self, x):
        x = self.convolutions(x)
        x = self.agg(x)
        logits = self.dense(x)
        return logits
    
    
# Define model
class CNN_J(nn.Module):
    def __init__(self, filters, units1, units2, input_size=(32, 1, 28, 28)):
        super().__init__()

        self.convolutions = nn.Sequential(
            nn.Conv2d(1, filters, kernel_size=3, stride=1, padding=1),
            nn.LeakyReLU(),
            nn.MaxPool2d(kernel_size=2),
            
            nn.Conv2d(filters, filters, kernel_size=3, stride=1, padding=0),
            nn.LeakyReLU(),
            nn.MaxPool2d(kernel_size=2),
            
            nn.Conv2d(filters, filters, kernel_size=3, stride=1, padding=0),
            nn.LeakyReLU(),
            nn.MaxPool2d(kernel_size=2),            
        )

        activation_map_size = self._conv_test(input_size)
        logger.info(f"Aggregating activationmap with size {activation_map_size}")
        self.agg = nn.AvgPool2d(activation_map_size)

        self.dense = nn.Sequential(
            nn.Flatten(),
            
            nn.Linear(filters, units1),            
            nn.LeakyReLU(),
            
            nn.Linear(units1, units2)            ,
            nn.LeakyReLU(),
            
            nn.Linear(units2, 10)
        )

    def _conv_test(self, input_size = (32, 1, 28, 28)):
        x = torch.ones(input_size)
        x = self.convolutions(x)
        return x.shape[-2:]

    def forward(self, x):
        x = self.convolutions(x)
        x = self.agg(x)
        logits = self.dense(x)
        return logits

model = CNN(filters=32, units1=128, units2=64).to(device)
model_J = CNN_J(filters=32, units1=128, units2=64).to(device)

2023-05-16 11:50:16.281 | INFO     | __main__:__init__:26 - Aggregating activationmap with size torch.Size([2, 2])
2023-05-16 11:50:16.300 | INFO     | __main__:__init__:70 - Aggregating activationmap with size torch.Size([2, 2])


Using cpu device


In [18]:
from torchsummary import summary
summary(model_J, input_size=(1, 28, 28))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 32, 28, 28]             320
         LeakyReLU-2           [-1, 32, 28, 28]               0
         MaxPool2d-3           [-1, 32, 14, 14]               0
            Conv2d-4           [-1, 32, 12, 12]           9,248
         LeakyReLU-5           [-1, 32, 12, 12]               0
         MaxPool2d-6             [-1, 32, 6, 6]               0
            Conv2d-7             [-1, 32, 4, 4]           9,248
         LeakyReLU-8             [-1, 32, 4, 4]               0
         MaxPool2d-9             [-1, 32, 2, 2]               0
        AvgPool2d-10             [-1, 32, 1, 1]               0
          Flatten-11                   [-1, 32]               0
           Linear-12                  [-1, 128]           4,224
        LeakyReLU-13                  [-1, 128]               0
           Linear-14                   

And set up the optimizer, loss and accuracy.

In [14]:
import torch.optim as optim
from src.models import metrics
# optimizer = optim.Adam
optimizer = optim.AdamW
# optimizer = optim.SGD
loss_fn = torch.nn.CrossEntropyLoss()
accuracy = metrics.Accuracy()

In [35]:
yhat = model(x)
accuracy(y, yhat)

tensor(0.1094)

# MLflow
MLflow is an open-source platform designed to manage the entire Machine Learning (ML) lifecycle, including experimentation, reproducibility, deployment, and governance. It provides a set of APIs and tools to streamline ML workflows, making it easier to track experiments, package code, manage model versions, and deploy models.

Reasons to use MLflow over TensorBoard, gin-config, or Ray:

- End-to-end ML lifecycle management: While TensorBoard focuses on visualizing model training metrics and gin-config on hyperparameter configuration, MLflow covers a broader range of tasks, such as experiment tracking, model packaging, and deployment.

- Framework agnostic: MLflow is not tied to a specific ML framework, making it suitable for projects using different libraries or even multiple libraries.

- Model Registry: MLflow provides a centralized model registry, allowing you to version, track, and manage your models, which is not available in TensorBoard or gin-config.

- Deployment support: MLflow facilitates model deployment to various platforms, such as local, cloud, or Kubernetes environments, whereas TensorBoard and gin-config are not built for deployment tasks.

- Integration with other tools: MLflow integrates with popular tools and platforms like Databricks, AWS, and Azure, making it easy to incorporate into existing workflows.

However, the choice between MLflow and other tools like TensorBoard, gin-config, or Ray depends on your specific use case and the scope of the ML workflow you want to manage.

In [11]:
import mlflow
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("mnist_convolutions")

<Experiment: artifact_location='/Users/jeremycs/Development/machinelearning/ML22/notebooks/2_convolutions/mlruns/1', creation_time=1683986227006, experiment_id='1', last_update_time=1683986227006, lifecycle_stage='active', name='mnist_convolutions', tags={}>

In the code above, we set the MLflow tracking URI to a local SQLite database file. This is done to configure the storage location for MLflow's experiment tracking data, such as metrics, parameters, and artifacts. By specifying a SQLite database, we enable a lightweight and easy-to-use storage solution for tracking the experiments and their associated information.

The line mlflow.set_experiment("mnist_convolutions") sets the active MLflow experiment to "mnist_convolutions". This is useful for organizing and grouping your runs, as it allows you to associate the upcoming ML training runs with a specific experiment name, making it easier to search, compare, and analyze the results later.

In [4]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

We import functions and classes from the hyperopt library to perform hyperparameter optimization. This library helps us find the best hyperparameter values for our machine learning model by searching through a defined search space and using optimization algorithms like Tree-structured Parzen Estimator (TPE). The goal is to improve our model's performance by tuning its hyperparameters.

Advantages of TPE:

- Model-based approach: TPE is a Bayesian optimization method that models the objective function as a probability distribution. It learns from previous evaluations to decide which points in the search space to explore next, making it more efficient in finding optimal hyperparameters.

- Exploration-exploitation trade-off: TPE balances the trade-off between exploration (searching in new regions of the search space) and exploitation (refining around the current best points). This can lead to better results in problems with complex search spaces.

- Continuous hyperparameter optimization: TPE can handle continuous hyperparameters more naturally, as it builds a probability model to estimate the performance for any given point in the search space.

Lets set up an objective function and start logging some usefull things we might want to track:

In [34]:
import torch.optim as optim
from src.models import metrics
from src.models import train_model
from datetime import datetime
from src.data import make_dataset
modeldir = Path("/Users/jeremycs/Development/machinelearning/ML22/src/models")
datadir = Path("../../data/raw/")

# Define the objective function for hyperparameter optimization
def objective(params):
    # Start a new MLflow run for tracking the experiment
    with mlflow.start_run():
        # Set MLflow tags to record metadata about the model and developer
        mlflow.set_tag("model", "convnet")
        mlflow.set_tag("dev", "linksmith")
        # Log hyperparameters to MLflow
        mlflow.log_params(params)
        mlflow.log_param("datadir", f"{datadir.resolve()}")
        mlflow.log_param("batchsize", f"{batch_size}")

        # Initialize the optimizer, loss function, and accuracy metric      
        # optimizer = optim.Adam
        loss_fn = torch.nn.CrossEntropyLoss()
        accuracy = metrics.Accuracy()
        
        selected_search_space = {
            'filters': params['filters'],
            'units1': params['units1'],
            'units2': params['units2']
        }
        model_j = CNN_J(**selected_search_space)
        
        # Select the optimizer based on the choice in params
        if params['optimizer'] == 'SGD':
            optimizer = torch.optim.SGD
        elif params['optimizer'] == 'Adam':
            optimizer = torch.optim.Adam
        elif params['optimizer'] == 'AdamW':
            optimizer = torch.optim.AdamW
            
        train_dataloader, test_dataloader = make_dataset.get_MNIST(datadir, batch_size=params['batch_size']) 
 
        # Instantiate the CNN model with the given hyperparameters
        # Train the model using a custom train loop
        model_j, test_loss = train_model.trainloop(
            epochs=3,
            model=model_j,
            optimizer=optimizer,
            learning_rate=params['learning_rate'],
            loss_fn=loss_fn,
            metrics=[accuracy],
            train_dataloader=train_dataloader,
            test_dataloader=test_dataloader,
            log_dir="modellog",
            # set the tunewriter to mlflow.
            tunewriter=["mlflow"],
            train_steps=100, #len(train_dataloader),
            eval_steps=100, #len(test_dataloader),
        )

        # Save the trained model with a timestamp   
        tag = datetime.now().strftime("%Y%m%d-%H%M")
        modelpath = modeldir / (tag + "model.pt")
        torch.save(model_j, modelpath)

        # Log the saved model as an artifact in MLflow
        mlflow.log_artifact(local_path=modelpath, artifact_path="pytorch_models")
        return {'loss' : test_loss, 'status': STATUS_OK}

In [20]:
# search_space_1 = {
#     'filters' : scope.int(hp.quniform('filters', 16, 128, 8)),
#     'units1' : scope.int(hp.quniform('units1', 32, 128, 8)),
#     'units2' : scope.int(hp.quniform('units2', 32, 128, 8)),
# }

# search_space_2 = {
#     'filters' : scope.int(hp.quniform('filters', 104, 124, 2)),
#     'units1' : scope.int(hp.quniform('units1', 58, 78, 2)),
#     'units2' : scope.int(hp.quniform('units2', 52, 92, 4)),
# }

search_space = {
    'filters' : scope.int(hp.quniform('filters', 118, 122, 2)),
    'units1' : scope.int(hp.quniform('units1', 64, 68, 2)),
    'units2' : scope.int(hp.quniform('units2', 86, 90, 2)),
    'learning_rate' : scope.int(hp.quniform('learning_rate', 0.0005, 0.0015, 0.0005)),
    'optimizer': hp.choice('optimizer', ['SGD', 'Adam', 'AdamW']),
    'batch_size': scope.int(hp.quniform('batch_size', 32, 96, 32)) 
}

We define a search space for hyperparameter optimization using Hyperopt. The search space specifies the range and distribution of hyperparameters to explore during the optimization process. This is crucial for finding the optimal set of hyperparameters that yield the best performance for the machine learning model. The search space defined here includes the number of filters in the convolutional layers, and the number of units in two fully connected layers, allowing Hyperopt to find the best combination within the given ranges.


Now, finally, let us perform the hyperparameter search using the fmin function from hyperopt. The function takes the following arguments:

- `fn=objective`: The objective function to minimize, which is defined earlier to train the model and return the test loss.
- `space=search_space`: The search space defined earlier, containing the range of hyperparameters to explore.
- `algo=tpe.suggest`: The optimization algorithm to use, in this case, the Tree-structured Parzen Estimator (TPE) method.
- `max_evals=10`: The maximum number of function evaluations, i.e., the maximum number of hyperparameter combinations to try.
- `trials=Trials()`: A Trials object to store the results of each evaluation.

The fmin function searches for the best hyperparameters within the given search space using the TPE algorithm, aiming to minimize the objective function (test loss). Once the optimization process is completed, the best hyperparameters found are stored in the best_result variable.

In [35]:
best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=10,
    trials=Trials()
)

  0%|          | 0/10 [00:00<?, ?trial/s, best loss=?]

2023-05-16 12:00:57.407 | INFO     | __main__:__init__:70 - Aggregating activationmap with size torch.Size([2, 2])
  0%|[38;2;30;71;6m          [0m| 0/3 [00:00<?, ?it/s]
  0%|[38;2;30;71;6m          [0m| 0/100 [00:00<?, ?it/s][A
  1%|[38;2;30;71;6m1         [0m| 1/100 [00:00<00:14,  6.98it/s][A
  3%|[38;2;30;71;6m3         [0m| 3/100 [00:00<00:08, 12.09it/s][A
  5%|[38;2;30;71;6m5         [0m| 5/100 [00:00<00:06, 14.97it/s][A
  7%|[38;2;30;71;6m7         [0m| 7/100 [00:00<00:05, 16.40it/s][A
 10%|[38;2;30;71;6m#         [0m| 10/100 [00:00<00:04, 18.18it/s][A
 12%|[38;2;30;71;6m#2        [0m| 12/100 [00:00<00:07, 11.75it/s][A
 14%|[38;2;30;71;6m#4        [0m| 14/100 [00:01<00:08, 10.58it/s][A
 16%|[38;2;30;71;6m#6        [0m| 16/100 [00:01<00:07, 10.63it/s][A
 18%|[38;2;30;71;6m#8        [0m| 18/100 [00:01<00:07, 10.91it/s][A
 20%|[38;2;30;71;6m##        [0m| 20/100 [00:01<00:07, 11.16it/s][A
 22%|[38;2;30;71;6m##2       [0m| 22/100 [00:01<00:06, 11.

  0%|          | 0/10 [00:30<?, ?trial/s, best loss=?]


RuntimeError: Parent directory /Users/jeremycs/Development/machinelearning/ML22/src/models does not exist.

After running this, you can look at the best_result

In [1]:
model_j = CNN_J(filters=120, units1=66, units2=88, batch_size=64).to(device)
# summary(model, input_size=(1, 28, 28))
trained_model, test_loss =  train_model.trainloop(
    epochs=3,
    model=model_j,
    train_dataloader=train_dataloader,    
    optimizer = optim.SGD,
    loss_fn = torch.nn.CrossEntropyLoss(),    
    metrics=[metrics.Accuracy()],
    test_dataloader=test_dataloader,
    train_steps=len(train_dataloader),
    eval_steps=len(test_dataloader),
    tunewriter=["mlflow"],
    learning_rate=1e-3,
    log_dir="modellog",
    factor=0.5,
)

tag = datetime.now().strftime("%Y%m%d-%H%M")
modelpath = modeldir / (tag + "model.pt")
torch.save(trained_model, modelpath)

NameError: name 'CNN_J' is not defined

In [None]:
tag = datetime.now().strftime("%Y%m%d-%H%M")
modelpath = modeldir / (tag + "model.pt")
torch.save(trained_model, modelpath)

In [None]:
# best_result = {'filters': 104.0, 'units1': 58.0, 'units2': 52.0}
# training_run_1 = {'filters': 112.0, 'units1': 72.0, 'units2': 56.0}
# training_run_2 = {'filters': 114.0, 'units1': 72.0, 'units2': 52.0}
# training_run_3 = {'filters': 122.0, 'units1': 66.0, 'units2': 88.0}

But you can also explore the UI from mlflow. It is pretty nice. The help you out, you can use the makefile by typing `make` in the terminal.