In [None]:
# preprocess and define batch sizes for tensorflow 

model = Model()
ds_train = ds_train.map(preprocess_mnist_tfds, num_parallel_calls=tf.data.AUTOTUNE)
ds_train = ds_train.batch(128)
ds_test = ds_test.map(preprocess_mnist_tfds, num_parallel_calls=tf.data.AUTOTUNE)
ds_test = ds_test.batch(128) 


In [None]:
from kfp.components import InputPath, OutputPath
import kfp

def load_tensorflow_dataset(dataset_str: str, train_file_output: OutputPath("pickle"),  test_file_output: OutputPath("pickle")):
    import tensorflow_datasets as tfds
    import pickle
    (xy_train, xy_test), ds_info = tfds.load(
        dataset_str,
        split=['train', 'test'], shuffle_files=True,
        as_supervised=True,
        with_info=True,
    )
    with open(train_file_output, "wb") as file:
        pickle.dump(xy_train, file)
    with open(test_file_output, "wb") as file:
        pickle.dump(xy_test, file)

if __name__ == "__main__":
    
    preprocess_op = kfp.components.create_component_from_func(
        func=load_tensorflow_dataset,
        base_image='python:3.9',
        packages_to_install=['tensorflow_datasets'],
        output_component_file='load_tensorflow_dataset.yaml')
    print('Loaded dataset from tensorflow datasets')

In [1]:
from kfp.components import InputPath, OutputPath
import kfp

def preprocess(train_file_input: InputPath('pickle'), test_file_input: InputPath('pickle'),
              train_file_output: OutputPath("pickle"),  test_file_output: OutputPath("pickle")):
    import numpy as np
    import tensorflow as tf
    import pickle
    def preprocess_mnist(image, label=None):
        # reshape and upsample to 3 channel for transfer learning models
        # ... for when no channel information is present
        if len(image.shape) != 3:
            image = np.dstack((image, image, image))
        # ... for when channel is only 1 dimension
        if image.shape[2] == 1:
            image = tf.image.grayscale_to_rgb(image)
        # normalize pixel values
        image = tf.cast(image, tf.float32) / 255.
        # resize with pad for mobilenetv2
        image = tf.image.resize_with_pad(image, target_height=224, target_width=224)
        return image, label
    with open(train_file_input, "rb") as file:
        ds_train = pickle.load(file)
    with open(test_file_input, "rb") as file:
        ds_test = pickle.load(file)
    # preprocess and batch 
    ds_train = ds_train.map(preprocess_mnist_tfds, num_parallel_calls=tf.data.AUTOTUNE)
    ds_train = ds_train.batch(128)
    ds_test = ds_test.map(preprocess_mnist_tfds, num_parallel_calls=tf.data.AUTOTUNE)
    ds_test = ds_test.batch(128) 
    
    with open(train_file_output, "wb") as file:
        pickle.dump(ds_train, file)
    with open(test_file_output, "wb") as file:
        pickle.dump(ds_test, file)


if __name__ == "__main__":
    
    preprocess_op = kfp.components.create_component_from_func(
        func=preprocess,
        base_image='python:3.9',
        packages_to_install=['numpy', 'tensorflow'],
        output_component_file='preprocess.yaml')
    print('Preprocessed dataset from tensorflow datasets')

ModuleNotFoundError: No module named 'kfp'

In [None]:
experiment_name = "mnist-classification"
mlflow_run_name = "test_run"
hyperparams = {
    'learning_rate': 0.01,
    'l1': 0.0,
    'l2': 0.0, 
    'num_hidden': 16,
    'epochs': 10}

def hyperparameter_train(train_file_input: InputPath('pickle'), test_file_input: InputPath('pickle'), hyperparameters):
    import numpy as np
    import tensorflow as tf
    import pickle
    import mlflow
    
    def mlflow_experiment_definition(experiment_name):
        try:
            experiment_id = mlflow.create_experiment(
                experiment_name,
                tags={"version": "v0.1"},
            )
        except mlflow.exceptions.MlflowException as e: 
            if str(e) == f"Experiment '{experiment_name}' already exists.":
                print(f'Experiment already exists, setting experiment to {experiment_name}')
                experiment_info = mlflow.set_experiment("mnist-classification")
                experiment_id = experiment_info.experiment_id
        return experiment_id
    
    def preprocess_mnist(image, label=None):
        # reshape and upsample to 3 channel for transfer learning models
        # ... for when no channel information is present
        if len(image.shape) != 3:
            image = np.dstack((image, image, image))
        # ... for when channel is only 1 dimension
        if image.shape[2] == 1:
            image = tf.image.grayscale_to_rgb(image)
        # normalize pixel values
        image = tf.cast(image, tf.float32) / 255.
        # resize with pad for mobilenetv2
        image = tf.image.resize_with_pad(image, target_height=224, target_width=224)
        return image, label
    
    class MNIST(mlflow.pyfunc.PythonModel):     
        def fit(self, xy_tuple_train, xy_tuple_test, hyperparameters):
            ## Build model
            # class names for mnist hardcoded
            class_names = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

            # set layer regularization for DNN
            regularizer = tf.keras.regularizers.l1_l2(hyperparameters['l1'], hyperparameters['l2'])

            # load in mobilenetv2 weights and instantiate dense classification head 
            base_model = "https://tfhub.dev/google/tf2-preview/mobilenet_v2/feature_vector/4"
            layers = [
                hub.KerasLayer(
                    base_model,
                    input_shape=(224, 224, 3),
                    trainable=False,
                    name='mobilenet_embedding'),
                tf.keras.layers.Dense(hyperparameters['num_hidden'],
                                      kernel_regularizer=regularizer,
                                      activation='relu',
                                      name='dense_hidden'),
                tf.keras.layers.Dense(len(class_names),
                                      kernel_regularizer=regularizer,
                                      activation='softmax',
                                      name='mnist_prob')
            ]

            self._model = tf.keras.Sequential(layers, name='mnist-classification')

            # compile model 
            self._model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=hyperparams['learning_rate']),
                                loss=tf.keras.losses.SparseCategoricalCrossentropy(
                                from_logits=False),
                                metrics=['accuracy'])

            ## Fit model
            # fit model and save history to model store
            self._train_history = self._model.fit(xy_tuple_train, epochs=hyperparameters['epochs'], validation_data=xy_tuple_test)
            self._model_base = base_model

        def predict(self, context, model_input: np.ndarray) -> np.ndarray:
            image, _ = preprocess_mnist_tfds(model_input)
            image = tf.reshape(image, [1, 224, 224, 3])
            return self._model.predict(image).argmax()
    
    # load data from pipeline
    with open(train_file_input, "rb") as file:
        ds_train = pickle.load(file)
    with open(test_file_input, "rb") as file:
        ds_test = pickle.load(file)
    
    # instantiate model 
    model = MNIST()
    
    # define experiment name and run name
    experiment_name = "mnist-classification"
    experiment_id = mlflow_experiment_definition(experiment_name)
    mlflow_run_name = "test_run"
    
    with mlflow.start_run(experiment_id=experiment_id, 
                      run_name=mlflow_run_name) as run:
        # You can set autolog for tensorflow model.
        # Note that autolog does not allow logging of any additional params and metrics.
        # We'll choose to do manual logging.
        # mlflow.tensorflow.autolog()

        model.fit(ds_train, ds_test, hyperparams)

        # MLFlow Tracking parameters
        mlflow.log_params(params=hyperparams)

        # MLFlow Tracking metrics 
        # Logging metrics for each epoch (housed in dictionary)
        training_history = model._train_history.history
        for epoch in range(0, hyperparams['epochs']):
            insert = {}
            for metric, value in training_history.items():
                insert[metric] = training_history[metric][epoch]
            mlflow.log_metrics(metrics=insert, step=epoch+1)

        # MLFlow tracking artifact (e.g. model file)
        # this will log the model and all its details under run_id/artifacts
        mlflow.pyfunc.log_model(python_model=model,
                               artifact_path="")
        
        mlflow.get_artifact_uri()
        # Close out MLFlow run to prevent any log contamination.
        mlflow.end_run(status='FINISHED')
        
    return f"{mlflow.get_artifact_uri()}/{model.artifact_path}"


if __name__ == "__main__":
    
    train_op = kfp.components.create_component_from_func(
        func=train,
        base_image='python:3.9',
        packages_to_install=['tensorflow', 'numpy', 'mlflow'],
        output_component_file='train_tensorflow_mnist.yaml')
    print('Completed transfer learning training on MNIST')



In [None]:
experiment_name = "mnist-classification"
mlflow_run_name = "test_run"
hyperparams = {
    'learning_rate': 0.01,
    'l1': 0.0,
    'l2': 0.0, 
    'num_hidden': 16,
    'epochs': 10}

def production_train(train_file_input: InputPath('pickle'), test_file_input: InputPath('pickle'), hyperparameters):
    import numpy as np
    import tensorflow as tf
    import pickle
    import mlflow
    
    def mlflow_experiment_definition(experiment_name):
        try:
            experiment_id = mlflow.create_experiment(
                experiment_name,
                tags={"version": "v0.1"},
            )
        except mlflow.exceptions.MlflowException as e: 
            if str(e) == f"Experiment '{experiment_name}' already exists.":
                print(f'Experiment already exists, setting experiment to {experiment_name}')
                experiment_info = mlflow.set_experiment("mnist-classification")
                experiment_id = experiment_info.experiment_id
        return experiment_id
    
    def preprocess_mnist(image, label=None):
        # reshape and upsample to 3 channel for transfer learning models
        # ... for when no channel information is present
        if len(image.shape) != 3:
            image = np.dstack((image, image, image))
        # ... for when channel is only 1 dimension
        if image.shape[2] == 1:
            image = tf.image.grayscale_to_rgb(image)
        # normalize pixel values
        image = tf.cast(image, tf.float32) / 255.
        # resize with pad for mobilenetv2
        image = tf.image.resize_with_pad(image, target_height=224, target_width=224)
        return image, label
    
    class MNIST(mlflow.pyfunc.PythonModel):     
        def fit(self, xy_tuple_train, xy_tuple_test, hyperparameters):
            ## Build model
            # class names for mnist hardcoded
            class_names = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

            # set layer regularization for DNN
            regularizer = tf.keras.regularizers.l1_l2(hyperparameters['l1'], hyperparameters['l2'])

            # load in mobilenetv2 weights and instantiate dense classification head 
            base_model = "https://tfhub.dev/google/tf2-preview/mobilenet_v2/feature_vector/4"
            layers = [
                hub.KerasLayer(
                    base_model,
                    input_shape=(224, 224, 3),
                    trainable=False,
                    name='mobilenet_embedding'),
                tf.keras.layers.Dense(hyperparameters['num_hidden'],
                                      kernel_regularizer=regularizer,
                                      activation='relu',
                                      name='dense_hidden'),
                tf.keras.layers.Dense(len(class_names),
                                      kernel_regularizer=regularizer,
                                      activation='softmax',
                                      name='mnist_prob')
            ]

            self._model = tf.keras.Sequential(layers, name='mnist-classification')

            # compile model 
            self._model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=hyperparams['learning_rate']),
                                loss=tf.keras.losses.SparseCategoricalCrossentropy(
                                from_logits=False),
                                metrics=['accuracy'])

            ## Fit model
            # fit model and save history to model store
            self._train_history = self._model.fit(xy_tuple_train, epochs=hyperparameters['epochs'], validation_data=xy_tuple_test)
            self._model_base = base_model

        def predict(self, context, model_input: np.ndarray) -> np.ndarray:
            image, _ = preprocess_mnist_tfds(model_input)
            image = tf.reshape(image, [1, 224, 224, 3])
            return self._model.predict(image).argmax()
    
    # load data from pipeline
    with open(train_file_input, "rb") as file:
        ds_train = pickle.load(file)
    with open(test_file_input, "rb") as file:
        ds_test = pickle.load(file)
    
    # instantiate model 
    model = MNIST()
    
    # define experiment name and run name
    experiment_name = "mnist-classification"
    experiment_id = mlflow_experiment_definition(experiment_name)
    mlflow_run_name = "deployment_run"
    
    # loaded hyperparameters
    hyperparams = {
    'learning_rate': 0.01,
    'l1': 0.0,
    'l2': 0.0, 
    'num_hidden': 16,
    'epochs': 10}
    
    with mlflow.start_run(experiment_id=experiment_id, 
                      run_name=mlflow_run_name) as run:
        # You can set autolog for tensorflow model.
        # Note that autolog does not allow logging of any additional params and metrics.
        # We'll choose to do manual logging.
        # mlflow.tensorflow.autolog()

        model.fit(ds_train, ds_test, hyperparams)

        # MLFlow Tracking parameters
        mlflow.log_params(params=hyperparams)

        # MLFlow Tracking metrics 
        # Logging metrics for each epoch (housed in dictionary)
        training_history = model._train_history.history
        for epoch in range(0, hyperparams['epochs']):
            insert = {}
            for metric, value in training_history.items():
                insert[metric] = training_history[metric][epoch]
            mlflow.log_metrics(metrics=insert, step=epoch+1)

        # MLFlow tracking artifact (e.g. model file)
        # this will log the model and all its details under run_id/artifacts
        mlflow.pyfunc.log_model(python_model=model,
                               artifact_path="")
        
        mlflow.get_artifact_uri()
        # Close out MLFlow run to prevent any log contamination.
        mlflow.end_run(status='FINISHED')
        
    return f"{mlflow.get_artifact_uri()}/{model.artifact_path}"


if __name__ == "__main__":
    
    train_op = kfp.components.create_component_from_func(
        func=train,
        base_image='python:3.9',
        packages_to_install=['tensorflow', 'numpy', 'mlflow'],
        output_component_file='train_tensorflow_mnist.yaml')
    print('Completed transfer learning training on MNIST')



In [None]:
import optuna
from optuna.integration.mlflow import MLflowCallback


def preprocess_mnist(image, label=None):
    # reshape and upsample to 3 channel for transfer learning models
    # ... for when no channel information is present
    if len(image.shape) != 3:
        image = np.dstack((image, image, image))
    # ... for when channel is only 1 dimension
    if image.shape[2] == 1:
        image = tf.image.grayscale_to_rgb(image)
    # normalize pixel values
    image = tf.cast(image, tf.float32) / 255.
    # resize with pad for mobilenetv2
    image = tf.image.resize_with_pad(image, target_height=224, target_width=224)
    return image, label

class MNIST(mlflow.pyfunc.PythonModel):     
    def fit(self, xy_tuple_train, xy_tuple_test, hyperparameters):
        ## Build model
        # class names for mnist hardcoded
        class_names = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

        # set layer regularization for DNN
        regularizer = tf.keras.regularizers.l1_l2(hyperparameters['l1'], hyperparameters['l2'])

        # load in mobilenetv2 weights and instantiate dense classification head 
        base_model = "https://tfhub.dev/google/tf2-preview/mobilenet_v2/feature_vector/4"
        layers = [
            hub.KerasLayer(
                base_model,
                input_shape=(224, 224, 3),
                trainable=False,
                name='mobilenet_embedding'),
            tf.keras.layers.Dense(hyperparameters['num_hidden'],
                                  kernel_regularizer=regularizer,
                                  activation='relu',
                                  name='dense_hidden'),
            tf.keras.layers.Dense(len(class_names),
                                  kernel_regularizer=regularizer,
                                  activation='softmax',
                                  name='mnist_prob')
        ]

        self._model = tf.keras.Sequential(layers, name='mnist-classification')

        # compile model 
        self._model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=hyperparams['learning_rate']),
                            loss=tf.keras.losses.SparseCategoricalCrossentropy(
                            from_logits=False),
                            metrics=['accuracy'])

        ## Fit model
        # fit model and save history to model store
        self._train_history = self._model.fit(xy_tuple_train, epochs=hyperparameters['epochs'], validation_data=xy_tuple_test)
        self._model_base = base_model

    def predict(self, context, model_input: np.ndarray) -> np.ndarray:
        image, _ = preprocess_mnist_tfds(model_input)
        image = tf.reshape(image, [1, 224, 224, 3])
        return self._model.predict(image).argmax()

# hyperparameters search using Optuna
# can scale Optuna with Kubeflow https://medium.com/optuna/parallel-hyperparameter-tuning-with-optuna-and-kubeflow-pipelines-4ef05ce614ae
def objective(trial): 
    """
    Optuna objective function for tuning transfer learning model
    """
    hyperparameters = {
        'learning_rate': trial.suggest_float('learning_rate', 0.00001, 0.1, log=True),
        'l1': trial.suggest_float('l1', 0.0, 0.1),
        'l2': trial.suggest_float('l2', 0.0, 0.1),
        'num_hidden': trial.suggest_int('num_hidden', 8, 64),
        'epochs': trial.suggest_int('epochs', 2, 10)
    }

    model.fit(ds_train, ds_test, hyperparameters)
    training_history = model._train_history.history
    validation_accuracy = training_history['val_accuracy'][-1]
    return validation_accuracy
    
    

    
    
if __name__ == "__main__":
    try:
        print('loading study...')
        study = optuna.load_study(
            study_name="mnist-classification-kubernetes",
            storage="postgresql://{}:{}@postgres:5432/{}".format(
                os.environ["POSTGRES_USER"],
                os.environ["POSTGRES_PASSWORD"],
                os.environ["POSTGRES_DB"],
            ),
        )
    except KeyError:
        print('no study found. building from scratch...')
        study = optuna.create_study(
            study_name="mnist-classification-kubernetes",
            storage="postgresql://{}:{}@postgres:5432/{}".format(
                os.environ["POSTGRES_USER"],
                os.environ["POSTGRES_PASSWORD"],
                os.environ["POSTGRES_DB"],
            ),
            pruner=optuna.pruners.HyperbandPruner(),
            direction='maximize'
        )

    study.optimize(objective, n_trials=50)
    print(study.best_trial)



### yaml file for Kubernetes of Optuna job

apiVersion: batch/v1
kind: Job
metadata:
  name: study-creator
spec:
  template:
    spec:
      restartPolicy: OnFailure
      initContainers:
        - name: wait-for-database
          image: postgres:latest
          imagePullPolicy: IfNotPresent
          command:
          - /bin/sh
          - -c
          - -e
          - -x
          - |
            until pg_isready -U $(POSTGRES_USER) -h postgres -p 5432;
            do echo "waiting for postgres"; sleep 2; done;
          envFrom:
            - secretRef:
                name: postgres-secrets
      containers:
        - name: study-creator
          image: optuna-kubernetes:example
          imagePullPolicy: IfNotPresent
          command:
          # create study
          - /bin/sh
          - -c
          - -e
          - -x
          - |
            optuna create-study --skip-if-exists --direction maximize \
            --study-name "kubernetes" --storage \
            "postgresql://${POSTGRES_USER}:${POSTGRES_PASSWORD}@postgres:5432/${POSTGRES_DB}"
          envFrom:
            - secretRef:
                name: postgres-secrets
---
apiVersion: batch/v1
kind: Job
metadata:
  name: worker
spec:
  parallelism: 5
  template:
    spec:
      restartPolicy: OnFailure
      initContainers:
        - name: wait-for-study
          image: optuna-kubernetes:example
          imagePullPolicy: IfNotPresent
          command:
          - /bin/sh
          - -c
          - -e
          - -x
          - |
            until [ `sh check_study.sh` -eq 0 ];
            do echo "waiting for study"; sleep 2; done;
          envFrom:
            - secretRef:
                name: postgres-secrets
      containers:
        - name: worker
          image: optuna-kubernetes:example
          imagePullPolicy: IfNotPresent
          command:
            - python
            - sklearn_distributed.py
          envFrom:
            - secretRef:
                name: postgres-secrets
---