# ML Flow on Azure ML

The ML ops demo notebook shows running ML Flow on a local machine, and the AzureML notebook demonstrates using the Azure ML SDK for experiment tracking. This notebook combines the two, using AzureML to run, but tracking through the ML Flow API with AzureML providing the backend storage. This allows us to make use of the easily scaling  infrastructure of AzureML, while the code is still portable as other backends can easily be swapped in when required.


In [1]:
import pandas as pd
import numpy as np
import pathlib
import matplotlib.pyplot as plt

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [3]:
%load_ext tensorboard

In [4]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, Activation, Flatten
from tensorflow.keras.layers import Conv1D, concatenate
from tensorflow.keras.layers import ZeroPadding1D, Reshape, Input, Dropout, PReLU
from tensorflow.keras.models import Sequential, Model

from sklearn.metrics import mean_absolute_error, r2_score

In [5]:
import mlflow
mlflow.tensorflow.autolog()



In [6]:
import prd_pipeline

### Load data

In [7]:
import azureml.core
from azureml.core import Workspace, Datastore, Dataset
from azureml.core import Experiment

prd_ws = Workspace.from_config()

In [8]:
# azure_dataset_name ='sd3'
# azure_experiment_name='prd_mlops_test'
# azure_env_name = 'prd_ml_cluster'
# cluster_name = 'mlops-test'

In [9]:
# prd_model_name = 'azml_mlflow_20220504'

In [10]:
target_parameter = [
    'radar_fraction_in_band_instant_0.25', 
    'radar_fraction_in_band_instant_2.5',
    'radar_fraction_in_band_instant_7.0',
    'radar_fraction_in_band_instant_10.0'
]
profile_features = ['air_temperature', 'relative_humidity']
single_lvl_features = [] #'air_pressure_at_sea_level'

In [11]:
feature_dict = {
    'profile': profile_features,
    'single_level': single_lvl_features,
    'target': target_parameter,
} 

In [12]:
mlflow.set_tracking_uri(prd_ws.get_mlflow_tracking_uri())

In [13]:
# input_data = prd_pipeline.load_data(
#     prd_ws,
#     dataset_name=azure_dataset_name
# )
# data_splits, data_dims = prd_pipeline.preprocess_data(
#     input_data,
#     test_fraction=0.2,
#     feature_dict={'profile': profile_features, 'single_level': single_lvl_features,'target': target_parameter,},
# )

In [14]:
configs_dir = pathlib.Path.cwd().parent / 'data_prep' / 'event_configs'
configs_dir.is_dir(), configs_dir

(True,
 PosixPath('/mnt/batch/tasks/shared/LS_root/mounts/clusters/prd-ml-fractions/code/Users/hannah.brown/precip_rediagnosis/data_prep/event_configs'))

In [15]:
config_path_list = [p1 for p1 in configs_dir.iterdir() if '20' in str(p1) and 'json' in str(p1)]

In [16]:
prd_merged_file_dataset_name = 'prd_merged_csv_files'
train202208_dataset_all = azureml.core.Dataset.get_by_name(prd_ws, name=prd_merged_file_dataset_name)
prd_prefix = 'prd'
merged_prefix = prd_prefix + '_merged'
csv_file_suffix = 'csv'

In [17]:
def get_file_name(dataset_config):

    event_start_dt = datetime.datetime.strptime(dataset_config['event_start'], '%Y-%m-%dT%H:%MZ')
    event_end_dt = datetime.datetime.strptime(dataset_config['event_end'], '%Y-%m-%dT%H:%MZ')
    times_list = drivers.calc_dates_list(event_start_dt, event_end_dt, float(dataset_config['target_time_delta']))
    final_timestamp = max(times_list)  # Data extract isn't inclusive of final date so need second last time 

    start_datestring = f'{event_start_dt.year:04d}{event_start_dt.month:02d}{event_start_dt.day:02d}T{event_start_dt.hour:02d}{event_start_dt.minute:02d}Z'
    end_datestring = f'{event_end_dt.year:04d}{event_end_dt.month:02d}{event_end_dt.day:02d}T{event_end_dt.hour:02d}{event_end_dt.minute:02d}Z'
    final_datestring = f'{final_timestamp.year:04d}{final_timestamp.month:02d}{final_timestamp.day:02d}T{final_timestamp.hour:02d}{final_timestamp.minute:02d}Z'
    
    return f'prd_merged_{start_datestring}_{final_datestring}.csv'

In [18]:
with train202208_dataset_all.mount() as train202208_file_mount:
    print('loading all data')
    prd_path_list = [p1 for p1 in pathlib.Path(train202208_file_mount.mount_point).rglob(f'{merged_prefix}*{csv_file_suffix}') ]
    merged_df = pd.concat([pd.read_csv(p1) for p1 in prd_path_list])

Not mounting as a volume: ArgumentError(InvalidArgument { argument: "arguments.path", expected: "Glob patterns inside the path are not supported by the volume mount.Path must be a direct path to the file or folder, or end with '/**' or '/**/*' to match the entire content of the volume.", actual: "REDACTED" }). 
Falling back to dataflow mount.
loading all data


In [19]:
test_fraction=0.2
df_train, df_test = prd_pipeline.random_time_space_sample(
    merged_df, test_fraction=test_fraction, random_state=np.random.RandomState(), sampling_columns = ['time', 'latitude', 'longitude'])
data_splits, data_dims_dict = prd_pipeline.preprocess_data(
    df_train, feature_dict, test_fraction=test_fraction/(1-test_fraction))

target has dims: 4
dropping smallest bin: radar_fraction_in_band_instant_0.25
getting profile columns
{'profile': ['air_temperature', 'relative_humidity'], 'single_level': [], 'target': ['radar_fraction_in_band_instant_0.25', 'radar_fraction_in_band_instant_2.5', 'radar_fraction_in_band_instant_7.0', 'radar_fraction_in_band_instant_10.0']}


In [21]:
exp1 = mlflow.create_experiment('prd_fraction_models')
exp1

'7ccf9441-4968-4c88-9b08-154061256d80'

In [22]:
exp1 = mlflow.get_experiment(exp1)

In [23]:
exp1

<Experiment: artifact_location='', experiment_id='7ccf9441-4968-4c88-9b08-154061256d80', lifecycle_stage='active', name='prd_fraction_models', tags={}>

In [24]:
import datetime
log_dir = 'log/fit/' + datetime.datetime.now().strftime('%Y%m%d-%H%M%S')

In [25]:
tensorflow_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

# run tensorboard --logdir LOGDIRPATH from command line to launch 

2022-08-23 11:28:36.970461: I tensorflow/core/profiler/lib/profiler_session.cc:159] Profiler session started.
2022-08-23 11:28:36.971857: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcuda.so.1


In [26]:
import tempfile

In [27]:
hyperparameter_dict = {
    'loss_function': tf.keras.losses.KLDivergence(),
    'epochs': 20, 
    'learning_rate': 0.001, 
    'batch_size': 500
}

In [32]:
with mlflow.start_run(experiment_id=exp1.experiment_id) as current_run:
    print('starting')
    model = prd_pipeline.build_model(**data_dims_dict)
    print('model built')
    model.summary()
    print('training_model')
    model = prd_pipeline.train_model(model, data_splits, hyperparameter_dict)

starting


2022-08-23 12:10:28.808615: E tensorflow/stream_executor/cuda/cuda_driver.cc:313] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2022-08-23 12:10:28.808716: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (prd-ml-fractions): /proc/driver/nvidia/version does not exist
2022-08-23 12:10:28.811184: I tensorflow/core/platform/cpu_feature_guard.cc:143] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 AVX512F FMA
2022-08-23 12:10:28.878904: I tensorflow/core/platform/profile_utils/cpu_utils.cc:102] CPU Frequency: 2593905000 Hz
2022-08-23 12:10:28.880543: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7fe8e4000b60 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2022-08-23 12:10:28.880579: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): Host, Default Version


model built
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
profile_input (InputLayer)      [(None, 33, 2)]      0                                            
__________________________________________________________________________________________________
zero_padding1d (ZeroPadding1D)  (None, 35, 2)        0           profile_input[0][0]              
__________________________________________________________________________________________________
conv1d (Conv1D)                 (None, 33, 32)       192         zero_padding1d[0][0]             
__________________________________________________________________________________________________
zero_padding1d_1 (ZeroPadding1D (None, 35, 32)       0           conv1d[0][0]                     
__________________________________________________________________________________

2022-08-23 12:10:29.309437: I tensorflow/core/profiler/lib/profiler_session.cc:159] Profiler session started.


Epoch 1/20
   6/1236 [..............................] - ETA: 30s - loss: 0.5842

2022-08-23 12:10:32.582683: I tensorflow/core/profiler/lib/profiler_session.cc:159] Profiler session started.
2022-08-23 12:10:32.632518: I tensorflow/core/profiler/rpc/client/save_profile.cc:168] Creating directory: /tmp/tmpkuym0dhr/train/plugins/profile/2022_08_23_12_10_32
2022-08-23 12:10:32.642345: I tensorflow/core/profiler/rpc/client/save_profile.cc:174] Dumped gzipped tool data for trace.json.gz to /tmp/tmpkuym0dhr/train/plugins/profile/2022_08_23_12_10_32/prd-ml-fractions.trace.json.gz
2022-08-23 12:10:32.656968: I tensorflow/core/profiler/utils/event_span.cc:288] Generation of step-events took 0.024 ms

2022-08-23 12:10:32.674613: I tensorflow/python/profiler/internal/profiler_wrapper.cc:87] Creating directory: /tmp/tmpkuym0dhr/train/plugins/profile/2022_08_23_12_10_32Dumped tool data for overview_page.pb to /tmp/tmpkuym0dhr/train/plugins/profile/2022_08_23_12_10_32/prd-ml-fractions.overview_page.pb
Dumped tool data for input_pipeline.pb to /tmp/tmpkuym0dhr/train/plugins/profi

Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


Using TensorFlow backend.


If we look at the experiment in AzureML GUI, we see that all the model parameters have been automatically logged, and the model has been saved by ML Flow ready for use in inference.

In [None]:
# training_hist_df = pd.DataFrame(history.history)
# training_hist_df['epoch'] = history.epoch

In [None]:
# plt.figure(figsize=(10, 8))
# plt.plot(training_hist_df.epoch, training_hist_df.loss, label='training')
# plt.plot(training_hist_df.epoch, training_hist_df.val_loss, c='g', label='validation')
# plt.legend()
# plt.ylabel('MAE [mm of precipitation]')
# plt.xlabel('epochs')
# plt.show()

In [None]:
# plt.figure(figsize=(10, 8))
# plt.hist(data_splits['y_val'], alpha=0.5, bins=40, label='Actual')
# plt.hist(y_pred, alpha=0.5, bins=40, label='Predicted')
# plt.legend()
# plt.show()