# Training on GPU cluster
One of the advantages of running a cluster, is that you can have relatively small instance running for the notebook server and then send of the training task to a separate compute cluster, which can be running powerful (and expensive) GPUs, which spin up to run the training script and then spin down.

In [1]:
%matplotlib inline

In [2]:
import pandas as pd
import numpy as np
import pathlib
import matplotlib.pyplot as plt

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [4]:
import prd_pipeline

## set up azure experiment


In [5]:
import azureml.core
from azureml.core import Workspace, Datastore, Dataset, Environment
from azureml.core import Experiment, ComputeTarget, ScriptRunConfig

In [6]:
prd_ws = Workspace.from_config()

In [29]:
load_all = False
if load_all:
    azure_dataset_name ='prd_merged_all_events_files'
else:
    #  use subset for development.
    azure_dataset_name ='prd_merged_202110_nswws_amber_oct_files'

In [7]:
azure_experiment_name='prd_mlops_test'
azure_env_name = 'prd_ml_cluster'
cluster_name = 'mlops-gpu-test'

In [8]:
prd_model_name = 'azml_cluster_demo_20220414'

In [20]:
merged_prefix = 'prd_merged'
csv_file_suffix = 'csv'

In [33]:
target_parameter = 'radar_mean_rain_instant'
profile_features = ['air_temperature', 'relative_humidity']
single_lvl_features = ['air_pressure_at_sea_level'] 

In [27]:
prd_dataset_all = Dataset.get_by_name(prd_ws, azure_dataset_name)
prd_dataset_all

{
  "source": [
    "('precip_rediagnosis_train202209', 'prd/202110_nswws_amber_oct/prd_merged*csv')"
  ],
  "definition": [
    "GetDatastoreFiles"
  ],
  "registration": {
    "id": "650bb76f-dc17-4c8d-9c3b-7a8c65397919",
    "name": "prd_merged_202110_nswws_amber_oct_files",
    "version": 1,
    "description": "files Dataset for merged data for 202110_nswws_amber_oct.",
    "workspace": "Workspace.create(name='precip_rediagnosis', subscription_id='07efdc52-cd27-48ed-9443-3aad2b6b777b', resource_group='precip_rediagnosis')"
  }
}

In [22]:
import pandas

In [23]:
with azml_ds.mount() as prd_ds_mount:
    prd_path_list = [p1 for p1 in pathlib.Path(prd_ds_mount.mount_point).rglob(f'{merged_prefix}*{csv_file_suffix}') ]
    merged_df = pandas.concat([pandas.read_csv(p1) for p1 in prd_path_list])

Downloaded path: /tmp/tmpqr7yo48a/e23cc2ea-41a5-4969-b533-7a7a581aa7ec/prd/202110_nswws_amber_oct/prd_merged_20211019T1800Z_20211021T0600Z.csv is different from target path: /tmp/tmpqr7yo48a/e23cc2ea-41a5-4969-b533-7a7a581aa7ec/prd_merged_20211019T1800Z_20211021T0600Z.csv


In [24]:
merged_df

Unnamed: 0,realization,latitude,longitude,forecast_period,forecast_reference_time,time,cloud_area_fraction,surface_altitude,air_pressure_at_sea_level,rainfall_rate,...,radar_fraction_in_band_aggregate_3hr_0.0,radar_fraction_in_band_aggregate_3hr_0.25,radar_fraction_in_band_aggregate_3hr_2.5,radar_fraction_in_band_aggregate_3hr_7.0,radar_fraction_in_band_aggregate_3hr_10.0,radar_fraction_in_band_instant_0.0,radar_fraction_in_band_instant_0.25,radar_fraction_in_band_instant_2.5,radar_fraction_in_band_instant_7.0,radar_fraction_in_band_instant_10.0
0,0,49.40625,-5.484375,0 days 06:00:00,2021-10-19 12:00:00,2021-10-19 18:00:00,1.000000,0.0,100931.0,1.991540,...,0.0,0.0,0.0,0.781176,0.218824,0.0,0.082353,0.903529,0.021176,0.0
1,1,49.40625,-5.484375,0 days 06:00:00,2021-10-19 12:00:00,2021-10-19 18:00:00,1.000000,0.0,100937.0,2.967194,...,0.0,0.0,0.0,0.781176,0.218824,0.0,0.082353,0.903529,0.021176,0.0
2,2,49.40625,-5.484375,0 days 06:00:00,2021-10-19 12:00:00,2021-10-19 18:00:00,1.000000,0.0,100959.0,4.050136,...,0.0,0.0,0.0,0.781176,0.218824,0.0,0.082353,0.903529,0.021176,0.0
3,3,49.40625,-5.484375,0 days 06:00:00,2021-10-19 12:00:00,2021-10-19 18:00:00,1.000000,0.0,101007.0,0.865012,...,0.0,0.0,0.0,0.781176,0.218824,0.0,0.082353,0.903529,0.021176,0.0
4,4,49.40625,-5.484375,0 days 06:00:00,2021-10-19 12:00:00,2021-10-19 18:00:00,1.000000,0.0,100912.0,2.182648,...,0.0,0.0,0.0,0.781176,0.218824,0.0,0.082353,0.903529,0.021176,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
186979,13,58.78125,1.546875,0 days 06:00:00,2021-10-21,2021-10-21 06:00:00,1.000000,0.0,98909.0,0.000000,...,1.0,0.0,0.0,0.000000,0.000000,1.0,0.000000,0.000000,0.000000,0.0
186980,14,58.78125,1.546875,0 days 06:00:00,2021-10-21,2021-10-21 06:00:00,1.000000,0.0,99012.0,0.000000,...,1.0,0.0,0.0,0.000000,0.000000,1.0,0.000000,0.000000,0.000000,0.0
186981,15,58.78125,1.546875,0 days 06:00:00,2021-10-21,2021-10-21 06:00:00,0.484375,0.0,98904.0,0.000000,...,1.0,0.0,0.0,0.000000,0.000000,1.0,0.000000,0.000000,0.000000,0.0
186982,16,58.78125,1.546875,0 days 06:00:00,2021-10-21,2021-10-21 06:00:00,0.796875,0.0,99010.0,0.000000,...,1.0,0.0,0.0,0.000000,0.000000,1.0,0.000000,0.000000,0.000000,0.0


In [32]:
[c1 for c1 in merged_df.columns if 'rain' in c1]

['rainfall_rate',
 'convective_rainfall_rate',
 'radar_max_rain_aggregate_3hr',
 'radar_mean_rain_aggregate_3hr',
 'radar_max_rain_instant',
 'radar_mean_rain_instant']

In [10]:
prd_exp = Experiment(workspace=prd_ws, name=azure_experiment_name)
prd_exp

Name,Workspace,Report Page,Docs Page
prd_mlops_test,precip_rediagnosis,Link to Azure Machine Learning studio,Link to Documentation


Get the AzML environment (basically a conda environment) from the workspace.

In [11]:
prd_env = Environment.get(workspace=prd_ws, name=azure_env_name)
prd_env

{
    "databricks": {
        "eggLibraries": [],
        "jarLibraries": [],
        "mavenLibraries": [],
        "pypiLibraries": [],
        "rcranLibraries": []
    },
    "docker": {
        "arguments": [],
        "baseDockerfile": null,
        "baseImage": "azureml/openmpi3.1.2-cuda10.2-cudnn8-ubuntu18.04",
        "baseImageRegistry": {
            "address": "mcr.microsoft.com",
            "password": null,
            "registryIdentity": null,
            "username": null
        },
        "enabled": false,
        "platform": {
            "architecture": "amd64",
            "os": "Linux"
        },
        "sharedVolumes": true,
        "shmSize": null
    },
    "environmentVariables": {},
    "inferencingStackVersion": null,
    "name": "prd_ml_cluster",
    "python": {
        "baseCondaEnvironment": null,
        "condaDependencies": {
            "channels": [
                "conda-forge"
            ],
            "dependencies": [
                "python=3.8",

### Load data

load the data from the script so we'renot duplicating code

In [12]:
# import importlib 
# importlib.reload(prd_cluster_train_demo)

In [35]:
%%time
input_data = prd_pipeline.load_data(
    prd_ws,
    dataset_name=azure_dataset_name
)
data_splits, data_dims = prd_pipeline.preprocess_data(
    input_data,
    feature_dict={'profile': profile_features, 'single_level': single_lvl_features,'target': target_parameter,},
)


loading all event data
Downloaded path: /tmp/tmp9u09eg6o/5f93f6c7-073b-4e2c-86d2-fcbc5c78b40c/prd/202110_nswws_amber_oct/prd_merged_20211019T1800Z_20211021T0600Z.csv is different from target path: /tmp/tmp9u09eg6o/5f93f6c7-073b-4e2c-86d2-fcbc5c78b40c/prd_merged_20211019T1800Z_20211021T0600Z.csv
target has dims: 23
dropping zeros
getting profile columns
['relative_humidity_5.0', 'relative_humidity_10.0', 'relative_humidity_20.0', 'relative_humidity_30.0', 'relative_humidity_50.0', 'relative_humidity_75.0', 'relative_humidity_100.0', 'relative_humidity_150.0', 'relative_humidity_200.0', 'relative_humidity_250.0', 'relative_humidity_300.0', 'relative_humidity_400.0', 'relative_humidity_500.0', 'relative_humidity_600.0', 'relative_humidity_700.0', 'relative_humidity_800.0', 'relative_humidity_1000.0', 'relative_humidity_1250.0', 'relative_humidity_1500.0', 'relative_humidity_1750.0', 'relative_humidity_2000.0', 'relative_humidity_2250.0', 'relative_humidity_2500.0', 'relative_humidity_2750

In [None]:
# these are example calls to the code for easier debugging than running on a separate cluster
# model = prd_cluster_train_demo.build_model(**data_dims)
# model = prd_cluster_train_demo.train_model(model, data_splits)

In [None]:
import datetime
log_dir = 'log/fit/' + datetime.datetime.now().strftime('%Y%m%d-%H%M%S')

### Execute our training run on a cluster

In [36]:
prd_demo_compute_target = ComputeTarget(workspace=prd_ws, name=cluster_name)
prd_demo_compute_target

AmlCompute(workspace=Workspace.create(name='precip_rediagnosis', subscription_id='07efdc52-cd27-48ed-9443-3aad2b6b777b', resource_group='precip_rediagnosis'), name=mlops-gpu-test, id=/subscriptions/07efdc52-cd27-48ed-9443-3aad2b6b777b/resourceGroups/precip_rediagnosis/providers/Microsoft.MachineLearningServices/workspaces/precip_rediagnosis/computes/mlops-gpu-test, type=AmlCompute, provisioning_state=Succeeded, location=uksouth, tags={})

In [37]:
prd_demo_args = ['--dataset-name', azure_dataset_name,
                 '--target-parameter', target_parameter,
                 '--model-name', prd_model_name,
                ]

prd_demo_args += ['--profile-features']
prd_demo_args += profile_features
prd_demo_args += ['--single-level_features']
prd_demo_args += single_lvl_features

prd_demo_args

['--dataset-name',
 'prd_merged_202110_nswws_amber_oct_files',
 '--target-parameter',
 'radar_mean_rain_instant',
 '--model-name',
 'azml_cluster_demo_20220414',
 '--profile-features',
 'air_temperature',
 'relative_humidity',
 '--single-level_features',
 'air_pressure_at_sea_level']

In [38]:
prd_run_src = ScriptRunConfig(source_directory=os.getcwd(),
                      script='prd_cluster_train_demo.py',
                      arguments=prd_demo_args,
                      compute_target=prd_demo_compute_target,
                      environment=prd_env)

In [39]:
prd_run = prd_exp.submit(prd_run_src)
prd_run

Experiment,Id,Type,Status,Details Page,Docs Page
prd_mlops_test,prd_mlops_test_1663332556_40876e65,azureml.scriptrun,Queued,Link to Azure Machine Learning studio,Link to Documentation


## Evaluation

We now get the trained model to do some evaluatiion and create some plots

In [None]:
import tempfile

In [None]:
import tensorflow.keras

We download the model file into a temproary directory (so as not to pollute the local workspace) and load into memory to do inference

In [None]:
with tempfile.TemporaryDirectory() as td1:
    td_path = pathlib.Path(td1)
    prd_run.download_files(prefix=prd_model_name, output_directory=td1)
    model_path = td_path / prd_model_name
    list(model_path.iterdir())
    trained_model = tensorflow.keras.models.load_model(model_path)

In [None]:
trained_model

In [None]:
data_splits.keys()

In [None]:
y_pred = trained_model.predict(data_splits['X_test'])

In [None]:
fig1 = plt.figure(figsize=(10, 8))
ax1 = fig1.add_subplot(1,1,1)
ax1.scatter(data_splits['y_test'], y_pred, s=200, c='darkblue')
ax1.plot([0, 300], [0, 300], ls="--", c=".3")
ax1.set_xlabel('Actual 3hr precip accumulation value')
ax1.set_ylabel('Predicted 3hr precip_accumulation value')

In [None]:
prd_run.log_image(name='actual_vs_pred', plot=fig1, description='predicted vs actual 3hr accumulations of rainfall')

In [None]:
prd_run.complete()