In [1]:
%matplotlib inline

In [2]:
import tempfile

In [3]:
import pandas as pd
import numpy as np
import pathlib
import matplotlib.pyplot as plt

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [5]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, Activation, Flatten
from tensorflow.keras.layers import Conv1D, concatenate
from tensorflow.keras.layers import ZeroPadding1D, Reshape, Input, Dropout, PReLU
from tensorflow.keras.models import Sequential, Model

from sklearn.metrics import mean_absolute_error, r2_score

In [6]:
import mlflow

### Load data

In [7]:
import azureml.core
from azureml.core import Workspace, Datastore, Dataset
from azureml.core import Experiment

ws = Workspace.from_config()

In [8]:
ws.get_mlflow_tracking_uri()

'azureml://uksouth.api.azureml.ms/mlflow/v1.0/subscriptions/07efdc52-cd27-48ed-9443-3aad2b6b777b/resourceGroups/precip_rediagnosis/providers/Microsoft.MachineLearningServices/workspaces/precip_rediagnosis?'

In [9]:
dataset = Dataset.get_by_name(ws, name='sd3')
data = dataset.to_pandas_dataframe()

In [10]:
target_parameter = 'rainfall_rate_composite'
profile_features = ['air_temperature', 'relative_humidity']
single_lvl_features = ['air_pressure_at_sea_level'] 

In [11]:
prd_model_name = 'pipeline_demo_20220414'

## create an azxure experiment


In [None]:
prd_exp = Experiment(workspace=ws, name='prd_mlops_test')
prd_exp

### Data pre-processing

In [None]:
# drop NaN values in the dataset
data = data.dropna()

# drop data points with zero precip in the radar data
data = data[data[target_parameter]>0]

In [None]:
# Get a list of columns names for profile features
prof_feature_columns = [s for s in data.columns for vars in profile_features if s.startswith(vars)]

features = data[prof_feature_columns + single_lvl_features]

target = data[[target_parameter]]

In [None]:
# data_control uses only the control member

# data_control = data[data['realization']==0]

# # Get a list of columns names for profile features
# prof_feature_columns = [s for s in data.columns for vars in profile_features if s.startswith(vars)]

# features = data_control[prof_feature_columns + single_lvl_features]

# target = data_control[[target_parameter]]

In [None]:
features.dtypes

In [None]:
standardScaler = StandardScaler()

features = pd.DataFrame(standardScaler.fit_transform(features), 
                                columns=features.columns,
                                index=features.index)

In [None]:
# features.describe().T

In [None]:
processed_data = pd.concat([features, target], axis=1, sort=False)
processed_data

Split the dataset into 60/20/20 subsets for training, testing and validation

In [None]:
# Height profiles data
X_train_prof, X_test_prof, y_train, y_test = train_test_split(
    features[prof_feature_columns],
    target,
    test_size=0.2,
    random_state=1
)

# Single level data
X_train_singlvl, X_test_singlvl, y_train, y_test = train_test_split(
    features[single_lvl_features],
    target,
    test_size=0.2,
    random_state=1
)

# y_test and y_train is the same in both of these, given that the random state is set

In [None]:
# reshape height profile variables 
X_train_prof = np.transpose(X_train_prof.to_numpy().reshape(X_train_prof.shape[0], 2, 33), (0, 2, 1))
X_test_prof = np.transpose(X_test_prof.to_numpy().reshape(X_test_prof.shape[0], 2, 33), (0, 2, 1))

In [None]:
def build_model(nprof_features, nheights, nsinglvl_features):
    
    profile_input = Input(shape=(nheights, nprof_features), name='profile_input')
    prof_size = nheights*nprof_features

    out = ZeroPadding1D(padding=1)(profile_input)
    out = Conv1D(32, 3, strides=1, activation='relu', use_bias=False, kernel_initializer='glorot_uniform', bias_initializer='zeros')(out)
    ident = out
    out = ZeroPadding1D(padding=1)(out)
    out = Conv1D(32, 3, strides=1, activation='relu', use_bias=False, kernel_initializer='glorot_uniform', bias_initializer='zeros')(out)
    out = ZeroPadding1D(padding=1)(out)
    out = Conv1D(32, 3, strides=1, activation='relu', use_bias=False, kernel_initializer='glorot_uniform', bias_initializer='zeros')(out)
    x = tf.keras.layers.add([out, ident])
    out = Flatten()(x)
    out = Dense(prof_size, use_bias=False, activation='relu')(out)

    if nsinglvl_features > 0:
        surf_input = Input(shape=(nsinglvl_features,), name='surf_input')
        flat_profs = Flatten()(profile_input)
        raw_in = tf.keras.layers.concatenate([flat_profs, surf_input])
        raw_size = (nheights*nprof_features)+nsinglvl_features
        
        out = tf.keras.layers.concatenate([out, surf_input])
        x = tf.keras.layers.add([out, raw_in])
        x = Dense(1024, use_bias=False, activation='relu')(x)
        x = Dense(1024, use_bias=False, activation='relu')(x)
        
        main_output = Dense(1, use_bias=True, activation='linear', name='main_output')(x)
        model = Model(inputs=[profile_input, surf_input], outputs=[main_output])
    
    else:
        main_output = Dense(1, use_bias=True, activation='linear', name='main_output')(out)
        model = Model(inputs=[profile_input], outputs=[main_output])
        
    return model

In [None]:
nprof_features = len(profile_features)
nheights = len(prof_feature_columns)//len(profile_features)
nsinglvl_features = len(single_lvl_features)

In [None]:
if nsinglvl_features > 0:
    X_train = [X_train_prof, X_train_singlvl]
    X_test = [X_test_prof, X_test_singlvl]
else:
    X_train = X_train_prof
    X_test = X_test_prof

In [None]:
X_train_singlvl.shape

In [None]:
import datetime
log_dir = 'log/fit/' + datetime.datetime.now().strftime('%Y%m%d-%H%M%S')

In [None]:
prd_run = prd_exp.start_logging(display_name='mlops_demo_' + datetime.datetime.now().strftime('%Y%m%d-%H%M%S'))


In [None]:
prd_run.log('learning_rate', 0.001)

In [None]:

model = build_model(nprof_features, nheights, nsinglvl_features)
model.summary()
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
model.compile(loss='mean_absolute_error', optimizer=optimizer)




We can save out the model archiecture to a JSON file, so that it can be reloaded and trained ion a nice reproducible way!
It can be loaded in by calling `tf.keras.model.models_from_json`

See [this](https://towardsdatascience.com/saving-and-loading-keras-model-42195b92f57a) article.


In [None]:
model.to_json()

In [None]:
history = model.fit(X_train, y_train, epochs=1, batch_size=32, validation_split=0.25, verbose=True)

In [None]:
prd_run.log('batch_size', 32)

In [None]:
y_pred = model.predict(X_test)

In [None]:
error = mean_absolute_error(y_test, y_pred)
prd_run.log('MAE', error)
rsqrd = r2_score(y_test, y_pred)
prd_run.log(f'R-squared score', rsqrd)

## Evaluation

In [None]:
training_hist_df = pd.DataFrame(history.history)
training_hist_df['epoch'] = history.epoch

In [None]:
plt.figure(figsize=(10, 8))
plt.plot(training_hist_df.epoch, training_hist_df.loss, label='training')
plt.plot(training_hist_df.epoch, training_hist_df.val_loss, c='g', label='validation')
plt.legend()
plt.ylabel('MAE [mm of precipitation]')
plt.xlabel('epochs')

In [None]:
fig1 = plt.figure(figsize=(10, 8))
ax1 = fig1.add_subplot(1,1,1)
ax1.scatter(y_test, y_pred, s=200, c='darkblue')
ax1.plot([0, 300], [0, 300], ls="--", c=".3")
ax1.set_xlabel('Actual 3hr precip accumulation value')
ax1.set_ylabel('Predicted 3hr precip_accumulation value')

In [None]:
prd_run.log_image(name='actual_vs_pred', plot=fig1, description='predicted vs actual 3hr accumulations of rainfall')

In [None]:
with tempfile.TemporaryDirectory() as td1:
    model_save_path = pathlib.Path(td1) / prd_model_name
    model.save(model_save_path)
    prd_run.upload_folder(name=prd_model_name, path=str(model_save_path))
    prd_run.register_model(prd_model_name, prd_model_name + '/')


In [None]:
prd_run.complete()

## Control member only plots

In [None]:
prd_run

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 8))

plt.plot(training_hist_df.epoch, training_hist_df.loss, label='training')
plt.plot(training_hist_df.epoch, training_hist_df.val_loss, c='g', label='validation')
plt.legend()
plt.ylabel('MAE [mm of precipitation]')
plt.xlabel('epochs')
plt.title('control member only')
plt.show()

In [None]:
y_pred = model.predict(X_test)

plt.figure(figsize=(10, 8))

plt.scatter(y_test, y_pred, s=200, c='darkblue')
plt.gca().plot([0, 300], [0, 300], ls="--", c=".3")

plt.xlabel('Actual 3hr precip accumulation value')
plt.ylabel('Predicted 3hr precip_accumulation value')
plt.show()

In [None]:
from sklearn.metrics import mean_absolute_error, r2_score
error = mean_absolute_error(y_test, y_pred)
print(f'MAE: {error:.3f}')

rsqrd = r2_score(y_test, y_pred)
print(f'R-squared score: {rsqrd:.3f}')

In [None]:
plt.figure(figsize=(10, 8))
plt.hist(y_test, alpha=0.5, bins=25, label='Actual')
plt.hist(y_pred, alpha=0.5, bins=25, label='Predicted')
plt.legend()
plt.show()