# AMLD 2021 - Workshop

> ##### Machine Learning in Science: Encoding physical constraints and good development practices


## Example 03 - Tools: Model tracking with MLFlow

In this notebook, we introduce MLFlow. MLFlow makes it easy to keep track of the models you have run, their hyperparameters, and their performances. 

### Workshop Organizers

* Dr. Maria Han Veiga (University of Michigan, USA)

* Dr. Miles Timpe (University of Zurich, Switzerland)

In [1]:
def train(in_target, in_n1, in_n2=0, in_n3=0):
    """Trains a single model given a hyperparameter set."""
    
    import os
    import random
    import warnings
    import sys

    from joblib import dump, load
    import pandas as pd
    import numpy as np
    from sklearn import preprocessing
    from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
    from sklearn.neural_network import MLPRegressor
    
    import mlflow
    import mlflow.sklearn
    
    import logging
    logging.basicConfig(level=logging.WARN)
    logger = logging.getLogger(__name__)
    
    # We can define a global seed value to make our lives easier
    seed = 42

    # Set random seeds
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

    
    warnings.filterwarnings("ignore")
    

    def eval_metrics(actual, pred):
        rmse = np.sqrt(mean_squared_error(actual, pred))
        mae = mean_absolute_error(actual, pred)
        r2 = r2_score(actual, pred)
        return rmse, mae, r2
    
    
    def neural_network(n1, n2, n3):
        
        layer_sizes = [n1,n2,n3]
        
        if n3 == 0:
            layer_sizes = [n1,n2]
        if n2 == 0:
            layer_sizes = [n1]

        # Define model (MLP) with sklearn
        return MLPRegressor(hidden_layer_sizes=layer_sizes,
                            max_iter=1000, early_stopping=True,
                            random_state=seed)


    def load_data(target):
        """Load train and test datasets."""

        features = ['mtotal', 'gamma', 'b_inf', 'v_inf',
                    'targ_core_fraction', 'targ_omega', 'targ_theta', 'targ_phi',
                    'proj_core_fraction', 'proj_omega', 'proj_theta', 'proj_phi',
                    'lr_mass', target]

        try:
            x_train = pd.read_csv('../datasets/train.csv', usecols=features)
        except Exception as e:
            logger.exception(
                "Unable to download train.csv! Error: %s", e)

        try:
            x_test  = pd.read_csv('../datasets/test.csv', usecols=features)
        except Exception as e:
            logger.exception(
                "Unable to download test.csv! Error: %s", e)


        x_train = x_train[x_train['lr_mass'] > 0]
        x_test = x_test[x_test['lr_mass'] > 0]

        if target != 'lr_mass':
            x_train.pop('lr_mass')
            x_test.pop('lr_mass')

        y_train = x_train.pop(target)
        y_test  = x_test.pop(target)

        return x_train, y_train, x_test, y_test


    def scale_data(x_train, y_train, x_test, y_test):

        # Scale features
        x_scaler = preprocessing.StandardScaler()

        # Fit scaler to training data
        x_scaler.fit(x_train)

        # Save scaler
        dump(x_scaler, f"../models/mlflow/x_scaler.joblib")

        # Make sure to apply same scaler to train and test!
        scaled_x_train = x_scaler.transform(x_train)
        scaled_x_test  = x_scaler.transform(x_test)

        scaled_x_train = pd.DataFrame(scaled_x_train, columns=x_train.columns)
        scaled_x_test  = pd.DataFrame(scaled_x_test, columns=x_test.columns)

        del x_scaler


        # Scale target
        y_scaler = preprocessing.StandardScaler()

        y_scaler.fit(y_train.values.reshape(-1, 1))

        # Save scaler
        dump(y_scaler, f"../models/mlflow/y_scaler_{target}.joblib")

        scaled_y_train = y_scaler.transform(y_train.values.reshape(-1, 1))
        scaled_y_test  = y_scaler.transform(y_test.values.reshape(-1, 1))

        scaled_y_train = pd.Series(data=np.squeeze(scaled_y_train), name=target)
        scaled_y_test  = pd.Series(data=np.squeeze(scaled_y_test), name=target)

        del y_scaler


        return scaled_x_train, scaled_y_train, scaled_x_test, scaled_y_test

    
    
    # Check to make sure that the target is valid
    if in_target not in ['lr_mass', 'slr_mass', 'debris_mass']:
        logger.exception(
            "Target name is invalid! Error: %s", e)
    else:
        target = in_target
    
    # Check to make sure the MLP has at least one neuron
    if int(in_n1) <= 0:
        logger.exception(
            "n1 must be greater than zero! Error: %s", e)
    else:
        n1 = int(in_n1)

    # Set default values if no n2 is provided
    if int(in_n2) <= 0:
        n2 = 0
    else:
        n2 = int(in_n2)

    # Set default values if no n3 is provided
    if int(in_n3) <= 0:
        n3 = 0
    else:
        n3 = int(in_n3)
        
        
    # Load and scale data
    x_train, y_train, x_test, y_test = load_data(target)

    scaled_x_train, scaled_y_train, scaled_x_test, scaled_y_test = scale_data(
        x_train, y_train, x_test, y_test)
        

    # Useful for multiple runs (only doing one run in this sample notebook)    
    with mlflow.start_run():
        
        model = neural_network(n1, n2, n3)
        
        # Fit model
        model.fit(scaled_x_train, scaled_y_train)

        # Make predictions
        scaled_y_pred = model.predict(scaled_x_test)

        # Apply inverse scaling to the model predictions
        y_scaler = load(f"../models/mlflow/y_scaler_{target}.joblib")

        y_pred = pd.Series(y_scaler.inverse_transform(scaled_y_pred), name='y_pred').values

        (rmse, mae, r2) = eval_metrics(y_test, y_pred)
        
        #dump(model, f"../models/regressor_mlp_{target}.joblib") 

        # Print out metrics
        print(f"Target: {target}")
        print(f"MLP (n1={n1}, n2={n2}, n3={n3}):")
        print(f"  RMSE: {rmse:.4f}")
        print(f"  MAE:  {mae:4f}")
        print(f"  R2:   {r2:.4f}")

        # Log parameter, metrics, and model to MLflow
        mlflow.log_param("target", target)
        mlflow.log_param("n1", n1)
        mlflow.log_param("n2", n2)
        mlflow.log_param("n3", n3)
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("r2", r2)
        mlflow.log_metric("mae", mae)

        mlflow.sklearn.log_model(model, "model")

### Train models and track them with MLFlow

The train function will call the code above and train, test, and log a model for the given input parameters; in this case the number of neurons in the 1-3 hidden layers. Let's train a few different network configurations and then we'll use the MLFlow UI to see the results.

In [2]:
train('lr_mass', 24, 24, 24)

Target: lr_mass
MLP (n1=24, n2=24, n3=24):
  RMSE: 0.0547
  MAE:  0.033646
  R2:   0.9866


In [3]:
train('lr_mass', 24, 24)

Target: lr_mass
MLP (n1=24, n2=24, n3=0):
  RMSE: 0.0497
  MAE:  0.029545
  R2:   0.9889


In [4]:
train('lr_mass', 24)

Target: lr_mass
MLP (n1=24, n2=0, n3=0):
  RMSE: 0.0647
  MAE:  0.041469
  R2:   0.9813


In [5]:
train('slr_mass', 8, 4)

Target: slr_mass
MLP (n1=8, n2=4, n3=0):
  RMSE: 0.0501
  MAE:  0.028197
  R2:   0.9420


In [6]:
train('debris_mass', 2, 2, 2)

Target: debris_mass
MLP (n1=2, n2=2, n3=2):
  RMSE: 0.2975
  MAE:  0.212318
  R2:   -0.0006


In [7]:
train('lr_mass', 3, 3, 3)

Target: lr_mass
MLP (n1=3, n2=3, n3=3):
  RMSE: 0.1224
  MAE:  0.088305
  R2:   0.9329


### View runs

Once we have trained a few models, let's see how MLFLow can help us keep track of our previous runs. Make sure you're in the `amld-2021-repML/notebooks` directory, then type `mlflow ui` into the terminal and hit enter. Then paste http://localhost:5000/ in your browser to pull up the MLFLow user interface.