In [None]:
# Reset repo
!git reset
!git clean -fd
!docker stop $(docker ps -a -q)

# Installation

To keep things clean and not run into version conflicts its recommended to setup a new anaconda environment with this command:

In [None]:
# Created with conda env export > environment.yml
!conda env create -f environment.yml

If you just use an existing python environment, you can also easily install all three via pip

In [None]:
! pip install mlflow
! pip install dvc
! pip install sacred

# Preperations

Import the libraries we need (this is actually the default import I load via jupyter-magic commands every time I start something new)

In [None]:
# %load ~/dev/imports.py
import os
from datetime import datetime as dt

import numpy as np
import pandas as pd
import sklearn as skl

# Pandas display options
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.set_option('display.max_rows', 10)

import IPython.display as ipd

# Set random seed 
RSEED = 42

# Visualizations
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (25, 5)
%matplotlib inline
plt.style.use('fivethirtyeight')
plt.rcParams['font.size'] = 18

import seaborn as sns
cm = sns.light_palette("green", as_cmap=True)


I created some helper functions we will need, but since they do not support the understanding of the frameworks we want to learn about, I excluded them into an own python file. Feels free to check them out, if you want to dig deeper.

In [None]:
%load_ext autoreload

In [None]:
%autoreload 2

In [None]:
from utils import *

# Baseline Setup

We will use the boston housing prices dataset which can be received directly from scikit-learn

In [None]:
from sklearn.datasets import load_boston
boston = load_boston()

data = pd.DataFrame(boston.data,columns=boston.feature_names)
data['target'] = pd.Series(boston.target)
data.sample(5)

And this is the basic code we would use, if we do not want to track any information

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error 

# Do a train_test_split
x_train, x_test, y_train, y_test = train_test_split(data.iloc[:,:-1], data.iloc[:,-1], test_size=10, random_state=42)

# Create and fit regression
linreg = LinearRegression()
linreg.fit(x_train, y_train)

# Do prediction and calculate mean absolute error
test_pred = linreg.predict(x_test)
mean_absolute_error(y_test, test_pred)

# Sacred

Since we want to use some visualization, we will first start a database as storage backend and a visualzation tool using docker

In [None]:
%%bash 
docker run -d --rm -p 27017:27017 --name mongodb mongo
docker run -d --rm -p 9000:9000 --name omniboard --link mongodb:mongo vivekratnavel/omniboard -m mongo:27017:sacred
echo "wait a few seconds till containers are up"
sleep 10

In [None]:
!open http://127.0.0.1:9000/

### v1 (the more explicit version)

In [None]:
%%writefile sacred_simple.py
#!/usr/bin/env python

from __future__ import division, print_function, unicode_literals
from sacred import Experiment

ex = Experiment('Boston Housing Prices')
from utils import *
        
@ex.capture
def capturestuff(_seed):
    print(_seed)    

def cfg():
    params = {
        "objective" : "regression",
        "metric" : "rmse", 
        "alpha" : 0.5,
        "min_child_samples" : 10,
        "learning_rate" : 0.1,
        "bagging_fraction" : 0.5,
        "feature_fraction" : 0.5,
        "bagging_frequency" : 10
    }
    return params
    
def logSacred(run,model,data,param=dict(),metrics=dict(),features=None, tags=dict()):
    # Imports
    from sklearn.externals import joblib
    
    # Get some general information
    output_folder = "sacred_out"
    type = model.__module__.split(".")[0]
    modelname = model.__class__.__name__
    
    # Track config
    mycfg=cfg()
    ex.add_config(cfg_or_file=mycfg)
    
    # Track dependencies
    import pkg_resources
    for d in pkg_resources.working_set:
        ex.add_package_dependency(d.project_name,d.version)
    
    # Track source code
    data.to_csv("{}/data".format(output_folder))
    ex.add_resource("{}/data".format(output_folder))
    
    # Create file about features
    if features is not None:
        with open("{}/features.txt".format(output_folder), "w+") as f: 
            f.write(",".join(features))
        ex.add_artifact("{}/features.txt".format(output_folder))
        
    # plot Feature importances if avaible
    if plotFeatureImportances(model, features, type):
        ex.add_artifact("{}/featureimportance.png".format(output_folder))

    # Track Model binary
    if type=="sklearn":
        _ = joblib.dump(model,"{}/sklearn".format(output_folder))
        ex.add_artifact("{}/sklearn".format(output_folder))
    if type=="lgb":
        model.save_model("{}/lghtgbm.txt".format(output_folder))
        ex.add_artifact("{}/lghtgbm.txt".format(output_folder))
        
    # Log metrics
    for k,v in metrics.items():
        ex.log_scalar(k,v)
        
    # Tags can only be set using the UI

@ex.automain
def run(_run):
    """
    f a nice greet message.

    Uses the name from config.
    """
    # Setup
    from sklearn.model_selection import train_test_split
    from sklearn.linear_model import Lasso
    from sklearn.metrics import mean_absolute_error 

    # Do a train_test_split on my Data
    data = getData()
    x_train, x_test, y_train, y_test = train_test_split(data.iloc[:,:-1], data.iloc[:,-1], test_size=10, random_state=42)
    
    # Define my params
    params=dict(alpha=0.5)
    
    clf = Lasso(**params)
    clf.fit(x_train, y_train)
    predictions = clf.predict(x_test)
    metrics = eval_metrics(y_test, predictions)
        
    logSacred(_run,clf,data,param=params,metrics=metrics,features=x_test.columns.values)

In [None]:
!mkdir -p sacred_out
!python sacred_simple.py -m sacred

### v2 (the more sacred-style ways)

In [None]:
%%writefile sacred_simple2.py
#!/usr/bin/env python
from __future__ import division, print_function, unicode_literals
from sacred import Experiment

# Imports need to be done in the beginning of the file, since sacred won't recognize them, if they occur within a function
from sklearn.externals import joblib

ex = Experiment('Boston Housing Prices')
from utils import *
        
@ex.capture
def capturestuff(_seed):
    print(_seed)

def getData():
    from sklearn.datasets import load_boston
    boston = load_boston()

    data = pd.DataFrame(boston.data,columns=boston.feature_names)
    data['target'] = pd.Series(boston.target)
    return data

@ex.config
def cfg(_log):
    alpha= 0.5
    
def logSacred(run,model,data,output_folder="sacred_out", param=dict(),metrics=dict(),features=None, tags=dict()):
    # Get some general information
    import os
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    type = model.__module__.split(".")[0]
    modelname = model.__class__.__name__
    
    # Config will be tracked automatically
    
    # Dependencies will also be tracked automatically
    
    # Track source code
    data.to_csv("{}/data".format(output_folder))
    #ex.add_source_file("{}/data".format(output_folder))
    ab = ex.open_resource("{}/data".format(output_folder))
    # Create file about features
    if features is not None:
        with open("{}/features.txt".format(output_folder), "w+") as f: 
            f.write(",".join(features))
        ex.add_artifact("{}/features.txt".format(output_folder))
        
    # plot Feature importances if avaible
    if plotFeatureImportances(model, features, type):
        ex.add_artifact("{}/featureimportance.png".format(output_folder))

    # Track Model binary
    if type=="sklearn":
        _ = joblib.dump(model,"{}/sklearn".format(output_folder))
        ex.add_artifact("{}/sklearn".format(output_folder))
    if type=="lgb":
        model.save_model("{}/lghtgbm.txt".format(output_folder))
        ex.add_artifact("{}/lghtgbm.txt".format(output_folder))
        
    # Log metrics
    for k,v in metrics.items():
        ex.log_scalar(k,v)
        
    # Set some tags to identify the experiment
    for tag, v in tags.items():
        ex.add.set_tag(t,v)

@ex.automain
def run(_run, alpha):
    # Setup
    from sklearn.model_selection import train_test_split
    from sklearn.linear_model import Lasso
    from sklearn.metrics import mean_absolute_error 

    # Do a train_test_split on my Data
    data = getData()
    x_train, x_test, y_train, y_test = train_test_split(data.iloc[:,:-1], data.iloc[:,-1], test_size=10, random_state=42)
    
    # Define my params
    params=dict(alpha=alpha)
    
    clf = Lasso(**params)
    clf.fit(x_train, y_train)
    predictions = clf.predict(x_test)
    metrics = eval_metrics(y_test, predictions)
        
    logSacred(_run,clf,data,param=params,metrics=metrics,features=x_test.columns.values)

Let's run the script

In [None]:
!mkdir -p sacred_out
!python sacred_simple2.py -m sacred

And provide some additional parameters to it

In [None]:
!python sacred_simple2.py -m sacred with 'alpha=0.2'

Sacred also offers some functions to print configurations or list dependencies it detected

In [None]:
!python sacred_simple2.py print_config 

In [None]:
!python sacred_simple2.py print_dependencies

You can easily switch the storage backend between runs using --file_storage handler

In [None]:
!python sacred_simple.py --file_storage=BASEDIR

# DVC

DVC works much like git, so we will first need to init the project

## Init

In [None]:
! dvc init

DVC creates a .dvc folder that stores all important information and can be tracked using git (just as mentioned above)

In [None]:
! git status

In [None]:
%%bash
# To re-run the command above, remove the dvc file  as well as the .dvc folder
#rm -rf .dvc
#rm simple.dvc

DVC does not provide a python api but gets controlled using the command line and scripts executes. Therefore we use the jupyter %writefile magic to create the file for our training

In [None]:
%%writefile dvc_simple.py
def logDVC(model,output_folder="dvc_out", param=dict(),metrics=dict(),features=None, tags=dict()):
    import json
    from sklearn.externals import joblib
    import os
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    # Get some general information
    type = model.__module__.split(".")[0]

    # No option to set some tags to identify the experiment

    # Save Model
    if type=="sklearn":
        _ = joblib.dump(model,"{}/mymodel".format(output_folder))
    if type=="lgb":
        model.save_model("{}/mymodel".format(output_folder))

    # Log metrics
    with open('{}/metrics.txt'.format(output_folder), 'w') as f:
        f.write(json.dumps(metrics))

    # plot Feature importances if avaible
    plotFeatureImportances(model, features, type)

    # Create file about features
    if features is not None:
        with open("{}/features.txt".format(output_folder), "w+") as f: 
            f.write(",".join(features))
            
if __name__ == "__main__":
    from sklearn.model_selection import train_test_split
    from sklearn.linear_model import Lasso
    # We need to import utils here, since it is an own script and the execution environment has no access to the jupyter execution environment
    from utils import *

    # Do a train_test_split
    data = getData()
    x_train, x_test, y_train, y_test = train_test_split(data.iloc[:,:-1], data.iloc[:,-1], test_size=10, random_state=42)

    # Define the details of our run
    params=dict(alpha=0.4)
    clf = Lasso(**params)
    clf.fit(x_train, y_train)
    predictions = clf.predict(x_test)

    metrics = eval_metrics(y_test, predictions)
    logDVC(clf,param=params,metrics=metrics,features=x_test.columns.values)

Now we can run the command using DVC. Since it will need a couple of information, its help fucntion is an important reference

In [None]:
!dvc run --help

We tell DVC which file to run (-d), the project file that keeps the information (-f), the metrics (-M) and the output file (-o).

In [None]:
%%bash
echo $(pwd)
dvc run  \
  -d dvc_simple.py \
  -f simple.dvc \
  -o dvc_count
  python dvc_simple.py

In [None]:
! dvc pipeline show simple.dvc

In [None]:
! dvc repro simple.dvc

In [None]:
!git diff

# MLFlow

MLFlow offers a visualization server, but since this is a long running process and will block other notebook cells start it from a terminal using:

```
mlflow ui
```

After a few seconds you can open it in your Browser: 

In [None]:
!open http://127.0.0.1:5000/

Create a function to do the logging:

In [None]:
import git
repo = git.Repo(search_parent_directories=True)
sha = repo.head.object.hexsha
repo.remotes.origin

In [None]:
def getGitInfos():
    import git
    repo = git.Repo(search_parent_directories=True)
    sha = repo.head.object.hexsha
    try:
        remoteurl = repo.remotes.origin.url
    except AttributeError:
        remoteurl = ""
    return sha, remoteurl

def logMlflow(model,data,output_folder="mlflow_out", param=dict(),metrics=dict(),features=None, tags=dict(),run_name=None):
    # Imports
    from sklearn.externals import joblib
    import mlflow
    import os
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    # Get some general information
    type = model.__module__.split(".")[0]
    modelname = model.__class__.__name__
    sha, remoteurl = getGitInfos()
    
    # Start actual logging
    mlflow.set_experiment(experiment_name="demo")
    if not run_name:
        run_name = modelname
    with mlflow.start_run(source_name=remoteurl,source_version=sha, run_name=run_name):
        
        # Log Parameters
        for k,v in param.items():
            mlflow.log_param(k, v)

        # Track dependencies
        import pkg_resources
        with open("{}/dependencies.txt".format(output_folder), "w+") as f: 
            for d in pkg_resources.working_set:
                f.write("{}={}\n".format(d.project_name,d.version))
        mlflow.log_artifact("{}/dependencies.txt".format(output_folder))
        
        # Track data
        data.to_csv("{}/data".format(output_folder))
        mlflow.log_artifact("{}/data".format(output_folder))
        
        if type=="sklearn":
            _ = joblib.dump(model,"{}/sklearn".format(output_folder))
            mlflow.log_artifact("{}/sklearn".format(output_folder))
        if type=="lgb":
            model.save_model("{}/lghtgbm.txt".format(output_folder))
            mlflow.log_artifact("{}/lghtgbm.txt".format(output_folder))
        
        # Log metrics
        for k,v in metrics.items():
            mlflow.log_metric(k,v)

        # plot Feature importances if avaible
        featurePlot = plotFeatureImportances(model, features, type)
        if featurePlot:
            mlflow.log_artifact("{}.png".format(featurePlot))
            
        # Set some tags to identify the experiment
        mlflow.set_tag("model",modelname)
        for tag, v in tags.items():
            mlflow.set_tag(t,v)

And use it after training

In [None]:
# Setup
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso

from utils import *

# Do a train_test_split
data = getData()
x_train, x_test, y_train, y_test = train_test_split(data.iloc[:,:-1], data.iloc[:,-1], test_size=10, random_state=42)

params=dict(alpha=0.1)

clf = Lasso(**params)

def run(clf, params, run_name=None):
    clf.fit(x_train, y_train)
    predictions = clf.predict(x_test)
    metrics = eval_metrics(y_test, predictions)
    print(metrics['mae'], metrics['r2'])

    logMlflow(clf,data,param=params,metrics=metrics,features=x_test.columns.values, run_name=run_name)
    
run(clf,params)

Now it is easy to test different models and parameter combinations:

In [None]:
# Lasso
from sklearn.linear_model import Lasso
params = [
    dict(alpha=0.4),
    dict(alpha=0.3),
    dict(alpha=0.2)
]

for i, p in enumerate(params):
    print(p)
    clf = Lasso(**p)
    run(clf,p, run_name =  clf.__class__.__name__)

In [None]:
from sklearn.linear_model import Ridge
params = [
    dict(alpha=0.1),
    dict(alpha=0.5),
    dict(alpha=0.9)
]

for i, p in enumerate(params):
    print(p)
    clf = Ridge(**p)
    run(clf,p)

In [None]:
from sklearn.linear_model import ElasticNet
params = [
    dict(alpha=0.1, l1_ratio=0.5),
    dict(alpha=0.5, l1_ratio=0.5),
    dict(alpha=0.9, l1_ratio=0.5),
    dict(alpha=0.9, l1_ratio=0.2),
    dict(alpha=0.9, l1_ratio=0.8)
]

for i, p in enumerate(params):
    print(p)
    clf = ElasticNet(**p)
    run(clf,p)

In [None]:
from sklearn.tree import DecisionTreeRegressor
params = [
    dict(max_depth=6, min_samples_split=5),
    dict(max_depth=5, min_samples_split=3),
    dict(max_depth=4, min_samples_split=5),
]

for i, p in enumerate(params):
    print(p)
    clf = DecisionTreeRegressor(**p)
    run(clf,p, run_name =  clf.__class__.__name__)

## MLFlow Project

To make the code above a MLFlow Project and use its remote-run functionality, we will first need to create a file named MLProject which speciefies the environment file as well as the entrypoint

In [None]:
%%writefile mlflow/MLProject
name: mlflow

conda_env: environment.yml

entry_points:
  main:
    parameters:
      alpha: float
      l1_ratio: {type: float, default: 0.1}
    command: "python train.py {alpha} {l1_ratio}"

Also we will need to make a few adaptions to the code above:
 * import the utils functionalities
 * define a [main functionality](https://stackoverflow.com/questions/419163/what-does-if-name-main-do)
 * parse the command line arguments using sys

In [None]:
%%writefile mlflow/train.py
from utils import *
import git
def logMlflow(model,data,param=dict(),metrics=dict(),features=None, tags=dict(),run_name=None):
    # Imports
    import mlflow
    import os
    from sklearn.externals import joblib
    
    # Get some general information
    output_folder = "mlflow_out"
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    type = model.__module__.split(".")[0]
    modelname = model.__class__.__name__
    
    # Start actual logging
    
    repo = git.Repo(search_parent_directories=True)
    sha = repo.head.object.hexsha
    mlflow.set_experiment(experiment_name="demo")
    if not run_name:
        run_name = modelname
    with mlflow.start_run(source_name=repo.remotes.origin.url,source_version=sha, run_name=run_name):
        
        # Log Parameters
        for k,v in param.items():
            mlflow.log_param(k, v)

        # Track dependencies
        import pkg_resources
        with open("{}/dependencies.txt".format(output_folder), "w+") as f: 
            for d in pkg_resources.working_set:
                f.write("{}={}\n".format(d.project_name,d.version))
        mlflow.log_artifact("{}/dependencies.txt".format(output_folder))
        
        # Track data
        data.to_csv("{}/data".format(output_folder))
        mlflow.log_artifact("{}/data".format(output_folder))
        
        if type=="sklearn":
            _ = joblib.dump(model,"{}/sklearn".format(output_folder))
            mlflow.log_artifact("{}/sklearn".format(output_folder))
        if type=="lgb":
            model.save_model("{}/lghtgbm.txt".format(output_folder))
            mlflow.log_artifact("{}/lghtgbm.txt".format(output_folder))
        
        # Log metrics
        for k,v in metrics.items():
            mlflow.log_metric(k,v)

        # plot Feature importances if avaible
        featurePlot = plotFeatureImportances(model, features, type)
        if featurePlot:
            mlflow.log_artifact("{}.png".format(featurePlot))
            
        # Set some tags to identify the experiment
        mlflow.set_tag("model",modelname)
        for tag, v in tags.items():
            mlflow.set_tag(t,v)
            
def run(clf, params, run_name=None):
    clf.fit(x_train, y_train)
    predictions = clf.predict(x_test)
    metrics = eval_metrics(y_test, predictions)
    print(metrics['mae'], metrics['r2'])

    logMlflow(clf,data,param=params,metrics=metrics,features=x_test.columns.values, run_name=run_name)

if __name__ == "__main__":
    from sklearn.model_selection import train_test_split
    from sklearn.linear_model import ElasticNet
    import sys
    # Do a train_test_split
    data = getData()
    x_train, x_test, y_train, y_test = train_test_split(data.iloc[:,:-1], data.iloc[:,-1], test_size=10, random_state=42)
    
    params=dict(alpha=float(sys.argv[1]) if len(sys.argv) > 1 else 0.5,
                l1_ratio = float(sys.argv[2]) if len(sys.argv) > 2 else 0.5)

    clf = ElasticNet(**params)

    run(clf,params)


Now you should be able to run this:

In [None]:
!mlflow run mlflow -P alpha=0.4

Or run it directly from our github account

In [None]:
!mlflow run mlflow -P alpha=0.4