# Installation

To keep things clean and not run into version conflicts its recommended to setup a new anaconda environment with this command:

In [6]:
# Created with conda env export > environment.yml
!conda env create -f environment.yml


CondaValueError: prefix already exists: /Users/nkreiling/miniconda3/envs/data-version-control



If you just use an existing python environment, you can also easily install all three via pip

In [7]:
! pip install mlflow
! pip install dvc
! pip install sacred



# Preperations

Import the libraries we need (this is actually the default import I load via jupyter-magic commands every time I start something new)

In [67]:
# %load ~/dev/imports.py
import os
from datetime import datetime as dt

import numpy as np
import pandas as pd
import sklearn as skl

# Pandas display options
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.set_option('display.max_rows', 10)

import IPython.display as ipd

# Set random seed 
RSEED = 42

# Visualizations
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (25, 5)
%matplotlib inline
plt.style.use('fivethirtyeight')
plt.rcParams['font.size'] = 18

import seaborn as sns
cm = sns.light_palette("green", as_cmap=True)


I created some helper functions we will need, but since they do not support the understanding of the frameworks we want to learn about, I excluded them into an own python file. Feels free to check them out, if you want to dig deeper.

In [68]:
%load_ext autoreload

In [69]:
%autoreload 2

In [70]:
from utils import *

# Baseline Setup

We will use the boston housing prices dataset which can be received directly from scikit-learn

In [71]:
from sklearn.datasets import load_boston
boston = load_boston()

data = pd.DataFrame(boston.data,columns=boston.feature_names)
data['target'] = pd.Series(boston.target)
data.sample(5)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
104,0.14,0.0,8.56,0.0,0.52,6.167,90.0,2.421,5.0,384.0,20.9,392.69,12.33,20.1
445,10.672,0.0,18.1,0.0,0.74,6.459,94.8,1.988,24.0,666.0,20.2,43.06,23.98,11.8
491,0.106,0.0,27.74,0.0,0.609,5.983,98.8,1.868,4.0,711.0,20.1,390.11,18.07,13.6
468,15.576,0.0,18.1,0.0,0.58,5.926,71.0,2.908,24.0,666.0,20.2,368.74,18.13,19.1
356,8.983,0.0,18.1,1.0,0.77,6.212,97.4,2.122,24.0,666.0,20.2,377.73,17.6,17.8


And this is the basic code we would use, if we do not want to track any information

In [72]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error 

# Do a train_test_split
x_train, x_test, y_train, y_test = train_test_split(data.iloc[:,:-1], data.iloc[:,-1], test_size=10, random_state=42)

# Create and fit regression
linreg = LinearRegression()
linreg.fit(x_train, y_train)

# Do prediction and calculate mean absolute error
test_pred = linreg.predict(x_test)
mean_absolute_error(y_test, test_pred)

2.4968624733238176

# Sacred

Since we want to use some visualization, we will first start a database as storage backend and a visualzation tool using docker

In [73]:
%%bash 
docker run -d --rm -p 27017:27017 --name mongodb mongo
docker run -d --rm -p 9000:9000 --name omniboard --link mongodb:mongo vivekratnavel/omniboard -m mongo:27017:sacred
echo "wait a few seconds till containers are up"
sleep 10

wait a few seconds till containers are up


docker: Error response from daemon: Conflict. The container name "/mongodb" is already in use by container "8e7ecabadc3964c51094f18ffbbe8650bda8ad548ca622c554749a9ceed9496c". You have to remove (or rename) that container to be able to reuse that name.
See 'docker run --help'.
docker: Error response from daemon: Conflict. The container name "/omniboard" is already in use by container "aff3580cc674be66237554cf5a242b6afe8ed8c250a2f5e3e9dc0e6d6d821f7a". You have to remove (or rename) that container to be able to reuse that name.
See 'docker run --help'.


In [74]:
!open http://127.0.0.1:9000/

### v1 (the more explicit version)

In [75]:
%%writefile sacred_simple.py
#!/usr/bin/env python

from __future__ import division, print_function, unicode_literals
from sacred import Experiment

ex = Experiment('Boston Housing Prices')
from utils import *
        
@ex.capture
def capturestuff(_seed):
    print(_seed)    

def cfg():
    params = {
        "objective" : "regression",
        "metric" : "rmse", 
        "alpha" : 0.5,
        "min_child_samples" : 10,
        "learning_rate" : 0.1,
        "bagging_fraction" : 0.5,
        "feature_fraction" : 0.5,
        "bagging_frequency" : 10
    }
    return params
    
def logSacred(run,model,data,param=dict(),metrics=dict(),features=None, tags=dict()):
    # Imports
    from sklearn.externals import joblib
    
    # Get some general information
    output_folder = "sacred_out"
    type = model.__module__.split(".")[0]
    modelname = model.__class__.__name__
    
    # Track config
    mycfg=cfg()
    ex.add_config(cfg_or_file=mycfg)
    
    # Track dependencies
    import pkg_resources
    for d in pkg_resources.working_set:
        ex.add_package_dependency(d.project_name,d.version)
        print(d.project_name,d.version)
    
    # Track source code
    data.to_csv("{}/data".format(output_folder))
    ex.add_resource("{}/data".format(output_folder))
    
    
    # Create file about features
    if features is not None:
        with open("{}/features.txt".format(output_folder), "w+") as f: 
            f.write(",".join(features))
        ex.add_artifact("{}/features.txt".format(output_folder))
        
    # plot Feature importances if avaible
    if plotFeatureImportances(model, features, type):
        ex.add_artifact("{}/featureimportance.png".format(output_folder))

    # Track Model binary
    if type=="sklearn":
        _ = joblib.dump(model,"{}/sklearn".format(output_folder))
        ex.add_artifact("{}/sklearn".format(output_folder))
    if type=="lgb":
        model.save_model("{}/lghtgbm.txt".format(output_folder))
        ex.add_artifact("{}/lghtgbm.txt".format(output_folder))
        
    # Log metrics
    for k,v in metrics.items():
        ex.log_scalar(k,v)
        
    # Tags can only be set using the UI

@ex.automain
def run(_run):
    """
    f a nice greet message.

    Uses the name from config.
    """
    # Setup
    from sklearn.model_selection import train_test_split
    from sklearn.linear_model import Lasso
    from sklearn.metrics import mean_absolute_error 

    # Do a train_test_split on my Data
    data = getData()
    x_train, x_test, y_train, y_test = train_test_split(data.iloc[:,:-1], data.iloc[:,-1], test_size=10, random_state=42)
    
    # Define my params
    params=dict(alpha=0.5)
    
    clf = Lasso(**params)
    clf.fit(x_train, y_train)
    predictions = clf.predict(x_test)
    metrics = eval_metrics(y_test, predictions)
        
    logSacred(_run,clf,data,param=params,metrics=metrics,features=x_test.columns.values)

Writing sacred_simple.py


### v2 (the more sacred-style ways)

In [76]:
%%writefile sacred_simple2.py
#!/usr/bin/env python
from __future__ import division, print_function, unicode_literals
from sacred import Experiment

# Imports need to be done in the beginning of the file, since sacred won't recognize them, if they occur within a function
from sklearn.externals import joblib

ex = Experiment('Boston Housing Prices')
from utils import *
        
@ex.capture
def capturestuff(_seed):
    print(_seed)

def getData():
    from sklearn.datasets import load_boston
    boston = load_boston()

    data = pd.DataFrame(boston.data,columns=boston.feature_names)
    data['target'] = pd.Series(boston.target)
    return data

@ex.config
def cfg(_log):
    alpha= 0.5
    
def logSacred(run,model,data,param=dict(),metrics=dict(),features=None, tags=dict()):
    # Get some general information
    output_folder = "sacred_out"
    type = model.__module__.split(".")[0]
    print(type)
    modelname = model.__class__.__name__
    
    # Config will be tracked automatically
    
    # Dependencies will also be tracked automatically
    
    # Track source code
    data.to_csv("{}/data".format(output_folder))
    #ex.add_source_file("{}/data".format(output_folder))
    ab = ex.open_resource("data/")
    # Create file about features
    if features is not None:
        with open("{}/features.txt".format(output_folder), "w+") as f: 
            f.write(",".join(features))
        ex.add_artifact("{}/features.txt".format(output_folder))
        
    # plot Feature importances if avaible
    if plotFeatureImportances(model, features, type):
        ex.add_artifact("{}/featureimportance.png".format(output_folder))

    # Track Model binary
    if type=="sklearn":
        _ = joblib.dump(model,"{}/sklearn".format(output_folder))
        ex.add_artifact("{}/sklearn".format(output_folder))
    if type=="lgb":
        model.save_model("{}/lghtgbm.txt".format(output_folder))
        ex.add_artifact("{}/lghtgbm.txt".format(output_folder))
        
    # Log metrics
    for k,v in metrics.items():
        ex.log_scalar(k,v)
        
    # Set some tags to identify the experiment
    for tag, v in tags.items():
        ex.add.set_tag(t,v)

@ex.automain
def run(_run, alpha):
    # Setup
    from sklearn.model_selection import train_test_split
    from sklearn.linear_model import Lasso
    from sklearn.metrics import mean_absolute_error 

    # Do a train_test_split on my Data
    data = getData()
    x_train, x_test, y_train, y_test = train_test_split(data.iloc[:,:-1], data.iloc[:,-1], test_size=10, random_state=42)
    
    # Define my params
    params=dict(alpha=alpha)
    
    clf = Lasso(**params)
    clf.fit(x_train, y_train)
    predictions = clf.predict(x_test)
    metrics = eval_metrics(y_test, predictions)
        
    logSacred(_run,clf,data,param=params,metrics=metrics,features=x_test.columns.values)

Writing sacred_simple2.py


Let's run the script

In [95]:
!mkdir -p sacred_out
!python sacred_simple2.py -m sacred

  import imp
INFO - Boston Housing Prices - Running command 'run'
INFO - Boston Housing Prices - Started run with ID "11"
sklearn
INFO - Boston Housing Prices - Completed after 0:00:01


And provide some additional parameters to it

In [96]:
!python sacred_simple2.py -m sacred with 'alpha=0.2'

  import imp
INFO - Boston Housing Prices - Running command 'run'
INFO - Boston Housing Prices - Started run with ID "12"
sklearn
INFO - Boston Housing Prices - Completed after 0:00:01


Sacred also offers some functions to print configurations or list dependencies it detected

In [97]:
!python sacred_simple2.py print_config 

  import imp
INFO - Boston Housing Prices - Running command 'print_config'
INFO - Boston Housing Prices - Started
Configuration ([94mmodified[0m, [92madded[0m, [91mtypechanged[0m, [90mdoc[0m):
  alpha = 0.5
  seed = 488314257                   [90m# the random seed for this experiment[0m
INFO - Boston Housing Prices - Completed after 0:00:00


In [98]:
!python sacred_simple2.py print_dependencies

  import imp
INFO - Boston Housing Prices - Running command 'print_dependencies'
INFO - Boston Housing Prices - Started
Dependencies:
  numpy                == 1.15.4
  sacred               == 0.7.4
  scikit-learn         == 0.20.0

Sources:
  sacred_simple2.py                            a71db735ea8c5d608dafd9b55031944b

Version Control:
[91mM git://Users/nkreiling/playground/data-version-control  885c9cea8db20e7fa41dfd64930d9f5e9c98d1b3[0m

INFO - Boston Housing Prices - Completed after 0:00:00


You can easily switch the storage backend between runs using --file_storage handler

In [99]:
!python sacred_simple.py --file_storage=BASEDIR

INFO - Boston Housing Prices - Running command 'run'
INFO - Boston Housing Prices - Started run with ID "2"
  import imp
zc.lockfile 1.3.0
wrapt 1.10.11
wheel 0.32.2
Werkzeug 0.14.1
webencodings 0.5.1
wcwidth 0.1.7
urllib3 1.24.1
traitlets 4.3.2
tornado 5.1.1
testpath 0.4.2
terminado 0.8.1
tabulate 0.8.2
statsmodels 0.9.0
smmap2 2.0.5
six 1.11.0
simplejson 3.16.0
setuptools 40.5.0
Send2Trash 1.5.0
seaborn 0.9.0
scipy 1.1.0
scikit-learn 0.20.0
schema 0.6.8
sacred 0.7.4
s3transfer 0.1.13
requests 2.20.1
querystring-parser 1.2.3
pyzmq 17.1.2
PyYAML 3.13
pytz 2018.7
python-dateutil 2.7.5
pyrsistent 0.14.5
pyparsing 2.3.0
pymongo 3.7.2
Pygments 2.2.0
pyfiglet 0.7.6
pyasn1 0.4.4
py-cpuinfo 4.0.0
ptyprocess 0.6.0
protobuf 3.6.1
prompt-toolkit 2.0.7
prometheus-client 0.4.2
ply 3.11
pip 18.1
Pillow 5.3.0
pickleshare 0.7.5
pexpect 4.6.0
patsy 0.5.1
parso 0.3.1
pandocfilters 1.4.2
pandas 0.23.4
numpy 1.15.4
notebook 5.7.0
nose 1.3.7
nose-exclude 0.5.0
networkx 2.2
nbformat 4.4.0
nbconvert 5.3.1
n

# DVC

DVC works much like git, so we will first need to init the project

## Init

In [100]:
! dvc init

[31mError: [0mFailed to initiate dvc: '.dvc' exists. Use '-f' to force.

Having any troubles? Hit us up at dvc.org/support, we are always happy to help!
[0m

DVC creates a .dvc folder that stores all important information and can be tracked using git (just as mentioned above)

In [101]:
! git status

On branch master
Changes to be committed:
  (use "git reset HEAD <file>..." to unstage)

	[32mnew file:   .dvc/.gitignore[m
	[32mnew file:   .dvc/config[m

Changes not staged for commit:
  (use "git add <file>..." to update what will be committed)
  (use "git checkout -- <file>..." to discard changes in working directory)

	[31mmodified:   simple.ipynb[m

Untracked files:
  (use "git add <file>..." to include in what will be committed)

	[31mBASEDIR/[m
	[31mdvc_simple.py[m
	[31msacred_out/[m
	[31msacred_simple.py[m
	[31msacred_simple2.py[m
	[31msimple.dvc[m



In [102]:
%%bash
# To re-run the command above, remove the dvc file  as well as the .dvc folder
#rm -rf .dvc
#rm simple.dvc

DVC does not provide a python api but gets controlled using the command line and scripts executes. Therefore we use the jupyter %writefile magic to create the file for our training

In [103]:
%%writefile dvc_simple.py
def logDVC(model,param=dict(),metrics=dict(),features=None, tags=dict()):
    # Imports
    import mlflow
    import json
    from sklearn.externals import joblib
    
    # Get some general information
    type = model.__module__.split(".")[0]

    # No option to set some tags to identify the experiment

    # Save Model
    if type=="sklearn":
        _ = joblib.dump(model,"tmp/mymodel")
    if type=="lgb":
        model.save_model("tmp/mymodel")

    # Log metrics
    with open('tmp/metrics.txt', 'w') as f:
        f.write(json.dumps(metrics))

    # plot Feature importances if avaible
    plotFeatureImportances(model, features, type)

    # Create file about features
    if features is not None:
        with open("tmp/features.txt", "w+") as f: 
            f.write(",".join(features))
            

# Setup
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
# We need to import utils here, since it is an own script and the execution environment has no access to the jupyter execution environment
from utils import *

# Do a train_test_split
data = getData()
x_train, x_test, y_train, y_test = train_test_split(data.iloc[:,:-1], data.iloc[:,-1], test_size=10, random_state=42)

# Define the details of our run
params=dict(alpha=0.4)
clf = Lasso(**params)
clf.fit(x_train, y_train)
predictions = clf.predict(x_test)

metrics = eval_metrics(y_test, predictions)
logDVC(clf,params,metrics,features=x_test.columns.values)

Overwriting dvc_simple.py


Now we can run the command using DVC. Since it will need a couple of information, its help fucntion is an important reference

In [104]:
!dvc run --help

usage: dvc run [-h] [-q | -v] [-d DEPS] [-o OUTS] [-O OUTS_NO_CACHE]
               [-M METRICS_NO_CACHE] [-f FILE] [-c CWD] [--no-exec] [-y]
               ...

Generate a stage file from a given command and execute the command.

positional arguments:
  command               Command or command file to execute.

optional arguments:
  -h, --help            show this help message and exit
  -q, --quiet           Be quiet.
  -v, --verbose         Be verbose.
  -d DEPS, --deps DEPS  Declare dependencies for reproducible cmd.
  -o OUTS, --outs OUTS  Declare output data file or data directory.
  -O OUTS_NO_CACHE, --outs-no-cache OUTS_NO_CACHE
                        Declare output regular file or directory (sync to Git,
                        not DVC cache).
  -M METRICS_NO_CACHE, --metrics-no-cache METRICS_NO_CACHE
                        Declare output metric file or directory (not cached by
                        DVC).
  -f FILE, --file FILE  Specify name of the stage file. It should be

We tell DVC which file to run (-d), the project file that keeps the information (-f), the metrics (-M) and the output file (-o).

In [105]:
%%bash
echo $(pwd)
dvc run  \
  -d dvc_simple.py \
  -f simple.dvc \
  -o tmp/featureimportance.png \
  -M tmp/metrics.txt \
  -o tmp/features.txt \
  -o tmp/mymodel
  python dvc_simple.py

/Users/nkreiling/playground/data-version-control


Error: Failed to run command: 'simple.dvc' already exists

Having any troubles? Hit us up at dvc.org/support, we are always happy to help!
  import imp


In [106]:
! dvc pipeline show simple.dvc

simple.dvc
[0m

In [107]:
! dvc repro simple.dvc

[32mStage 'simple.dvc' didn't change.[0m
Pipeline is up to date. Nothing to reproduce.
[0m

In [108]:
!git diff

[1mdiff --git a/simple.ipynb b/simple.ipynb[m
[1mindex 64b7cdd..8f97fc1 100644[m
[1m--- a/simple.ipynb[m
[1m+++ b/simple.ipynb[m
[36m@@ -117,7 +117,7 @@[m
   },[m
   {[m
    "cell_type": "code",[m
[31m-   "execution_count": 1,[m
[32m+[m[32m   "execution_count": 67,[m
    "metadata": {},[m
    "outputs": [],[m
    "source": [[m
[36m@@ -158,7 +158,7 @@[m
   },[m
   {[m
    "cell_type": "code",[m
[31m-   "execution_count": 2,[m
[32m+[m[32m   "execution_count": 68,[m
    "metadata": {},[m
    "outputs": [],[m
    "source": [[m
[36m@@ -167,7 +167,7 @@[m
   },[m
   {[m
    "cell_type": "code",[m
[31m-   "execution_count": 3,[m
[32m+[m[32m   "execution_count": 69,[m
    "metadata": {},[m
    "outputs": [],[m
    "source": [[m
[36m@@ -176,7 +176,7 @@[m
   },[m
   {[m
    "cell_type": "code",[m
[31m-   "execution_count": 4,[m
[32m+[m[32m   "execution_count": 70,[m
    "metadata": {},[m
    "outputs": [],[m
    "source": [[m
[36m@@

# MLFlow

MLFlow offers a visualization server, but since this is a long running process and will block other notebook cells start it from a terminal using:

```
mlflow ui
```

After a few seconds you can open it in your Browser: 

In [None]:
!open http://127.0.0.1:5000/

Create a function to do the logging:

In [None]:
import git
def logMlflow(model,data,param=dict(),metrics=dict(),features=None, tags=dict(),run_name=None):
    # Imports
    import mlflow
    import os
    from sklearn.externals import joblib
    
    # Get some general information
    output_folder = "mlflow_out"
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    type = model.__module__.split(".")[0]
    modelname = model.__class__.__name__
    
    # Start actual logging
    
    repo = git.Repo(search_parent_directories=True)
    sha = repo.head.object.hexsha
    mlflow.set_experiment(experiment_name="demo")
    if not run_name:
        run_name = modelname
    with mlflow.start_run(source_name=repo.remotes.origin.url,source_version=sha, run_name=run_name):
        
        # Log Parameters
        for k,v in param.items():
            mlflow.log_param(k, v)

        # Track dependencies
        import pkg_resources
        with open("{}/dependencies.txt".format(output_folder), "w+") as f: 
            for d in pkg_resources.working_set:
                f.write("{}={}\n".format(d.project_name,d.version))
        mlflow.log_artifact("{}/dependencies.txt".format(output_folder))
        
        # Track data
        data.to_csv("{}/data".format(output_folder))
        mlflow.log_artifact("{}/data".format(output_folder))
        
        if type=="sklearn":
            _ = joblib.dump(model,"{}/sklearn".format(output_folder))
            mlflow.log_artifact("{}/sklearn".format(output_folder))
        if type=="lgb":
            model.save_model("{}/lghtgbm.txt".format(output_folder))
            mlflow.log_artifact("{}/lghtgbm.txt".format(output_folder))
        
        # Log metrics
        for k,v in metrics.items():
            mlflow.log_metric(k,v)

        # plot Feature importances if avaible
        featurePlot = plotFeatureImportances(model, features, type)
        if featurePlot:
            mlflow.log_artifact("{}.png".format(featurePlot))
            
        # Set some tags to identify the experiment
        mlflow.set_tag("model",modelname)
        for tag, v in tags.items():
            mlflow.set_tag(t,v)
            
        

And use it after training

In [None]:
# Setup
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso

from utils import *

# Do a train_test_split
data = getData()
x_train, x_test, y_train, y_test = train_test_split(data.iloc[:,:-1], data.iloc[:,-1], test_size=10, random_state=42)

params=dict(alpha=0.1)

clf = Lasso(**params)

def run(clf, params, run_name=None):
    clf.fit(x_train, y_train)
    predictions = clf.predict(x_test)
    metrics = eval_metrics(y_test, predictions)
    print(metrics['mae'], metrics['r2'])

    logMlflow(clf,data,param=params,metrics=metrics,features=x_test.columns.values, run_name=run_name)
    
run(clf,params)

Now it is easy to test different models and parameter combinations:

In [None]:
# Lasso
from sklearn.linear_model import Lasso
params = [
    dict(alpha=0.4),
    dict(alpha=0.3),
    dict(alpha=0.2)
]

for i, p in enumerate(params):
    print(p)
    clf = Lasso(**p)
    run(clf,p, run_name =  clf.__class__.__name__)

In [None]:
from sklearn.linear_model import Ridge
params = [
    dict(alpha=0.1),
    dict(alpha=0.5),
    dict(alpha=0.9)
]

for i, p in enumerate(params):
    print(p)
    clf = Ridge(**p)
    run(clf,p)

In [None]:
from sklearn.linear_model import ElasticNet
params = [
    dict(alpha=0.1, l1_ratio=0.5),
    dict(alpha=0.5, l1_ratio=0.5),
    dict(alpha=0.9, l1_ratio=0.5),
    dict(alpha=0.9, l1_ratio=0.2),
    dict(alpha=0.9, l1_ratio=0.8)
]

for i, p in enumerate(params):
    print(p)
    clf = ElasticNet(**p)
    run(clf,p)

In [None]:
from sklearn.tree import DecisionTreeRegressor
params = [
    dict(max_depth=6, min_samples_split=5),
    dict(max_depth=5, min_samples_split=3),
    dict(max_depth=4, min_samples_split=5),
]

for i, p in enumerate(params):
    print(p)
    clf = DecisionTreeRegressor(**p)
    run(clf,p, run_name =  clf.__class__.__name__)

## MLFlow Project

To make the code above a MLFlow Project and use its remote-run functionality, we will first need to create a file named MLProject which speciefies the environment file as well as the entrypoint

In [None]:
%%writefile mlflow/MLProject
name: mlflow

conda_env: environment.yml

entry_points:
  main:
    parameters:
      alpha: float
      l1_ratio: {type: float, default: 0.1}
    command: "python train.py {alpha} {l1_ratio}"

Also we will need to make a few adaptions to the code above:
 * import the utils functionalities
 * define a [main functionality](https://stackoverflow.com/questions/419163/what-does-if-name-main-do)
 * parse the command line arguments using sys

In [None]:
%%writefile mlflow/train.py
from utils import *
import git
def logMlflow(model,data,param=dict(),metrics=dict(),features=None, tags=dict(),run_name=None):
    # Imports
    import mlflow
    import os
    from sklearn.externals import joblib
    
    # Get some general information
    output_folder = "mlflow_out"
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    type = model.__module__.split(".")[0]
    modelname = model.__class__.__name__
    
    # Start actual logging
    
    repo = git.Repo(search_parent_directories=True)
    sha = repo.head.object.hexsha
    mlflow.set_experiment(experiment_name="demo")
    if not run_name:
        run_name = modelname
    with mlflow.start_run(source_name=repo.remotes.origin.url,source_version=sha, run_name=run_name):
        
        # Log Parameters
        for k,v in param.items():
            mlflow.log_param(k, v)

        # Track dependencies
        import pkg_resources
        with open("{}/dependencies.txt".format(output_folder), "w+") as f: 
            for d in pkg_resources.working_set:
                f.write("{}={}\n".format(d.project_name,d.version))
        mlflow.log_artifact("{}/dependencies.txt".format(output_folder))
        
        # Track data
        data.to_csv("{}/data".format(output_folder))
        mlflow.log_artifact("{}/data".format(output_folder))
        
        if type=="sklearn":
            _ = joblib.dump(model,"{}/sklearn".format(output_folder))
            mlflow.log_artifact("{}/sklearn".format(output_folder))
        if type=="lgb":
            model.save_model("{}/lghtgbm.txt".format(output_folder))
            mlflow.log_artifact("{}/lghtgbm.txt".format(output_folder))
        
        # Log metrics
        for k,v in metrics.items():
            mlflow.log_metric(k,v)

        # plot Feature importances if avaible
        featurePlot = plotFeatureImportances(model, features, type)
        if featurePlot:
            mlflow.log_artifact("{}.png".format(featurePlot))
            
        # Set some tags to identify the experiment
        mlflow.set_tag("model",modelname)
        for tag, v in tags.items():
            mlflow.set_tag(t,v)
            
def run(clf, params, run_name=None):
    clf.fit(x_train, y_train)
    predictions = clf.predict(x_test)
    metrics = eval_metrics(y_test, predictions)
    print(metrics['mae'], metrics['r2'])

    logMlflow(clf,data,param=params,metrics=metrics,features=x_test.columns.values, run_name=run_name)

if __name__ == "__main__":
    from sklearn.model_selection import train_test_split
    from sklearn.linear_model import ElasticNet
    import sys
    # Do a train_test_split
    data = getData()
    x_train, x_test, y_train, y_test = train_test_split(data.iloc[:,:-1], data.iloc[:,-1], test_size=10, random_state=42)
    
    params=dict(alpha=float(sys.argv[1]) if len(sys.argv) > 1 else 0.5,
                l1_ratio = float(sys.argv[2]) if len(sys.argv) > 2 else 0.5)

    clf = ElasticNet(**params)

    run(clf,params)


Now you should be able to run this:

In [None]:
!mlflow run mlflow -P alpha=0.4

Or run it directly from our github account

In [None]:
!mlflow run mlflow -P alpha=0.4