# mlflow Reproducibility

This notebook demonstrates mlflow's capability to reproduce results from experimental runs.

By providing the unique identifier for a run, this notebook will retrieve the model and training and test data sets for that run, retrain the model on the training data and calculate the metric score on the test data.

In [1]:
from __future__ import print_function

In [2]:
import os
import os.path
import socket
import pandas as pd
import tempfile
import pickle
import shutil
import scipy.stats
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.linear_model import ElasticNet
import mlflow
import mlflow.sklearn
import mlflow.tracking


## Set up connection to mlflow tracking server

In [3]:
# Assumes MFLOW_TRACKING_URI is set

client = mlflow.tracking.MlflowClient()
mlflow.get_tracking_uri()

'http://mlflow_tracker:5000'

In [4]:
for e in client.list_experiments():
    print('experiement:', e.name)

experiement: mlflow_demo1
experiement: mlflow_demo2


## Specify Run identifier to retrieve model and data

In [5]:
RUN_UUID = 'aa9478ceea7e4cfbad36b01f14fcd769'

## Retrieve training/test data sets from run

In [6]:
# retrieve pickled data file
data_dir = client.download_artifacts(RUN_UUID,'data_sets')
print("contents of data download directory",data_dir,os.listdir(data_dir))


contents of data download directory /tmp/tmpjeq3cv5j/data_sets ['data.pkl']


In [7]:
os.path.join(data_dir,os.listdir(data_dir)[0])

'/tmp/tmpjeq3cv5j/data_sets/data.pkl'

In [8]:
# unpickle training and test data
with open(os.path.join(data_dir,os.listdir(data_dir)[0]),'rb') as f:
    data = pickle.load(f)

In [9]:
data['train_x'][:5]

array([[ 0.39624274, -0.37825504, -0.80060088, -0.9399674 , -0.02735814,
        -1.04506461,  0.54772967, -0.70723951,  0.41334757,  0.34770965,
         0.5554772 ],
       [ 0.87268093,  0.53201882,  0.62459925,  1.46345779, -0.02735814,
         0.52710899,  1.40504061,  1.68361317, -0.04858051,  1.31198399,
        -1.15740449],
       [ 0.15802364, -0.07483042, -0.12991847,  0.08148831,  0.06784456,
         0.70851364,  1.30978384, -0.0733535 ,  0.28136812, -0.87954861,
         0.71860879],
       [ 0.39624274, -0.7828212 ,  0.03775213, -0.73968196, -0.64617568,
         0.82945007,  0.30958775, -1.38246592, -0.18055996, -0.87954861,
         1.77896413],
       [-1.39040049, -0.22654273,  2.63664649, -1.04011011,  1.59108772,
        -1.10553282, -0.02381095, -0.32484088,  0.08339894, -0.6165647 ,
        -0.91270711]])

## Retrieve Model

In [10]:
# retrieve pickled data file
model_dir = client.download_artifacts(RUN_UUID,'best_estimator')
print("contents of model download directory",model_dir,os.listdir(model_dir))


contents of model download directory /tmp/tmp3fp2s7qc/best_estimator ['MLmodel', 'conda.yaml', 'model.pkl']


In [11]:

sklearn_model = mlflow.sklearn.load_model(model_dir)
sklearn_model

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=7,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=279, n_jobs=None,
           oob_score=False, random_state=13, verbose=0, warm_start=False)

## Train model on training data set

In [12]:
sklearn_model.fit(data['train_x'],data['train_y'].quality)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=7,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=279, n_jobs=None,
           oob_score=False, random_state=13, verbose=0, warm_start=False)

## Caclulate score on test data set

In [13]:
sklearn_model.score(data['test_x'],data['test_y'].quality)

0.37756045253228665

## Retrieve saved metric from training run

In [14]:
run_info = client.get_run(RUN_UUID)
for m in run_info.data.metrics:
    print(m.key, m.value)

r2 0.377560452532


## Clean up

In [15]:
shutil.rmtree(data_dir)
shutil.rmtree(model_dir)