# Using Trava for parameters & metrics autotracking with MLFlow

### Note: MLFlow is only one of possible implementations of trava.tracker.Tracker interface. 

In [1]:
import sys
sys.path.append('../')

import pandas as pd
import numpy as np

### Get data

In [2]:
from sklearn import datasets

dataset = datasets.load_breast_cancer()
df = pd.DataFrame(dataset['data'], columns=dataset['feature_names'])
df['target'] = dataset['target']

### Configure which metrics to track

Below we list metrics we want `MLFlow` to track. 

`sk` is a wrapper around `scikit-learn's` `make_scorer` function. 

`sk_proba` is the same, but for metrics that require probabilities instead of labels.

You can put any metric in the list, but it must be wrapped in `trava.scorer.Scorer`, see the class implementation for the details.

`FitTimeScorer` and `PredictTimeScorer` are examples of custom metrics that have nothing to do with prediction-related metrics. We can track anything that we find useful for our case.

In [3]:
from sklearn.metrics import log_loss, roc_auc_score, recall_score, precision_score
from trava.ext.sklearn.scorers import sk, sk_proba
from trava.ext.tracker.mlflow import MLFlowTracker
from trava.ext.scorers.model_scorers import FitTimeScorer, PredictTimeScorer

tracker_scorers = [
    sk_proba(log_loss),
    sk_proba(roc_auc_score),
    sk(recall_score),
    sk(precision_score),
    FitTimeScorer(),
    PredictTimeScorer()
]

tracker = MLFlowTracker(scorers=tracker_scorers)
# setting the new experiment
tracker.track_set_tracking_group(group='mlflow_demo')

INFO: 'mlflow_demo' does not exist. Creating a new experiment


### Split the data

Just `Trava's` way of making train/test splits.

In [4]:
from trava.ext.split_logic.basic import BasicSplitLogic
from trava.split.config import DataSplitConfig
from trava.split.result import SplitResult
from trava.split.splitter import Splitter

# Trava comes with some built-in options for splitting data,
# yet you can subclass trava.split.SplitLogic for something that suits you
split_config = DataSplitConfig(split_logic=BasicSplitLogic(shuffle=True),
                               target_col_name='target',
                               test_size=0.3)

split_result = Splitter.split(df=df, config=split_config)

### Initialize Trava

We use `TravaSV` instance to train and assess our models.

In [5]:
from trava.trava_sv import TravaSV

trava = TravaSV(tracker=tracker)

### Fit a model

##### Both default and user-provided parameters of the model will be tracked automatically

In [6]:
from sklearn.linear_model import LogisticRegression

# the following is roughly the same as:
# log_reg = LogisticRegression(C=1)
# log_reg.fit(split_result.X_train, split_result.y_train)
# log_reg.predict(split_result.X_test)
trava.fit_predict(raw_split_data=split_result,
                  model_id='log_reg',
                  model_type=LogisticRegression,
                  model_init_params={'C': 1})

  from collections import (
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[]

### Actually, that's all you need to do!

---

### Making sure that everything is tracked

In [7]:
import mlflow

In [8]:
experiment_id = mlflow.get_experiment_by_name('mlflow_demo').experiment_id
runs = mlflow.search_runs(experiment_ids=[experiment_id])
runs

Unnamed: 0,run_id,experiment_id,status,artifact_uri,start_time,end_time,metrics.recall_score,metrics.predict_time,metrics.fit_time,metrics.precision_score,...,params.verbose,params.fit_intercept,params.penalty,params.dual,params.intercept_scaling,tags.mlflow.user,tags.mlflow.runName,tags.mlflow.source.name,tags.mlflow.source.type,tags.model_type
0,2551943ef876419c9bedbc4f79a27d1d,1,FINISHED,file:///Users/ilya.tyutin/Projects/trava/examp...,2020-05-24 13:59:52.052000+00:00,2020-05-24 13:59:52.123000+00:00,0.916667,0.001057,0.019371,0.942857,...,0,True,l2,False,1,ilya.tyutin,log_reg,/Users/ilya.tyutin/anaconda3/lib/python3.7/sit...,LOCAL,LogisticRegression


### Examining the run

#### As you see, all the metrics as well as parameters are tracked.

In [9]:
mlflow.get_run(runs.iloc[0]['run_id'])

<Run: data=<RunData: metrics={'fit_time': 0.01937103271484375,
 'log_loss': 0.17052001191715066,
 'precision_score': 0.9428571428571428,
 'predict_time': 0.0010571479797363281,
 'recall_score': 0.9166666666666666,
 'roc_auc_score': 0.983392122281011}, params={'C': '1',
 'dual': 'False',
 'fit_intercept': 'True',
 'intercept_scaling': '1',
 'max_iter': '100',
 'multi_class': 'auto',
 'penalty': 'l2',
 'solver': 'lbfgs',
 'tol': '0.0001',
 'verbose': '0',
 'warm_start': 'False'}, tags={'mlflow.runName': 'log_reg',
 'mlflow.source.name': '/Users/ilya.tyutin/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py',
 'mlflow.source.type': 'LOCAL',
 'mlflow.user': 'ilya.tyutin',
 'model_type': 'LogisticRegression'}>, info=<RunInfo: artifact_uri='file:///Users/ilya.tyutin/Projects/trava/examples/mlruns/1/2551943ef876419c9bedbc4f79a27d1d/artifacts', end_time=1590328792123, experiment_id='1', lifecycle_stage='active', run_id='2551943ef876419c9bedbc4f79a27d1d', run_uuid='2551943ef876419c9bed