In [None]:
import os
import logging
from pathlib import Path
from datetime import datetime

import pandas as pd
from devtools import pprint

logging.basicConfig(level=logging.INFO, handlers=[logging.StreamHandler()])

### API Configuration

In [None]:
import mlopus

mlflow_api = mlopus.mlflow.get_api(
    # plugin="mlflow",  # This is the default API plugin, which is based on open source MLflow.
                        # Check for installed API plugins with `mlopus.mlflow.list_api_plugins()`
    
    # cls="...",        # Alternatively, an API class or a fully qualified class name may be provided.
                        # The API class just needs to implement `mlopus.mlflow.api.base.BaseMlflowApi`
    
    
    # Config for the selected API plugin. See all available config params with `mlopus.mlflow.api_conf_schema()`
    conf={
        "tracking_uri": None,  # Defaults to env var MLFLOW_TRACKING_URI or `~/.cache/mlflow`
        "cache_dir": None,  # Defaults to ~/.cache/mlopus/mlflow-providers/mlflow/<hashed_tracking_uri>
        "offline_mode": False,  # Disables any features that require contacting the MLflow server (defaults to False)
        "cache_local_artifacts": True,  # Allow caching artifacts when artifacts repo is local (will cause duplication!) (default is False)
        "file_transfer": {"use_scheme": "mlflow_sandbox"}  # If using the `mlflow-sandbox` Docker setup, replace `s3://...` artifact URLs with `mlflow_sandbox://...`
    },
)

pprint(mlflow_api)

### 1. Basic experiment tracking

In [None]:
# Get experiment by name, create if non-existing
exp = mlflow_api.get_or_create_exp(name="mlopus_introduction")
print(exp.url)

In [None]:
# Start a run in the experiment above
with exp.start_run(name="1_basics") as run:
    print(run.url)
    
    # Tags, params and metrics can be nested dicts
    run \
        .set_tags({"namespace": {"key": datetime.today()}}) \
        .log_params({"namespace": {"key": True}})  \
        .log_metrics({"namespace": {"key": 42}})

    # An artifact can be a file or dir. Caching options are available.
    # See `help(run.log_artifact)` or `help(mlflow_api.log_run_artifact)`
    run.log_artifact("data/report.csv", use_cache=False)

    # Inspect the metadata
    pprint(run)
    pprint(run.exp)

In [None]:
# The run is ended automatically when exiting the `with` block
assert run.end_time is not None
assert run.status == mlopus.mlflow.RunStatus.FINISHED

In [None]:
# Inspect the logged artifact.
path = run.get_artifact("report.csv")  # If the artifact storage is remote (e.g: S3/GCS), then this will be a path to the
!tree {path.parent} && echo            # local cache. Sync between remote and cache happens automatically (unless in offline mode).
pd.read_csv(path)

### 2. Artifact dumpers and loaders

In [None]:
# An artifact dumper can be any callback that receives a path and writes data to that path.
# This could be, for example, a writer method of a model or dataset class.
dumper = lambda path: path.mkdir() or [path.joinpath(f"{i}.txt").write_text("a"*i) for i in range(1, 4)]

In [None]:
# An artifact loader can be any callback that receives a path and loads data from that path.
# This could be, for example, a loader method of a model or dataset class.
loader = lambda path: {f: path.joinpath(f).read_text() for f in sorted(os.listdir(path))}

In [None]:
# Get experiment and start run
with mlflow_api \
    .get_or_create_exp("mlopus_introduction") \
    .start_run("2_dumpers_and_loaders") as run:
    
    # Log an artifact using a dumper. If the logging fails, the temporary files are kept.
    run.log_artifact(dumper, path_in_run="my_files")  

In [None]:
# Inspect the logged artifact.
!tree {run.get_artifact("my_files")}  # If the artifact storage is remote (e.g: S3/GCS), then this will be a path to the
                                      # local cache. Sync between remote and cache happens automatically (unless in offline mode).

In [None]:
# Use a loader on the artifact.
run.load_artifact(loader, path_in_run="my_files")  # Just like before, the cache sync happens automatically,
                                                   # if required, unless the API is in offline mode.

### 3. Model Registry

In [None]:
# Now let's publish a model version.
# The `source` of the model artifact will be the same artifact dumper from the previous example.
# Alternatively, we could use as `source` a path to a local directory containing all model files.

with mlflow_api \
    .get_or_create_exp("mlopus_introduction") \
    .start_run("3_model_registry") as run:

    model_version = mlflow_api \
        .get_or_create_model("mlopus_example") \
        .log_version(run, dumper)  # `path_in_run` defaults to model name

    print(model_version.url)
    print(version_number := model_version.version)

In [None]:
# Inspect the metadata.
pprint(model_version)
pprint(model_version.model)
pprint(model_version.run)
pprint(model_version.run.exp)

In [None]:
# Inspect the logged model artifact.
!tree {model_version.get_artifact()}  # If the artifact storage is remote (e.g: S3/GCS), then this will be a path to the
                                      # local cache. Sync between remote and cache happens automatically (unless in offline mode).

In [None]:
# Use the loader function from the previous example on the model artifact.
model_version.load_artifact(loader)  # Just like before, the cache sync happens automatically,
                                     # if required, unless the API is in offline mode.

### 4. Model Caching and Offline Mode

In [None]:
# First of all, let's clear all cache, so this example starts clean.
mlflow_api.clean_all_cache()

# Now, let's cache the model version that we published in the previous example.
# Both the metadata and artifact for this model version will be cached.
mlflow_api \
    .get_model("mlopus_example") \
    .get_version(version_number) \
    .cache()

In [None]:
# Get an offline copy of the API and load the model version metadata using cache only.
# Offline mode can also be set when loading the API with `mlopus.mlflow.get_api(conf={"offline_mode": True})`
offline_model_version = mlflow_api \
    .in_offline_mode \
    .get_model("mlopus_example") \
    .get_version(version_number)

# Inspect the metadata.
pprint(offline_model_version)

In [None]:
# Use the loader function from the previous example on the model artifact.
offline_model_version.load_artifact(loader)