Merge pull request mlflow#24 from criteo-forks/master-sync

Master sync
jdlesage · Oct 30, 2019 · c49d10c · c49d10c
2 parents 1184634 + 424ed02
commit c49d10c
Show file tree

Hide file tree

Showing 36 changed files with 537 additions and 170 deletions.
diff --git a/docs/source/search-syntax.rst b/docs/source/search-syntax.rst
@@ -156,6 +156,7 @@ with 10 layers and had a prediction accuracy of 94.5% or higher, use:
 .. code-block:: py
 
   from mlflow.tracking.client import MlflowClient
+  from mlflow.entities import ViewType
 
   query = "params.model = 'CNN' and params.layers = '10' and metrics.'prediction accuracy' >= 0.945"
   runs = MlflowClient().search_runs(["3", "4", "17"], query, ViewType.ACTIVE_ONLY)
@@ -165,6 +166,7 @@ To search all known experiments for any MLflow runs created using the Inception
 .. code-block:: py
 
   from mlflow.tracking.client import MlflowClient
+  from mlflow.entities import ViewType
 
   all_experiments = [exp.experiment_id for exp in MlflowClient().list_experiments()]
   runs = MlflowClient().search_runs(all_experiments, "params.model = 'Inception'", ViewType.ALL)

diff --git a/examples/sklearn_elasticnet_wine/MLproject b/examples/sklearn_elasticnet_wine/MLproject
@@ -5,6 +5,6 @@ conda_env: conda.yaml
 entry_points:
   main:
     parameters:
-      alpha: float
+      alpha: {type: float, default: 0.5}
       l1_ratio: {type: float, default: 0.1}
     command: "python train.py {alpha} {l1_ratio}"
diff --git a/mlflow/__init__.py b/mlflow/__init__.py
@@ -52,11 +52,14 @@
 log_artifacts = mlflow.tracking.fluent.log_artifacts
 log_artifact = mlflow.tracking.fluent.log_artifact
 active_run = mlflow.tracking.fluent.active_run
+get_run = mlflow.tracking.fluent.get_run
 start_run = mlflow.tracking.fluent.start_run
 end_run = mlflow.tracking.fluent.end_run
 search_runs = mlflow.tracking.fluent.search_runs
 get_artifact_uri = mlflow.tracking.fluent.get_artifact_uri
 set_tracking_uri = tracking.set_tracking_uri
+get_experiment = mlflow.tracking.fluent.get_experiment
+get_experiment_by_name = mlflow.tracking.fluent.get_experiment_by_name
 get_tracking_uri = tracking.get_tracking_uri
 create_experiment = mlflow.tracking.fluent.create_experiment
 set_experiment = mlflow.tracking.fluent.set_experiment

diff --git a/mlflow/entities/model_registry/model_version.py b/mlflow/entities/model_registry/model_version.py
@@ -4,6 +4,12 @@
 
 
 class ModelVersion(_ModelRegistryEntity):
+    """
+    Note:: Experimental: This entity may change or be removed in a future release without warning.
+    MLflow entity for Model Version.
+    A model version is uniquely identified using underlying
+    :py:class:`mlflow.entities.model_registry.RegisteredModel` and version number.
+    """
     def __init__(self, registered_model, version):
         """
         Construct a :py:class:`mlflow.entities.model_registry.RegisteredModel` instance

diff --git a/mlflow/entities/model_registry/model_version_detailed.py b/mlflow/entities/model_registry/model_version_detailed.py
@@ -5,6 +5,13 @@
 
 
 class ModelVersionDetailed(ModelVersion):
+    """
+    Note:: Experimental: This entity may change or be removed in a future release without warning.
+    MLflow entity for Model Version Detailed.
+    Provides additional metadata data for model version in addition to information in
+    :py:class:`mlflow.entities.model_registry.ModelVersion`.
+    """
+
     def __init__(self, registered_model, version, creation_timestamp, last_updated_timestamp=None,
                  description=None, user_id=None, current_stage=None, source=None, run_id=None,
                  status=None, status_message=None):

diff --git a/mlflow/entities/model_registry/registered_model.py b/mlflow/entities/model_registry/registered_model.py
@@ -3,6 +3,12 @@
 
 
 class RegisteredModel(_ModelRegistryEntity):
+    """
+    Note:: Experimental: This entity may change or be removed in a future release without warning.
+    MLflow entity for Registered Model.
+    A registered model entity is uniquely identified by its name.
+    """
+
     def __init__(self, name):
         """
         Construct a :py:class:`mlflow.entities.model_registry.RegisteredModel`

diff --git a/mlflow/entities/model_registry/registered_model_detailed.py b/mlflow/entities/model_registry/registered_model_detailed.py
@@ -4,12 +4,17 @@
 
 
 class RegisteredModelDetailed(RegisteredModel):
-    # __init__ method to initialize fields
+    """
+    Note:: Experimental: This entity may change or be removed in a future release without warning.
+    MLflow entity for Registered Model Detailed.
+    Provides additional metadata data for registered model in addition to information in
+    :py:class:`mlflow.entities.model_registry.RegisteredModel`.
+    """
+
     def __init__(self, name, creation_timestamp, last_updated_timestamp=None, description=None,
                  latest_versions=None):
         # Constructor is called only from within the system by various backend stores.
         super(RegisteredModelDetailed, self).__init__(name)
-        self._name = name
         self._creation_time = creation_timestamp
         self._last_updated_timestamp = last_updated_timestamp
         self._description = description

diff --git a/mlflow/h2o.py b/mlflow/h2o.py
@@ -126,9 +126,10 @@ def log_model(h2o_model, artifact_path, conda_env=None, registered_model_name=No
                                 ]
                             ]
                         }
-    :param registered_model_name: If given, create a model version under ``registered_model_name``,
-                                  also creating a registered model if one with the given name does
-                                  not exist.
+    :param registered_model_name: Note:: Experimental: This argument may change or be removed in a
+                                  future release without warning. If given, create a model
+                                  version under ``registered_model_name``, also creating a
+                                  registered model if one with the given name does not exist.
     :param kwargs: kwargs to pass to ``h2o.save_model`` method.
     """
     Model.log(artifact_path=artifact_path, flavor=mlflow.h2o,

diff --git a/mlflow/java/client/src/main/java/com/databricks/api/proto/mlflow/ModelRegistry.java b/mlflow/java/client/src/main/java/com/databricks/api/proto/mlflow/ModelRegistry.java
diff --git a/mlflow/keras.py b/mlflow/keras.py
@@ -212,9 +212,10 @@ def log_model(keras_model, artifact_path, conda_env=None, custom_objects=None, k
     :param keras_module: Keras module to be used to save / load the model
                          (``keras`` or ``tf.keras``). If not provided, MLflow will
                          attempt to infer the Keras module based on the given model.
-    :param registered_model_name: If given, create a model version under ``registered_model_name``,
-                                  also creating a registered model if one with the given name does
-                                  not exist.
+    :param registered_model_name: Note:: Experimental: This argument may change or be removed in a
+                                  future release without warning. If given, create a model
+                                  version under ``registered_model_name``, also creating a
+                                  registered model if one with the given name does not exist.
     :param kwargs: kwargs to pass to ``keras_model.save`` method.
 
     >>> from keras import Dense, layers

diff --git a/mlflow/mleap.py b/mlflow/mleap.py
@@ -36,9 +36,10 @@ def log_model(spark_model, sample_input, artifact_path, registered_model_name=No
     :param sample_input: Sample PySpark DataFrame input that the model can evaluate. This is
                          required by MLeap for data schema inference.
     :param artifact_path: Run-relative artifact path.
-    :param registered_model_name: If given, create a model version under ``registered_model_name``,
-                                  also creating a registered model if one with the given name does
-                                  not exist.
+    :param registered_model_name: Note:: Experimental: This argument may change or be removed in a
+                                  future release without warning. If given, create a model
+                                  version under ``registered_model_name``, also creating a
+                                  registered model if one with the given name does not exist.
 
     >>> import mlflow
     >>> import mlflow.mleap

diff --git a/mlflow/models/__init__.py b/mlflow/models/__init__.py
@@ -68,9 +68,10 @@ def log(cls, artifact_path, flavor, registered_model_name=None, **kwargs):
         :param flavor: Flavor module to save the model with. The module must have
                        the ``save_model`` function that will persist the model as a valid
                        MLflow model.
-        :param registered_model_name: If given, create a model version under
-                                      ``registered_model_name``, also creating a registered model
-                                      if one with the given name does not exist.
+        :param registered_model_name: Note:: Experimental: This argument may change or be removed
+                                      in a future release without warning. If given, create a model
+                                      version under ``registered_model_name``, also creating a
+                                      registered model if one with the given name does not exist.
         :param kwargs: Extra args passed to the model flavor.
         """
         with TempDir() as tmp:

diff --git a/mlflow/onnx.py b/mlflow/onnx.py
@@ -232,9 +232,10 @@ def log_model(onnx_model, artifact_path, conda_env=None, registered_model_name=N
                                 'onnxruntime=0.3.0'
                             ]
                         }
-    :param registered_model_name: If given, create a model version under ``registered_model_name``,
-                                  also creating a registered model if one with the given name does
-                                  not exist.
+    :param registered_model_name: Note:: Experimental: This argument may change or be removed in a
+                                  future release without warning. If given, create a model
+                                  version under ``registered_model_name``, also creating a
+                                  registered model if one with the given name does not exist.
     """
     Model.log(artifact_path=artifact_path, flavor=mlflow.onnx,
               onnx_model=onnx_model, conda_env=conda_env,

diff --git a/mlflow/projects/_project_spec.py b/mlflow/projects/_project_spec.py
@@ -11,44 +11,61 @@
 from mlflow.utils.file_utils import get_local_path_or_none
 
 
-MLPROJECT_FILE_NAME = "MLproject"
+MLPROJECT_FILE_NAME = "mlproject"
 DEFAULT_CONDA_FILE_NAME = "conda.yaml"
 
 
+def _find_mlproject(directory):
+    filenames = os.listdir(directory)
+    for filename in filenames:
+        if filename.lower() == MLPROJECT_FILE_NAME:
+            return os.path.join(directory, filename)
+    return None
+
+
 def load_project(directory):
-    mlproject_path = os.path.join(directory, MLPROJECT_FILE_NAME)
+    mlproject_path = _find_mlproject(directory)
+
     # TODO: Validate structure of YAML loaded from the file
-    if os.path.exists(mlproject_path):
+    yaml_obj = {}
+    if mlproject_path is not None:
         with open(mlproject_path) as mlproject_file:
-            yaml_obj = yaml.safe_load(mlproject_file.read())
-    else:
-        yaml_obj = {}
+            yaml_obj = yaml.safe_load(mlproject_file)
+
     project_name = yaml_obj.get("name")
-    if not project_name:
-        project_name = None
-    conda_path = yaml_obj.get("conda_env")
+
+    # Validate config if docker_env parameter is present
     docker_env = yaml_obj.get("docker_env")
     if docker_env and not docker_env.get("image"):
         raise ExecutionException("Docker environment specified but no image "
                                  "attribute found.")
+
+    # Validate config if conda_env parameter is present
+    conda_path = yaml_obj.get("conda_env")
     if conda_path and docker_env:
-        raise ExecutionException("Project cannot contain both a docker and conda environment.")
+        raise ExecutionException("Project cannot contain both a docker and "
+                                 "conda environment.")
+
+    # Parse entry points
     entry_points = {}
     for name, entry_point_yaml in yaml_obj.get("entry_points", {}).items():
         parameters = entry_point_yaml.get("parameters", {})
         command = entry_point_yaml.get("command")
         entry_points[name] = EntryPoint(name, parameters, command)
+
     if conda_path:
         conda_env_path = os.path.join(directory, conda_path)
         if not os.path.exists(conda_env_path):
             raise ExecutionException("Project specified conda environment file %s, but no such "
                                      "file was found." % conda_env_path)
         return Project(conda_env_path=conda_env_path, entry_points=entry_points,
                        docker_env=docker_env, name=project_name,)
+
     default_conda_path = os.path.join(directory, DEFAULT_CONDA_FILE_NAME)
     if os.path.exists(default_conda_path):
         return Project(conda_env_path=default_conda_path, entry_points=entry_points,
                        docker_env=docker_env, name=project_name)
+
     return Project(conda_env_path=None, entry_points=entry_points,
                    docker_env=docker_env, name=project_name)
 
@@ -92,10 +109,7 @@ def _validate_parameters(self, user_parameters):
         for name in self.parameters:
             if (name not in user_parameters and self.parameters[name].default is None):
                 missing_params.append(name)
-        if len(missing_params) == 1:
-            raise ExecutionException(
-                "No value given for missing parameter: '%s'" % missing_params[0])
-        elif len(missing_params) > 1:
+        if missing_params:
             raise ExecutionException(
                 "No value given for missing parameters: %s" %
                 ", ".join(["'%s'" % name for name in missing_params]))

diff --git a/mlflow/projects/databricks.py b/mlflow/projects/databricks.py
@@ -6,6 +6,7 @@
 import textwrap
 import time
 import logging
+import posixpath
 
 from six.moves import shlex_quote
 
@@ -23,9 +24,9 @@
 # Base directory within driver container for storing files related to MLflow
 DB_CONTAINER_BASE = "/databricks/mlflow"
 # Base directory within driver container for storing project archives
-DB_TARFILE_BASE = os.path.join(DB_CONTAINER_BASE, "project-tars")
+DB_TARFILE_BASE = posixpath.join(DB_CONTAINER_BASE, "project-tars")
 # Base directory directory within driver container for storing extracted project directories
-DB_PROJECTS_BASE = os.path.join(DB_CONTAINER_BASE, "projects")
+DB_PROJECTS_BASE = posixpath.join(DB_CONTAINER_BASE, "projects")
 # Name to use for project directory when archiving it for upload to DBFS; the TAR will contain
 # a single directory with this name
 DB_TARFILE_ARCHIVE_NAME = "mlflow-project"
@@ -131,9 +132,9 @@ def custom_filter(x):
             with open(temp_tar_filename, "rb") as tarred_project:
                 tarfile_hash = hashlib.sha256(tarred_project.read()).hexdigest()
             # TODO: Get subdirectory for experiment from the tracking server
-            dbfs_path = os.path.join(DBFS_EXPERIMENT_DIR_BASE, str(experiment_id),
-                                     "projects-code", "%s.tar.gz" % tarfile_hash)
-            dbfs_fuse_uri = os.path.join("/dbfs", dbfs_path)
+            dbfs_path = posixpath.join(DBFS_EXPERIMENT_DIR_BASE, str(experiment_id),
+                                       "projects-code", "%s.tar.gz" % tarfile_hash)
+            dbfs_fuse_uri = posixpath.join("/dbfs", dbfs_path)
             if not self._dbfs_path_exists(dbfs_path):
                 self._upload_to_dbfs(temp_tar_filename, dbfs_fuse_uri)
                 _logger.info("=== Finished uploading project to %s ===", dbfs_fuse_uri)
@@ -233,10 +234,10 @@ def _get_databricks_run_cmd(dbfs_fuse_tar_uri, run_id, entry_point, parameters):
     Generate MLflow CLI command to run on Databricks cluster in order to launch a run on Databricks.
     """
     # Strip ".gz" and ".tar" file extensions from base filename of the tarfile
-    tar_hash = os.path.splitext(os.path.splitext(os.path.basename(dbfs_fuse_tar_uri))[0])[0]
-    container_tar_path = os.path.abspath(os.path.join(DB_TARFILE_BASE,
-                                                      os.path.basename(dbfs_fuse_tar_uri)))
-    project_dir = os.path.join(DB_PROJECTS_BASE, tar_hash)
+    tar_hash = posixpath.splitext(posixpath.splitext(posixpath.basename(dbfs_fuse_tar_uri))[0])[0]
+    container_tar_path = posixpath.abspath(posixpath.join(DB_TARFILE_BASE,
+                                           posixpath.basename(dbfs_fuse_tar_uri)))
+    project_dir = posixpath.join(DB_PROJECTS_BASE, tar_hash)
     mlflow_run_arr = list(map(shlex_quote, ["mlflow", "run", project_dir,
                                             "--entry-point", entry_point]))
     if run_id: