# [Learning Spark Second Edition](https://github.com/databricks/LearningSparkV2)


 use _docker-compose.yml_ file located at he root directory

### [Chapter 11](https://learning.oreilly.com/library/view/learning-spark-2nd/9781492050032/ch11.html)
> MLflow: Track

In [1]:
# Start Spark Session
import os
from pyspark.sql import SparkSession

# Create a SparkSession
PARENT_DIR = os.popen('dirname $PWD').read().strip()

spark = SparkSession.builder.appName("Chapter 11. ML Flows") \
    .getOrCreate()

# filepath
filePath = 'databricks-datasets/learning-spark-v2/sf-airbnb/'+ \
          'sf-airbnb-clean.parquet/'
filePath = os.path.join(PARENT_DIR,filePath)

#load and create log price
from pyspark.sql.functions import log
airbnbDF = spark.read.parquet(filePath).withColumn('log_price', log('price'))

from pyspark.ml import Pipeline
#train-test split
trainDF, testDF = airbnbDF.randomSplit([.8, .2], seed=42)

# Combine output of StringIndexer defined above and numeric columns
from pyspark.ml.feature import VectorAssembler,StringIndexer

categoricalCols = [field for (field, dataType) in trainDF.dtypes 
                   if dataType == "string"]

indexOutputCols = [x + "Index" for x in categoricalCols]

numericCols = [field for (field, dataType) in trainDF.dtypes 
               if ((dataType == "double") & (field != "log_price"))]

stringIndexer = StringIndexer(inputCols=categoricalCols, 
                              outputCols=indexOutputCols, 
                              handleInvalid="skip")

assemblerInputs = indexOutputCols + numericCols
vecAssembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")

from pyspark.ml.tuning import CrossValidator
#Random Forest
from pyspark.ml.regression import RandomForestRegressor
rf = RandomForestRegressor(labelCol="log_price", maxBins=40, seed=42)


# pipeline

from pyspark.ml import Pipeline
pipeline = Pipeline(stages = [stringIndexer, vecAssembler, rf])


In [2]:
# In Python 
import mlflow
import mlflow.spark
import pandas as pd

from pyspark.ml.evaluation import RegressionEvaluator

#from pyspark.ml.tuning import CrossValidator

with mlflow.start_run(run_name="random-forest") as run:
    
    # Log params: num_trees and max_depth
    mlflow.log_param("num_trees", rf.getNumTrees())
    mlflow.log_param("max_depth", rf.getMaxDepth())

    # Log model
    pipelineModel = pipeline.fit(trainDF)
    mlflow.spark.log_model(pipelineModel, "model")

    # Log metrics: RMSE and R2
    predDF = pipelineModel.transform(testDF)
    regressionEvaluator = RegressionEvaluator(predictionCol="prediction", 
                                            labelCol="price")
    rmse = regressionEvaluator.setMetricName("rmse").evaluate(predDF)
    r2 = regressionEvaluator.setMetricName("r2").evaluate(predDF)
    mlflow.log_metrics({"rmse": rmse, "r2": r2})

    # Log artifact: feature importance scores
    rfModel = pipelineModel.stages[-1]
    pandasDF = (pd.DataFrame(list(zip(vecAssembler.getInputCols(), 
                                    rfModel.featureImportances)), 
                           columns=["feature", "importance"])
              .sort_values(by="importance", ascending=False))

    # First write to local filesystem, then tell MLflow where to find that file
    pandasDF.to_csv("feature-importance.csv", index=False)
    mlflow.log_artifact("feature-importance.csv")

In [6]:
#!pip install mlflow > /dev/null
#!mlflow server --host 0.0.0.0

[2020-07-28 21:09:59 +0000] [1538] [INFO] Starting gunicorn 20.0.4
[2020-07-28 21:09:59 +0000] [1538] [ERROR] Connection in use: ('0.0.0.0', 5000)
[2020-07-28 21:09:59 +0000] [1538] [ERROR] Retrying in 1 second.
[2020-07-28 21:10:00 +0000] [1538] [ERROR] Connection in use: ('0.0.0.0', 5000)
[2020-07-28 21:10:00 +0000] [1538] [ERROR] Retrying in 1 second.
[2020-07-28 21:10:01 +0000] [1538] [ERROR] Connection in use: ('0.0.0.0', 5000)
[2020-07-28 21:10:01 +0000] [1538] [ERROR] Retrying in 1 second.
[2020-07-28 21:10:02 +0000] [1538] [ERROR] Connection in use: ('0.0.0.0', 5000)
[2020-07-28 21:10:02 +0000] [1538] [ERROR] Retrying in 1 second.
[2020-07-28 21:10:03 +0000] [1538] [ERROR] Connection in use: ('0.0.0.0', 5000)
[2020-07-28 21:10:03 +0000] [1538] [ERROR] Retrying in 1 second.
[2020-07-28 21:10:04 +0000] [1538] [ERROR] Can't connect to ('0.0.0.0', 5000)
Running the mlflow server failed. Please see the logs above for details.


In [10]:
!mlflow server --help

Usage: mlflow server [OPTIONS]

  Run the MLflow tracking server.

  The server which listen on http://localhost:5000 by default, and only
  accept connections from the local machine. To let the server accept
  connections from other machines, you will need to pass --host 0.0.0.0 to
  listen on all network interfaces (or a specific interface address).

Options:
  --backend-store-uri PATH     URI to which to persist experiment and run
                               data. Acceptable URIs are SQLAlchemy-compatible
                               database connection strings (e.g.
                               'sqlite:///path/to/file.db') or local
                               filesystem URIs (e.g.
                               'file:///absolute/path/to/directory'). By
                               default, data will be logged to the ./mlruns
                               directory.

  --default-artifact-root URI  Local or S3 URI to store artifacts, for new
         

In [5]:
#!mlflow ui 

Usage: mlflow ui [OPTIONS]
Try 'mlflow ui --help' for help.

Error: no such option: --host


In [None]:
# In Python
from mlflow.tracking import MlflowClient

client = MlflowClient()
runs = client.search_runs(run.info.experiment_id, 
                          order_by=["attributes.start_time desc"], 
                          max_results=1)

run_id = runs[0].info.run_id
runs[0].data.metrics