# [Learning Spark Second Edition](https://github.com/databricks/LearningSparkV2)


 use _docker-compose.yml_ file located at he root directory

### [Chapter 11](https://learning.oreilly.com/library/view/learning-spark-2nd/9781492050032/ch11.html)
> MLflow: Track

In [1]:
# Start Spark Session
import os
from pyspark.sql import SparkSession

# Create a SparkSession
PARENT_DIR = os.popen('dirname $PWD').read().strip()

spark = SparkSession.builder.appName("Chapter 11. ML Flows") \
    .getOrCreate()

# filepath
filePath = 'databricks-datasets/learning-spark-v2/sf-airbnb/'+ \
          'sf-airbnb-clean.parquet/'
filePath = os.path.join(PARENT_DIR,filePath)

#load and create log price
from pyspark.sql.functions import log
airbnbDF = spark.read.parquet(filePath).withColumn('log_price', log('price'))

from pyspark.ml import Pipeline
#train-test split
trainDF, testDF = airbnbDF.randomSplit([.8, .2], seed=42)

# Combine output of StringIndexer defined above and numeric columns
from pyspark.ml.feature import VectorAssembler,StringIndexer

categoricalCols = [field for (field, dataType) in trainDF.dtypes 
                   if dataType == "string"]

indexOutputCols = [x + "Index" for x in categoricalCols]

numericCols = [field for (field, dataType) in trainDF.dtypes 
               if ((dataType == "double") & (field != "log_price"))]

stringIndexer = StringIndexer(inputCols=categoricalCols, 
                              outputCols=indexOutputCols, 
                              handleInvalid="skip")

assemblerInputs = indexOutputCols + numericCols
vecAssembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")

from pyspark.ml.tuning import CrossValidator
#Random Forest
from pyspark.ml.regression import RandomForestRegressor
rf = RandomForestRegressor(labelCol="log_price", maxBins=40, seed=42)


# pipeline

from pyspark.ml import Pipeline
pipeline = Pipeline(stages = [stringIndexer, vecAssembler, rf])


In [2]:
# In Python 
import mlflow
import mlflow.spark
import pandas as pd

from pyspark.ml.evaluation import RegressionEvaluator

#from pyspark.ml.tuning import CrossValidator

with mlflow.start_run(run_name="random-forest") as run:
    
    # Log params: num_trees and max_depth
    mlflow.log_param("num_trees", rf.getNumTrees())
    mlflow.log_param("max_depth", rf.getMaxDepth())

    # Log model
    pipelineModel = pipeline.fit(trainDF)
    mlflow.spark.log_model(pipelineModel, "model")

    # Log metrics: RMSE and R2
    predDF = pipelineModel.transform(testDF)
    regressionEvaluator = RegressionEvaluator(predictionCol="prediction", 
                                            labelCol="price")
    rmse = regressionEvaluator.setMetricName("rmse").evaluate(predDF)
    r2 = regressionEvaluator.setMetricName("r2").evaluate(predDF)
    mlflow.log_metrics({"rmse": rmse, "r2": r2})

    # Log artifact: feature importance scores
    rfModel = pipelineModel.stages[-1]
    pandasDF = (pd.DataFrame(list(zip(vecAssembler.getInputCols(), 
                                    rfModel.featureImportances)), 
                           columns=["feature", "importance"])
              .sort_values(by="importance", ascending=False))

    # First write to local filesystem, then tell MLflow where to find that file
    pandasDF.to_csv("feature-importance.csv", index=False)
    mlflow.log_artifact("feature-importance.csv")

In [3]:
#!pip install mlflow > /dev/null
#!mlflow server --host 0.0.0.0

In [5]:
#port http://127.0.0.1:5000/
!mlflow ui

[2020-07-28 21:14:17 +0000] [923] [INFO] Starting gunicorn 20.0.4
[2020-07-28 21:14:17 +0000] [923] [ERROR] Connection in use: ('127.0.0.1', 5000)
[2020-07-28 21:14:17 +0000] [923] [ERROR] Retrying in 1 second.
[2020-07-28 21:14:18 +0000] [923] [ERROR] Connection in use: ('127.0.0.1', 5000)
[2020-07-28 21:14:18 +0000] [923] [ERROR] Retrying in 1 second.
[2020-07-28 21:14:19 +0000] [923] [ERROR] Connection in use: ('127.0.0.1', 5000)
[2020-07-28 21:14:19 +0000] [923] [ERROR] Retrying in 1 second.
[2020-07-28 21:14:20 +0000] [923] [ERROR] Connection in use: ('127.0.0.1', 5000)
[2020-07-28 21:14:20 +0000] [923] [ERROR] Retrying in 1 second.
[2020-07-28 21:14:21 +0000] [923] [ERROR] Connection in use: ('127.0.0.1', 5000)
[2020-07-28 21:14:21 +0000] [923] [ERROR] Retrying in 1 second.
[2020-07-28 21:14:22 +0000] [923] [ERROR] Can't connect to ('127.0.0.1', 5000)
Running the mlflow server failed. Please see the logs above for details.


In [6]:
# In Python
from mlflow.tracking import MlflowClient

client = MlflowClient()
runs = client.search_runs(run.info.experiment_id, 
                          order_by=["attributes.start_time desc"], 
                          max_results=1)

run_id = runs[0].info.run_id
runs[0].data.metrics

TypeError: search_runs() got an unexpected keyword argument 'order_by'