In [19]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# MLflow Regression Recipe Notebook

This notebook runs the MLflow Regression Recipe on Databricks and inspects its results. For more information about the MLflow Regression Recipe, including usage examples, see the [Regression Recipe overview documentation](https://mlflow.org/docs/latest/recipes.html#regression-recipe) the [Regression Recipe API documentation](https://mlflow.org/docs/latest/python_api/mlflow.recipes.html#module-mlflow.recipes.regression.v1.recipe).

In [20]:
from mlflow.recipes import Recipe

r = Recipe(profile="local")


2022/11/15 19:56:29 INFO mlflow.recipes.recipe: Creating MLflow Recipe 'regression' with profile: 'local'


In [21]:
r.clean()

In [22]:
r.inspect()

In [23]:
r.run("ingest")

2022/11/15 19:56:31 INFO mlflow.recipes.step: Running step ingest...


name,type
tpep_pickup_datetime,datetime
tpep_dropoff_datetime,datetime
trip_distance,number
fare_amount,number
pickup_zip,integer
dropoff_zip,integer

tpep_pickup_datetime,tpep_dropoff_datetime,trip_distance,fare_amount,pickup_zip,dropoff_zip
2016-02-13 21:47:53,2016-02-13 21:57:15,1.4,8.0,10103,10110
2016-02-13 18:29:09,2016-02-13 18:37:23,1.31,7.5,10023,10023
2016-02-06 19:40:58,2016-02-06 19:52:32,1.8,9.5,10001,10018
2016-02-12 19:06:43,2016-02-12 19:20:54,2.3,11.5,10044,10111
2016-02-23 10:27:56,2016-02-23 10:58:33,2.6,18.5,10199,10022


In [24]:
r.run("split")

2022/11/15 19:56:32 INFO mlflow.recipes.utils.execution: ingest: No changes. Skipping.


2022/11/15 19:56:33 INFO mlflow.recipes.step: Running step split...


In [25]:
r.run("transform")

2022/11/15 19:56:37 INFO mlflow.recipes.utils.execution: ingest, split: No changes. Skipping.


2022/11/15 19:56:38 INFO mlflow.recipes.step: Running step transform...


Name,Type
tpep_pickup_datetime,datetime64[ns]
tpep_dropoff_datetime,datetime64[ns]
trip_distance,float64
fare_amount,float64
pickup_zip,int32
dropoff_zip,int32

Name,Type
hour_encoder__pickup_hour_0,float64
hour_encoder__pickup_hour_1,float64
hour_encoder__pickup_hour_2,float64
hour_encoder__pickup_hour_3,float64
hour_encoder__pickup_hour_4,float64
hour_encoder__pickup_hour_5,float64
hour_encoder__pickup_hour_6,float64
hour_encoder__pickup_hour_7,float64
hour_encoder__pickup_hour_8,float64
hour_encoder__pickup_hour_9,float64

hour_encoder__pickup_hour_0,hour_encoder__pickup_hour_1,hour_encoder__pickup_hour_2,hour_encoder__pickup_hour_3,hour_encoder__pickup_hour_4,hour_encoder__pickup_hour_5,hour_encoder__pickup_hour_6,hour_encoder__pickup_hour_7,hour_encoder__pickup_hour_8,hour_encoder__pickup_hour_9,hour_encoder__pickup_hour_10,hour_encoder__pickup_hour_11,hour_encoder__pickup_hour_12,hour_encoder__pickup_hour_13,hour_encoder__pickup_hour_14,hour_encoder__pickup_hour_15,hour_encoder__pickup_hour_16,hour_encoder__pickup_hour_17,hour_encoder__pickup_hour_18,hour_encoder__pickup_hour_19,hour_encoder__pickup_hour_20,hour_encoder__pickup_hour_21,hour_encoder__pickup_hour_22,hour_encoder__pickup_hour_23,day_encoder__pickup_dow_0,day_encoder__pickup_dow_1,day_encoder__pickup_dow_2,day_encoder__pickup_dow_3,day_encoder__pickup_dow_4,day_encoder__pickup_dow_5,day_encoder__pickup_dow_6,std_scaler__trip_distance,std_scaler__trip_duration,fare_amount
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-0.422905,-0.11199,8.0
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,-0.166606,-0.016302,11.5
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,-0.081173,0.310163,18.5
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.068893,0.03237,28.5
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.109628,0.084684,15.0


In [33]:
r.run("train")

2022/11/15 20:02:51 INFO mlflow.recipes.utils.execution: ingest: No changes. Skipping.


2022/11/15 20:02:52 INFO mlflow.recipes.step: Running step split...
2022/11/15 20:02:54 INFO mlflow.recipes.step: Running step transform...
2022/11/15 20:02:57 INFO mlflow.recipes.step: Running step train...
2022/11/15 20:03:11 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2022/11/15 20:03:11 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.


Metric,training,validation
root_mean_squared_error,0.374813,2.44022
weighted_mean_squared_error,0.108214,9.30917
example_count,8247.0,858.0
max_error,6.28702,55.6992
mean_absolute_error,0.249954,0.525139
mean_absolute_percentage_error,498977000000.0,0.0458731
mean_on_target,12.4346,12.8304
mean_squared_error,0.140485,5.95469
r2_score,0.998808,0.943577
score,0.998808,0.943577

Name,Type
tpep_pickup_datetime,datetime
tpep_dropoff_datetime,datetime
trip_distance,double
pickup_zip,integer
dropoff_zip,integer

Name,Type
-,"Tensor('float32', (-1,))"

absolute_error,prediction,fare_amount,tpep_pickup_datetime,tpep_dropoff_datetime,trip_distance,pickup_zip,dropoff_zip
6.287022,38.712978,45.0,2016-02-06 23:18:10,2016-02-06 23:35:18,3.51,10013,7302
6.241135,53.758865,60.0,2016-01-31 01:05:06,2016-01-31 01:19:26,5.25,10017,7030
5.131754,1.131754,-4.0,2016-02-02 10:48:08,2016-02-02 10:51:28,0.46,10065,10021
4.287354,47.712646,52.0,2016-01-16 00:12:20,2016-01-16 00:27:06,6.1,11378,10012
4.210205,47.789795,52.0,2016-02-26 16:54:41,2016-02-26 17:06:30,4.02,11371,11367
4.043003,45.956997,50.0,2016-01-19 00:48:47,2016-01-19 01:14:02,9.0,10119,11371
3.282337,51.717663,55.0,2016-02-28 04:50:41,2016-02-28 04:52:32,0.18,10115,10027
3.169811,59.330189,62.5,2016-02-01 10:30:45,2016-02-01 10:56:12,16.1,10019,7114
2.973267,61.526733,64.5,2016-01-03 03:25:40,2016-01-03 03:57:12,17.35,10023,7114
2.892513,24.607487,27.5,2016-02-17 19:00:52,2016-02-17 19:23:37,7.1,11422,11096

Unnamed: 0,Latest
Model Rank,> 0
root_mean_squared_error,2.44022
weighted_mean_squared_error,9.30917
max_error,55.6992
mean_absolute_error,0.525139
mean_absolute_percentage_error,0.0458731
mean_squared_error,5.95469
Run Time,2022-11-15 20:02:59
Run ID,4c2cb4397fc54173925943e90c5b74c8


In [27]:
r.run("evaluate")

2022/11/15 19:56:56 INFO mlflow.recipes.utils.execution: ingest, split, transform, train: No changes. Skipping.


2022/11/15 19:56:58 INFO mlflow.recipes.step: Running step evaluate...
2022/11/15 19:57:00 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2022/11/15 19:57:03 INFO mlflow.models.evaluation.default_evaluator: Shap explainer _PatchedKernelExplainer is used.

  0%|          | 0/10 [00:00<?, ?it/s]
 30%|███       | 3/10 [00:00<00:00, 27.68it/s]
 60%|██████    | 6/10 [00:00<00:00, 20.36it/s]
 90%|█████████ | 9/10 [00:00<00:00, 17.79it/s]
100%|██████████| 10/10 [00:00<00:00, 18.18it/s]
elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
2022/11/15 19:57:16 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.


Metric,validation,test
root_mean_squared_error,5.83279,2.304773
weighted_mean_squared_error,11.2742,4.967055
example_count,1482.0,1495.0
max_error,180.634,55.815173
mean_absolute_error,0.842825,0.515834
mean_absolute_percentage_error,0.0782529,0.28548
mean_on_target,12.5405,12.284622
mean_squared_error,34.0214,5.311978
r2_score,0.675054,0.947019
score,0.675054,0.947019

metric,greater_is_better,value,threshold,validated
root_mean_squared_error,False,2.30477,10,✅
mean_absolute_error,False,0.515834,50,✅
weighted_mean_squared_error,False,4.96705,50,✅


In [28]:
r.run("register")

2022/11/15 19:57:21 INFO mlflow.recipes.utils.execution: ingest, split, transform, train, evaluate: No changes. Skipping.


2022/11/15 19:57:22 INFO mlflow.recipes.step: Running step register...
Registered model 'taxi_fare_regressor' already exists. Creating a new version of this model...
2022/11/15 19:57:23 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: taxi_fare_regressor, version 2
Created version '2' of model 'taxi_fare_regressor'.


In [32]:
r.inspect("train")

Metric,training,validation
root_mean_squared_error,0.349786,5.83279
weighted_mean_squared_error,0.00642794,11.2742
example_count,7023.0,1482.0
max_error,8.47379,180.634
mean_absolute_error,0.236171,0.842825
mean_absolute_percentage_error,721085000000.0,0.0782529
mean_on_target,12.4564,12.5405
mean_squared_error,0.12235,34.0214
r2_score,0.998984,0.675054
score,0.998984,0.675054

Name,Type
tpep_pickup_datetime,datetime
tpep_dropoff_datetime,datetime
trip_distance,double
pickup_zip,integer
dropoff_zip,integer

Name,Type
-,"Tensor('float32', (-1,))"

absolute_error,prediction,fare_amount,tpep_pickup_datetime,tpep_dropoff_datetime,trip_distance,pickup_zip,dropoff_zip
8.473793,36.526207,45.0,2016-02-06 23:18:10,2016-02-06 23:35:18,3.51,10013,7302
5.364044,46.635956,52.0,2016-02-26 16:54:41,2016-02-26 17:06:30,4.02,11371,11367
3.751804,-0.248196,-4.0,2016-02-02 10:48:08,2016-02-02 10:51:28,0.46,10065,10021
2.834633,10.334633,7.5,2016-02-05 13:50:35,2016-02-05 14:06:00,1.24,10002,10013
2.54055,59.95945,62.5,2016-02-01 10:30:45,2016-02-01 10:56:12,16.1,10019,7114
2.479317,62.020683,64.5,2016-01-03 03:25:40,2016-01-03 03:57:12,17.35,10023,7114
2.081164,14.581164,12.5,2016-02-20 10:56:53,2016-02-20 11:19:39,2.7,10110,10009
2.065601,49.934399,52.0,2016-01-20 13:26:07,2016-01-20 14:06:10,15.0,10009,11430
1.899132,25.899132,24.0,2016-02-02 09:30:00,2016-02-02 10:08:12,5.89,11102,10016
1.891636,54.108364,56.0,2016-02-21 07:02:59,2016-02-21 07:26:25,13.68,10278,7114

Unnamed: 0,Latest
Model Rank,> 0
root_mean_squared_error,5.83279
weighted_mean_squared_error,11.2742
max_error,180.634
mean_absolute_error,0.842825
mean_absolute_percentage_error,0.0782529
mean_squared_error,34.0214
Run Time,2022-11-15 19:56:44
Run ID,2daecb47b88e4a63be4ffc68130f70bf


In [30]:
training_data = r.get_artifact("training_data")
training_data.describe()

Unnamed: 0,trip_distance,fare_amount,pickup_zip,dropoff_zip
count,7023.0,7023.0,7023.0,7023.0
mean,2.885041,12.4564,10138.497793,10173.090275
std,3.511771,10.974005,342.291008,413.752749
min,0.0,-8.0,7002.0,7002.0
25%,1.0,6.5,10012.0,10013.0
50%,1.7,9.0,10022.0,10023.0
75%,3.095,14.0,10110.0,10119.0
max,30.6,275.0,11436.0,11691.0


In [31]:
trained_model = r.get_artifact("model")
print(trained_model)

mlflow.pyfunc.loaded_model:
  artifact_path: train/model
  flavor: mlflow.sklearn
  run_id: 2daecb47b88e4a63be4ffc68130f70bf

