In [37]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# MLflow Regression Recipe Notebook

This notebook runs the MLflow Regression Recipe on Databricks and inspects its results. For more information about the MLflow Regression Recipe, including usage examples, see the [Regression Recipe overview documentation](https://mlflow.org/docs/latest/recipes.html#regression-recipe) the [Regression Recipe API documentation](https://mlflow.org/docs/latest/python_api/mlflow.recipes.html#module-mlflow.recipes.regression.v1.recipe).

In [38]:
from mlflow.recipes import Recipe

r = Recipe(profile="profile_2")


2022/11/21 19:28:55 INFO mlflow.recipes.recipe: Creating MLflow Recipe 'regression' with profile: 'profile_2'


In [39]:
r.clean()

In [40]:
r.inspect()

In [41]:
r.run("ingest")

2022/11/21 19:29:12 INFO mlflow.recipes.step: Running step ingest...


name,type
tpep_pickup_datetime,datetime
tpep_dropoff_datetime,datetime
trip_distance,number
fare_amount,number
pickup_zip,integer
dropoff_zip,integer

tpep_pickup_datetime,tpep_dropoff_datetime,trip_distance,fare_amount,pickup_zip,dropoff_zip
2016-02-13 21:47:53,2016-02-13 21:57:15,1.4,8.0,10103,10110
2016-02-13 18:29:09,2016-02-13 18:37:23,1.31,7.5,10023,10023
2016-02-06 19:40:58,2016-02-06 19:52:32,1.8,9.5,10001,10018
2016-02-12 19:06:43,2016-02-12 19:20:54,2.3,11.5,10044,10111
2016-02-23 10:27:56,2016-02-23 10:58:33,2.6,18.5,10199,10022


In [42]:
r.run("split")

2022/11/21 19:29:18 INFO mlflow.recipes.utils.execution: ingest: No changes. Skipping.


2022/11/21 19:29:19 INFO mlflow.recipes.step: Running step split...


In [43]:
r.run("transform")

2022/11/21 19:29:27 INFO mlflow.recipes.utils.execution: ingest, split: No changes. Skipping.


2022/11/21 19:29:28 INFO mlflow.recipes.step: Running step transform...


Name,Type
tpep_pickup_datetime,datetime64[ns]
tpep_dropoff_datetime,datetime64[ns]
trip_distance,float64
fare_amount,float64
pickup_zip,int32
dropoff_zip,int32

Name,Type
hour_encoder__pickup_hour_0,float64
hour_encoder__pickup_hour_1,float64
hour_encoder__pickup_hour_2,float64
hour_encoder__pickup_hour_3,float64
hour_encoder__pickup_hour_4,float64
hour_encoder__pickup_hour_5,float64
hour_encoder__pickup_hour_6,float64
hour_encoder__pickup_hour_7,float64
hour_encoder__pickup_hour_8,float64
hour_encoder__pickup_hour_9,float64

hour_encoder__pickup_hour_0,hour_encoder__pickup_hour_1,hour_encoder__pickup_hour_2,hour_encoder__pickup_hour_3,hour_encoder__pickup_hour_4,hour_encoder__pickup_hour_5,hour_encoder__pickup_hour_6,hour_encoder__pickup_hour_7,hour_encoder__pickup_hour_8,hour_encoder__pickup_hour_9,hour_encoder__pickup_hour_10,hour_encoder__pickup_hour_11,hour_encoder__pickup_hour_12,hour_encoder__pickup_hour_13,hour_encoder__pickup_hour_14,hour_encoder__pickup_hour_15,hour_encoder__pickup_hour_16,hour_encoder__pickup_hour_17,hour_encoder__pickup_hour_18,hour_encoder__pickup_hour_19,hour_encoder__pickup_hour_20,hour_encoder__pickup_hour_21,hour_encoder__pickup_hour_22,hour_encoder__pickup_hour_23,day_encoder__pickup_dow_0,day_encoder__pickup_dow_1,day_encoder__pickup_dow_2,day_encoder__pickup_dow_3,day_encoder__pickup_dow_4,day_encoder__pickup_dow_5,day_encoder__pickup_dow_6,fare_amount
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,8.0
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,11.5
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,18.5
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,28.5
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,15.0


In [44]:
r.run("train")

2022/11/21 19:29:36 INFO mlflow.recipes.utils.execution: ingest, split, transform: No changes. Skipping.


2022/11/21 19:29:37 INFO mlflow.recipes.step: Running step train...
[flaml.automl: 11-21 19:29:42] {2599} INFO - task = regression
[flaml.automl: 11-21 19:29:42] {2601} INFO - Data split method: uniform
[flaml.automl: 11-21 19:29:42] {2604} INFO - Evaluation method: holdout
[flaml.automl: 11-21 19:29:42] {2726} INFO - Minimizing error metric: mae
[flaml.automl: 11-21 19:29:42] {2870} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'xgboost', 'extra_tree', 'xgb_limitdepth']
[flaml.automl: 11-21 19:29:42] {3166} INFO - iteration 0, current learner lgbm
[flaml.automl: 11-21 19:29:42] {3296} INFO - Estimated sufficient time budget=1510s. Estimated necessary time budget=11s.
[flaml.automl: 11-21 19:29:42] {3343} INFO -  at 0.2s,	estimator lgbm's best error=7.0653,	best estimator lgbm's best error=7.0653
[flaml.automl: 11-21 19:29:42] {3166} INFO - iteration 1, current learner lgbm
[flaml.automl: 11-21 19:29:42] {3343} INFO -  at 0.8s,	estimator lgbm's best error=7.0653,	best estima

Metric,training,validation
mean_absolute_error,6.00449,6.46913
weighted_mean_squared_error,124.555,125.675
example_count,7625.0,1198.0
max_error,265.304,57.6242
mean_absolute_percentage_error,17255800000000.0,0.493451
mean_on_target,12.4256,12.7788
mean_squared_error,125.843,125.74
r2_score,-0.0686153,-0.0956389
root_mean_squared_error,11.218,11.2134
score,-0.0686153,-0.0956389

Name,Type
tpep_pickup_datetime,datetime
tpep_dropoff_datetime,datetime
trip_distance,double
pickup_zip,integer
dropoff_zip,integer

Name,Type
-,"Tensor('float32', (-1,))"

absolute_error,prediction,fare_amount,tpep_pickup_datetime,tpep_dropoff_datetime,trip_distance,pickup_zip,dropoff_zip
265.30398,9.69602,275.0,2016-02-12 20:55:19,2016-02-12 21:52:38,20.85,10013,7008
250.30398,9.69602,260.0,2016-02-29 12:16:16,2016-02-29 12:16:53,0.0,8876,8876
95.610615,9.389385,105.0,2016-01-16 18:09:15,2016-01-16 18:09:23,0.0,7310,7310
85.30398,9.69602,95.0,2016-02-22 21:17:27,2016-02-22 22:00:58,30.6,11371,7114
84.80398,9.69602,94.5,2016-01-21 00:58:34,2016-01-21 01:33:10,24.5,11371,10601
78.30398,9.69602,88.0,2016-02-11 12:47:12,2016-02-11 13:16:59,19.02,10119,10710
75.30398,9.69602,85.0,2016-02-11 17:52:13,2016-02-11 18:38:17,14.46,10282,7114
72.80398,9.69602,82.5,2016-01-05 16:07:58,2016-01-05 17:48:29,24.5,11422,11213
68.80398,9.69602,78.5,2016-01-11 17:21:51,2016-01-11 18:17:18,19.3,10021,7114
68.80398,9.69602,78.5,2016-01-22 10:09:42,2016-01-22 11:04:16,19.8,10028,7114

Unnamed: 0,Latest
Model Rank,> 0
mean_absolute_error,6.46913
weighted_mean_squared_error,125.675
max_error,57.6242
mean_absolute_percentage_error,0.493451
mean_squared_error,125.74
root_mean_squared_error,11.2134
Run Time,2022-11-21 19:29:41
Run ID,68076e42fc804cf29a4d6df7dd5e4ecf


In [36]:
r.run("evaluate")

2022/11/21 19:10:34 INFO mlflow.recipes.utils.execution: ingest, split, transform, train: No changes. Skipping.


2022/11/21 19:10:35 INFO mlflow.recipes.step: Running step evaluate...
2022/11/21 19:10:37 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2022/11/21 19:10:48 INFO mlflow.models.evaluation.default_evaluator: Shap explainer _PatchedKernelExplainer is used.

  0%|          | 0/10 [00:00<?, ?it/s]
 30%|███       | 3/10 [00:00<00:00, 27.16it/s]
 60%|██████    | 6/10 [00:00<00:00, 27.19it/s]
 90%|█████████ | 9/10 [00:00<00:00, 25.72it/s]
100%|██████████| 10/10 [00:00<00:00, 26.39it/s]
elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
2022/11/21 19:11:08 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.


Metric,validation,test
mean_absolute_error,2.12044,1.839527
weighted_mean_squared_error,16.9148,12.96306
example_count,959.0,990.0
max_error,22.0322,22.822661
mean_absolute_percentage_error,32780800000000.0,22987908447992.07
mean_on_target,3.13932,2.83601
mean_squared_error,16.6471,13.198155
r2_score,-0.133983,-0.092211
root_mean_squared_error,4.08008,3.632927
score,-0.133983,-0.092211

metric,greater_is_better,value,threshold,validated
root_mean_squared_error,False,3.63293,10,✅
mean_absolute_error,False,1.83953,50,✅
weighted_mean_squared_error,False,12.9631,50,✅
max_error,False,22.8227,100,✅


In [10]:
r.run("register")

2022/11/21 16:45:48 INFO mlflow.recipes.utils.execution: ingest, split, transform, train, evaluate: No changes. Skipping.


2022/11/21 16:45:49 INFO mlflow.recipes.step: Running step register...
Registered model 'taxi_fare_regressor' already exists. Creating a new version of this model...
2022/11/21 16:45:50 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: taxi_fare_regressor, version 2
Created version '2' of model 'taxi_fare_regressor'.


In [11]:
r.inspect("train")

Metric,training,validation
root_mean_squared_error,3.43511,3.84283
weighted_mean_squared_error,11.3514,14.9506
example_count,8051.0,959.0
max_error,27.5632,20.6269
mean_absolute_error,2.14914,2.34305
mean_absolute_percentage_error,46749500000000.0,54114500000000.0
mean_on_target,2.86689,3.13932
mean_squared_error,11.8,14.7674
r2_score,0.0274036,-0.00593847
score,0.0274036,-0.00593847

Name,Type
tpep_pickup_datetime,datetime
tpep_dropoff_datetime,datetime
fare_amount,double
pickup_zip,integer
dropoff_zip,integer

Name,Type
-,"Tensor('float32', (-1,))"

absolute_error,prediction,trip_distance,tpep_pickup_datetime,tpep_dropoff_datetime,fare_amount,pickup_zip,dropoff_zip
27.56321,3.03679,30.6,2016-02-22 21:17:27,2016-02-22 22:00:58,95.0,11371,7114
23.34977,2.45023,25.8,2016-02-06 12:18:51,2016-02-06 13:03:53,71.0,11422,11218
22.291881,3.258119,25.55,2016-02-16 22:30:58,2016-02-16 23:24:47,73.5,10009,11050
21.729148,2.770852,24.5,2016-01-05 16:07:58,2016-01-05 17:48:29,82.5,11422,11213
21.578039,4.721961,26.3,2016-01-07 04:07:58,2016-01-07 04:43:08,0.0,10018,10606
21.439961,3.060039,24.5,2016-01-16 23:05:32,2016-01-16 23:43:37,65.0,11422,10463
21.042173,3.457827,24.5,2016-01-21 00:58:34,2016-01-21 01:33:10,94.5,11371,10601
20.886557,4.113443,25.0,2016-02-21 03:31:58,2016-02-21 03:32:28,0.0,10502,10502
20.481508,3.718492,24.2,2016-02-01 01:01:57,2016-02-01 01:31:40,63.0,11422,10468
19.661621,2.638379,22.3,2016-01-19 12:46:55,2016-01-19 13:15:28,58.5,11422,10475

Unnamed: 0,Latest
Model Rank,> 0
root_mean_squared_error,3.84283
weighted_mean_squared_error,14.9506
max_error,20.6269
mean_absolute_error,2.34305
mean_absolute_percentage_error,5.41145e+13
mean_squared_error,14.7674
Run Time,2022-11-21 16:40:39
Run ID,fc3c6782d2a040708b8795cc22b35503


In [None]:
training_data = r.get_artifact("training_data")
training_data.describe()

In [None]:
trained_model = r.get_artifact("model")
print(trained_model)