In [8]:
import numpy as np 
import pandas as pd
import pickle
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

In [25]:
import mlflow

mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("nyc-taxi-experiment")

<Experiment: artifact_location='./mlruns/2', experiment_id='2', lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>

In [4]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error

In [6]:
def read_dataframe(filename):
    if filename.endswith('.csv'):
        df = pd.read_csv(filename)

        df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
        df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)
    elif filename.endswith('.parquet'):
        df = pd.read_parquet(filename)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df

In [9]:
df_train  = read_dataframe('./data/green_tripdata_2021-01.parquet')
df_val  = read_dataframe('./data/green_tripdata_2021-02.parquet')

In [10]:
len(df_train), len(df_val)

(73908, 61921)

In [11]:
df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
df_val['PU_DO'] = df_val['PULocationID'] + '_' + df_val['DOLocationID']

In [13]:
dv = DictVectorizer()
categorical = ['PU_DO']
numerical = ['trip_distance']

train_dict = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [14]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [30]:
lr = LinearRegression()
lr.fit(X_train,y_train)

In [31]:
y_pred  = lr.predict(X_val)
mean_squared_error(y_val, y_pred, squared=False)

7.758715199477344

In [34]:
with open('models/lin_reg.bin', 'wb') as f_out:
    pickle.dump((dv,lr), f_out)

In [36]:
with mlflow.start_run():

    mlflow.set_tag("developer", 'avijit')  #useful for keeping track when working in big team

    mlflow.log_param("train-data-path", './data/green_tripdata_2021-01.parquet')
    mlflow.log_param("val-data-path", './data/green_tripdata_2021-02.parquet')

    alpha = 0.0001
    mlflow.log_param("alpha", alpha)
    lr = Lasso(alpha)
    lr.fit(X_train, y_train)

    
    y_pred = lr.predict(X_val)
    rmse  = mean_squared_error(y_val, y_pred, squared=False)
    mlflow.log_metric("rmse", rmse)

    mlflow.log_artifact(local_path="models/lin_reg.bin", artifact_path="mlflow/models_pickle")


In [27]:
import xgboost as xgb

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

In [28]:
train = xgb.DMatrix(X_train, label = y_train)
valid = xgb.DMatrix(X_val, label=y_val)

In [49]:
def objective(params):

    with mlflow.start_run():
        mlflow.set_tag('model', 'xgboost')
        mlflow.log_params(params)
        booster = xgb.train(
            params = params,
            dtrain = train,
            num_boost_round = 1000,
            evals = [(valid, 'validation')],
            early_stopping_rounds =  50
        )
        y_pred = booster.predict(valid)
        rmse = mean_squared_error(y_val, y_pred, squared = False)
        mlflow.log_metric('rmse', rmse)

    return {'loss':rmse, 'status':STATUS_OK}


In [50]:
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'learning_rate':hp.loguniform('learning_rate', -3, 0), # loguniform gives interval like : exp(-3) to exp(0) -> [0.05, 1]
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'reg:linear',
    'seed':42,
}

best_result = fmin(
    fn = objective,
    space = search_space,
    algo = tpe.suggest,
    max_evals=50,
    trials = Trials()
)

[0]	validation-rmse:10.03420                          
[1]	validation-rmse:7.36745                           
[2]	validation-rmse:6.79707                           
[3]	validation-rmse:6.64471                           
[4]	validation-rmse:6.58709                           
[5]	validation-rmse:6.56422                           
[6]	validation-rmse:6.55230                           
[7]	validation-rmse:6.54504                           
[8]	validation-rmse:6.54319                           
[9]	validation-rmse:6.54103                           
[10]	validation-rmse:6.53919                          
[11]	validation-rmse:6.53568                          
[12]	validation-rmse:6.53149                          
[13]	validation-rmse:6.52723                          
[14]	validation-rmse:6.52292                          
[15]	validation-rmse:6.52146                          
[16]	validation-rmse:6.52113                          
[17]	validation-rmse:6.52155                          
[18]	valid

Autolog -Using the best model now (model with lowest rmse - taken from mlflow tracking)

In [29]:
best_params = {
    'learning_rate': 0.11384066976621526,
    'max_depth': 23,
    'min_child_weight':	1.051707019693116,
    'objective': 'reg:linear',
    'reg_alpha': 0.044043330327184854,
    'reg_lambda': 0.0077173132817309215,
    'seed':	42

}

mlflow.xgboost.autolog()

booster = xgb.train(
            params = best_params,
            dtrain = train,
            num_boost_round = 1000,
            evals = [(valid, 'validation')],
            early_stopping_rounds =  50
        )

2022/05/30 15:28:19 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '12cc854c3fe040988ae4c946512ab31d', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current xgboost workflow


[0]	validation-rmse:19.16818
[1]	validation-rmse:17.39664
[2]	validation-rmse:15.85221
[3]	validation-rmse:14.50706
[4]	validation-rmse:13.34016
[5]	validation-rmse:12.33083
[6]	validation-rmse:11.46108
[7]	validation-rmse:10.71469
[8]	validation-rmse:10.07676
[9]	validation-rmse:9.53373
[10]	validation-rmse:9.07219
[11]	validation-rmse:8.68249
[12]	validation-rmse:8.35199
[13]	validation-rmse:8.07441
[14]	validation-rmse:7.83951
[15]	validation-rmse:7.64093
[16]	validation-rmse:7.47472
[17]	validation-rmse:7.33536
[18]	validation-rmse:7.21765
[19]	validation-rmse:7.11789
[20]	validation-rmse:7.03372
[21]	validation-rmse:6.96243
[22]	validation-rmse:6.90145
[23]	validation-rmse:6.84980
[24]	validation-rmse:6.80631
[25]	validation-rmse:6.76892
[26]	validation-rmse:6.73601
[27]	validation-rmse:6.70724
[28]	validation-rmse:6.68360
[29]	validation-rmse:6.66320
[30]	validation-rmse:6.64514
[31]	validation-rmse:6.62901
[32]	validation-rmse:6.61435
[33]	validation-rmse:6.60196
[34]	validation



In [39]:
mlflow.xgboost.autolog(disable=True)

Logging the model

In [42]:
with mlflow.start_run():
    best_params = {
    'learning_rate': 0.11384066976621526,
    'max_depth': 23,
    'min_child_weight':	1.051707019693116,
    'objective': 'reg:linear',
    'reg_alpha': 0.044043330327184854,
    'reg_lambda': 0.0077173132817309215,
    'seed':	42

}
    mlflow.log_params(best_params)

    booster = xgb.train(
            params = best_params,
            dtrain = train,
            num_boost_round = 1000,
            evals = [(valid, 'validation')],
            early_stopping_rounds =  50
    )
    
    y_pred = booster.predict(valid)
    rmse = mean_squared_error(y_val, y_pred, squared = False)
    mlflow.log_metric('rmse', rmse)

    with open("models/preprocessor.b", "wb") as f_out:
        pickle.dump(dv, f_out)

    mlflow.log_artifact(local_path='models/preprocessor.b', artifact_path='preprocessor')

    mlflow.xgboost.log_model(booster, artifact_path ='models_mlflow')

[0]	validation-rmse:19.16818
[1]	validation-rmse:17.39664
[2]	validation-rmse:15.85221
[3]	validation-rmse:14.50706
[4]	validation-rmse:13.34016
[5]	validation-rmse:12.33083
[6]	validation-rmse:11.46108
[7]	validation-rmse:10.71469
[8]	validation-rmse:10.07676
[9]	validation-rmse:9.53373
[10]	validation-rmse:9.07219
[11]	validation-rmse:8.68249
[12]	validation-rmse:8.35199
[13]	validation-rmse:8.07441
[14]	validation-rmse:7.83951
[15]	validation-rmse:7.64093
[16]	validation-rmse:7.47472
[17]	validation-rmse:7.33536
[18]	validation-rmse:7.21765
[19]	validation-rmse:7.11789
[20]	validation-rmse:7.03372
[21]	validation-rmse:6.96243
[22]	validation-rmse:6.90145
[23]	validation-rmse:6.84980
[24]	validation-rmse:6.80631
[25]	validation-rmse:6.76892
[26]	validation-rmse:6.73601
[27]	validation-rmse:6.70724
[28]	validation-rmse:6.68360
[29]	validation-rmse:6.66320
[30]	validation-rmse:6.64514
[31]	validation-rmse:6.62901
[32]	validation-rmse:6.61435
[33]	validation-rmse:6.60196
[34]	validation

make predictions using logged model

In [43]:
#import mlflow
logged_model = 'runs:/62c205fffd7046dbb6be19d3600272bd/models_mlflow'

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)



In [44]:
loaded_model

mlflow.pyfunc.loaded_model:
  artifact_path: models_mlflow
  flavor: mlflow.xgboost
  run_id: 62c205fffd7046dbb6be19d3600272bd

MLflow saves the models and makes possible to load it in two different flavors.
The first one is Python function, which we did above.
Now we will load it using xgboost flavor

In [45]:
xgboost_model = mlflow.xgboost.load_model(logged_model)



In [46]:
xgboost_model

<xgboost.core.Booster at 0x7f471611a0d0>

In [48]:
y_pred = xgboost_model.predict(valid)

In [49]:
y_pred[:10]

array([15.123217,  7.184564, 13.548071, 24.295067,  9.101829, 17.166336,
       11.854638,  9.04875 ,  8.945308, 19.988482], dtype=float32)

In [50]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.svm import LinearSVR

mlflow.sklearn.autolog()

for model_class in (RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor, LinearSVR):

    with mlflow.start_run():

        mlflow.log_param("train-data-path", "./data/green_tripdata_2021-01.csv")
        mlflow.log_param("valid-data-path", "./data/green_tripdata_2021-02.csv")
        mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")

        mlmodel = model_class()
        mlmodel.fit(X_train, y_train)

        y_pred = mlmodel.predict(X_val)
        rmse = mean_squared_error(y_val, y_pred, squared=False)
        mlflow.log_metric("rmse", rmse)

