# 2.5 Model registry

In [1]:
from sklearn.metrics import root_mean_squared_error
import pandas as pd

In [2]:
from mlflow import MlflowClient
import mlflow

MLFLOW_TRACKING_URI = "sqlite:///mlflow.db"
client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)

In [3]:
model_name = 'nyc-taxi regressor'

---

## Utility functions

In [4]:
def read_dataframe(filename):
    if filename.endswith('.csv'):
        df = pd.read_csv(filename)

        df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
        df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)
    elif filename.endswith('.parquet'):
        df = pd.read_parquet(filename)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df


def preprocess(df, dv):
    df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']
    categorical = ['PU_DO']
    numerical = ['trip_distance']
    train_dicts = df[categorical + numerical].to_dict(orient='records')
    return dv.transform(train_dicts)  # just transform, no fitting

def test_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    return {"rmse": root_mean_squared_error(y_test, y_pred)}

---

## Read datasets

### Production

In [5]:
df_prod = read_dataframe('https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2024-03.parquet')

In [6]:
df_prod.head(3)

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,...,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge,duration
0,2,2024-03-01 00:10:52,2024-03-01 00:26:12,N,1.0,129,226,1.0,1.72,12.8,...,0.5,3.06,0.0,,1.0,18.36,1.0,1.0,0.0,15.333333
1,2,2024-03-01 00:22:21,2024-03-01 00:35:15,N,1.0,130,218,1.0,3.25,17.7,...,0.5,0.0,0.0,,1.0,20.2,2.0,1.0,0.0,12.9
2,2,2024-03-01 00:45:27,2024-03-01 01:04:32,N,1.0,255,107,2.0,4.58,23.3,...,0.5,3.5,0.0,,1.0,32.05,1.0,1.0,2.75,19.083333


---

## Retrieve reprocessor from mlflow

### Download

In [7]:
run_id = '863a8c7e9d354033ae7a733dbdc0c6bd'
client.download_artifacts(
    run_id=run_id,
    path='preprocessor',
    dst_path='.'
)

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

'/workspaces/mlops-zoomcamp/02-experiment-tracking/preprocessor'

### Load

In [8]:
import pickle

In [9]:
with open('preprocessor/preprocessor.b', 'rb') as f_in:
    dv = pickle.load(f_in)

---

## Tranform data

In [10]:
X_test = preprocess(
    df=df_prod,
    dv=dv
)

In [11]:
X_test

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 107739 stored elements and shape (55139, 5093)>

## Test model

In [12]:
target = 'duration'
y_test = df_prod[target].values

### 1. load via model id

In [13]:
run_id = '863a8c7e9d354033ae7a733dbdc0c6bd'
model_uri = f'runs:/{run_id}/model'

model = mlflow.pyfunc.load_model(model_uri)

%time
test_model(model, X_test, y_test)

CPU times: user 2 μs, sys: 0 ns, total: 2 μs
Wall time: 3.81 μs


{'rmse': np.float64(5.51659537395944)}

### 2. load via name and version

In [14]:
model_version = 3

model = mlflow.sklearn.load_model(f"models:/{model_name}/{model_version}")

%time
test_model(model, X_test, y_test)

CPU times: user 1 μs, sys: 0 ns, total: 1 μs
Wall time: 3.58 μs


{'rmse': np.float64(5.51659537395944)}

### 3. load via name and alias

In [15]:
alias = 'champion'
model_uri = f"models:/{model_name}@{alias}"

model = mlflow.sklearn.load_model(model_uri)

%time
test_model(model, X_test, y_test)

CPU times: user 1 μs, sys: 0 ns, total: 1 μs
Wall time: 3.34 μs


{'rmse': np.float64(5.51659537395944)}