In [1]:
import os
import uuid
import pickle

import pandas as pd



from mlflow.tracking import MlflowClient
import mlflow

from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import make_pipeline

## Variables

In [2]:
year = 2024
month = 1
taxi_type = 'green'

input_file = f'https://d37ci6vzurychx.cloudfront.net/trip-data/{taxi_type}_tripdata_{year:04d}-{month:02d}.parquet'
output_file = f'output/{taxi_type}/{year:04d}-{month:02d}.parquet'

RUN_ID = '40da67942f0740d7b27c002eb79f755e'

MLFLOW_TRACKING_URI = 'http://localhost:5000'
mlflow.set_tracking_uri('sqlite:///mlflow.db')
mlflow.set_experiment('nyc-taxi-experiment')

<Experiment: artifact_location='/workspaces/mlops-zoomcamp/04-deployment/03_batch_service/mlruns/1', creation_time=1741971924024, experiment_id='1', last_update_time=1741971924024, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>

## Functions

### Generate unique uuid  
Add unique uuid to link input records with output records

In [3]:
def generate_uuids(n):
    print('generating uuids ...')

    ride_ids = []
    for i in range(n):
        ride_ids.append(str(uuid.uuid4()))
    return ride_ids

### Read dataframe  
Read dataframe then add 'duration' and 'ride_id' features

In [4]:
def read_dataframe(filename: str):
    print('reading dataframe ...')
    
    df = pd.read_parquet(filename)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.dt.total_seconds() / 60
    df = df[(df.duration >= 1) & (df.duration <= 60)]
    
    df['ride_id'] = generate_uuids(len(df))

    return df

### Prepare dictionaries  
Create dictionaries

In [5]:
def prepare_dictionaries(df: pd.DataFrame):
    print('preparing dictionaries ...')
    
    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']

    categorical = ['PU_DO']
    numerical = ['trip_distance']
    dicts = df[categorical + numerical].to_dict(orient='records')
    return dicts

### Load model  
Load model stored in local db

In [6]:
def load_model(run_id):
    print('loading models ...')
    
    logged_model = f'runs:/{run_id}/models_mlflow'
    model = mlflow.pyfunc.load_model(logged_model)
    return model

### Apply model  
Apply the model to dataframe and return dataframe of results

In [7]:
def apply_model(input_file, run_id, output_file):
    df = read_dataframe(input_file)
    dicts = prepare_dictionaries(df)

    model = load_model(run_id)

    print('predicting ...')

    le = LabelEncoder()
    df_input = pd.DataFrame(dicts)
    df_input['PU_DO'] = le.fit_transform(df_input['PU_DO'])
    y_pred = model.predict(df_input)

    df_result = pd.DataFrame()
    df_result['ride_id'] = df['ride_id']
    df_result['lpep_pickup_datetime'] = df['lpep_pickup_datetime']
    df_result['PULocationID'] = df['PULocationID']
    df_result['DOLocationID'] = df['DOLocationID']
    df_result['actual_duration'] = df['duration']
    df_result['predicted_duration'] = y_pred
    df_result['diff'] = df_result['actual_duration'] - df_result['predicted_duration']
    df_result['model_version'] = run_id
    
    df_result.to_parquet(output_file, index=False)
    print(f'results saved to {output_file}')

## Run model

In [8]:
apply_model(input_file=input_file, run_id=RUN_ID, output_file=output_file)

reading dataframe ...
generating uuids ...
preparing dictionaries ...
loading models ...
predicting ...
results saved to output/green/2024-01.parquet


## Results

In [9]:
results = pd.read_parquet(output_file)
results.head(5)

Unnamed: 0,ride_id,lpep_pickup_datetime,PULocationID,DOLocationID,actual_duration,predicted_duration,diff,model_version
0,edbedf71-0c7c-47f8-887b-85699f4c5586,2024-01-01 00:46:55,236,239,11.5,37.701515,-26.201515,40da67942f0740d7b27c002eb79f755e
1,b267fc37-481e-4cd2-b20e-05d4c2c355e0,2024-01-01 00:31:42,65,170,20.866667,37.701515,-16.834849,40da67942f0740d7b27c002eb79f755e
2,43ee7846-9e9a-4a2e-8b63-acd9d707d3a0,2024-01-01 00:30:21,74,262,19.033333,37.701515,-18.668182,40da67942f0740d7b27c002eb79f755e
3,e15fbdab-c21a-4f5b-b3a4-35f86ea5adfb,2024-01-01 00:30:20,74,116,11.866667,37.701515,-25.834849,40da67942f0740d7b27c002eb79f755e
4,82fbe8ee-b3d1-4fc3-8100-57972a1bf382,2024-01-01 00:32:38,74,243,10.983333,37.701515,-26.718182,40da67942f0740d7b27c002eb79f755e
