In [1]:
import pickle

import pandas as pd

from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error

In [2]:
from sklearn.pipeline import make_pipeline

In [3]:
import mlflow

logged_model = '/home/kantundpeterpan/projects/zoomcamp/zcmlops/mlartifacts/2/models/m-dbbe1bfd8e8c44e88f459ca9df82f691/artifacts'
# Load model as a PyFuncModel.
model = mlflow.pyfunc.load_model(logged_model)

In [4]:
model

mlflow.pyfunc.loaded_model:
  artifact_path: mlflow-artifacts:/2/models/m-dbbe1bfd8e8c44e88f459ca9df82f691/artifacts
  flavor: mlflow.sklearn
  run_id: ecf5c933c8d04c0cb32ac726b561ed13

In [5]:
def read_dataframe(filename: str):
    df = pd.read_parquet(filename)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.dt.total_seconds() / 60
    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    return df


def prepare_dictionaries(df: pd.DataFrame):
    df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']
    categorical = ['PU_DO']
    numerical = ['trip_distance']
    dicts = df[categorical + numerical].to_dict(orient='records')
    return dicts

In [6]:
df = read_dataframe('https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-01.parquet')

dicts = prepare_dictionaries(df)
y_pred = model.predict(dicts)

In [7]:
df_results = pd.DataFrame()

In [8]:
import uuid

In [9]:
df_results['ride_id'] = [str(uuid.uuid4()) for i in range(df.shape[0])]

In [11]:
for col in ('lpep_pickup_datetime', 'lpep_dropoff_datetime',
            'PULocationID', 'DOLocationID'):
    df_results[col] = df[col]

In [12]:
df_results['actual_duration'] = df['duration']
df_results['predicted_duration'] = y_pred

In [13]:
df['diff'] = df_results['actual_duration'] - df_results['predicted_duration']

In [14]:
df_results

Unnamed: 0,ride_id,lpep_pickup_datetime,lpep_dropoff_datetime,PULocationID,DOLocationID,actual_duration,predicted_duration
0,cdd5e36f-77dd-4533-9c5f-e5f285ac2b38,2021-01-01 00:15:56,2021-01-01 00:19:52,43,151,3.933333,6.862711
1,e3c6c080-f71c-4100-bfb9-c2dd30a79140,2021-01-01 00:25:59,2021-01-01 00:34:44,166,239,8.750000,13.368721
2,7c752bf7-b518-45db-b634-2e1ca66493e9,2021-01-01 00:45:57,2021-01-01 00:51:55,41,42,5.966667,6.360871
3,0656c0b7-e894-435f-8da9-051d8c6e2327,2020-12-31 23:57:51,2021-01-01 00:04:56,168,75,7.083333,11.824423
4,0456522a-e834-4787-84b0-90cc8c512e2a,NaT,NaT,,,,3.389290
...,...,...,...,...,...,...,...
73903,2e108d0a-6fcf-4e4f-977d-8dd1279d3c98,2021-01-28 20:29:00,2021-01-28 20:42:00,259,174,13.000000,41.526829
73904,589b4728-a313-4905-900a-eab0d4db70e1,2021-01-28 20:05:00,2021-01-28 20:34:00,42,3,29.000000,43.858974
73905,aff85fb3-5092-4fe7-bc03-8fb1646b8c6a,2021-01-28 20:47:00,2021-01-28 21:20:00,66,76,33.000000,14.436509
73906,f64f316e-b535-4a39-8b1b-7a1dcb3fdb05,2021-01-28 20:11:00,2021-01-28 20:25:00,181,61,14.000000,37.092622
