https://github.com/DataTalksClub/mlops-zoomcamp/tree/main/04-deployment

https://www.youtube.com/watch?v=aewOpHSCkqI&list=PL3MmuxUbc_hIUISrluw_A7wDSmfOhErJK

In [1]:
import pandas as pd
import mlflow
import joblib
import uuid

from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# model artifact
run_id='d3b600b0d89e4e95b27b5e333e4d3c01'

# define the input and output file
year, month = 2021, 3
taxi_type = 'green'
input_file = f"https://d37ci6vzurychx.cloudfront.net/trip-data/{taxi_type}_tripdata_{year:04d}-{month:02d}.parquet"
output_file = f"./output/{taxi_type}/{year:04d}-{month:02d}.parquet"


def load_dataset(path: str):
    df = pd.read_parquet(path)
    return df

def generate_uuids(length):
    return [str(uuid.uuid4()) for i in range(length)]

def prepare_features(df: pd.DataFrame):
    # duration
    df['duration'] = df['lpep_dropoff_datetime'] - df['lpep_pickup_datetime']
    df['duration'] = df['duration'].dt.total_seconds() / 60
    
    # pickup and dropoff location
    categorical = ["PULocationID", "DOLocationID"]
    df[categorical] = df[categorical].astype(str)
    df.loc[:, "PU_DO"] = df["PULocationID"] + "_" + df["DOLocationID"]
    
    # every filter must be at the end of the function
    return df[(df['duration'] >= 1) & (df['duration'] <= 60)]

def load_model(run_id):
    # select the model to use by choosing run_id
    artifact_root_dir = '../web-service-mlflow'
    logged_model = f'{artifact_root_dir}/mlruns/1/{run_id}/artifacts/model'

    # load model as a PyFuncModel and predict the dicts
    return mlflow.pyfunc.load_model(logged_model)

def main(input_file, run_id, output_file):
    # load the dataset
    df = load_dataset(input_file)
    
    # generate artificial ride_id
    df['ride_id'] = generate_uuids(length=len(df))
    
    # preprocess the dataframe
    features = ['PU_DO', 'trip_distance']
    df = prepare_features(df)
    dicts= df[features].to_dict(orient='records')
    
    # predict
    model = load_model(run_id)
    y_pred = model.predict(dicts)
    
    # create the result for scoring job
    df_result = pd.DataFrame()
    df_result['ride_id'] = df['ride_id']
    df_result['lpep_pickup_datetime'] = df['lpep_pickup_datetime']
    df_result['PULocationID'] = df['PULocationID']
    df_result['DOLocationID'] = df['DOLocationID']
    df_result['actual_duration'] = df['duration']
    df_result['predicted_duration'] = y_pred
    df_result['diff'] = df_result['actual_duration'] - df_result['predicted_duration']
    df_result['model_version'] = run_id
    
    # save the result as parquet file
    df_result.to_parquet(output_file, index=False)
    
    display(df_result.sample(5))
    
main(input_file, run_id, output_file)

Unnamed: 0,ride_id,lpep_pickup_datetime,PULocationID,DOLocationID,actual_duration,predicted_duration,diff,model_version
70207,c8161954-4098-451f-a9e0-628f6cb82cb7,2021-03-21 16:30:00,218,218,5.0,5.566185,-0.566185,d3b600b0d89e4e95b27b5e333e4d3c01
25767,5274d457-621c-43f6-b856-4c10f53a0f8a,2021-03-19 11:35:39,196,100,22.416667,24.862382,-2.445716,d3b600b0d89e4e95b27b5e333e4d3c01
72681,0c9dedc4-9cc7-4222-8976-56041fa9da87,2021-03-23 12:43:00,130,15,20.0,28.461629,-8.461629,d3b600b0d89e4e95b27b5e333e4d3c01
30505,edc239fe-44f7-4fdb-a65a-b21602798c58,2021-03-22 19:12:34,97,239,22.333333,24.111712,-1.778378,d3b600b0d89e4e95b27b5e333e4d3c01
19870,42b69f92-0b40-46d4-9a6e-5e5c44e6ef67,2021-03-15 09:45:21,74,75,2.533333,6.814733,-4.2814,d3b600b0d89e4e95b27b5e333e4d3c01
