https://github.com/DataTalksClub/mlops-zoomcamp/tree/main/04-deployment

https://www.youtube.com/watch?v=aewOpHSCkqI&list=PL3MmuxUbc_hIUISrluw_A7wDSmfOhErJK

In [13]:
import pandas as pd
import joblib

from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

import mlflow

mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("green-taxi-duration")

def load_dataset(path: str):
    df = pd.read_parquet(path)
    return df

def prepare_features(df: pd.DataFrame):
    # duration
    df['duration'] = df['lpep_dropoff_datetime'] - df['lpep_pickup_datetime']
    df['duration'] = df['duration'].dt.total_seconds() / 60
    
    # pickup and dropoff location
    categorical = ["PULocationID", "DOLocationID"]
    df[categorical] = df[categorical].astype(str)
    df.loc[:, "PU_DO"] = df["PULocationID"] + "_" + df["DOLocationID"]
    
    # every filter must be at the end of the function
    return df[(df['duration'] >= 1) & (df['duration'] <= 60)]
    

def main():
    # load the dataset
    train_path = "~/ml-ops/dataset/green_tripdata_2021-01.parquet"
    val_path = "~/ml-ops/dataset/green_tripdata_2021-02.parquet"
    train = load_dataset(train_path)
    val = load_dataset(val_path)
    
    # preprocess features
    df_train = prepare_features(train)
    df_val = prepare_features(val)
    
    features = ['PU_DO', 'trip_distance']
    dict_train = df_train[features].to_dict(orient='records')
    dict_val = df_val[features].to_dict(orient='records')
    
    # target
    target = 'duration'
    y_train = df_train[target]
    y_val = df_val[target]
    
    # tracking and training
    with mlflow.start_run():
        params = dict(max_depth=20, n_estimators=100, min_samples_leaf=10, random_state=0)
        mlflow.log_params(params)

        pipeline = make_pipeline(
            DictVectorizer(),
            RandomForestRegressor(**params, n_jobs=-1)
        )

        pipeline.fit(dict_train, y_train)
        y_pred = pipeline.predict(dict_val)

        rmse = mean_squared_error(y_pred, y_val, squared=False)
        print(params, rmse)
        mlflow.log_metric('rmse', rmse)

        mlflow.sklearn.log_model(pipeline, artifact_path="model")

from tqdm import tqdm
#tqdm(main())

{'max_depth': 20, 'n_estimators': 100, 'min_samples_leaf': 10, 'random_state': 0} 6.7558229919200725




In [18]:
# to add to predict.py
from mlflow.tracking import MlflowClient

MLFLOW_TRACKING_URI = 'http://127.0.0.1:5000'
RUN_ID = 'd3b600b0d89e4e95b27b5e333e4d3c01'

client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)
path = client.download_artifacts(run_id=RUN_ID, path="model/model.pkl")
path

'/home/irfanfadh43/ml-ops/04-deployment/web-service-mlflow/mlruns/1/d3b600b0d89e4e95b27b5e333e4d3c01/artifacts/model/model.pkl'

In [19]:
path = client.download_artifacts(run_id=RUN_ID, path="model/model.pkl")
with open(path, 'rb') as f_out:
    model = joblib.load(f_out)
    
model