## Batch Deployment

https://github.com/DataTalksClub/mlops-zoomcamp/tree/main/04-deployment

In [36]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import inflection
import joblib

from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

SEED = 0
def major_library_version():
    import pandas, numpy, sklearn
    print(f"pandas: {pandas.__version__}")
    print(f"numpy: {numpy.__version__}")
    print(f"sklearn: {sklearn.__version__}")
    
def load_dataset(path):
    df = pd.read_parquet(path)
    return df

def prepare_features(df):
    # duration
    df['duration'] = df['lpep_dropoff_datetime'] - df['lpep_pickup_datetime']
    df['duration'] = df['duration'].dt.total_seconds() / 60
    
    # pickup and dropoff location
    categorical = ["PULocationID", "DOLocationID"]
    df[categorical] = df[categorical].astype(str)
    df.loc[:, "PU_DO"] = df["PULocationID"] + "_" + df["DOLocationID"]
    
    # every filter must be at the end of the function
    return df[(df['duration'] >= 1) & (df['duration'] <= 60)]
     


def train_the_model(train, features=None, target='duration'):
    if features == None:
        features = ["PU_DO", "trip_distance"]
        
    # preprocess the one-hot encode outside pipeline in order
    # to take advantage of dictionary input later on web service
    dv = DictVectorizer()
    
    # parse to dictionaries for using DictVectorizer
    train_dicts = train[features].to_dict(orient="records")
    X_train = dv.fit_transform(train_dicts)
    y_train = train[target]
    
    # model
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_train)
    rmse = mean_squared_error(y_train, y_pred, squared=False)
    
    return dv, model, rmse

def run_the_model(val, dv, model, features=None, target='duration'):
    if features == None:
        features = ["PU_DO", "trip_distance"]
        
    val_dicts = val[features].to_dict(orient="records")
    X_val = dv.transform(val_dicts)
    y_val = val[target]
    
    y_pred = model.predict(X_val)
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    
    return y_pred, rmse

def save_the_model(model, path="./models/linear_regression.bin"):
    with open(path, 'wb') as file_out:
         joblib.dump(model, file_out)

def load_the_model(path="./models/linear_regression.bin"):
    with open(path, 'rb') as file_in:
        return joblib.load(file_in)
    
def main():
    """
    Notes:
    * Use data from Jan 2021 for training then use Feb 2021 data for validation.
    """
    # load the dataset
    train = load_dataset("../dataset/green_tripdata_2021-01.parquet")
    train = prepare_features(train)
    val = load_dataset("../dataset/green_tripdata_2021-02.parquet")
    val = prepare_features(val)
    
    # train the model
    dv, model, rmse = train_the_model(train, features=None, target='duration')
    print(f"RMSE of training data: {rmse:.3f}")
    
    # prepare the path
    from datetime import datetime
    saved_time = datetime.today().strftime("%Y-%m-%d")
    dv_path = f"./models/{saved_time}_preprocessor.b"
    model_path = f"./models/{saved_time}_lin_reg.bin"
    
    # save the pipeline
    save_the_model(dv, path=dv_path)
    save_the_model(model, path=model_path)
    
    # load and test the model on validation dataset
    dv = load_the_model(path=dv_path)
    model = load_the_model(path=model_path)
    y_pred, rmse = run_the_model(val, dv, model, features=None, target='duration')
    print(f"RMSE of validation data: {rmse:.3f}")
    
    # library version
    major_library_version()

main()

RMSE of training data: 5.700
RMSE of validation data: 7.759
pandas: 1.4.2
numpy: 1.22.3
sklearn: 1.1.0
