In [1]:
!python3 -V

Python 3.11.2


In [2]:
import pandas as pd

In [3]:
import pickle

In [4]:
import seaborn as sns
import matplotlib.pyplot as plt

In [5]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression, Lasso, Ridge

from sklearn.metrics import mean_squared_error

In [6]:
import mlflow

mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("nyc-taxi-experiment")

<Experiment: artifact_location='/Users/Jake/Olvin/dev/mlops-zoomcamp/02-experiment-tracking/mlruns/1', creation_time=1682585772380, experiment_id='1', last_update_time=1682585772380, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>

In [7]:
def read_dataframe(filename):
    df = pd.read_parquet(filename)
    
    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime

    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[((df.duration >= 1) & (df.duration <= 60))]

    categorical = ['PULocationID', 'DOLocationID']

    df[categorical] = df[categorical].astype(str)

    return df

In [8]:
df_train = read_dataframe('./data/green_tripdata_2022-01.parquet')
df_val = read_dataframe('./data/green_tripdata_2022-02.parquet')

In [9]:
df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
df_val['PU_DO'] = df_val['PULocationID'] + '_' + df_val['DOLocationID']

In [10]:
categorical = ['PU_DO']   # ['PULocationID', 'DOLocationID']
numerical = ['trip_distance']

dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [11]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_val)

mean_squared_error(y_val, y_pred, squared=False)

6.928931233573194

In [12]:
with mlflow.start_run():
    mlflow.set_tag("developer", "Jake")

    mlflow.log_param("train-data-path", "./data/green_tripdata_2022-01.parquet")
    mlflow.log_param("val-data-path", "./data/green_tripdata_2022-02.parquet")

    alpha = 0.1
    mlflow.log_param("alpha", alpha)

    lr = Lasso(alpha)
    lr.fit(X_train, y_train)

    y_pred = lr.predict(X_val)

    rmse = mean_squared_error(y_val, y_pred, squared=False)
    mlflow.log_param("rmse", rmse)

    with open('models/lin_reg.bin', 'wb') as f_out:
        pickle.dump((dv, lr), f_out)

    mlflow.log_artifact(local_path="models/lin_reg.bin", artifact_path="models_pickle")

In [13]:
import xgboost as xgb

In [14]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

In [15]:
train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_val, label=y_val)

In [16]:
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag("model", "xgboost")
        mlflow.log_params(params)
        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=1000,
            evals=[(valid, "validation")],
            early_stopping_rounds=50,
        )
        y_pred = booster.predict(valid)
        rmse = mean_squared_error(y_val, y_pred, squared=False)
        mlflow.log_metric("rmse", rmse)
    
    return {"loss": rmse, "status": STATUS_OK}

In [17]:
search_space = {
    "max_depth": scope.int(hp.quniform("max_depth", 4, 100, 1)),
    "learning_rate": hp.loguniform("learning_rate", -3, 0),
    "reg_alpha": hp.loguniform("reg_alpha", -5, -1),
    "reg_lambda": hp.loguniform("reg_lambda", -6, -1),
    "min_child_weight": hp.loguniform("min_child_weight", -1, 3),
    "objective": "reg:linear",
    "seed": 42,
}

In [18]:
best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=Trials()
)

[0]	validation-rmse:7.17429                           
[1]	validation-rmse:6.16635                           
[2]	validation-rmse:6.04943                           
[3]	validation-rmse:6.03067                           
[4]	validation-rmse:6.01761                           
[5]	validation-rmse:6.00955                           
[6]	validation-rmse:6.00376                           
[7]	validation-rmse:5.99584                           
[8]	validation-rmse:5.98991                           
[9]	validation-rmse:5.98391                           
[10]	validation-rmse:5.97887                          
[11]	validation-rmse:5.97120                          
[12]	validation-rmse:5.95619                          
[13]	validation-rmse:5.95138                          
[14]	validation-rmse:5.94941                          
[15]	validation-rmse:5.94344                          
[16]	validation-rmse:5.94011                          
[17]	validation-rmse:5.93562                          
[18]	valid

KeyboardInterrupt: 

In [None]:
params = {
    "learning_rate": "0.15226675007511403",
    "max_depth": "9",
    "min_child_weight": "2.9648379129685623",
    "objective": "reg:linear",
    "reg_alpha": "0.023421186858445464",
    "reg_lambda": "0.0027874361952537935",
    "seed": "42",
}

mlflow.xgboost.autolog()

booster = xgb.train(
    params=params,
    dtrain = train,
    num_boost_round=1000,
    evals=[(valid, "validation")],
    early_stopping_rounds=50,
)

2023/04/27 12:01:11 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '05aaa2ca5dad46d38a4c25547aaa3235', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current xgboost workflow


[0]	validation-rmse:15.25584
[1]	validation-rmse:13.44500
[2]	validation-rmse:11.95637
[3]	validation-rmse:10.73915
[4]	validation-rmse:9.75652
[5]	validation-rmse:8.96536
[6]	validation-rmse:8.33520
[7]	validation-rmse:7.83517
[8]	validation-rmse:7.44452
[9]	validation-rmse:7.13943
[10]	validation-rmse:6.89981
[11]	validation-rmse:6.71412
[12]	validation-rmse:6.56828
[13]	validation-rmse:6.45431
[14]	validation-rmse:6.36505
[15]	validation-rmse:6.29673
[16]	validation-rmse:6.24050
[17]	validation-rmse:6.19700
[18]	validation-rmse:6.16006
[19]	validation-rmse:6.13061
[20]	validation-rmse:6.10764
[21]	validation-rmse:6.08844
[22]	validation-rmse:6.07373
[23]	validation-rmse:6.06043
[24]	validation-rmse:6.05023
[25]	validation-rmse:6.04064
[26]	validation-rmse:6.03502
[27]	validation-rmse:6.02858
[28]	validation-rmse:6.02284
[29]	validation-rmse:6.01875
[30]	validation-rmse:6.01554
[31]	validation-rmse:6.01154
[32]	validation-rmse:6.00822
[33]	validation-rmse:6.00591
[34]	validation-rmse



In [21]:
with mlflow.start_run():
    
    params = {
        "learning_rate": "0.15226675007511403",
        "max_depth": "9",
        "min_child_weight": "2.9648379129685623",
        "objective": "reg:linear",
        "reg_alpha": "0.023421186858445464",
        "reg_lambda": "0.0027874361952537935",
        "seed": "42",
    }

    mlflow.log_params(params)

    booster = xgb.train(
        params=params,
        dtrain = train,
        num_boost_round=1000,
        evals=[(valid, "validation")],
        early_stopping_rounds=50,
    )

    y_pred = booster.predict(valid)

    rmse = mean_squared_error(y_val, y_pred, squared=False)
    mlflow.log_metric("rmse", rmse)

    with open("models/preprocessor.b", "wb") as f_out:
        pickle.dump(dv, f_out)

    mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")

    mlflow.xgboost.log_model(booster, artifact_path="models_mlflow")

[0]	validation-rmse:15.25584
[1]	validation-rmse:13.44500
[2]	validation-rmse:11.95637
[3]	validation-rmse:10.73915
[4]	validation-rmse:9.75652
[5]	validation-rmse:8.96536
[6]	validation-rmse:8.33520
[7]	validation-rmse:7.83517
[8]	validation-rmse:7.44452
[9]	validation-rmse:7.13943
[10]	validation-rmse:6.89981
[11]	validation-rmse:6.71412
[12]	validation-rmse:6.56828
[13]	validation-rmse:6.45431
[14]	validation-rmse:6.36505
[15]	validation-rmse:6.29673
[16]	validation-rmse:6.24050
[17]	validation-rmse:6.19700
[18]	validation-rmse:6.16006
[19]	validation-rmse:6.13061
[20]	validation-rmse:6.10764
[21]	validation-rmse:6.08844
[22]	validation-rmse:6.07373
[23]	validation-rmse:6.06043
[24]	validation-rmse:6.05023
[25]	validation-rmse:6.04064
[26]	validation-rmse:6.03502
[27]	validation-rmse:6.02858
[28]	validation-rmse:6.02284
[29]	validation-rmse:6.01875
[30]	validation-rmse:6.01554
[31]	validation-rmse:6.01154
[32]	validation-rmse:6.00822
[33]	validation-rmse:6.00591
[34]	validation-rmse

In [22]:
logged_model = 'runs:/8cd4bd90c153414996e8f26aea4a875d/models_mlflow'

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)



In [24]:
xgboost_model = mlflow.xgboost.load_model(logged_model)



In [25]:
y_pred = xgboost_model.predict(valid)

In [26]:
y_pred[:10]

array([ 6.529334 ,  4.1258154, 25.884104 , 36.3634   , 27.879772 ,
        9.5935135, 18.0543   ,  4.179074 , 15.718515 ,  5.6397686],
      dtype=float32)