In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import pickle

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.metrics import root_mean_squared_error

In [6]:
import mlflow
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("nyc-taxi-experiment-1")


<Experiment: artifact_location='/workspaces/mlops-zoomcamp/03-orchestration/mlruns/1', creation_time=1750798590842, experiment_id='1', last_update_time=1750798590842, lifecycle_stage='active', name='nyc-taxi-experiment-1', tags={}>

In [7]:
def read_dataframe(filename, nrows=100_000):
    # Load only needed columns to avoid memory issues
    cols = ["tpep_pickup_datetime", "tpep_dropoff_datetime", "PULocationID", "DOLocationID", "trip_distance"]
    df = pd.read_parquet(filename, columns=cols)

    # Sample only a subset of rows (adjust nrows as needed)
    df = df.sample(n=nrows, random_state=42)

    df["duration"] = (df.tpep_dropoff_datetime - df.tpep_pickup_datetime).dt.total_seconds() / 60

    df = df[(df["duration"] >= 1) & (df["duration"] <= 60)]

    categorical = ["PULocationID", "DOLocationID"]
    df[categorical] = df[categorical].astype(str)

    return df


In [8]:
df_train = read_dataframe("https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2025-01.parquet")
df_val = read_dataframe("https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2025-02.parquet")

In [10]:
categorical = ["PULocationID", "DOLocationID"]
numerical = ["trip_distance"]

dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient="records")
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient="records")
X_val = dv.transform(val_dicts)


In [11]:
target = "duration"
y_train = df_train[target].values
y_val = df_val[target].values

In [15]:
import xgboost as xgb

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

In [17]:
mlflow.xgboost.autolog(disable=True)

In [20]:
from pathlib import Path

In [21]:
models_folder = Path("models")
models_folder.mkdir(exist_ok=True)

In [22]:
with mlflow.start_run():

    train = xgb.DMatrix(X_train, label=y_train)
    valid = xgb.DMatrix(X_val, label=y_val)
  
    best_params = {
    "learning_rate":0.21791931703206877,
    "max_depth":11,
    "min_child_weight":17.72218118291325,
    "objective":"reg:linear",
    "reg_alpha":0.35833596417752367,
    "reg_lambda":0.09446440043442567,
    "seed":42
    }

    mlflow.log_params(best_params)
    
    booster = xgb.train(
                    params=best_params,
                    dtrain=train,
                    num_boost_round=30,
                    evals=[(valid, 'validation')],
                    early_stopping_rounds=50
                )
    y_pred = booster.predict(valid)
    mse = root_mean_squared_error(y_val, y_pred)
    rmse = np.sqrt(mse)
    mlflow.log_metric("rmse", rmse)

    with open("models/preprocessor.b", "wb") as f_out:
        pickle.dump(dv, f_out)
        
    mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")
    mlflow.xgboost.log_model(booster, artifact_path="models_mlflow")

[0]	validation-rmse:8.27394
[1]	validation-rmse:7.24974
[2]	validation-rmse:6.54049
[3]	validation-rmse:6.05847
[4]	validation-rmse:5.73505


  self.starting_round = model.num_boosted_rounds()


[5]	validation-rmse:5.52083
[6]	validation-rmse:5.38063
[7]	validation-rmse:5.28875
[8]	validation-rmse:5.22851
[9]	validation-rmse:5.18532
[10]	validation-rmse:5.15658
[11]	validation-rmse:5.13639
[12]	validation-rmse:5.12152
[13]	validation-rmse:5.10902
[14]	validation-rmse:5.09809
[15]	validation-rmse:5.08991
[16]	validation-rmse:5.08398
[17]	validation-rmse:5.07992
[18]	validation-rmse:5.07651
[19]	validation-rmse:5.07014
[20]	validation-rmse:5.06835
[21]	validation-rmse:5.06401
[22]	validation-rmse:5.05858
[23]	validation-rmse:5.05612
[24]	validation-rmse:5.05126
[25]	validation-rmse:5.04989
[26]	validation-rmse:5.04598
[27]	validation-rmse:5.04454
[28]	validation-rmse:5.04204
[29]	validation-rmse:5.03840


  xgb_model.save_model(model_data_path)


🏃 View run honorable-horse-764 at: http://localhost:5000/#/experiments/1/runs/54b8a9b42d874f3698c7b5fd2c9ecc11
🧪 View experiment at: http://localhost:5000/#/experiments/1
