In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
import pandas as pd

In [4]:
from sklearn.feature_extraction import DictVectorizer

In [5]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error

In [6]:
import numpy as np

In [7]:
from sklearn.linear_model import Lasso

In [8]:
from sklearn.linear_model import Ridge

In [9]:
import pickle

In [10]:
import mlflow
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("nyc-taxi-experiment")

2025/09/03 18:00:33 INFO mlflow.tracking.fluent: Experiment with name 'nyc-taxi-experiment' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/1', creation_time=1756922433871, experiment_id='1', last_update_time=1756922433871, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>

In [13]:
def read_data(filename):
    df = pd.read_parquet(filename)
    df["duration"] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds()/60)
    df = df[((df.duration >= 1) & (df.duration <= 60))]
    categorical = ["PULocationID", "DOLocationID"]
    # numerical = ["trip_distance"]
    # df[categorical] = df[categorical].astype(str)
    df.loc[:, categorical] = df[categorical].astype(str)
    df["PU_DO"] = df["PULocationID"] + "_" + df["DOLocationID"]
    
    return df

In [14]:
df_train = read_data("https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-01.parquet")
df_val = read_data("https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-02.parquet")

In [17]:
# def 

categorical = ["PU_DO"]#"PULocationID", "DOLocationID"]
numerical = ["trip_distance"]

dv  = DictVectorizer()

train_dicts = df_train[categorical+numerical].to_dict(orient = "records")
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical+numerical].to_dict(orient = "records")
X_val = dv.transform(val_dicts)


target = "duration"
y_train = df_train[target].values
y_val = df_val[target].values



In [15]:
import xgboost as xgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll.base import scope

In [19]:
from pathlib import Path
models_dir = Path("models")
models_dir.mkdir(exist_ok=True)

In [20]:
with mlflow.start_run():
    train = xgb.DMatrix(X_train, label=y_train)
    valid = xgb.DMatrix(X_val, label=y_val)

    params = {"learning_rate":0.33707810007380146,
    "max_depth": 39,
    "min_child_weight": 1.3396380858101118,
    "objective": "reg:linear",
    "reg_alpha":0.21705039846156954,
    "reg_lambda": 0.06884936106500567,
    "seed":42
    }
    mlflow.log_params(params)

    booster = xgb.train(
                params=params,
                dtrain = train,
                num_boost_round=300,
                evals=[(valid, "validation")],
                early_stopping_rounds=50
            )
    y_pred = booster.predict(valid)
    rmse = root_mean_squared_error(y_val, y_pred)
    mlflow.log_metric("rmse", rmse)
    
    with open("models/preprocessor.b", "wb") as f_out:
        pickle.dump(dv, f_out)
    mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")
    mlflow.xgboost.log_model(booster, artifact_path="models_mlflow")
    
    

[0]	validation-rmse:9.62534
[1]	validation-rmse:8.17704
[2]	validation-rmse:7.40890
[3]	validation-rmse:7.00429
[4]	validation-rmse:6.78999
[5]	validation-rmse:6.67317
[6]	validation-rmse:6.60557
[7]	validation-rmse:6.56339
[8]	validation-rmse:6.53581
[9]	validation-rmse:6.51646
[10]	validation-rmse:6.50541
[11]	validation-rmse:6.49706
[12]	validation-rmse:6.49188
[13]	validation-rmse:6.49033
[14]	validation-rmse:6.48728
[15]	validation-rmse:6.48529
[16]	validation-rmse:6.48091
[17]	validation-rmse:6.47749
[18]	validation-rmse:6.47291
[19]	validation-rmse:6.46890
[20]	validation-rmse:6.46656
[21]	validation-rmse:6.46464
[22]	validation-rmse:6.46113
[23]	validation-rmse:6.45724
[24]	validation-rmse:6.45554
[25]	validation-rmse:6.45482
[26]	validation-rmse:6.45238
[27]	validation-rmse:6.45174
[28]	validation-rmse:6.44799
[29]	validation-rmse:6.44379
[30]	validation-rmse:6.44145
[31]	validation-rmse:6.43966
[32]	validation-rmse:6.43722
[33]	validation-rmse:6.43584
[34]	validation-rmse:6.4



🏃 View run sneaky-hound-293 at: http://localhost:5000/#/experiments/1/runs/02dc7938993a44eaad8b5869501800c8
🧪 View experiment at: http://localhost:5000/#/experiments/1
