In [1]:
import os
import pandas as pd
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style("darkgrid")

In [2]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import root_mean_squared_error

In [3]:
import mlflow
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("nyc-taxi-experiment")

<Experiment: artifact_location='/home/user/GITs/Zoomcamps/MLOps/mlops-material/2-Experiment-Tracking/mlruns/1', creation_time=1747927057005, experiment_id='1', last_update_time=1747927057005, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>

**Download the data**

- [NYC taxi data](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page) (very large dataset > 1000000)
    - 2022, Yellow, January
    - 2022, Yellow, February
- OR (much smaller datasets < 100000)
    - 2021, Green, January
    - 2021, Green, February

In [None]:
!mkdir -p data

# Green Data
!wget -c https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-01.parquet -O data/green_tripdata_2021-01.parquet
!wget -c https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-02.parquet -O data/green_tripdata_2021-02.parquet

# Yellow Data
!wget -c https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-01.parquet -O data/yellow_tripdata_2021-01.parquet
!wget -c https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-02.parquet -O data/yellow_tripdata_2021-02.parquet


In [5]:
def read_dataframe(filename):
    df = pd.read_parquet(filename)

    if "2021" in filename:
        dropoff_datetime = "lpep_dropoff_datetime"
        pickup_datetime = "lpep_pickup_datetime"
    elif "2022" in filename:
        dropoff_datetime = "tpep_dropoff_datetime"
        pickup_datetime = "tpep_pickup_datetime"
    else:
        raise ValueError("Invalid dataset")
    
    df[dropoff_datetime] = pd.to_datetime(df[dropoff_datetime])
    df[pickup_datetime] = pd.to_datetime(df[pickup_datetime])

    # create duration-column
    df["duration"] = df[dropoff_datetime] - df[pickup_datetime]
    df["duration"] = df["duration"].dt.total_seconds() / 60

    # remove outliers
    df = df[(df["duration"] >= 1) & (df["duration"] <= 60)].copy()

    # convert categorical features to strings (to get 1-hot encoding)
    categorical = ["PULocationID", "DOLocationID"]
    df[categorical] = df[categorical].astype(str)

    return df

# train_file = "data/yellow_tripdata_2022-01.parquet"
# val_file = "data/yellow_tripdata_2022-02.parquet"
train_file = "data/green_tripdata_2021-01.parquet"
val_file = "data/green_tripdata_2021-02.parquet"


df_train = read_dataframe(train_file)
print("Train: ", df_train.shape)

df_val = read_dataframe(val_file)
print("Val: ", df_val.shape)

Train:  (73908, 21)
Val:  (61921, 21)


**Create new features**

In [6]:
df_train["PU_DO"] = df_train["PULocationID"] + "_" + df_train["DOLocationID"]
df_val["PU_DO"] = df_val["PULocationID"] + "_" + df_val["DOLocationID"]

In [7]:
categorical = ["PU_DO"]
numerical = ["trip_distance"]

dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient="records")
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient="records")
X_val = dv.transform(val_dicts)

In [8]:
target = "duration"
y_train = df_train[target].values
y_val = df_val[target].values

**Linear Regression**

In [9]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_val)

root_mean_squared_error(y_val, y_pred)

7.758715212021978

In [10]:
models_dir = "./models/"
if not os.path.exists(models_dir):
    os.mkdir(models_dir)

with open("./models/lin_reg.bin", "wb") as f_out:
    pickle.dump((dv, lr), f_out)

## Experiment Tracking with MLflow

In [11]:
# Set everything inside a mlflow-run
with mlflow.start_run():

    # Setting tags/metadata
    mlflow.set_tag("developer", "johannes")
    # log data about the dataset
    mlflow.log_param("train-data-path", train_file)
    mlflow.log_param("valid-data-path", val_file)

    alpha = 0.1
    # logging the alpha parameter
    mlflow.log_param("alpha", alpha)

    lr = Lasso(alpha)
    lr.fit(X_train, y_train)
    
    y_pred = lr.predict(X_val)
    rmse = root_mean_squared_error(y_val, y_pred)
    # Log metric
    mlflow.log_metric("rmse", rmse)
    # saves artifact to local disk and creates an artifact uri in the current run
    mlflow.log_artifact(local_path="./models/lin_reg.bin", artifact_path="models_pickle")

In [12]:
import xgboost as xgb

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

# XGBoost requires the data to be of a certain type
train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_val, label=y_val)


def objective(params):
    """ Objective function of XGBoost when used woth hyperopt.
    
    Parameters
    ----------
    params:
        Hyperparameters used in mlflow(logging) and xgboost(training)
    
    Returns
    -------
    dict:
        Dictionary of training results.

    """
    with mlflow.start_run():
        mlflow.set_tag("model", "xgboost")
        mlflow.log_params(params)
        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=1000,
            evals=[(valid, 'validation')],
            early_stopping_rounds=50
        )
        y_pred = booster.predict(valid)
        rmse = root_mean_squared_error(y_val, y_pred)
        mlflow.log_metric("rmse", rmse)
    
    return { "loss": rmse, "status": STATUS_OK }

In [14]:
# Defining the search-space
search_space = {
    "max_depth": scope.int(hp.quniform("max_depth", 4, 100, 1)),  # [4, 100]
    "learning_rate": hp.loguniform("learning_rate", -3, 0),       # [exp(-3),  exp(0)] <~> [0.05, 1]
    "reg_alpha": hp.loguniform("reg_alpha", -5, -1),              # [exp(-5), exp(-1)] <~> [0.0067, 0.367]
    "reg_lambda": hp.loguniform("reg_lambda", -6, -1),            # [exp(-6), exp(-1)] <~> [0.0025, 0.367]
    "min_child_weight": hp.loguniform("min_child_weight", -1, 3), # [exp(-1),  exp(3)] <~> [0.367, 20.09]
    "objective": "reg:squarederror",  # Deprecated: "reg:squarederror"
    "seed": 42
}

best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=Trials()
)

[0]	validation-rmse:11.64054                          
[1]	validation-rmse:11.12064                          
[2]	validation-rmse:10.64906                          
[3]	validation-rmse:10.22220                          
[4]	validation-rmse:9.83643                           
[5]	validation-rmse:9.48816                           
[6]	validation-rmse:9.17453                           
[7]	validation-rmse:8.89268                           
[8]	validation-rmse:8.64069                           
[9]	validation-rmse:8.41475                           
[10]	validation-rmse:8.21354                          
[11]	validation-rmse:8.03311                          
[12]	validation-rmse:7.87285                          
[13]	validation-rmse:7.72941                          
[14]	validation-rmse:7.60190                          
[15]	validation-rmse:7.48903                          
  0%|          | 0/50 [00:43<?, ?trial/s, best loss=?]


KeyboardInterrupt: 

### Train and and save the model with the obtained parameters

In [15]:
params = {
    "learning_rate": 0.2907320120988654,
    "max_depth": 21,
    "min_child_weight": 1.0853986931084512,
    "objective": "reg:squarederror",
    "reg_alpha": 0.07070755159719935,
    "reg_lambda": 0.015498739698424788,
    "seed": 42,
}

# Using autolog for logging with just 1 line of code
mlflow.xgboost.autolog()

booster = xgb.train(
    params=params,
    dtrain=train,
    num_boost_round=1000,
    evals=[(valid, "validation")],
    early_stopping_rounds=50
)

2025/05/22 17:37:25 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '91014a8da7a648c1981ffbaf73c6ab9b', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current xgboost workflow


[0]	validation-rmse:9.98384
[1]	validation-rmse:8.59973
[2]	validation-rmse:7.77738
[3]	validation-rmse:7.29835
[4]	validation-rmse:7.02157
[5]	validation-rmse:6.86064
[6]	validation-rmse:6.76045
[7]	validation-rmse:6.69581
[8]	validation-rmse:6.65603
[9]	validation-rmse:6.62923
[10]	validation-rmse:6.61052
[11]	validation-rmse:6.59834
[12]	validation-rmse:6.58928
[13]	validation-rmse:6.58207
[14]	validation-rmse:6.57870
[15]	validation-rmse:6.57642
[16]	validation-rmse:6.57186
[17]	validation-rmse:6.56986
[18]	validation-rmse:6.56782
[19]	validation-rmse:6.56571
[20]	validation-rmse:6.56246
[21]	validation-rmse:6.55887
[22]	validation-rmse:6.55519
[23]	validation-rmse:6.55288
[24]	validation-rmse:6.55021
[25]	validation-rmse:6.54809
[26]	validation-rmse:6.54390
[27]	validation-rmse:6.54160
[28]	validation-rmse:6.53942
[29]	validation-rmse:6.53736
[30]	validation-rmse:6.53429
[31]	validation-rmse:6.53146
[32]	validation-rmse:6.52961
[33]	validation-rmse:6.52798
[34]	validation-rmse:6.5



### Logging of Models

In [16]:
with mlflow.start_run():
    best_params = {
        "learning_rate": 0.2907320120988654,
        "max_depth": 21,
        "min_child_weight": 1.0853986931084512,
        "objective": "reg:squarederror",
        "reg_alpha": 0.07070755159719935,
        "reg_lambda": 0.015498739698424788,
        "seed": 42,
    }

    mlflow.log_params(best_params)

    booster = xgb.train(
        params=best_params,
        dtrain=train,
        num_boost_round=1000,
        evals=[(valid, 'validation')],
        early_stopping_rounds=50
    )

    y_pred = booster.predict(valid)
    rmse = root_mean_squared_error(y_val, y_pred)
    mlflow.log_metric("rmse", rmse)

    mlflow.xgboost.log_model(booster, artifact_path="models_mlflow")

[0]	validation-rmse:9.98384
[1]	validation-rmse:8.59973
[2]	validation-rmse:7.77738
[3]	validation-rmse:7.29835
[4]	validation-rmse:7.02157
[5]	validation-rmse:6.86064
[6]	validation-rmse:6.76045
[7]	validation-rmse:6.69581
[8]	validation-rmse:6.65603
[9]	validation-rmse:6.62923
[10]	validation-rmse:6.61052
[11]	validation-rmse:6.59834
[12]	validation-rmse:6.58928
[13]	validation-rmse:6.58207
[14]	validation-rmse:6.57870
[15]	validation-rmse:6.57642
[16]	validation-rmse:6.57186
[17]	validation-rmse:6.56986
[18]	validation-rmse:6.56782
[19]	validation-rmse:6.56571
[20]	validation-rmse:6.56246
[21]	validation-rmse:6.55887
[22]	validation-rmse:6.55519
[23]	validation-rmse:6.55288
[24]	validation-rmse:6.55021
[25]	validation-rmse:6.54809
[26]	validation-rmse:6.54390
[27]	validation-rmse:6.54160
[28]	validation-rmse:6.53942
[29]	validation-rmse:6.53736
[30]	validation-rmse:6.53429
[31]	validation-rmse:6.53146
[32]	validation-rmse:6.52961
[33]	validation-rmse:6.52798
[34]	validation-rmse:6.5

KeyboardInterrupt: 

### Logging the preprocessor (dv)

In [36]:
# Disable autologging (no automatic tracking anymore)
mlflow.xgboost.autolog(disable=True)

In [37]:
with mlflow.start_run():

    best_params = {
        "learning_rate": 0.2907320120988654,
        "max_depth": 21,
        "min_child_weight": 1.0853986931084512,
        "objective": "reg:squarederror",
        "reg_alpha": 0.07070755159719935,
        "reg_lambda": 0.015498739698424788,
        "seed": 42,
    }

    mlflow.log_params(best_params)

    booster = xgb.train(
        params=best_params,
        dtrain=train,
        num_boost_round=1000,
        evals=[(valid, 'validation')],
        early_stopping_rounds=50
    )

    y_pred = booster.predict(valid)
    rmse = root_mean_squared_error(y_val, y_pred)
    mlflow.log_metric("rmse", rmse)

    # saving the preprocessor-object that is used before using the data in ml-tasks
    with open("models/preprocessor.b", "wb") as f_out:
        pickle.dump(dv, f_out)

    mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")

    mlflow.xgboost.log_model(booster, artifact_path="models_mlflow")



[0]	validation-rmse:9.98384
[1]	validation-rmse:8.59973
[2]	validation-rmse:7.77738
[3]	validation-rmse:7.29835
[4]	validation-rmse:7.02157
[5]	validation-rmse:6.86064
[6]	validation-rmse:6.76045
[7]	validation-rmse:6.69581
[8]	validation-rmse:6.65603
[9]	validation-rmse:6.62923
[10]	validation-rmse:6.61052
[11]	validation-rmse:6.59834
[12]	validation-rmse:6.58928
[13]	validation-rmse:6.58207
[14]	validation-rmse:6.57870
[15]	validation-rmse:6.57642
[16]	validation-rmse:6.57186
[17]	validation-rmse:6.56986
[18]	validation-rmse:6.56782
[19]	validation-rmse:6.56571
[20]	validation-rmse:6.56246
[21]	validation-rmse:6.55887
[22]	validation-rmse:6.55519
[23]	validation-rmse:6.55288
[24]	validation-rmse:6.55021
[25]	validation-rmse:6.54809
[26]	validation-rmse:6.54390
[27]	validation-rmse:6.54160
[28]	validation-rmse:6.53942
[29]	validation-rmse:6.53736
[30]	validation-rmse:6.53429
[31]	validation-rmse:6.53146
[32]	validation-rmse:6.52961
[33]	validation-rmse:6.52798
[34]	validation-rmse:6.5



### Prediction with saved model

In [38]:
logged_model = "runs:/2f44b77204084128800d161da3f01b86/models_mlflow"

# Load model as PyFuncModel
loaded_model = mlflow.pyfunc.load_model(logged_model)



In [40]:
loaded_model

mlflow.pyfunc.loaded_model:
  artifact_path: models_mlflow
  flavor: mlflow.xgboost
  run_id: 2f44b77204084128800d161da3f01b86

In [41]:
# Loading the model as an xgboost function
xgboost_model = mlflow.xgboost.load_model(logged_model)



In [42]:
xgboost_model

<xgboost.core.Booster at 0x7b0fbc90f490>

In [43]:
y_pred = xgboost_model.predict(valid)
print(y_pred[:10])

[14.395272   7.0069036 15.285045  24.44286    9.461245  17.213272
 10.945483   7.834591   9.07624   19.193548 ]
