In [2]:
import warnings
warnings.filterwarnings("ignore")

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
from sklearn.feature_extraction import DictVectorizer

In [4]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [5]:
import numpy as np

In [6]:
from sklearn.linear_model import Lasso

In [7]:
from sklearn.linear_model import Ridge

In [8]:
import pickle

In [9]:
import mlflow

In [17]:
import xgboost as xgb

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll.base import scope

In [10]:
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("nyc-taxi-experiment")

2025/09/02 11:23:57 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2025/09/02 11:23:57 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.


<Experiment: artifact_location='/home/ubuntu/notebooks/03-expriments-tracking/mlruns/1', creation_time=1756745664232, experiment_id='1', last_update_time=1756745664232, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>

In [11]:
def read_data(filename):
    df = pd.read_parquet(filename)
    df["duration"] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds()/60)
    df = df[((df.duration >= 1) & (df.duration <= 60))]
    categorical = ["PULocationID", "DOLocationID"]
    # numerical = ["trip_distance"]
    # df[categorical] = df[categorical].astype(str)
    df.loc[:, categorical] = df[categorical].astype(str)
    return df

In [12]:
df_train = read_data("data/green_tripdata_2021-01.parquet")
df_val = read_data("data/green_tripdata_2021-02.parquet")

In [13]:
len(df_train), len(df_val)


(73908, 61921)

In [14]:
df_train["PU_DO"] = df_train["PULocationID"] + "_" + df_train["DOLocationID"]
df_val["PU_DO"] = df_val["PULocationID"] + "_" + df_val["DOLocationID"]

In [15]:
# def 

categorical = ["PU_DO"]#"PULocationID", "DOLocationID"]
numerical = ["trip_distance"]

dv  = DictVectorizer()

train_dicts = df_train[categorical+numerical].to_dict(orient = "records")
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical+numerical].to_dict(orient = "records")
X_val = dv.transform(val_dicts)

target = "duration"
y_train = df_train[target].values
y_val = df_val[target].values




In [None]:

# lr = LinearRegression()
# lr.fit(X_train, y_train)

In [None]:
# y_pred = lr.predict(X_val)
# np.sqrt(mean_squared_error(y_val, y_pred))

np.float64(7.7587152133919135)

In [None]:
# lasso = Lasso(alpha=0.002)
# lasso.fit(X_train, y_train)
# y_pred_lasso = lasso.predict(X_val)
# np.sqrt(mean_squared_error(y_val, y_pred_lasso))

np.float64(9.91133386905489)

In [20]:
with mlflow.start_run():
    
    mlflow.set_tag("developer", "Koomi")
    
    mlflow.log_param("train-data-path", "data/green_tripdata_2021-01.parquet")
    mlflow.log_param("valid-data-path", "data/green_tripdata_2021-02.parquet")
    mlflow.log_param("model", "Ridge")
    
    alpha = 0.02
    mlflow.log_param("alpha", alpha)
    
    ridge = Ridge(alpha=alpha)
    ridge.fit(X_train, y_train)
    y_pred_ridge = ridge.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred_ridge))
    
    mlflow.log_metric("rmse", rmse)

In [None]:
# with mlflow.start_run():
    
#     mlflow.set_tag("developer", "Koomi")
    
#     mlflow.log_param("train-data-path", "data/green_tripdata_2021-01.parquet")
#     mlflow.log_param("valid-data-path", "data/green_tripdata_2021-02.parquet")
#     mlflow.log_param("model", "Ridge")
    
#     alpha = 0.01
#     mlflow.log_param("alpha", alpha)
    
#     ridge = Ridge(alpha=alpha)
#     ridge.fit(X_train, y_train)
#     y_pred_ridge = ridge.predict(X_val)
#     rmse = np.sqrt(mean_squared_error(y_val, y_pred_ridge))
    
#     mlflow.log_metric("rmse", rmse)
    
#     with open("../models/ridge_reg.bin", "wb") as f_out:
#         pickle.dump((dv, ridge), f_out)
#     mlflow.log_artifact("models/ridge_reg.bin")


np.float64(7.509752258177714)

In [19]:
train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_val, label=y_val)

In [34]:


def objective(params):
    with mlflow.start_run():
        mlflow.set_tag("model", "xgboost")
        mlflow.log_params(params)
        
        booster = xgb.train(
            params=params,
            dtrain = train,
            num_boost_round=1000,
            evals=[(valid, "validation")],
            early_stopping_rounds=50
        )
        
        y_pred = booster.predict(valid)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        mlflow.log_metric("rmse", rmse)

    return {"loss": rmse, "status": STATUS_OK}

In [None]:
import warningssearch_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, 1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, 1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'reg:linear',
    'seed': 42
}

In [None]:
best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=Trials()
)

[0]	validation-rmse:11.26818                          
[1]	validation-rmse:10.47232                          
[2]	validation-rmse:9.80595                           
[3]	validation-rmse:9.25179                           
[4]	validation-rmse:8.79313                           
[5]	validation-rmse:8.41579                           
[6]	validation-rmse:8.10746                           
[7]	validation-rmse:7.85594                           
[8]	validation-rmse:7.65133                           
[9]	validation-rmse:7.48556                           
[10]	validation-rmse:7.35080                          
[11]	validation-rmse:7.24089                          
[12]	validation-rmse:7.15090                          
[13]	validation-rmse:7.07830                          
[14]	validation-rmse:7.01916                          
[15]	validation-rmse:6.97137                          
[16]	validation-rmse:6.93244                          
[17]	validation-rmse:6.89968                          
[18]	valid

In [None]:
mlflow.xgboost.autolog()

train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMmlflow.xgboost.autolog()

train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_val, label=y_val)atrix(X_val, label=y_val)


params = {"learning_rate":0.33707810007380146,
"max_depth": 39,
"min_child_weight": 1.3396380858101118,
"objective": "reg:linear",
"reg_alpha":0.21705039846156954,
"reg_lambda": 0.06884936106500567,
"seed":42
}

booster = xgb.train(
            params=params,
            dtrain = train,
            num_boost_round=1000,
            evals=[(valid, "validation")],
            early_stopping_rounds=50
        )

2025/09/02 10:30:16 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'ac98c82915454af993926708b29a5b74', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current xgboost workflow


[0]	validation-rmse:9.62534
[1]	validation-rmse:8.17704
[2]	validation-rmse:7.40890
[3]	validation-rmse:7.00429
[4]	validation-rmse:6.78999
[5]	validation-rmse:6.67317
[6]	validation-rmse:6.60557
[7]	validation-rmse:6.56339
[8]	validation-rmse:6.53581
[9]	validation-rmse:6.51646
[10]	validation-rmse:6.50541
[11]	validation-rmse:6.49706
[12]	validation-rmse:6.49188
[13]	validation-rmse:6.49033
[14]	validation-rmse:6.48728
[15]	validation-rmse:6.48529
[16]	validation-rmse:6.48091
[17]	validation-rmse:6.47749
[18]	validation-rmse:6.47291
[19]	validation-rmse:6.46890
[20]	validation-rmse:6.46656
[21]	validation-rmse:6.46464
[22]	validation-rmse:6.46113
[23]	validation-rmse:6.45724
[24]	validation-rmse:6.45554
[25]	validation-rmse:6.45482
[26]	validation-rmse:6.45238
[27]	validation-rmse:6.45174
[28]	validation-rmse:6.44799
[29]	validation-rmse:6.44379
[30]	validation-rmse:6.44145
[31]	validation-rmse:6.43966
[32]	validation-rmse:6.43722
[33]	validation-rmse:6.43584
[34]	validation-rmse:6.4



# Saving manually the model

In [42]:
ridge = Ridge(alpha=0.01)
ridge.fit(X_train, y_train)

y_pred_ridge = ridge.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred_ridge))
print(rmse)

7.509752258177714


In [None]:


with open("models/ridge_reg.bin", "wb") as f_out:
    pickle.dump((dv, ridge), f_out)

In [45]:
with mlflow.start_run():
    
    mlflow.set_tag("developer", "Koomi")
    
    mlflow.log_param("train-data-path", "data/green_tripdata_2021-01.parquet")
    mlflow.log_param("valid-data-path", "data/green_tripdata_2021-02.parquet")
    mlflow.log_param("model", "Ridge")
    
    alpha = 0.01
    mlflow.log_param("alpha", alpha)
    
    ridge = Ridge(alpha=alpha)
    ridge.fit(X_train, y_train)
    y_pred_ridge = ridge.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred_ridge))
    
    with open("models/ridge_reg.bin", "wb") as f_out:
        pickle.dump((dv, ridge), f_out)
        
    mlflow.log_metric("rmse", rmse)
    
    mlflow.log_artifact(local_path="models/ridge_reg.bin", artifact_path="models_pickle")

## XGBOOST

In [24]:
from mlflow.models.signature import infer_signature

In [29]:
mlflow.xgboost.autolog(disable=True)
mlflow.xgboost.autolog(log_models=False)

with mlflow.start_run():

    train = xgb.DMatrix(X_train, label=y_train)
    valid = xgb.DMatrix(X_val, label=y_val)

    best_params = {"learning_rate":0.33707810007380146,
    "max_depth": 39,
    "min_child_weight": 1.3396380858101118,
    "objective": "reg:linear",
    "reg_alpha":0.21705039846156954,
    "reg_lambda": 0.06884936106500567,
    "seed":42
    }
    mlflow.log_params(best_params)

    booster = xgb.train(
                params=best_params,
                dtrain = train,
                num_boost_round=1000,
                evals=[(valid, "validation")],
                early_stopping_rounds=50
            )
    y_pred = booster.predict(valid)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    mlflow.log_metric("rmse", rmse)
    with open("models/preprosessor.b", "wb") as f_out:
        pickle.dump(dv, f_out)
    mlflow.log_artifact("models/preprosessor.b", artifact_path="preprocessor")
    signature = infer_signature(X_val, y_pred)
    # signature = infer_signature(X_val, y_pred)
    mlflow.xgboost.log_model(booster, name="xgboost-model", signature=signature)


[0]	validation-rmse:9.62534
[1]	validation-rmse:8.17704
[2]	validation-rmse:7.40890
[3]	validation-rmse:7.00429
[4]	validation-rmse:6.78999
[5]	validation-rmse:6.67317
[6]	validation-rmse:6.60557
[7]	validation-rmse:6.56339
[8]	validation-rmse:6.53581
[9]	validation-rmse:6.51646
[10]	validation-rmse:6.50541
[11]	validation-rmse:6.49706
[12]	validation-rmse:6.49188
[13]	validation-rmse:6.49033
[14]	validation-rmse:6.48728
[15]	validation-rmse:6.48529
[16]	validation-rmse:6.48091
[17]	validation-rmse:6.47749
[18]	validation-rmse:6.47291
[19]	validation-rmse:6.46890
[20]	validation-rmse:6.46656
[21]	validation-rmse:6.46464
[22]	validation-rmse:6.46113
[23]	validation-rmse:6.45724
[24]	validation-rmse:6.45554
[25]	validation-rmse:6.45482
[26]	validation-rmse:6.45238
[27]	validation-rmse:6.45174
[28]	validation-rmse:6.44799
[29]	validation-rmse:6.44379
[30]	validation-rmse:6.44145
[31]	validation-rmse:6.43966
[32]	validation-rmse:6.43722
[33]	validation-rmse:6.43584
[34]	validation-rmse:6.4

In [None]:
          
# mlflow.set_active_model(model_id="m-8ff74fb9dd1e4d489a6987f365713231")
logged_model = 'runs:/85f4610e1d8f40a9bb6292ebbe34f010/xgboost-model'
loaded_model = mlflow.pyfunc.load_model(logged_model)

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]
Downloading artifacts: 100%|██████████| 5/5 [00:00<00:00, 919.24it/s]  


In [None]:
loaded_model

In [38]:
xgboost_loading = mlflow.xgboost.load_model(logged_model)

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]
Downloading artifacts: 100%|██████████| 5/5 [00:00<00:00, 2381.50it/s] 


In [39]:
xgboost_loading

<xgboost.core.Booster at 0x7509632f1710>

In [None]:
y_pred = xgboost_loading.predict(valid)

array([14.016934 ,  7.150633 , 14.79729  , ..., 13.579181 ,  5.7700477,
        8.021618 ], shape=(61921,), dtype=float32)

In [None]:
import mlflow
import mlflow.xgboost
from mlflow.models.signature import infer_signature
import xgboost as xgb
import numpy as np
from sklearn.metrics import mean_squared_error

with mlflow.start_run():
    mlflow.xgboost.autolog(log_models=False)  # ✅ let autolog track params/metrics, but disable auto model logging

    train = xgb.DMatrix(X_train, label=y_train)
    valid = xgb.DMatrix(X_val, label=y_val)

    best_params = {
        "learning_rate": 0.33707810007380146,
        "max_depth": 39,
        "min_child_weight": 1.3396380858101118,
        "objective": "reg:squarederror",   # ✅ use modern objective (not deprecated `reg:linear`)
        "reg_alpha": 0.21705039846156954,
        "reg_lambda": 0.06884936106500567,
        "seed": 42,
    }
    mlflow.log_params(best_params)

    booster = xgb.train(
        params=best_params,
        dtrain=train,
        num_boost_round=1000,
        evals=[(valid, "validation")],
        early_stopping_rounds=50
    )

    # Evaluate on validation
    y_pred = booster.predict(valid)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    mlflow.log_metric("rmse", rmse)

    # ✅ Add model signature and input example
    signature = infer_signature(X_val, y_pred)
    mlflow.xgboost.log_model(
        xgb_model=booster,
        name="models_xgboost",             # ✅ use `name=` instead of artifact_path
        signature=signature,
        input_example=X_val[:5]            # ✅ small sample for schema inference
    )


[0]	validation-rmse:9.62534
[1]	validation-rmse:8.17704
[2]	validation-rmse:7.40890
[3]	validation-rmse:7.00429
[4]	validation-rmse:6.78999
[5]	validation-rmse:6.67317
[6]	validation-rmse:6.60557
[7]	validation-rmse:6.56339
[8]	validation-rmse:6.53581
[9]	validation-rmse:6.51646
[10]	validation-rmse:6.50541
[11]	validation-rmse:6.49706
[12]	validation-rmse:6.49188
[13]	validation-rmse:6.49033
[14]	validation-rmse:6.48728
[15]	validation-rmse:6.48529
[16]	validation-rmse:6.48091
[17]	validation-rmse:6.47749
[18]	validation-rmse:6.47291
[19]	validation-rmse:6.46890
[20]	validation-rmse:6.46656
[21]	validation-rmse:6.46464
[22]	validation-rmse:6.46113
[23]	validation-rmse:6.45724
[24]	validation-rmse:6.45554
[25]	validation-rmse:6.45482
[26]	validation-rmse:6.45238
[27]	validation-rmse:6.45174
[28]	validation-rmse:6.44799
[29]	validation-rmse:6.44379
[30]	validation-rmse:6.44145
[31]	validation-rmse:6.43966
[32]	validation-rmse:6.43722
[33]	validation-rmse:6.43584
[34]	validation-rmse:6.4

In [3]:
# sns.distplot(y_pred, label = "predicted")
# sns.distplot(y_train, label = "actual")

# plt.legend()