In [1]:
!pip install scikit-learn --upgrade




In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import pickle

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

In [3]:
import mlflow
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("nyc-taxi-experiment")


2025/06/16 16:37:57 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2025/06/16 16:37:57 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.


<Experiment: artifact_location='/workspaces/mlops-zoomcamp/02-experiment-tracking/mlruns/1', creation_time=1750085022940, experiment_id='1', last_update_time=1750085022940, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>

In [4]:
def read_dataframe(filename, nrows=100_000):
    # Load only needed columns to avoid memory issues
    cols = ["tpep_pickup_datetime", "tpep_dropoff_datetime", "PULocationID", "DOLocationID", "trip_distance"]
    df = pd.read_parquet(filename, columns=cols)

    # Sample only a subset of rows (adjust nrows as needed)
    df = df.sample(n=nrows, random_state=42)

    df["tpep_dropoff_datetime"] = pd.to_datetime(df["tpep_dropoff_datetime"])
    df["tpep_pickup_datetime"] = pd.to_datetime(df["tpep_pickup_datetime"])

    df["duration"] = (df.tpep_dropoff_datetime - df.tpep_pickup_datetime).dt.total_seconds() / 60

    df = df[(df["duration"] >= 1) & (df["duration"] <= 60)]

    categorical = ["PULocationID", "DOLocationID"]
    df[categorical] = df[categorical].astype(str)

    return df


In [5]:
df_train = read_dataframe("data/yellow_tripdata_2025-01.parquet")
df_val = read_dataframe("data/yellow_tripdata_2025-02.parquet")

In [6]:
len(df_train), len(df_val)

(98022, 97914)

In [7]:
categorical = ["PULocationID", "DOLocationID"]
numerical = ["trip_distance"]

dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient="records")
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient="records")
X_val = dv.transform(val_dicts)


In [8]:
target = "duration"
y_train = df_train[target].values
y_val = df_val[target].values

In [9]:
import inspect
print(inspect.getfile(mean_squared_error))
print(mean_squared_error.__doc__)


/home/codespace/anaconda3/envs/exp-tracking-env/lib/python3.11/site-packages/sklearn/utils/_param_validation.py
Mean squared error regression loss.

    Read more in the :ref:`User Guide <mean_squared_error>`.

    Parameters
    ----------
    y_true : array-like of shape (n_samples,) or (n_samples, n_outputs)
        Ground truth (correct) target values.

    y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs)
        Estimated target values.

    sample_weight : array-like of shape (n_samples,), default=None
        Sample weights.

    multioutput : {'raw_values', 'uniform_average'} or array-like of shape             (n_outputs,), default='uniform_average'
        Defines aggregating of multiple output values.
        Array-like value defines weights used to average errors.

        'raw_values' :
            Returns a full set of errors in case of multioutput input.

        'uniform_average' :
            Errors of all outputs are averaged with uniform weight.

  

In [10]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_val)

mse = mean_squared_error(y_val, y_pred)
rmse = np.sqrt(mse)
print("RMSE:", rmse)

RMSE: 8.022805511023254


In [11]:
with open("models/lin_reg.bin", "wb") as f_out:
    pickle.dump((dv, lr), f_out)

In [12]:
with mlflow.start_run():
    mlflow.set_tag("developer", "master")

    mlflow.log_param("train_data", "data/yellow_tripdata_2025-01.parquet")
    mlflow.log_param("valid_data", "data/yellow_tripdata_2025-02.parquet")
    
    alpha = 0.01
    mlflow.log_param("alpha", alpha)
    lr = Lasso(alpha)
    lr.fit(X_train, y_train)
    
    y_pred = lr.predict(X_val)
    mse = mean_squared_error(y_val, y_pred)
    rmse = np.sqrt(mse)
    mlflow.log_metric("rmse", rmse)

In [13]:
import xgboost as xgb

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_val, label=y_val)

In [14]:
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag("model", "xgboost")
        mlflow.log_params(params)
        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=1000,
            evals=[(valid, 'validation')],
            early_stopping_rounds=50
        )
        y_pred = booster.predict(valid)
        mse = mean_squared_error(y_val, y_pred)
        rmse = np.sqrt(mse)
        mlflow.log_metric("rmse", rmse)

    return {'loss': rmse, 'status': STATUS_OK}

In [15]:
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'reg:linear',
    'seed': 42
}

In [None]:
best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=Trials()
)

[0]	validation-rmse:8.27394                                                                        
[1]	validation-rmse:7.24974                                                                        
[2]	validation-rmse:6.54049                                                                        
  0%|                                                       | 0/50 [00:00<?, ?trial/s, best loss=?]

  self.starting_round = model.num_boosted_rounds()



[3]	validation-rmse:6.05847                                                                        
[4]	validation-rmse:5.73505                                                                        
[5]	validation-rmse:5.52083                                                                        
[6]	validation-rmse:5.38063                                                                        
[7]	validation-rmse:5.28875                                                                        
[8]	validation-rmse:5.22851                                                                        
[9]	validation-rmse:5.18532                                                                        
[10]	validation-rmse:5.15658                                                                       
[11]	validation-rmse:5.13639                                                                       
[12]	validation-rmse:5.12152                                                                       


  self.starting_round = model.num_boosted_rounds()



[0]	validation-rmse:5.66459                                                                        
[1]	validation-rmse:5.33028                                                                        
[2]	validation-rmse:5.33006                                                                        
[3]	validation-rmse:5.34030                                                                        
[4]	validation-rmse:5.35614                                                                        
[5]	validation-rmse:5.36876                                                                        
[6]	validation-rmse:5.38361                                                                        
[7]	validation-rmse:5.40504                                                                        
[8]	validation-rmse:5.41422                                                                        
[9]	validation-rmse:5.42233                                                                        


  self.starting_round = model.num_boosted_rounds()



[9]	validation-rmse:5.87295                                                                        
[10]	validation-rmse:5.74995                                                                       
[11]	validation-rmse:5.64747                                                                       
[12]	validation-rmse:5.56378                                                                       
[13]	validation-rmse:5.49565                                                                       
[14]	validation-rmse:5.43852                                                                       
[15]	validation-rmse:5.39222                                                                       
[16]	validation-rmse:5.35295                                                                       
[17]	validation-rmse:5.32087                                                                       
[18]	validation-rmse:5.29410                                                                       


  self.starting_round = model.num_boosted_rounds()



[0]	validation-rmse:5.71695                                                                        
[1]	validation-rmse:5.79143                                                                        
[2]	validation-rmse:5.86393                                                                        
[3]	validation-rmse:5.87875                                                                        
[4]	validation-rmse:5.90948                                                                        
[5]	validation-rmse:5.91666                                                                        
[6]	validation-rmse:5.92634                                                                        
[7]	validation-rmse:5.93396                                                                        
[8]	validation-rmse:5.94361                                                                        
[9]	validation-rmse:5.94565                                                                        


In [None]:
sns.distplot(y_pred, label="Prediction", kde=True)
sns.distplot(y_train, label="Actual", kde=True)
plt.legend()
plt.title("Prediction vs Actual Distribution")
plt.xlabel("Target Value")
plt.show();