In [6]:
import numpy as np
import pandas as pd

class BaselineModelPreviousHour:
    """
    A simple baseline model that uses the previous time step's value (e.g., rides_t-1)
    as the prediction for the current time step.
    """

    def fit(self, X_train: pd.DataFrame, y_train: pd.Series):
        # No training needed for baseline model
        pass

    def predict(self, X_test: pd.DataFrame) -> np.ndarray:
        if "target_lag_1" not in X_test.columns:
            raise ValueError("X_test must contain 'target_lag_1' column.")
        return X_test["target_lag_1"].values

In [7]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load transformed data
df = pd.read_parquet("transformeddata2024.parquet")

# Define features and target
features = [col for col in df.columns if col not in ['pickup_hour', 'target', 'location_id']]
X = df[features]
y = df['target']

# Simulate train/test split (use last 20% as test)
split_idx = int(0.8 * len(df))
X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]

# Baseline model
baseline = BaselineModelPreviousHour()
baseline.fit(X_train, y_train)
y_pred = baseline.predict(X_test)

# Evaluate
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100

print("📊 BaselineModelPreviousHour Evaluation:")
print(f"➡️ MAE:  {mae:.2f}")
print(f"➡️ MAPE: {mape:.2f}%")
print(f"➡️ RMSE: {rmse:.2f}")
print(f"➡️ R²:   {r2:.2f}")


📊 BaselineModelPreviousHour Evaluation:
➡️ MAE:  103.48
➡️ MAPE: 485.11%
➡️ RMSE: 115.55
➡️ R²:   -0.75


In [8]:
import logging
import os

import mlflow
from mlflow.models import infer_signature

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


def set_mlflow_tracking():
    """
    Set up MLflow tracking server credentials and URI.
    """
    uri = os.environ["MLFLOW_TRACKING_URI"]
    print(uri)
    mlflow.set_tracking_uri(uri)
    logger.info("MLflow tracking URI and credentials set.")

    return mlflow


def log_model_to_mlflow(
     model,
    input_data,
    experiment_name,
    metric_name="metric",
    model_name=None,
    params=None,
    mae=None,
    mape=None,
    rmse=None,
    r2=None
):
    """
    Log a trained model, parameters, and metrics to MLflow.

    Parameters:
    - model: Trained model object (e.g., sklearn model).
    - input_data: Input data used for training (for signature inference).
    - experiment_name: Name of the MLflow experiment.
    - metric_name: Name of the metric to log (e.g., "RMSE", "accuracy").
    - model_name: Optional name for the registered model.
    - params: Optional dictionary of hyperparameters to log.
    - score: Optional evaluation metric to log.
    """
    try:
        # Set the experiment
        mlflow.set_experiment(experiment_name)
        logger.info(f"Experiment set to: {experiment_name}")

        # Start an MLflow run
        with mlflow.start_run():
            # Log hyperparameters if provided
            if params:
                mlflow.log_params(params)
                logger.info(f"Logged parameters: {params}")

            # Log metrics if provided
            if mae is not None:
                mlflow.log_metric(metric_name, mae)
                mlflow.log_metric("mape", mape)
                mlflow.log_metric("rmse", rmse)
                mlflow.log_metric("r2", r2)
                logger.info(f"Logged {metric_name}: {mae}")

            # Infer the model signature
            signature = infer_signature(input_data, model.predict(input_data))
            logger.info("Model signature inferred.")

            # Determine the model name
            if not model_name:
                model_name = model.__class__.__name__

            # Log the model
            model_info = mlflow.sklearn.log_model(
                sk_model=model,
                artifact_path="model_artifact",
                signature=signature,
                input_example=input_data,
                registered_model_name=model_name,
            )
            logger.info(f"Model logged with name: {model_name}")
            return model_info

    except Exception as e:
        logger.error(f"An error occurred while logging to MLflow: {e}")
        raise


In [9]:
from dotenv import load_dotenv
load_dotenv() 
uri = os.environ["MLFLOW_TRACKING_URI"]
print(uri)

https://dagshub.com/jaathavan18/citi_bike_pred.mlflow


In [10]:

from dotenv import load_dotenv
load_dotenv() 

mlflow = set_mlflow_tracking()
log_model_to_mlflow(model=baseline,
    input_data=X_test,
    experiment_name="BaselineModel",
    metric_name="mean_absolute_error",
    mae=mae,      
    mape=mape,
    rmse=rmse,
    r2=r2)

INFO:__main__:MLflow tracking URI and credentials set.


https://dagshub.com/jaathavan18/citi_bike_pred.mlflow


2025/05/11 11:56:56 INFO mlflow.tracking.fluent: Experiment with name 'BaselineModel' does not exist. Creating a new experiment.
INFO:__main__:Experiment set to: BaselineModel
INFO:__main__:Logged mean_absolute_error: 103.47672955974842
INFO:__main__:Model signature inferred.
  from .autonotebook import tqdm as notebook_tqdm
Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 1984.87it/s]
2025/05/11 11:57:02 INFO mlflow.models.model: Found the following environment variables used during model inference: [HOPSWORKS_API_KEY]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
Registered model 'BaselineModelPreviousHour' already exists. Creating a new version of this model...
2025/05/11 11:57:08 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: BaselineModelPreviousHour, version 2
Created versio

🏃 View run inquisitive-slug-926 at: https://dagshub.com/jaathavan18/citi_bike_pred.mlflow/#/experiments/11/runs/bc88da64c80c43398fed2a33694958fe
🧪 View experiment at: https://dagshub.com/jaathavan18/citi_bike_pred.mlflow/#/experiments/11


<mlflow.models.model.ModelInfo at 0x19f116ac810>