In [11]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import warnings

In [12]:


def train_lgb_regression_model(df):
    features = [col for col in df.columns if col not in ['pickup_hour', 'target', 'location_id']]
    X = df[features]
    y = df['target']

    model = lgb.LGBMRegressor(random_state=42)
    tscv = TimeSeriesSplit(n_splits=5)

    rmse_scores, mae_scores, r2_scores, mape_scores = [], [], [], []

    print("🚀 Training LightGBM regression model...")
    for train_idx, test_idx in tscv.split(X):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        dashhubmod=model

        rmse_scores.append(np.sqrt(mean_squared_error(y_test, y_pred)))
        mae_scores.append(mean_absolute_error(y_test, y_pred))
        r2_scores.append(r2_score(y_test, y_pred))
        mape_scores.append(np.mean(np.abs((y_test - y_pred) / y_test)) * 100)

    print("\n📊 Evaluation Metrics (5-fold CV):")
    print(f"➡️  Mean Absolute Error (MAE): {np.mean(mae_scores):.2f}")
    print(f"➡️  Mean Absolute Percentage Error (MAPE): {np.mean(mape_scores):.2f}%")
    print(f"➡️  Root Mean Squared Error (RMSE): {np.mean(rmse_scores):.2f}")
    print(f"➡️  R-squared (R²): {np.mean(r2_scores):.2f}")

    return model, features,X_test,rmse_scores,mae_scores,r2_scores,mape_scores

In [13]:
print("📈 Training regression model...")
df_transformed=pd.read_parquet("transformeddata2024.parquet")
reg_model, reg_features,X_test,rmse_scores,mae_scores,r2_scores,mape_scores = train_lgb_regression_model(df_transformed)
    
print("📌 Calculating feature importance...")
feature_importance = pd.DataFrame({
    'feature': reg_features,
    'importance': reg_model.feature_importances_
    }).sort_values(by='importance', ascending=False)

print("\n🏆 Top 5 Important Features:")
print(feature_importance.head())

📈 Training regression model...
🚀 Training LightGBM regression model...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001349 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15237
[LightGBM] [Info] Number of data points in the train set: 664, number of used features: 116
[LightGBM] [Info] Start training from score 99.081325
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002551 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 27808
[LightGBM] [Info] Number of data points in the train set: 1326, number of used features: 116
[LightGBM] [Info] Start training from score 100.471342
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002914 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 28586
[LightGBM] [Info] Number of data points in t

In [14]:
import logging
import os

import mlflow
from mlflow.models import infer_signature

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


def set_mlflow_tracking():
    """
    Set up MLflow tracking server credentials and URI.
    """
    uri = os.environ["MLFLOW_TRACKING_URI"]
    print(uri)
    mlflow.set_tracking_uri(uri)
    logger.info("MLflow tracking URI and credentials set.")

    return mlflow


def log_model_to_mlflow(
     model,
    input_data,
    experiment_name,
    metric_name="metric",
    model_name=None,
    params=None,
    mae=None,
    mape=None,
    rmse=None,
    r2=None
):
    """
    Log a trained model, parameters, and metrics to MLflow.

    Parameters:
    - model: Trained model object (e.g., sklearn model).
    - input_data: Input data used for training (for signature inference).
    - experiment_name: Name of the MLflow experiment.
    - metric_name: Name of the metric to log (e.g., "RMSE", "accuracy").
    - model_name: Optional name for the registered model.
    - params: Optional dictionary of hyperparameters to log.
    - score: Optional evaluation metric to log.
    """
    try:
        # Set the experiment
        mlflow.set_experiment(experiment_name)
        logger.info(f"Experiment set to: {experiment_name}")

        # Start an MLflow run
        with mlflow.start_run():
            # Log hyperparameters if provided
            if params:
                mlflow.log_params(params)
                logger.info(f"Logged parameters: {params}")

            # Log metrics if provided
            if mae is not None:
                mlflow.log_metric(metric_name, mae)
                mlflow.log_metric("mape", mape)
                mlflow.log_metric("rmse", rmse)
                mlflow.log_metric("r2", r2)
                logger.info(f"Logged {metric_name}: {mae}")

            # Infer the model signature
            signature = infer_signature(input_data, model.predict(input_data))
            logger.info("Model signature inferred.")

            # Determine the model name
            if not model_name:
                model_name = model.__class__.__name__

            # Log the model
            model_info = mlflow.sklearn.log_model(
                sk_model=model,
                artifact_path="model_artifact",
                signature=signature,
                input_example=input_data,
                registered_model_name=model_name,
            )
            logger.info(f"Model logged with name: {model_name}")
            return model_info

    except Exception as e:
        logger.error(f"An error occurred while logging to MLflow: {e}")
        raise

In [15]:
from dotenv import load_dotenv
load_dotenv() 

mlflow = set_mlflow_tracking()
log_model_to_mlflow(model=reg_model,
    input_data=X_test,
    experiment_name="LightGbmWith5foldModel",
    metric_name="mean_absolute_error",
    mae=round(np.mean(mae_scores), 2),
mape=round(np.mean(mape_scores), 2),
rmse=round(np.mean(rmse_scores), 2),
r2=round(np.mean(r2_scores), 2))

INFO:__main__:MLflow tracking URI and credentials set.


https://dagshub.com/jaathavan18/citi_bike_pred.mlflow


2025/05/11 11:55:13 INFO mlflow.tracking.fluent: Experiment with name 'LightGbmWith5foldModel' does not exist. Creating a new experiment.
INFO:__main__:Experiment set to: LightGbmWith5foldModel
INFO:__main__:Logged mean_absolute_error: 22.67
INFO:__main__:Model signature inferred.
  from .autonotebook import tqdm as notebook_tqdm
Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 2339.27it/s]
2025/05/11 11:55:20 INFO mlflow.models.model: Found the following environment variables used during model inference: [HOPSWORKS_API_KEY]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
Registered model 'LGBMRegressor' already exists. Creating a new version of this model...
2025/05/11 11:55:26 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: LGBMRegressor, version 5
Created version '5' of model 'LGB

🏃 View run able-gnat-78 at: https://dagshub.com/jaathavan18/citi_bike_pred.mlflow/#/experiments/9/runs/fd0e8c4dbba748279fdab0f1ab91cd0a
🧪 View experiment at: https://dagshub.com/jaathavan18/citi_bike_pred.mlflow/#/experiments/9


<mlflow.models.model.ModelInfo at 0x1dc1bb624d0>