In [1]:
import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

class LightGBMRegressorModel:
    """
    A wrapper around LightGBM Regressor for consistent interface.
    """
    def __init__(self, **kwargs):
        self.model = lgb.LGBMRegressor(random_state=42, **kwargs)

    def fit(self, X_train: pd.DataFrame, y_train: pd.Series):
        self.model.fit(X_train, y_train)

    def predict(self, X_test: pd.DataFrame) -> np.ndarray:
        return self.model.predict(X_test)

    def feature_importance(self, feature_names: list) -> pd.DataFrame:
        return pd.DataFrame({
            "feature": feature_names,
            "importance": self.model.feature_importances_
        }).sort_values(by="importance", ascending=False)

# Function to evaluate model performance
def evaluate_model(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    mape = np.mean(np.abs((y_true - y_pred) / np.where(y_true != 0, y_true, 1))) * 100  # Avoid division by zero
    return {"MAE": mae, "MAPE": mape, "RMSE": rmse, "R²": r2}

# Function to preprocess datetime columns
def preprocess_datetime_columns(df, columns):
    """
    Convert datetime columns to numerical features (hour, day_of_week, month, etc.).
    Returns a new DataFrame with transformed columns.
    """
    df_copy = df.copy()
    for col in columns:
        if col in df_copy.columns:
            df_copy[f"{col}_hour"] = df_copy[col].dt.hour
            df_copy[f"{col}_day_of_week"] = df_copy[col].dt.dayofweek
            df_copy[f"{col}_month"] = df_copy[col].dt.month
            df_copy[f"{col}_is_weekend"] = df_copy[col].dt.dayofweek.isin([5, 6]).astype(int)
            df_copy = df_copy.drop(columns=[col])  # Drop original datetime column
    return df_copy

# Load transformed data
df = pd.read_parquet("../transformeddata2024.parquet")

# Debug: Inspect column dtypes
print("🔍 Column dtypes before preprocessing:")
print(df.dtypes)

# Define features and target
exclude_cols = ['Pickup_hour', 'target', 'location_id']
features = [col for col in df.columns if col not in exclude_cols]
X = df[features]
y = df['target']

# Identify datetime columns
datetime_cols = X.select_dtypes(include=['datetime64', 'datetime64[ns]', 'datetime64[ns, UTC]']).columns.tolist()
if datetime_cols:
    print(f"⚠️ Found datetime columns: {datetime_cols}")
    X = preprocess_datetime_columns(X, datetime_cols)
    features = [col for col in X.columns]  # Update feature list after preprocessing
else:
    print("✅ No datetime columns found.")

# Debug: Verify dtypes after preprocessing
print("\n🔍 Column dtypes after preprocessing:")
print(X.dtypes)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 1: Train initial LightGBM model to get feature importance
initial_model = LightGBMRegressorModel()
initial_model.fit(X_train, y_train)

# Get feature importance and select top 10 features
feature_importance = initial_model.feature_importance(features)
top_10_features = feature_importance.head(10)["feature"].tolist()

print("\n🏆 Top 10 Feature Importances:")
print(feature_importance.head(10))

# Step 2: Filter data to top 10 features
X_train_top10 = X_train[top_10_features]
X_test_top10 = X_test[top_10_features]

# Step 3: Standardize data for PCA
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_top10)
X_test_scaled = scaler.transform(X_test_top10)

# Step 4: Apply PCA (retain 95% of variance)
pca = PCA(n_components=0.95)  # Keep components explaining 95% of variance
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

print(f"\n🌟 PCA Results:")
print(f"Number of components selected: {pca.n_components_}")
print(f"Explained variance ratio: {np.sum(pca.explained_variance_ratio_):.4f}")

# Step 5: Train LightGBM model on PCA-transformed data
pca_model = LightGBMRegressorModel()
pca_model.fit(X_train_pca, y_train)
y_pred_pca = pca_model.predict(X_test_pca)

# Evaluate PCA model
pca_metrics = evaluate_model(y_test, y_pred_pca)

print("\n📊 LightGBM Model Evaluation (Top 10 Features + PCA):")
print(f"➡️ MAE:  {pca_metrics['MAE']:.2f}")
print(f"➡️ MAPE: {pca_metrics['MAPE']:.2f}%")
print(f"➡️ RMSE: {pca_metrics['RMSE']:.2f}")
print(f"➡️ R²:   {pca_metrics['R²']:.2f}")

# Optional: Train and evaluate model on top 10 features without PCA for comparison
top10_model = LightGBMRegressorModel()
top10_model.fit(X_train_top10, y_train)
y_pred_top10 = top10_model.predict(X_test_top10)
top10_metrics = evaluate_model(y_test, y_pred_top10)

print("\n📊 LightGBM Model Evaluation (Top 10 Features, No PCA):")
print(f"➡️ MAE:  {top10_metrics['MAE']:.2f}")
print(f"➡️ MAPE: {top10_metrics['MAPE']:.2f}%")
print(f"➡️ RMSE: {top10_metrics['RMSE']:.2f}")
print(f"➡️ R²:   {top10_metrics['R²']:.2f}")

# Save the models and preprocessing objects (optional)
import joblib
joblib.dump(pca_model.model, "lightgbm_pca_model.joblib")
joblib.dump(top10_model.model, "lightgbm_top10_model.joblib")
joblib.dump(scaler, "scaler.joblib")
joblib.dump(pca, "pca.joblib")
print("\n📦 Models and preprocessing objects saved.")

🔍 Column dtypes before preprocessing:
pickup_hour       datetime64[ns]
location_id               object
target                     int64
target_lag_1             float64
target_lag_2             float64
                       ...      
target_lag_112           float64
hour                       int32
day_of_week                int32
month                      int32
is_weekend                 int32
Length: 119, dtype: object
⚠️ Found datetime columns: ['pickup_hour']

🔍 Column dtypes after preprocessing:
target_lag_1               float64
target_lag_2               float64
target_lag_3               float64
target_lag_4               float64
target_lag_5               float64
                            ...   
is_weekend                   int32
pickup_hour_hour             int32
pickup_hour_day_of_week      int32
pickup_hour_month            int32
pickup_hour_is_weekend       int32
Length: 120, dtype: object
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testi

In [4]:
import logging
import os

import mlflow
from mlflow.models import infer_signature

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


def set_mlflow_tracking():
    """
    Set up MLflow tracking server credentials and URI.
    """
    uri = os.environ["MLFLOW_TRACKING_URI"]
    print(uri)
    mlflow.set_tracking_uri(uri)
    logger.info("MLflow tracking URI and credentials set.")

    return mlflow


def log_model_to_mlflow(
     model,
    input_data,
    experiment_name,
    metric_name="metric",
    model_name=None,
    params=None,
    mae=None,
    mape=None,
    rmse=None,
    r2=None
):
    """
    Log a trained model, parameters, and metrics to MLflow.

    Parameters:
    - model: Trained model object (e.g., sklearn model).
    - input_data: Input data used for training (for signature inference).
    - experiment_name: Name of the MLflow experiment.
    - metric_name: Name of the metric to log (e.g., "RMSE", "accuracy").
    - model_name: Optional name for the registered model.
    - params: Optional dictionary of hyperparameters to log.
    - score: Optional evaluation metric to log.
    """
    try:
        # Set the experiment
        mlflow.set_experiment(experiment_name)
        logger.info(f"Experiment set to: {experiment_name}")

        # Start an MLflow run
        with mlflow.start_run():
            # Log hyperparameters if provided
            if params:
                mlflow.log_params(params)
                logger.info(f"Logged parameters: {params}")

            # Log metrics if provided
            if mae is not None:
                mlflow.log_metric(metric_name, mae)
                mlflow.log_metric("mape", mape)
                mlflow.log_metric("rmse", rmse)
                mlflow.log_metric("r2", r2)
                logger.info(f"Logged {metric_name}: {mae}")

            # Infer the model signature
            signature = infer_signature(input_data, model.predict(input_data))
            logger.info("Model signature inferred.")

            # Determine the model name
            if not model_name:
                model_name = model.__class__.__name__

            # Log the model
            model_info = mlflow.sklearn.log_model(
                sk_model=model,
                artifact_path="model_artifact",
                signature=signature,
                input_example=input_data,
                registered_model_name=model_name,
            )
            logger.info(f"Model logged with name: {model_name}")
            return model_info

    except Exception as e:
        logger.error(f"An error occurred while logging to MLflow: {e}")
        raise

In [5]:
from dotenv import load_dotenv
load_dotenv() 

mlflow = set_mlflow_tracking()
log_model_to_mlflow(model=top10_model,
    input_data=X_test_top10,
    experiment_name="LightGbmModelWithOutPCA",
    metric_name="mean_absolute_error",
    mae=top10_metrics['MAE'],      
    mape=top10_metrics['MAPE'],
    rmse=top10_metrics['RMSE'],
    r2=top10_metrics['R²'])

INFO:__main__:MLflow tracking URI and credentials set.


https://dagshub.com/jaathavan18/citi_bike_pred.mlflow


2025/05/11 13:14:42 INFO mlflow.tracking.fluent: Experiment with name 'LightGbmModelWithOutPCA' does not exist. Creating a new experiment.
INFO:__main__:Experiment set to: LightGbmModelWithOutPCA
INFO:__main__:Logged mean_absolute_error: 21.528011210702736
INFO:__main__:Model signature inferred.
  from .autonotebook import tqdm as notebook_tqdm
Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 1683.30it/s]
2025/05/11 13:14:49 INFO mlflow.models.model: Found the following environment variables used during model inference: [HOPSWORKS_API_KEY]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
Registered model 'LightGBMRegressorModel' already exists. Creating a new version of this model...
2025/05/11 13:14:55 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: LightGBMRegressorModel, version 2


🏃 View run gifted-whale-368 at: https://dagshub.com/jaathavan18/citi_bike_pred.mlflow/#/experiments/12/runs/285a702f1a164a9c9d682ce2c51f584d
🧪 View experiment at: https://dagshub.com/jaathavan18/citi_bike_pred.mlflow/#/experiments/12


<mlflow.models.model.ModelInfo at 0x21779d5c490>

In [6]:
from dotenv import load_dotenv
load_dotenv() 

mlflow = set_mlflow_tracking()
log_model_to_mlflow(model=pca_model,
    input_data=X_test_pca,
    experiment_name="LightGbmModelWithOutPCA",
    metric_name="mean_absolute_error",
    mae=pca_metrics['MAE'],      
    mape=pca_metrics['MAPE'],
    rmse=pca_metrics['RMSE'],
    r2=pca_metrics['R²'])

INFO:__main__:MLflow tracking URI and credentials set.


https://dagshub.com/jaathavan18/citi_bike_pred.mlflow


INFO:__main__:Experiment set to: LightGbmModelWithOutPCA
INFO:__main__:Logged mean_absolute_error: 23.142474214359762
INFO:__main__:Model signature inferred.
Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 1750.34it/s]
Registered model 'LightGBMRegressorModel' already exists. Creating a new version of this model...
2025/05/11 13:15:12 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: LightGBMRegressorModel, version 3
Created version '3' of model 'LightGBMRegressorModel'.
INFO:__main__:Model logged with name: LightGBMRegressorModel


🏃 View run merciful-stork-350 at: https://dagshub.com/jaathavan18/citi_bike_pred.mlflow/#/experiments/12/runs/cec0b27508b540389947b1fad1cae1ab
🧪 View experiment at: https://dagshub.com/jaathavan18/citi_bike_pred.mlflow/#/experiments/12


<mlflow.models.model.ModelInfo at 0x217028a0b10>