In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import warnings
warnings.filterwarnings('ignore')

def load_and_preprocess_data(file_path):
    print("📥 Loading dataset...")
    df = pd.read_parquet(file_path)
    print("✅ Dataset loaded successfully.")

    print("🕒 Converting datetime columns...")
    df['started_at'] = pd.to_datetime(df['started_at'], format='mixed')
    df['ended_at'] = pd.to_datetime(df['ended_at'], format='mixed')  
    df['pickup_hour'] = df['started_at'].dt.floor('6H')
    df['location_id'] = df['start_station_id'].astype(str)

    print("⏱️ Calculating ride duration...")
    df['duration_minutes'] = (df['ended_at'] - df['started_at']).dt.total_seconds() / 60.0

    print("📊 Aggregating target (trip counts)...")
    ride_counts = df.groupby(['pickup_hour', 'location_id']).size().reset_index(name='target')

    print("🔁 Creating 112 lag features (28 days × 4 bins/day)...")
    lagged_data = []
    for loc in ride_counts['location_id'].unique():
        loc_df = ride_counts[ride_counts['location_id'] == loc].sort_values('pickup_hour')
        for lag in range(1, 113):
            loc_df[f'target_lag_{lag}'] = loc_df['target'].shift(lag)
        lagged_data.append(loc_df)

    df_lagged = pd.concat(lagged_data)

    print("📅 Extracting time-based features...")
    df_lagged['hour'] = df_lagged['pickup_hour'].dt.hour
    df_lagged['day_of_week'] = df_lagged['pickup_hour'].dt.dayofweek
    df_lagged['month'] = df_lagged['pickup_hour'].dt.month
    df_lagged['is_weekend'] = df_lagged['day_of_week'].isin([5, 6]).astype(int)

    print("🧹 Dropping missing values...")
    df_lagged = df_lagged.dropna()

    print("✅ Preprocessing complete.")
    return df_lagged

def train_lgb_regression_model(df):
    features = [col for col in df.columns if col not in ['pickup_hour', 'target', 'location_id']]
    X = df[features]
    y = df['target']

    model = lgb.LGBMRegressor(random_state=42)
    tscv = TimeSeriesSplit(n_splits=5)

    rmse_scores, mae_scores, r2_scores, mape_scores = [], [], [], []

    print("🚀 Training LightGBM regression model...")
    for train_idx, test_idx in tscv.split(X):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        rmse_scores.append(np.sqrt(mean_squared_error(y_test, y_pred)))
        mae_scores.append(mean_absolute_error(y_test, y_pred))
        r2_scores.append(r2_score(y_test, y_pred))
        mape_scores.append(np.mean(np.abs((y_test - y_pred) / y_test)) * 100)

    print("\n📊 Evaluation Metrics (5-fold CV):")
    print(f"➡️  Mean Absolute Error (MAE): {np.mean(mae_scores):.2f}")
    print(f"➡️  Mean Absolute Percentage Error (MAPE): {np.mean(mape_scores):.2f}%")
    print(f"➡️  Root Mean Squared Error (RMSE): {np.mean(rmse_scores):.2f}")
    print(f"➡️  R-squared (R²): {np.mean(r2_scores):.2f}")

    return model, features

if __name__ == "__main__":
    file_path = "BikeRide2024Top3Location.parquet"
    
    df_transformed = load_and_preprocess_data(file_path)
    df_transformed.to_parquet("transformeddata2024.parquet", index=False)
    
    print("📈 Training regression model...")
    reg_model, reg_features = train_lgb_regression_model(df_transformed)
    
    print("📌 Calculating feature importance...")
    feature_importance = pd.DataFrame({
        'feature': reg_features,
        'importance': reg_model.feature_importances_
    }).sort_values(by='importance', ascending=False)

    print("\n🏆 Top 5 Important Features:")
    print(feature_importance.head())


📥 Loading dataset...
✅ Dataset loaded successfully.
🕒 Converting datetime columns...
⏱️ Calculating ride duration...
📊 Aggregating target (trip counts)...
🔁 Creating 112 lag features (28 days × 4 bins/day)...
📅 Extracting time-based features...
🧹 Dropping missing values...
✅ Preprocessing complete.
📈 Training regression model...
🚀 Training LightGBM regression model...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001167 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15237
[LightGBM] [Info] Number of data points in the train set: 664, number of used features: 116
[LightGBM] [Info] Start training from score 99.081325
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002508 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 27808
[LightGBM] [Info] Number of data points in the train set: 1326, number of used features: 

In [2]:
import numpy as np
import pandas as pd

class BaselineModelPreviousHour:
    """
    A simple baseline model that uses the previous time step's value (e.g., rides_t-1)
    as the prediction for the current time step.
    """

    def fit(self, X_train: pd.DataFrame, y_train: pd.Series):
        # No training needed for baseline model
        pass

    def predict(self, X_test: pd.DataFrame) -> np.ndarray:
        if "target_lag_1" not in X_test.columns:
            raise ValueError("X_test must contain 'target_lag_1' column.")
        return X_test["target_lag_1"].values


In [3]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load transformed data
df = pd.read_parquet("transformeddata2024.parquet")

# Define features and target
# features = [col for col in df.columns if col not in ['Pickup_hour', 'target', 'location_id']]
features = [col for col in df.columns if col not in ['pickup_hour', 'target', 'location_id']]
X = df[features]
y = df['target']

# Simulate train/test split (use last 20% as test)
split_idx = int(0.8 * len(df))
X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]

# Baseline model
baseline = BaselineModelPreviousHour()
baseline.fit(X_train, y_train)
y_pred = baseline.predict(X_test)

# Evaluate
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100

print("📊 BaselineModelPreviousHour Evaluation:")
print(f"➡️ MAE:  {mae:.2f}")
print(f"➡️ MAPE: {mape:.2f}%")
print(f"➡️ RMSE: {rmse:.2f}")
print(f"➡️ R²:   {r2:.2f}")


📊 BaselineModelPreviousHour Evaluation:
➡️ MAE:  103.48
➡️ MAPE: 485.11%
➡️ RMSE: 115.55
➡️ R²:   -0.75


In [4]:
print(X_train.dtypes)

target_lag_1      float64
target_lag_2      float64
target_lag_3      float64
target_lag_4      float64
target_lag_5      float64
                   ...   
target_lag_112    float64
hour                int32
day_of_week         int32
month               int32
is_weekend          int32
Length: 116, dtype: object


In [5]:
import lightgbm as lgb
import numpy as np
import pandas as pd

class LightGBMRegressorModel:
    """
    A wrapper around LightGBM Regressor for consistent interface.
    """

    def __init__(self, **kwargs):
        self.model = lgb.LGBMRegressor(random_state=42, **kwargs)

    def fit(self, X_train: pd.DataFrame, y_train: pd.Series):
        self.model.fit(X_train, y_train)

    def predict(self, X_test: pd.DataFrame) -> np.ndarray:
        return self.model.predict(X_test)

    def feature_importance(self, feature_names: list) -> pd.DataFrame:
        return pd.DataFrame({
            "feature": feature_names,
            "importance": self.model.feature_importances_
        }).sort_values(by="importance", ascending=False)
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load transformed data
df = pd.read_parquet("transformeddata2024.parquet")

# Define features and target
features = [col for col in df.columns if col not in ['pickup_hour', 'target', 'location_id']]
X = df[features]
y = df['target']

# Train/test split
split_idx = int(0.8 * len(df))
X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]

# Train LightGBM model
lgb_model = LightGBMRegressorModel()
lgb_model.fit(X_train, y_train)
y_pred = lgb_model.predict(X_test)

# Evaluate
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100

print("📊 LightGBM Model Evaluation:")
print(f"➡️ MAE:  {mae:.2f}")
print(f"➡️ MAPE: {mape:.2f}%")
print(f"➡️ RMSE: {rmse:.2f}")
print(f"➡️ R²:   {r2:.2f}")

# Feature importance
print("\n🏆 Top 5 Feature Importances:")
print(lgb_model.feature_importance(features).head())


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003294 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 28586
[LightGBM] [Info] Number of data points in the train set: 3179, number of used features: 116
[LightGBM] [Info] Start training from score 102.463982
📊 LightGBM Model Evaluation:
➡️ MAE:  20.92
➡️ MAPE: 41.63%
➡️ RMSE: 30.53
➡️ R²:   0.88

🏆 Top 5 Feature Importances:
           feature  importance
0     target_lag_1         185
3     target_lag_4         140
113    day_of_week          90
1     target_lag_2          68
27   target_lag_28          54


In [6]:
import joblib

# Save the trained LightGBM model to a file
joblib.dump(lgb_model.model, "lightgbm_bikeride_model.joblib")
print("✅ Model saved to 'lightgbm_bikeride_model.joblib'")

✅ Model saved to 'lightgbm_bikeride_model.joblib'
