In [66]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, r2_score
from sklearn.model_selection import TimeSeriesSplit
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


folder_path = '../../data/processed/ETL/sensor_id=496'

df_sensor = pd.read_parquet(folder_path).set_index('timestamp')

# Extract hour, day, and month as integers
df_sensor["hour"] = df_sensor.index.hour
df_sensor["day"] = df_sensor.index.day
df_sensor["month"] = df_sensor.index.month
df_sensor["weekday"] = df_sensor.index.day_of_week

df_sensor.head()

Unnamed: 0_level_0,noise_db,hour,day,month,weekday
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-12-21 05:00:00,65.7,5,21,12,0
2023-07-05 00:00:00,62.5,0,5,7,2
2020-12-21 19:00:00,71.3,19,21,12,0
2023-07-07 17:00:00,70.4,17,7,7,4
2020-12-22 07:00:00,70.6,7,22,12,1


In [67]:
lockdown_start = '2020-03-14'
lockdown_end = '2020-06-07'

# Categorical values for pre, during and post lockdown periods
df_sensor['Lockdown'] = np.where(
    df_sensor.index < pd.to_datetime(lockdown_start), 'Pre-lockdown',
    np.where(
        df_sensor.index <= pd.to_datetime(lockdown_end), 'During lockdown', 'Post-lockdown'
    )
)
df_sensor.head()

Unnamed: 0_level_0,noise_db,hour,day,month,weekday,Lockdown
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-12-21 05:00:00,65.7,5,21,12,0,Post-lockdown
2023-07-05 00:00:00,62.5,0,5,7,2,Post-lockdown
2020-12-21 19:00:00,71.3,19,21,12,0,Post-lockdown
2023-07-07 17:00:00,70.4,17,7,7,4,Post-lockdown
2020-12-22 07:00:00,70.6,7,22,12,1,Post-lockdown


In [68]:
# One-hot-encoding for lockdown periods (categorical data to numerical)
df_sensor_encoded = pd.get_dummies(df_sensor, columns=['Lockdown'], prefix='lockdown')

# Prepare features (X) and target (y)
X = df_sensor_encoded.drop(columns=['noise_db'])
y = df_sensor_encoded['noise_db']

# Define the models to evaluate
model_definitions = {
    "Gradient Boosting Regressor": GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42),
}

# Create a TimeSeriesSplit object
tscv = TimeSeriesSplit(n_splits=3)

# Initialize lists to store results
all_results = []
all_predictions = []

# Iterate over each model
for model_name, model in model_definitions.items():
    print(f"Evaluating {model_name}...")
    fold_results = []
    predictions_list = []

    # Perform TimeSeriesSplit
    for fold, (train_index, test_index) in enumerate(tscv.split(X), start=1):
        # Split the data into train and test sets for the current fold
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Train the model on the current fold
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        # Calculate metrics for the current fold
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        mape = mean_absolute_percentage_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)

        # Store fold results
        fold_results.append({"Model": model_name, "Fold": fold, "RMSE": rmse, "MAPE": mape, "R2": r2})

        # Save predictions
        fold_predictions = pd.DataFrame({
            "Model": model_name,
            "Fold": fold,
            "Actual": y_test,
            "Predicted": y_pred
        })
        predictions_list.append(fold_predictions)

    # Aggregate results for this model
    all_results.extend(fold_results)
    all_predictions.extend(predictions_list)

# Create a DataFrame for all results
all_results_df = pd.DataFrame(all_results)

# Ensure each DataFrame in `all_predictions` has a datetime index before concatenating
for df in all_predictions:
    if 'timestamp' in df.columns:  # Replace 'Timestamp' with the column representing datetime
        df.set_index('timestamp', inplace=True)

# Combine all predictions into a single DataFrame
all_predictions_df = pd.concat(all_predictions, ignore_index=False)

Evaluating Gradient Boosting Regressor...


In [69]:
all_predictions_df.drop(columns=['Fold', 'Model'], inplace=True)
all_predictions_df.sort_index(inplace=True)
all_predictions_df.head(10)

Unnamed: 0_level_0,Actual,Predicted
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2015-12-07 00:00:00,66.6,65.745972
2015-12-07 02:00:00,63.9,65.584679
2015-12-07 03:00:00,62.6,65.652319
2015-12-07 04:00:00,65.3,65.901424
2015-12-07 05:00:00,66.4,68.428865
2015-12-07 06:00:00,69.7,70.366837
2015-12-07 11:00:00,72.7,72.254227
2015-12-07 12:00:00,77.0,72.254227
2015-12-07 13:00:00,72.0,72.156587
2015-12-07 15:00:00,72.4,72.17302
