## Gradient Boosting Machine (GBM)

In [21]:
# import libraries
import pandas as pd
import numpy as np
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score



In [22]:
# Load your data
file_name = 'waterTank_Golden_reduced.csv'
df = pd.read_csv(file_name, index_col=0)

# # Example: Create a sample multivariate time series data
# date_range = pd.date_range(start='1/1/2020', periods=100, freq='D')
# np.random.seed(42)
# data = {
#     'feature1': np.random.rand(100),
#     'feature2': np.random.rand(100),
#     'feature3': np.random.rand(100)
# }
# df = pd.DataFrame(data, index=date_range)

# Create lag features for each column
def create_lag_features(df, lag=1):
    for col in df.columns:
        df[f'{col}_lag{lag}'] = df[col].shift(lag)
    return df

df = create_lag_features(df, lag=1).dropna()
df

Unnamed: 0_level_0,Tank2OutFlow,Tank2.puddle,Tank3OutFlow,Tank2.level,wt3_valve,Tank2OutFlow_lag1,Tank2.puddle_lag1,Tank3OutFlow_lag1,Tank2.level_lag1,wt3_valve_lag1
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0.05,0.000000,0.022568,0.0,0.000000,0,0.000000,0.022568,0.0,0.000000,0.0
0.10,0.000000,0.022568,0.0,0.000000,0,0.000000,0.022568,0.0,0.000000,0.0
0.15,0.000000,0.022568,0.0,0.000000,0,0.000000,0.022568,0.0,0.000000,0.0
0.20,0.000000,0.022568,0.0,0.000000,0,0.000000,0.022568,0.0,0.000000,0.0
0.25,0.000000,0.022568,0.0,0.000000,0,0.000000,0.022568,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...
749.80,0.500300,43.209799,0.0,1.941046,0,0.499005,43.209799,0.0,1.911106,0.0
749.85,0.501029,43.209799,0.0,1.961054,0,0.500300,43.209799,0.0,1.941046,0.0
749.90,0.500285,43.209799,0.0,1.991114,0,0.501029,43.209799,0.0,1.961054,0.0
749.95,0.499207,43.209799,0.0,2.011127,0,0.500285,43.209799,0.0,1.991114,0.0


In [23]:
## Feature Engineering
# Define target variables and features
# features = df.columns.tolist()
features = ['Tank2OutFlow', 'Tank2.puddle', 'Tank3OutFlow', 'Tank2.level', 'wt3_valve']
targets = ['Tank2OutFlow', 'Tank2.puddle', 'Tank3OutFlow', 'Tank2.level']
# targets = [f'{col}_lag1' for col in df.columns]

# Split the data into training and testing sets
X = df[features]
y = df[targets]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)


In [24]:
# Model Training
# Train a separate model for each target variable
models = {}
for target in targets:
    model = LGBMRegressor()
    model.fit(X_train, y_train[target])
    models[target] = model


In [25]:
# Forecasting and Evaluation
# Forecast each target and evaluate
forecasts = {}
for target in targets:
    model = models[target]
    y_pred = model.predict(X_test)
    forecasts[target] = y_pred
    mse = mean_squared_error(y_test[target], y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test[target], y_pred)
    # r2_sqr = r2_score(y_test[target], y_pred)
    # print(f'Errors for {target}: MSE: {mse}, RMSE: {rmse}. MAE: {mae}, R2 Score: {r2_sqr}')
    print(f'Errors for {target}: MSE: {mse}, RMSE: {rmse}. MAE: {mae}')

    
# Combine forecasts into a DataFrame
forecast_df = pd.DataFrame(forecasts, index=X_test.index)
print(forecast_df)


Errors for Tank2OutFlow: MSE: 1.5328625254191893e-11, RMSE: 3.915178827868772e-06. MAE: 2.1347400363447844e-06
Errors for Tank2.puddle: MSE: 7.490499722816562, RMSE: 2.736877732529636. MAE: 2.3824318838731617
Errors for Tank3OutFlow: MSE: 3.1370336005609033e-07, RMSE: 0.0005600922781614564. MAE: 0.00012492862017496383
Errors for Tank2.level: MSE: 7.064925113387436e-06, RMSE: 0.002657992684976284. MAE: 0.0014651778800981816
        Tank2OutFlow  Tank2.puddle  Tank3OutFlow  Tank2.level
time                                                         
600.05      0.500299     38.573176      0.000013     1.431104
600.10      0.500992     38.573176      0.000013     1.461095
600.15      0.500284     38.573176      0.000013     1.480811
600.20      0.498973     38.573176      0.000013     1.510724
600.25      0.499270     38.573176      0.000013     1.531147
...              ...           ...           ...          ...
749.80      0.500299     38.573176      0.000013     1.938794
749.85      0.5

In [26]:
### With valueFlip dataset

### With Air Pollution dataset

In [27]:
# Load your data
file_name = 'pollution.csv'
df = pd.read_csv(file_name, index_col=0)

# # Example: Create a sample multivariate time series data
# date_range = pd.date_range(start='1/1/2020', periods=100, freq='D')
# np.random.seed(42)
# data = {
#     'feature1': np.random.rand(100),
#     'feature2': np.random.rand(100),
#     'feature3': np.random.rand(100)
# }
# df = pd.DataFrame(data, index=date_range)

# Create lag features for each column
def create_lag_features(df, lag=1):
    for col in df.columns:
        df[f'{col}_lag{lag}'] = df[col].shift(lag)
    return df

df = create_lag_features(df, lag=1).dropna()

## Feature Engineering
# Define target variables and features
# features = df.columns.tolist()
features = ['pollution', 'dew', 'temp', 'press', 'wnd_spd', 'snow', 'rain']
targets = ['pollution', 'dew', 'temp', 'press', 'wnd_spd']
# targets = [f'{col}_lag1' for col in df.columns]

# Split the data into training and testing sets
X = df[features]
y = df[targets]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# Model Training
# Train a separate model for each target variable
models = {}
for target in targets:
    model = LGBMRegressor()
    model.fit(X_train, y_train[target])
    models[target] = model

# Forecasting and Evaluation
# Forecast each target and evaluate
forecasts = {}
for target in targets:
    model = models[target]
    y_pred = model.predict(X_test)
    forecasts[target] = y_pred
    mse = mean_squared_error(y_test[target], y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test[target], y_pred)
    # r2_sqr = r2_score(y_test[target], y_pred)
    # print(f'Errors for {target}: MSE: {mse}, RMSE: {rmse}. MAE: {mae}, R2 Score: {r2_sqr}')
    print(f'Errors for {target}: MSE: {mse}, RMSE: {rmse}. MAE: {mae}')

    
# Combine forecasts into a DataFrame
forecast_df = pd.DataFrame(forecasts, index=X_test.index)
print(forecast_df)



Errors for pollution: MSE: 8.669804212853634, RMSE: 2.944453126278908. MAE: 0.2916742186372812
Errors for dew: MSE: 0.11375473650622395, RMSE: 0.33727546087170934. MAE: 0.025088080115638615
Errors for temp: MSE: 0.001584045335745269, RMSE: 0.039800067031919295. MAE: 0.0020394805170559174
Errors for press: MSE: 4.216809203384775e-05, RMSE: 0.0064936963305845885. MAE: 0.0005867000620735346
Errors for wnd_spd: MSE: 0.810139139257657, RMSE: 0.9000772962683021. MAE: 0.13664994474818354
                  pollution        dew      temp        press     wnd_spd
date                                                                     
01/01/2014 00:00  24.003426 -19.999411  7.000121  1014.000049  141.906611
01/01/2014 01:00  52.999336 -19.999411  7.000121  1013.000044  147.697614
01/01/2014 02:00  64.997654 -19.999411  6.000141  1013.000044  150.922587
01/01/2014 03:00  70.004021 -19.999411  6.000141  1013.000044  154.609005
01/01/2014 04:00  79.004555 -17.999492  3.000245  1012.000096    0.895

### With Water tank dataset (real-world data)

In [None]:
# Load your data
file_name = 'pollution.csv'
df = pd.read_csv(file_name, index_col=0)

# # Example: Create a sample multivariate time series data
# date_range = pd.date_range(start='1/1/2020', periods=100, freq='D')
# np.random.seed(42)
# data = {
#     'feature1': np.random.rand(100),
#     'feature2': np.random.rand(100),
#     'feature3': np.random.rand(100)
# }
# df = pd.DataFrame(data, index=date_range)

# Create lag features for each column
def create_lag_features(df, lag=1):
    for col in df.columns:
        df[f'{col}_lag{lag}'] = df[col].shift(lag)
    return df

df = create_lag_features(df, lag=1).dropna()

## Feature Engineering
# Define target variables and features
# features = df.columns.tolist()
features = ['pollution', 'dew', 'temp', 'press', 'wnd_spd', 'snow', 'rain']
targets = ['pollution', 'dew', 'temp', 'press', 'wnd_spd']
# targets = [f'{col}_lag1' for col in df.columns]

# Split the data into training and testing sets
X = df[features]
y = df[targets]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# Model Training
# Train a separate model for each target variable
models = {}
for target in targets:
    model = LGBMRegressor()
    model.fit(X_train, y_train[target])
    models[target] = model

# Forecasting and Evaluation
# Forecast each target and evaluate
forecasts = {}
for target in targets:
    model = models[target]
    y_pred = model.predict(X_test)
    forecasts[target] = y_pred
    mse = mean_squared_error(y_test[target], y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test[target], y_pred)
    # r2_sqr = r2_score(y_test[target], y_pred)
    # print(f'Errors for {target}: MSE: {mse}, RMSE: {rmse}. MAE: {mae}, R2 Score: {r2_sqr}')
    print(f'Errors for {target}: MSE: {mse}, RMSE: {rmse}. MAE: {mae}')

    
# Combine forecasts into a DataFrame
forecast_df = pd.DataFrame(forecasts, index=X_test.index)
print(forecast_df)

