## Gradient Boosting Machine (GBM)

In [17]:
# import libraries
import pandas as pd
import numpy as np
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score



In [18]:
# Load your data
file_name = 'phy_cps.csv'
df = pd.read_csv(file_name, index_col=0)

# # Example: Create a sample multivariate time series data
# date_range = pd.date_range(start='1/1/2020', periods=100, freq='D')
# np.random.seed(42)
# data = {
#     'feature1': np.random.rand(100),
#     'feature2': np.random.rand(100),
#     'feature3': np.random.rand(100)
# }
# df = pd.DataFrame(data, index=date_range)

# Create lag features for each column
def create_lag_features(df, lag=1):
    for col in df.columns:
        df[f'{col}_lag{lag}'] = df[col].shift(lag)
    return df

df = create_lag_features(df, lag=1).dropna()
df

Unnamed: 0_level_0,Tank_1,Tank_2,Tank_3,Pump_1,Pump_2,Flow_sensor,Tank_1_lag1,Tank_2_lag1,Tank_3_lag1,Pump_1_lag1,Pump_2_lag1,Flow_sensor_lag1
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
09/04/2021 18:23,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
09/04/2021 18:23,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
09/04/2021 18:23,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
09/04/2021 18:23,0,0,0,1,1,0,0.0,0.0,0.0,0.0,0.0,0.0
09/04/2021 18:23,0,0,0,1,1,0,0.0,0.0,0.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
09/04/2021 19:03,237,1873,248,0,1,0,188.0,1883.0,221.0,0.0,1.0,0.0
09/04/2021 19:03,292,1858,273,0,1,0,237.0,1873.0,248.0,0.0,1.0,0.0
09/04/2021 19:03,332,1845,296,0,1,0,292.0,1858.0,273.0,0.0,1.0,0.0
09/04/2021 19:03,358,1833,319,0,1,0,332.0,1845.0,296.0,0.0,1.0,0.0


In [19]:
## Feature Engineering
# Define target variables and features
# features = df.columns.tolist()
# features = ['Tank2OutFlow', 'Tank2.puddle', 'Tank3OutFlow', 'Tank2.level', 'wt3_valve']
# targets = ['Tank2OutFlow', 'Tank2.puddle', 'Tank3OutFlow', 'Tank2.level']
features = ['Tank_1', 'Tank_2', 'Tank_3', 'Pump_1', 'Pump_2', 'Flow_sensor']
targets = ['Tank_1', 'Tank_2', 'Tank_3', 'Pump_1', 'Pump_2', 'Flow_sensor']
# targets = [f'{col}_lag1' for col in df.columns]

# Split the data into training and testing sets
X = df[features]
y = df[targets]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, shuffle=False)


In [20]:
# Model Training
# Train a separate model for each target variable
models = {}
for target in targets:
    model = LGBMRegressor()
    model.fit(X_train, y_train[target])
    models[target] = model


In [21]:
# Forecasting and Evaluation
# Forecast each target and evaluate
forecasts = {}
for target in targets:
    model = models[target]
    y_pred = model.predict(X_test)
    forecasts[target] = y_pred
    mse = mean_squared_error(y_test[target], y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test[target], y_pred)
    r2_sqr = r2_score(y_test[target], y_pred)
    print(f'Errors for {target}: MSE: {mse}, RMSE: {rmse}. MAE: {mae}, R2 Score: {r2_sqr}')
    # print(f'Errors for {target}: MSE: {mse}, RMSE: {rmse}. MAE: {mae}')

    
# Combine forecasts into a DataFrame
forecast_df = pd.DataFrame(forecasts, index=X_test.index)
print(forecast_df)


Errors for Tank_1: MSE: 11.443514763689743, RMSE: 3.3828264459900605. MAE: 2.0509517813326577, R2 Score: 0.9999741882545451
Errors for Tank_2: MSE: 8.745936725203837, RMSE: 2.9573529929996245. MAE: 2.043755470309985, R2 Score: 0.9999797615060919
Errors for Tank_3: MSE: 300487.63633007975, RMSE: 548.1675257894066. MAE: 247.6187418686941, R2 Score: 0.7741599846762658
Errors for Pump_1: MSE: 1.5773967661533137e-10, RMSE: 1.2559445712901958e-05. MAE: 1.1199785309035342e-05, R2 Score: 0.9999999992844654
Errors for Pump_2: MSE: 1.2180161307242104e-10, RMSE: 1.1036376809099127e-05. MAE: 9.836736127747248e-06, R2 Score: 0.999999999282017
Errors for Flow_sensor: MSE: 268.65485080053816, RMSE: 16.390694030471625. MAE: 10.813527910810775, R2 Score: 0.9999128124985013
                      Tank_1       Tank_2      Tank_3    Pump_1    Pump_2   
Time                                                                        
09/04/2021 18:43    0.429711     3.738246  132.228894  0.000007  0.000007  \
09

In [22]:
### With valueFlip dataset

### With Air Pollution dataset

In [23]:
# Load your data
file_name = 'pollution.csv'
df = pd.read_csv(file_name, index_col=0)

# # Example: Create a sample multivariate time series data
# date_range = pd.date_range(start='1/1/2020', periods=100, freq='D')
# np.random.seed(42)
# data = {
#     'feature1': np.random.rand(100),
#     'feature2': np.random.rand(100),
#     'feature3': np.random.rand(100)
# }
# df = pd.DataFrame(data, index=date_range)

# Create lag features for each column
def create_lag_features(df, lag=1):
    for col in df.columns:
        df[f'{col}_lag{lag}'] = df[col].shift(lag)
    return df

df = create_lag_features(df, lag=1).dropna()

## Feature Engineering
# Define target variables and features
# features = df.columns.tolist()
features = ['pollution', 'dew', 'temp', 'press', 'wnd_spd', 'snow', 'rain']
targets = ['pollution', 'dew', 'temp', 'press', 'wnd_spd']
# targets = [f'{col}_lag1' for col in df.columns]

# Split the data into training and testing sets
X = df[features]
y = df[targets]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# Model Training
# Train a separate model for each target variable
models = {}
for target in targets:
    model = LGBMRegressor()
    model.fit(X_train, y_train[target])
    models[target] = model

# Forecasting and Evaluation
# Forecast each target and evaluate
forecasts = {}
for target in targets:
    model = models[target]
    y_pred = model.predict(X_test)
    forecasts[target] = y_pred
    mse = mean_squared_error(y_test[target], y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test[target], y_pred)
    # r2_sqr = r2_score(y_test[target], y_pred)
    # print(f'Errors for {target}: MSE: {mse}, RMSE: {rmse}. MAE: {mae}, R2 Score: {r2_sqr}')
    print(f'Errors for {target}: MSE: {mse}, RMSE: {rmse}. MAE: {mae}')

    
# Combine forecasts into a DataFrame
forecast_df = pd.DataFrame(forecasts, index=X_test.index)
print(forecast_df)



Errors for pollution: MSE: 8.669804212853634, RMSE: 2.944453126278908. MAE: 0.2916742186372812
Errors for dew: MSE: 0.11375473650622395, RMSE: 0.33727546087170934. MAE: 0.025088080115638615
Errors for temp: MSE: 0.001584045335745269, RMSE: 0.039800067031919295. MAE: 0.0020394805170559174
Errors for press: MSE: 4.216809203384775e-05, RMSE: 0.0064936963305845885. MAE: 0.0005867000620735346
Errors for wnd_spd: MSE: 0.810139139257657, RMSE: 0.9000772962683021. MAE: 0.13664994474818354
                  pollution        dew      temp        press     wnd_spd
date                                                                     
01/01/2014 00:00  24.003426 -19.999411  7.000121  1014.000049  141.906611
01/01/2014 01:00  52.999336 -19.999411  7.000121  1013.000044  147.697614
01/01/2014 02:00  64.997654 -19.999411  6.000141  1013.000044  150.922587
01/01/2014 03:00  70.004021 -19.999411  6.000141  1013.000044  154.609005
01/01/2014 04:00  79.004555 -17.999492  3.000245  1012.000096    0.895

In [24]:
print(End)

NameError: name 'End' is not defined

### With Water tank dataset (real-world data)

In [None]:
# Load your data
file_name = 'pollution.csv'
df = pd.read_csv(file_name, index_col=0)

# # Example: Create a sample multivariate time series data
# date_range = pd.date_range(start='1/1/2020', periods=100, freq='D')
# np.random.seed(42)
# data = {
#     'feature1': np.random.rand(100),
#     'feature2': np.random.rand(100),
#     'feature3': np.random.rand(100)
# }
# df = pd.DataFrame(data, index=date_range)

# Create lag features for each column
def create_lag_features(df, lag=1):
    for col in df.columns:
        df[f'{col}_lag{lag}'] = df[col].shift(lag)
    return df

df = create_lag_features(df, lag=1).dropna()

## Feature Engineering
# Define target variables and features
# features = df.columns.tolist()
features = ['pollution', 'dew', 'temp', 'press', 'wnd_spd', 'snow', 'rain']
targets = ['pollution', 'dew', 'temp', 'press', 'wnd_spd']
# targets = [f'{col}_lag1' for col in df.columns]

# Split the data into training and testing sets
X = df[features]
y = df[targets]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# Model Training
# Train a separate model for each target variable
models = {}
for target in targets:
    model = LGBMRegressor()
    model.fit(X_train, y_train[target])
    models[target] = model

# Forecasting and Evaluation
# Forecast each target and evaluate
forecasts = {}
for target in targets:
    model = models[target]
    y_pred = model.predict(X_test)
    forecasts[target] = y_pred
    mse = mean_squared_error(y_test[target], y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test[target], y_pred)
    # r2_sqr = r2_score(y_test[target], y_pred)
    # print(f'Errors for {target}: MSE: {mse}, RMSE: {rmse}. MAE: {mae}, R2 Score: {r2_sqr}')
    print(f'Errors for {target}: MSE: {mse}, RMSE: {rmse}. MAE: {mae}')

    
# Combine forecasts into a DataFrame
forecast_df = pd.DataFrame(forecasts, index=X_test.index)
print(forecast_df)

