<a href="https://www.kaggle.com/code/jackren000/lstm-predict-energy-behavior-of-prosumers?scriptVersionId=160972179" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
#### load the libraries
import pandas as pd
import numpy as np
import os
import tensorflow as tf
import matplotlib.pyplot as plt
from keras.models import Sequential, load_model
from keras.layers import LSTM, Dropout, Dense
from keras.callbacks import EarlyStopping
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.optimizers import Adam



In [2]:
#### set up T4 x 2 GPU
# check the available GPUs
gpus = tf.config.experimental.list_physical_devices('GPU')
print(f"Num GPUs Available: {len(gpus)}")

# set up MirroredStrategy, 
# which is an API provided by TensorFlow for distributed training across multiple GPUs on a single machine.
mirrored_strategy = tf.distribute.MirroredStrategy()

Num GPUs Available: 2


## Data Analysis
For more detailed data analysis, please refer to https://www.kaggle.com/code/jackren000/predict-energy-behavior-of-prosumers-dataanalysis

### Data Collection

In [3]:
#### update data directory path
DATA_DIR = '/kaggle/input/predict-energy-behavior-of-prosumers'

In [4]:
#### read the CSV files into DataFrames
train = pd.read_csv(os.path.join(DATA_DIR, "train.csv"))
gas_df = pd.read_csv(os.path.join(DATA_DIR, "gas_prices.csv"))
electricity_df = pd.read_csv(os.path.join(DATA_DIR, "electricity_prices.csv"))
client_df = pd.read_csv(os.path.join(DATA_DIR, "client.csv"))
fw_df = pd.read_csv(os.path.join(DATA_DIR, "forecast_weather.csv"))
hw_df = pd.read_csv(os.path.join(DATA_DIR, "historical_weather.csv"))

### Data Exploration

In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2018352 entries, 0 to 2018351
Data columns (total 9 columns):
 #   Column              Dtype  
---  ------              -----  
 0   county              int64  
 1   is_business         int64  
 2   product_type        int64  
 3   target              float64
 4   is_consumption      int64  
 5   datetime            object 
 6   data_block_id       int64  
 7   row_id              int64  
 8   prediction_unit_id  int64  
dtypes: float64(1), int64(7), object(1)
memory usage: 138.6+ MB


In [6]:
train.head()

Unnamed: 0,county,is_business,product_type,target,is_consumption,datetime,data_block_id,row_id,prediction_unit_id
0,0,0,1,0.713,0,2021-09-01 00:00:00,0,0,0
1,0,0,1,96.59,1,2021-09-01 00:00:00,0,1,0
2,0,0,2,0.0,0,2021-09-01 00:00:00,0,2,1
3,0,0,2,17.314,1,2021-09-01 00:00:00,0,3,1
4,0,0,3,2.904,0,2021-09-01 00:00:00,0,4,2


In [7]:
# display datetime range
train['datetime'].unique()

array(['2021-09-01 00:00:00', '2021-09-01 01:00:00',
       '2021-09-01 02:00:00', ..., '2023-05-31 21:00:00',
       '2023-05-31 22:00:00', '2023-05-31 23:00:00'], dtype=object)

Note that in the `train.csv` dataset, the datetime change begins with the hour, followed by the day, and then the month.

Here is the pseudocode of `train.csv` dataset:

In [8]:

################## The pseudocode of the train dataset ##################
#for year in range(2021, 2024):  
#    for month in range(1, 13):  # Adjusted to correctly range from 1 to 12  
#        for hour in range(24):  
#           for county in range(15):  
#                for is_business in range(2):  # Adjusted to correctly range from 0 to 1  
#                    for product_type in range(4):  
#                        print(target)  
######################################################################

In [9]:
train['datetime'] = pd.to_datetime(train['datetime'])

In [10]:
# 'datetime' column is changed to datetime64[ns]
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2018352 entries, 0 to 2018351
Data columns (total 9 columns):
 #   Column              Dtype         
---  ------              -----         
 0   county              int64         
 1   is_business         int64         
 2   product_type        int64         
 3   target              float64       
 4   is_consumption      int64         
 5   datetime            datetime64[ns]
 6   data_block_id       int64         
 7   row_id              int64         
 8   prediction_unit_id  int64         
dtypes: datetime64[ns](1), float64(1), int64(7)
memory usage: 138.6 MB


In [11]:
train.head()

Unnamed: 0,county,is_business,product_type,target,is_consumption,datetime,data_block_id,row_id,prediction_unit_id
0,0,0,1,0.713,0,2021-09-01,0,0,0
1,0,0,1,96.59,1,2021-09-01,0,1,0
2,0,0,2,0.0,0,2021-09-01,0,2,1
3,0,0,2,17.314,1,2021-09-01,0,3,1
4,0,0,3,2.904,0,2021-09-01,0,4,2


In [12]:
# display datetime range
train['datetime'].unique()

<DatetimeArray>
['2021-09-01 00:00:00', '2021-09-01 01:00:00', '2021-09-01 02:00:00',
 '2021-09-01 03:00:00', '2021-09-01 04:00:00', '2021-09-01 05:00:00',
 '2021-09-01 06:00:00', '2021-09-01 07:00:00', '2021-09-01 08:00:00',
 '2021-09-01 09:00:00',
 ...
 '2023-05-31 14:00:00', '2023-05-31 15:00:00', '2023-05-31 16:00:00',
 '2023-05-31 17:00:00', '2023-05-31 18:00:00', '2023-05-31 19:00:00',
 '2023-05-31 20:00:00', '2023-05-31 21:00:00', '2023-05-31 22:00:00',
 '2023-05-31 23:00:00']
Length: 15312, dtype: datetime64[ns]

In [13]:
train.head()

Unnamed: 0,county,is_business,product_type,target,is_consumption,datetime,data_block_id,row_id,prediction_unit_id
0,0,0,1,0.713,0,2021-09-01,0,0,0
1,0,0,1,96.59,1,2021-09-01,0,1,0
2,0,0,2,0.0,0,2021-09-01,0,2,1
3,0,0,2,17.314,1,2021-09-01,0,3,1
4,0,0,3,2.904,0,2021-09-01,0,4,2


In [14]:
client_df.head()

Unnamed: 0,product_type,county,eic_count,installed_capacity,is_business,date,data_block_id
0,1,0,108,952.89,0,2021-09-01,2
1,2,0,17,166.4,0,2021-09-01,2
2,3,0,688,7207.88,0,2021-09-01,2
3,0,0,5,400.0,1,2021-09-01,2
4,1,0,43,1411.0,1,2021-09-01,2


In [15]:
client_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41919 entries, 0 to 41918
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   product_type        41919 non-null  int64  
 1   county              41919 non-null  int64  
 2   eic_count           41919 non-null  int64  
 3   installed_capacity  41919 non-null  float64
 4   is_business         41919 non-null  int64  
 5   date                41919 non-null  object 
 6   data_block_id       41919 non-null  int64  
dtypes: float64(1), int64(5), object(1)
memory usage: 2.2+ MB


### Data Transformation

In [16]:
# preprocess data and generate essential features
def preprocess(train: pd.DataFrame, client: pd.DataFrame):
    """
    Preprocess the training and client data by merging, filling missing values,
    extracting datetime features, and scaling numerical features.
    """
    # reduce 'data_block_id' in the 'client' DataFrame by 2 to match 'train' DataFrame
    client['data_block_id'] -= 2

    # merge the client data with the train data
    train = train.merge(
        client[['installed_capacity', 'eic_count', 'data_block_id', 'county', 'is_business', 'product_type']],
        how='left',
        on=['data_block_id', 'county', 'is_business', 'product_type']
    )

    # fill missing values for 'installed_capacity' and 'eic_count' with 0
    train['installed_capacity'].fillna(0, inplace=True)
    train['eic_count'].fillna(0, inplace=True)

    # convert 'datetime' to datetime object and extract features
    train['datetime'] = pd.to_datetime(train['datetime'])
    train['month'] = train['datetime'].dt.month
    train['hour'] = train['datetime'].dt.hour
    train['dayofweek'] = train['datetime'].dt.dayofweek

    # set 'datetime' as the DataFrame index and sort it
    train.set_index('datetime', inplace=True)
    train.sort_index(inplace=True)

    # fill missing values in the 'target' column with 0
    train['target'].fillna(0, inplace=True)

    # initialize the MinMaxScaler and scale selected features
    scaler = MinMaxScaler(feature_range=(0, 1))
    train['installed_capacity'] = scaler.fit_transform(train[['installed_capacity']])
    train['eic_count'] = scaler.fit_transform(train[['eic_count']])
    train['target'] = scaler.fit_transform(train[['target']])

    return train, scaler

In [17]:
train, scaler = preprocess(train, client_df)

### Data Modeling

In [18]:
# drop non-feature columns from the dataset
X = train.drop(['target', 'row_id', 'data_block_id'], axis=1)
# isolate the target variable
y = train['target']

train.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2018352 entries, 2021-09-01 00:00:00 to 2023-05-31 23:00:00
Data columns (total 13 columns):
 #   Column              Dtype  
---  ------              -----  
 0   county              int64  
 1   is_business         int64  
 2   product_type        int64  
 3   target              float64
 4   is_consumption      int64  
 5   data_block_id       int64  
 6   row_id              int64  
 7   prediction_unit_id  int64  
 8   installed_capacity  float64
 9   eic_count           float64
 10  month               int32  
 11  hour                int32  
 12  dayofweek           int32  
dtypes: float64(3), int32(3), int64(7)
memory usage: 192.5 MB


In [19]:
# split the dataset into 70% training and 30% testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# reshape features for LSTM: [samples, timesteps, features]
X_train = np.array(X_train).reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test = np.array(X_test).reshape((X_test.shape[0], 1, X_test.shape[1]))

# print the shapes of the train and test data
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (1412846, 1, 10)
y_train shape: (1412846,)
X_test shape: (605506, 1, 10)
y_test shape: (605506,)


In [20]:
train.head()

Unnamed: 0_level_0,county,is_business,product_type,target,is_consumption,data_block_id,row_id,prediction_unit_id,installed_capacity,eic_count,month,hour,dayofweek
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2021-09-01,0,0,1,4.6e-05,0,0,0,0,0.049336,0.071193,9,0,2
2021-09-01,0,0,1,0.00624,1,0,1,0,0.049336,0.071193,9,0,2
2021-09-01,0,0,2,0.0,0,0,2,1,0.008615,0.011206,9,0,2
2021-09-01,0,0,2,0.001118,1,0,3,1,0.008615,0.011206,9,0,2
2021-09-01,0,0,3,0.000188,0,0,4,2,0.373189,0.453527,9,0,2


### Data Modeling

#### Model 1
Consider using stacked LSTM layers for their ability to represent complex patterns within time series data.

In [21]:
#### build the model architecture
with mirrored_strategy.scope():
    # define the LSTM model
    lst_model = Sequential()

    # first LSTM layer with dropout
    lst_model.add(LSTM(
        units=1024,
        return_sequences=True,
        activation='swish',
        input_shape=(X_train.shape[1], X_train.shape[2]),
    ))
    lst_model.add(Dropout(0.2))

    # second LSTM layer with dropout
    lst_model.add(LSTM(
        units=1024,
        return_sequences=True,
        activation='swish'
    ))
    lst_model.add(Dropout(0.2))

    # third LSTM layer with dropout, returning only the last output
    lst_model.add(LSTM(
        units=1024,
        return_sequences=False,
        activation='swish'
    ))
    lst_model.add(Dropout(0.2))

    # dense layer for output
    lst_model.add(Dense(units=1))

    custom_learning_rate = 0.002
    adam_optimizer = Adam(learning_rate=custom_learning_rate)

    # compile the model with the custom optimizer
    lst_model.compile(optimizer=adam_optimizer, loss='mean_absolute_error')

# print the model summary
lst_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 1, 1024)           4239360   
                                                                 
 dropout (Dropout)           (None, 1, 1024)           0         
                                                                 
 lstm_1 (LSTM)               (None, 1, 1024)           8392704   
                                                                 
 dropout_1 (Dropout)         (None, 1, 1024)           0         
                                                                 
 lstm_2 (LSTM)               (None, 1024)              8392704   
                                                                 
 dropout_2 (Dropout)         (None, 1024)              0         
                                                                 
 dense (Dense)               (None, 1)                 1

In [22]:
#### train the model
# set up the EarlyStopping callback to monitor the validation loss
earlyStop = EarlyStopping(
    monitor="val_loss", 
    verbose=1,          # verbose mode will print out extra information
    mode='min',         # the training will stop when the quantity monitored has stopped decreasing
    patience=5          # number of epochs with no improvement after which training will be stopped
)

# fit the LSTM model to the training data
history = lst_model.fit(
    X_train, y_train,                       # training data and labels
    epochs=15,                             # maximum number of epochs to run
    batch_size=1024,                        # batch size for training
    validation_data=(X_test, y_test),       # validation data for evaluating the model
    callbacks=[earlyStop],                  # list of callbacks, in this case just EarlyStopping
    verbose=1,                              # verbose mode will print out extra information per epoch
    shuffle=False                           # don't shuffle the data, usually important in time series
)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [23]:
#### calculate MAE after rescaling
# predict the y_test
y_pred = lst_model.predict(X_test)

# inverse scaling predictions and true values
y_pred_rescaled = scaler.inverse_transform(y_pred)
y_test_rescaled = scaler.inverse_transform(y_test.values.reshape(-1, 1))

# calculate MAE
true_mae = mean_absolute_error(y_test_rescaled, y_pred_rescaled)
print(f"True MAE: {true_mae}")

True MAE: 58.11881438518989


In [24]:
# save the model
lst_model.save('lstm.h5')

  saving_api.save_model(


### Submission

In [25]:
# adjust preprocess() to suit generated dataset
def preprocess(train: pd.DataFrame, client: pd.DataFrame):
    """
    Preprocesses the training and client data by merging, filling missing values,
    extracting datetime features, and scaling numerical features.
    """

    # merge the client data with the train data
    train = train.merge(
        client[['installed_capacity', 'eic_count', 'county', 'is_business', 'product_type']],
        how='left',
        on=['county', 'is_business', 'product_type']
    )

    # fill missing values for 'installed_capacity' and 'eic_count' with 0
    train['installed_capacity'].fillna(0, inplace=True)
    train['eic_count'].fillna(0, inplace=True)

    # convert 'datetime' to datetime object and extract features
    train['datetime'] = pd.to_datetime(train['datetime'])
    train['month'] = train['datetime'].dt.month
    train['hour'] = train['datetime'].dt.hour
    train['dayofweek'] = train['datetime'].dt.dayofweek

    # set 'datetime' as the DataFrame index and sort it
    train.set_index('datetime', inplace=True)
    train.sort_index(inplace=True)

    # initialize the MinMaxScaler and scale selected features
    scaler = MinMaxScaler(feature_range=(0, 1))
    train['installed_capacity'] = scaler.fit_transform(train[['installed_capacity']])
    train['eic_count'] = scaler.fit_transform(train[['eic_count']])

    return train, scaler

In [26]:
cd /kaggle/input/predict-energy-behavior-of-prosumers

/kaggle/input/predict-energy-behavior-of-prosumers


In [27]:
import enefit
# create an environment for testing
env = enefit.make_env()
# generate a test dataset (iterator)
iter_test = env.iter_test()

In [28]:
model = load_model('/kaggle/working/lstm.h5')

In [29]:
# # display generated dataset information
# counter = 0
# for (test, revealed_targets, client, historical_weather,
#         forecast_weather, electricity_prices, gas_prices, sample_prediction) in iter_test:
#     if counter == 0:
#         print(test.head(3))
#         print(revealed_targets.head(3))
#         print(client.head(3))
#         print(historical_weather.head(3))
#         print(forecast_weather.head(3))
#         print(electricity_prices.head(3))
#         print(gas_prices.head(3))
#         print(sample_prediction.head(3))
#     sample_prediction['target'] = 0
#     env.predict(sample_prediction)
#     counter += 1

In [30]:
cd /kaggle/working

/kaggle/working


In [31]:
counter = 0

for (test, revealed_targets, client, historical_weather, forecast_weather,
     electricity_prices, gas_prices, sample_prediction) in iter_test:
    test.rename(columns={'prediction_datetime': 'datetime'}, inplace=True)
    test, scaler = preprocess(test, client)
    
    
    # drop non-feature columns from the dataset
    X = test.drop(['row_id', 'currently_scored'], axis=1)
    # assign the matched dtype
    X = X.astype('float64')
    # reshape features for LSTM: [samples, timesteps, features]
    X = np.array(X).reshape((X.shape[0], 1, X.shape[1]))


    # ensure the data type is float32 unless your model was trained with float64
    sample_prediction['target'] = model.predict(X)
    sample_prediction['target'] = scaler.inverse_transform(sample_prediction[['target']])
    
    env.predict(sample_prediction)
    print(counter)
    counter += 1

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.
0
1
2
3


#### Model 2
Try reduce the overfitting by increase **dropout** percentage, decrease **stacked layer** number, and adjust learning rate.

In [32]:
# from tensorflow.keras.optimizers import Adam

In [33]:
# # define the LSTM model
# lst_model_2 = Sequential()

# # first LSTM layer with dropout
# lst_model_2.add(LSTM(
#     units=1024,
#     return_sequences=True,
#     activation='swish',
#     input_shape=(X_train.shape[1], X_train.shape[2]),
# ))
# lst_model_2.add(Dropout(0.3))

# # second LSTM layer with dropout
# lst_model_2.add(LSTM(
#     units=1024,
#     return_sequences=False,
#     activation='swish'
# ))
# lst_model_2.add(Dropout(0.3))

# # dense layer for output
# lst_model_2.add(Dense(units=1))

# # initialize the Adam optimizer with a custom learning rate
# custom_learning_rate = 0.02
# adam_optimizer = Adam(learning_rate=custom_learning_rate)

# # compile the model with the custom optimizer
# lst_model_2.compile(optimizer=adam_optimizer, loss='mean_absolute_error')

# # print the model summary
# lst_model_2.summary()

In [34]:
# # fit the LSTM model to the training data
# history_2 = lst_model_2.fit(
#     X_train, y_train,                       # training data and labels
#     epochs=100,                             # maximum number of epochs to run
#     batch_size=2048,                         # batch size for training
#     validation_data=(X_test, y_test),       # validation data for evaluating the model
#     callbacks=[earlyStop],                  # list of callbacks, in this case just EarlyStopping
#     verbose=2,                              # verbose mode will print out extra information per epoch
#     shuffle=False                           # don't shuffle the data, usually important in time series
# )

In [35]:
# mae = mean_absolute_error(lst_model.predict(test_X), test_y)
# print('Test MAE: %.3f' % mae)

In [36]:
# save model
# lst_model.save('lstm.h5')

### Model 3

Before we tried using 1 timestep for the models, this does not take the advantage of LSTM. Here, we try a 5.

In [37]:
#### read the CSV files into DataFrames
# train = pd.read_csv(os.path.join(DATA_DIR, "train.csv"))

# drop the non-feature columns to create the feature set X
# X = train.drop(['target', 'row_id', 'data_block_id'], axis=1)

# isolate the target variable to create the label set y
# y = train['target']

# print(X.shape)
# print(y.shape)

# X.head(20)


In [38]:
# # convert the DataFrame to a NumPy array if necessary
# X_values = X.values
# y_values = y.values

# # define a function to create sequences from the feature set X
# def create_sequences(X_values, y_values, look_back):
#     X_seq, y_seq = [], []
#     for i in range(len(X_values) - look_back):
#         # retrieve the input sequence
#         sequence = X_values[i:(i + look_back)]
#         X_seq.append(sequence)
#         # retrieve the corresponding target
#         target = y_values[i + look_back]
#         y_seq.append(target)
#     return np.array(X_seq), np.array(y_seq)

# # specify the look_back period
# # try 7!
# look_back = 7  

# # create sequences using the defined function
# X_seq, y_seq = create_sequences(X_values, y_values, look_back)

# print(X_seq.shape)  # This will print the shape of X_seq
# print(y_seq.shape)  # This will print the shape of y_seq

In [39]:
# import tensorflow as tf

# X_seq = tf.convert_to_tensor(X_seq, dtype=tf.float32)
# y_seq = tf.convert_to_tensor(y_seq, dtype=tf.float32)

In [40]:
# # X_seq.shape should be (num_samples, look_back, num_features)
# # y_seq.shape should be (num_samples,)

# # define the LSTM model architecture
# model = Sequential([
#     # LSTM layer with 50 units, input shape is based on the features and look_back
#     LSTM(50, input_shape=(X_seq.shape[1], X_seq.shape[2]), activation='relu'),
#     # Dense layer with one neuron for regression output
#     Dense(1)
# ])

# # compile the model with an optimizer and a loss function
# model.compile(optimizer='adam', loss='mean_squared_error')

# # train the model with the training data
# # use a validation split to monitor the model's performance on unseen data during training
# history = model.fit(
#     X_seq, y_seq,
#     epochs=100,                # Adjust the number of epochs based on your needs
#     batch_size=1024,            # Batch size can be adjusted based on computational resources
#     validation_split=0.2,     # 20% of the data will be used for validation
#     verbose=1                 # Set verbose to 0 for no output, 1 for progress bars, 2 for one line per epoch
# )


## Step 2: Try preprocessed data

Examine the data preprocessing steps detailed here: [Predict Energy Behavior of Prosumers - Data Analysis on Kaggle](https://www.kaggle.com/code/jackren000/predict-energy-behavior-of-prosumers-dataanalysis).

In [41]:
# # load the file
# train = pd.read_csv('/kaggle/input/enefit-processed-train/processed_train.csv')

In [42]:
# train.head(100)