In [1]:
#### load the libraries
import pandas as pd
import numpy as np
import os
from matplotlib import pyplot
from sklearn.metrics import mean_absolute_error
from keras.models import Sequential, load_model
from keras.layers import LSTM, Dropout, Dense
from keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split



### Data Collection

In [2]:
#### update data directory path
DATA_DIR = '/kaggle/input/predict-energy-behavior-of-prosumers'

In [3]:
#### read the CSV files into DataFrames
train = pd.read_csv(os.path.join(DATA_DIR, "train.csv"))

### Data Exploration

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2018352 entries, 0 to 2018351
Data columns (total 9 columns):
 #   Column              Dtype  
---  ------              -----  
 0   county              int64  
 1   is_business         int64  
 2   product_type        int64  
 3   target              float64
 4   is_consumption      int64  
 5   datetime            object 
 6   data_block_id       int64  
 7   row_id              int64  
 8   prediction_unit_id  int64  
dtypes: float64(1), int64(7), object(1)
memory usage: 138.6+ MB


In [5]:
train.head()

Unnamed: 0,county,is_business,product_type,target,is_consumption,datetime,data_block_id,row_id,prediction_unit_id
0,0,0,1,0.713,0,2021-09-01 00:00:00,0,0,0
1,0,0,1,96.59,1,2021-09-01 00:00:00,0,1,0
2,0,0,2,0.0,0,2021-09-01 00:00:00,0,2,1
3,0,0,2,17.314,1,2021-09-01 00:00:00,0,3,1
4,0,0,3,2.904,0,2021-09-01 00:00:00,0,4,2


In [6]:
# display datetime range
train['datetime'].unique()

array(['2021-09-01 00:00:00', '2021-09-01 01:00:00',
       '2021-09-01 02:00:00', ..., '2023-05-31 21:00:00',
       '2023-05-31 22:00:00', '2023-05-31 23:00:00'], dtype=object)

Note that in the `train.csv` dataset, the datetime change begins with the hour, followed by the day, and then the month.

Here is the pseudocode of `train.csv` dataset:

In [7]:

################## The pseudocode of the train dataset ##################
#for year in range(2021, 2024):  
#    for month in range(1, 13):  # Adjusted to correctly range from 1 to 12  
#        for hour in range(24):  
#           for county in range(15):  
#                for is_business in range(2):  # Adjusted to correctly range from 0 to 1  
#                    for product in range(4):  
#                        print(target)  
######################################################################

### Data Transformation

In [8]:
train['datetime'] = pd.to_datetime(train['datetime'])

In [9]:
# 'datetime' column is changed to datetime64[ns]
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2018352 entries, 0 to 2018351
Data columns (total 9 columns):
 #   Column              Dtype         
---  ------              -----         
 0   county              int64         
 1   is_business         int64         
 2   product_type        int64         
 3   target              float64       
 4   is_consumption      int64         
 5   datetime            datetime64[ns]
 6   data_block_id       int64         
 7   row_id              int64         
 8   prediction_unit_id  int64         
dtypes: datetime64[ns](1), float64(1), int64(7)
memory usage: 138.6 MB


In [10]:
train.head()

Unnamed: 0,county,is_business,product_type,target,is_consumption,datetime,data_block_id,row_id,prediction_unit_id
0,0,0,1,0.713,0,2021-09-01,0,0,0
1,0,0,1,96.59,1,2021-09-01,0,1,0
2,0,0,2,0.0,0,2021-09-01,0,2,1
3,0,0,2,17.314,1,2021-09-01,0,3,1
4,0,0,3,2.904,0,2021-09-01,0,4,2


In [11]:
# display datetime range
train['datetime'].unique()

<DatetimeArray>
['2021-09-01 00:00:00', '2021-09-01 01:00:00', '2021-09-01 02:00:00',
 '2021-09-01 03:00:00', '2021-09-01 04:00:00', '2021-09-01 05:00:00',
 '2021-09-01 06:00:00', '2021-09-01 07:00:00', '2021-09-01 08:00:00',
 '2021-09-01 09:00:00',
 ...
 '2023-05-31 14:00:00', '2023-05-31 15:00:00', '2023-05-31 16:00:00',
 '2023-05-31 17:00:00', '2023-05-31 18:00:00', '2023-05-31 19:00:00',
 '2023-05-31 20:00:00', '2023-05-31 21:00:00', '2023-05-31 22:00:00',
 '2023-05-31 23:00:00']
Length: 15312, dtype: datetime64[ns]

In [12]:
# set index as ascending datetime
train.set_index('datetime', inplace=True)
train.sort_index()
train['target'].fillna(value=0, inplace=True)
# train.dropna(axis=0, inplace=True)
train.head()

Unnamed: 0_level_0,county,is_business,product_type,target,is_consumption,data_block_id,row_id,prediction_unit_id
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2021-09-01,0,0,1,0.713,0,0,0,0
2021-09-01,0,0,1,96.59,1,0,1,0
2021-09-01,0,0,2,0.0,0,0,2,1
2021-09-01,0,0,2,17.314,1,0,3,1
2021-09-01,0,0,3,2.904,0,0,4,2


In [13]:
# drop non-feature columns from the dataset
X = train.drop(['target', 'row_id', 'data_block_id'], axis=1)
# isolate the target variable
y = train['target']

# split the dataset into 70% training and 30% testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# reshape features for LSTM: [samples, timesteps, features]
X_train = np.array(X_train).reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test = np.array(X_test).reshape((X_test.shape[0], 1, X_test.shape[1]))

# print the shapes of the train and test data
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (1412846, 1, 5)
y_train shape: (1412846,)
X_test shape: (605506, 1, 5)
y_test shape: (605506,)


### Data Modeling

#### Model 1

In [14]:
# define the LSTM model
lst_model = Sequential()

# first LSTM layer with dropout
lst_model.add(LSTM(
    units=1024,
    return_sequences=True,
    activation='swish',
    input_shape=(X_train.shape[1], X_train.shape[2]),
))
lst_model.add(Dropout(0.2))

# second LSTM layer with dropout
lst_model.add(LSTM(
    units=1024,
    return_sequences=True,
    activation='swish'
))
lst_model.add(Dropout(0.2))

# third LSTM layer with dropout, returning only the last output
lst_model.add(LSTM(
    units=1024,
    return_sequences=False,
    activation='swish'
))
lst_model.add(Dropout(0.2))

# dense layer for output
lst_model.add(Dense(units=1))

# compile the model
lst_model.compile(optimizer='adam', loss='mean_absolute_error')

# print the model summary
lst_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 1, 1024)           4218880   
                                                                 
 dropout (Dropout)           (None, 1, 1024)           0         
                                                                 
 lstm_1 (LSTM)               (None, 1, 1024)           8392704   
                                                                 
 dropout_1 (Dropout)         (None, 1, 1024)           0         
                                                                 
 lstm_2 (LSTM)               (None, 1024)              8392704   
                                                                 
 dropout_2 (Dropout)         (None, 1024)              0         
                                                                 
 dense (Dense)               (None, 1)                 1

In [15]:
# set up the EarlyStopping callback to monitor the validation loss
earlyStop = EarlyStopping(
    monitor="val_loss", 
    verbose=2,          # verbose mode will print out extra information
    mode='min',         # the training will stop when the quantity monitored has stopped decreasing
    patience=5          # number of epochs with no improvement after which training will be stopped
)

# fit the LSTM model to the training data
history = lst_model.fit(
    X_train, y_train,                       # training data and labels
    epochs=100,                             # maximum number of epochs to run
    batch_size=1024,                        # batch size for training
    validation_data=(X_test, y_test),       # validation data for evaluating the model
    callbacks=[earlyStop],                  # list of callbacks, in this case just EarlyStopping
    verbose=2,                              # verbose mode will print out extra information per epoch
    shuffle=False                           # don't shuffle the data, usually important in time series
)

Epoch 1/100
1380/1380 - 67s - loss: 184.8303 - val_loss: 151.9915 - 67s/epoch - 49ms/step
Epoch 2/100
1380/1380 - 64s - loss: 145.6404 - val_loss: 130.4620 - 64s/epoch - 46ms/step
Epoch 3/100
1380/1380 - 64s - loss: 137.1894 - val_loss: 129.1006 - 64s/epoch - 46ms/step
Epoch 4/100
1380/1380 - 64s - loss: 133.7388 - val_loss: 128.2595 - 64s/epoch - 46ms/step
Epoch 5/100
1380/1380 - 64s - loss: 132.4064 - val_loss: 130.6793 - 64s/epoch - 46ms/step
Epoch 6/100
1380/1380 - 64s - loss: 131.7563 - val_loss: 128.7879 - 64s/epoch - 46ms/step
Epoch 7/100
1380/1380 - 64s - loss: 131.3576 - val_loss: 128.9569 - 64s/epoch - 46ms/step
Epoch 8/100
1380/1380 - 64s - loss: 130.8578 - val_loss: 128.3190 - 64s/epoch - 46ms/step
Epoch 9/100
1380/1380 - 64s - loss: 130.7255 - val_loss: 129.0964 - 64s/epoch - 46ms/step
Epoch 9: early stopping


The early stopping callback stopped the training process because the validation loss did not decrease for several epochs, despite potential improvements in training loss. This divergence between training and validation performance is a classic sign of overfitting.

#### Model 2

reduce the overfitting by

In [16]:
from tensorflow.keras.optimizers import Adam

In [17]:
# define the LSTM model
lst_model_2 = Sequential()

# first LSTM layer with dropout
lst_model_2.add(LSTM(
    units=1024,
    return_sequences=True,
    activation='swish',
    input_shape=(X_train.shape[1], X_train.shape[2]),
))
lst_model_2.add(Dropout(0.3))

# second LSTM layer with dropout
lst_model_2.add(LSTM(
    units=1024,
    return_sequences=False,
    activation='swish'
))
lst_model_2.add(Dropout(0.3))

# dense layer for output
lst_model_2.add(Dense(units=1))

# initialize the Adam optimizer with a custom learning rate
custom_learning_rate = 0.02
adam_optimizer = Adam(learning_rate=custom_learning_rate)

# compile the model with the custom optimizer
lst_model_2.compile(optimizer=adam_optimizer, loss='mean_absolute_error')

# print the model summary
lst_model_2.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_3 (LSTM)               (None, 1, 1024)           4218880   
                                                                 
 dropout_3 (Dropout)         (None, 1, 1024)           0         
                                                                 
 lstm_4 (LSTM)               (None, 1024)              8392704   
                                                                 
 dropout_4 (Dropout)         (None, 1024)              0         
                                                                 
 dense_1 (Dense)             (None, 1)                 1025      
                                                                 
Total params: 12612609 (48.11 MB)
Trainable params: 12612609 (48.11 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [18]:
# fit the LSTM model to the training data
history_2 = lst_model_2.fit(
    X_train, y_train,                       # training data and labels
    epochs=100,                             # maximum number of epochs to run
    batch_size=2048,                         # batch size for training
    validation_data=(X_test, y_test),       # validation data for evaluating the model
    callbacks=[earlyStop],                  # list of callbacks, in this case just EarlyStopping
    verbose=2,                              # verbose mode will print out extra information per epoch
    shuffle=False                           # don't shuffle the data, usually important in time series
)

Epoch 1/100
690/690 - 42s - loss: 173.1839 - val_loss: 148.9331 - 42s/epoch - 61ms/step
Epoch 2/100
690/690 - 37s - loss: 159.1522 - val_loss: 137.3373 - 37s/epoch - 54ms/step
Epoch 3/100
690/690 - 37s - loss: 146.3663 - val_loss: 135.2917 - 37s/epoch - 54ms/step
Epoch 4/100
690/690 - 37s - loss: 143.6358 - val_loss: 140.6883 - 37s/epoch - 54ms/step
Epoch 5/100
690/690 - 37s - loss: 143.0775 - val_loss: 136.2490 - 37s/epoch - 54ms/step
Epoch 6/100
690/690 - 37s - loss: 142.4927 - val_loss: 133.2635 - 37s/epoch - 54ms/step
Epoch 7/100
690/690 - 37s - loss: 142.9490 - val_loss: 135.6977 - 37s/epoch - 54ms/step
Epoch 8/100
690/690 - 37s - loss: 140.7601 - val_loss: 131.0321 - 37s/epoch - 54ms/step
Epoch 9/100
690/690 - 37s - loss: 140.3795 - val_loss: 130.9897 - 37s/epoch - 54ms/step
Epoch 10/100
690/690 - 37s - loss: 138.8741 - val_loss: 131.8532 - 37s/epoch - 54ms/step
Epoch 11/100
690/690 - 37s - loss: 142.1758 - val_loss: 133.7669 - 37s/epoch - 54ms/step
Epoch 12/100
690/690 - 37s - l

In [19]:
# mae = mean_absolute_error(lst_model.predict(test_X), test_y)
# print('Test MAE: %.3f' % mae)

In [20]:
# save model
# lst_model.save('lstm.h5')

### Submission