In [1]:
#### load the libraries
import pandas as pd
import numpy as np
import os
from matplotlib import pyplot
from sklearn.metrics import mean_absolute_error
from keras.models import Sequential, load_model
from keras.layers import LSTM, Dropout, Dense
from keras.callbacks import EarlyStopping

### Data Collection

In [2]:
#### update data directory path
DATA_DIR = '/kaggle/input/predict-energy-behavior-of-prosumers'

In [3]:
#### read the CSV files into DataFrames
train = pd.read_csv(os.path.join(DATA_DIR, "train.csv"))

### Data Exploration

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2018352 entries, 0 to 2018351
Data columns (total 9 columns):
 #   Column              Dtype  
---  ------              -----  
 0   county              int64  
 1   is_business         int64  
 2   product_type        int64  
 3   target              float64
 4   is_consumption      int64  
 5   datetime            object 
 6   data_block_id       int64  
 7   row_id              int64  
 8   prediction_unit_id  int64  
dtypes: float64(1), int64(7), object(1)
memory usage: 138.6+ MB


In [5]:
train.head()

Unnamed: 0,county,is_business,product_type,target,is_consumption,datetime,data_block_id,row_id,prediction_unit_id
0,0,0,1,0.713,0,2021-09-01 00:00:00,0,0,0
1,0,0,1,96.59,1,2021-09-01 00:00:00,0,1,0
2,0,0,2,0.0,0,2021-09-01 00:00:00,0,2,1
3,0,0,2,17.314,1,2021-09-01 00:00:00,0,3,1
4,0,0,3,2.904,0,2021-09-01 00:00:00,0,4,2


In [6]:
# display datetime range
train['datetime'].unique()

array(['2021-09-01 00:00:00', '2021-09-01 01:00:00',
       '2021-09-01 02:00:00', ..., '2023-05-31 21:00:00',
       '2023-05-31 22:00:00', '2023-05-31 23:00:00'], dtype=object)

Note that in the `train.csv` dataset, the datetime change begins with the hour, followed by the day, and then the month.

Here is the pseudocode of `train.csv` dataset:

In [7]:

################## The pseudocode of the train dataset ##################
#for year in range(2021, 2024):  
#    for month in range(1, 13):  # Adjusted to correctly range from 1 to 12  
#        for hour in range(24):  
#           for county in range(15):  
#                for is_business in range(2):  # Adjusted to correctly range from 0 to 1  
#                    for product in range(4):  
#                        print(target)  
######################################################################

### Data Transformation

In [8]:
train['datetime'] = pd.to_datetime(train['datetime'])

In [9]:
# 'datetime' column is changed to datetime64[ns]
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2018352 entries, 0 to 2018351
Data columns (total 9 columns):
 #   Column              Dtype         
---  ------              -----         
 0   county              int64         
 1   is_business         int64         
 2   product_type        int64         
 3   target              float64       
 4   is_consumption      int64         
 5   datetime            datetime64[ns]
 6   data_block_id       int64         
 7   row_id              int64         
 8   prediction_unit_id  int64         
dtypes: datetime64[ns](1), float64(1), int64(7)
memory usage: 138.6 MB


In [10]:
train.head()

Unnamed: 0,county,is_business,product_type,target,is_consumption,datetime,data_block_id,row_id,prediction_unit_id
0,0,0,1,0.713,0,2021-09-01,0,0,0
1,0,0,1,96.59,1,2021-09-01,0,1,0
2,0,0,2,0.0,0,2021-09-01,0,2,1
3,0,0,2,17.314,1,2021-09-01,0,3,1
4,0,0,3,2.904,0,2021-09-01,0,4,2


In [11]:
# display datetime range
train['datetime'].unique()

<DatetimeArray>
['2021-09-01 00:00:00', '2021-09-01 01:00:00', '2021-09-01 02:00:00',
 '2021-09-01 03:00:00', '2021-09-01 04:00:00', '2021-09-01 05:00:00',
 '2021-09-01 06:00:00', '2021-09-01 07:00:00', '2021-09-01 08:00:00',
 '2021-09-01 09:00:00',
 ...
 '2023-05-31 14:00:00', '2023-05-31 15:00:00', '2023-05-31 16:00:00',
 '2023-05-31 17:00:00', '2023-05-31 18:00:00', '2023-05-31 19:00:00',
 '2023-05-31 20:00:00', '2023-05-31 21:00:00', '2023-05-31 22:00:00',
 '2023-05-31 23:00:00']
Length: 15312, dtype: datetime64[ns]

In [12]:
# set index as ascending datetime
train.set_index('datetime', inplace=True)
train.sort_index()
train['target'].fillna(value=0, inplace=True)
# train.dropna(axis=0, inplace=True)
train.head()

Unnamed: 0_level_0,county,is_business,product_type,target,is_consumption,data_block_id,row_id,prediction_unit_id
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2021-09-01,0,0,1,0.713,0,0,0,0
2021-09-01,0,0,1,96.59,1,0,1,0
2021-09-01,0,0,2,0.0,0,0,2,1
2021-09-01,0,0,2,17.314,1,0,3,1
2021-09-01,0,0,3,2.904,0,0,4,2
