In [1]:
test_dir = '../../5_min_complete_dataset_testing/'
train_dir = '../../5_min_complete_dataset/'

In [2]:
import os
import csv

In [3]:
test_dir_path = os.path.abspath(test_dir)
train_dir_path = os.path.abspath(train_dir)

In [4]:
import pandas as pd
from numpy import nan
from numpy import isnan
import numpy as np
from math import sqrt
from numpy import split
from numpy import array
import sys
from scipy.stats import randint
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv), data manipulation as in SQL
import matplotlib.pyplot as plt # this is used for the plot the graph 
import seaborn as sns # used for plot interactive graph. 
from sklearn.model_selection import train_test_split, KFold # to split the data into two parts
from sklearn.preprocessing import StandardScaler # for normalization
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline # pipeline making
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import SelectFromModel
from sklearn import metrics # for the check the error and accuracy of the model
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error


#Deep learning specific

import keras
from keras import regularizers
import itertools
from keras.layers import Dense, Dropout, LSTM, Bidirectional, RepeatVector, TimeDistributed, Flatten, BatchNormalization
from keras.models import Sequential, load_model
from keras.utils import to_categorical
from keras.optimizers import SGD, Adam
from keras.callbacks import EarlyStopping
from keras.utils import np_utils
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers import ConvLSTM2D

Using TensorFlow backend.


In [5]:
plt.rcParams['figure.figsize'] = [90, 45]
plt.rcParams['font.size'] = 40

In [6]:
def split_dataset(data, count):
    train = data[0:count]
    # restructure into windows of 60 min data
    train = array(split(train, len(train)/12))
    return train

In [7]:
def read_csv_file(filepath):
    df = pd.read_csv(filepath, sep = ',', parse_dates = {'dt' : ['time']}, infer_datetime_format=True, 
                 low_memory = False, na_values=['nan'], index_col = 'dt')
    df.drop(['gl_predict'], inplace = True, axis = 1)
    return df

In [8]:
def evaluate_forecasts(actual, predicted):
    print("Actual Shape : {}".format(actual.shape))
    print("Predicted Shape : {}".format(predicted.shape))
    scores = list()
    maes = list()
    for i in range(actual.shape[1]):
        mse = mean_squared_error(actual[:, i], predicted[:, i])
        mae = mean_absolute_error(actual[:, i], predicted[:, i])
        rmse = sqrt(mse)
        maes.append(mae)
        scores.append(rmse)
    s = 0
    for row in range(actual.shape[0]):
        for col in range(actual.shape[1]):
            s += (actual[row, col] - predicted[row, col])**2
    score = sqrt(s / (actual.shape[0] * actual.shape[1]))
    return score, scores, maes

In [9]:
def summarize_scores(name, score, scores, maes):
    rmse_05_min = scores[0]
    rmse_30_min = scores[5]
    rmse_60_min = scores[11]
    mae_05_min = maes[0]
    mae_30_min = maes[5]
    mae_60_min = maes[11]
    print("==========================")
    print("RMSE : ")
    print("5 Minutes : {0:0.1f}".format(rmse_05_min))
    print("30 Minutes : {0:0.1f}".format(rmse_30_min))
    print("60 Minutes : {0:0.1f}".format(rmse_60_min))
    print("==========================")
    print("==========================")
    print("MAE : ")
    print("5 Minutes : {0:0.1f}".format(mae_05_min))
    print("30 Minutes : {0:0.1f}".format(mae_30_min))
    print("60 Minutes : {0:0.1f}".format(mae_60_min))
    print("===========================")

In [10]:
def forecast(model, history, n_input):
    data = array(history)
    data = data.reshape((data.shape[0]*data.shape[1], data.shape[2]))
    # retrieve last observations for input data
    input_x = data[-n_input:, :]
    # reshape into [1, n_input, n]
    input_x = input_x.reshape((1, input_x.shape[0], input_x.shape[1]))
    # forecast the next 30 mins
    yhat = model.predict(input_x, verbose=0)
    # we only want the vector forecast
    yhat = yhat[0]
    return yhat

In [11]:
def evaluate_model(model, train, test, n_input):
    history = [x for x in train]
    # walk-forward validation over each week
    predictions = []
    for i in range(len(test)):
        # predict the 30 mins
        yhat_sequence = forecast(model, history, n_input)
        # store the predictions
        predictions.append(yhat_sequence)
        # get real observation and add to history for predicting the next 30 mins
        history.append(test[i, :])
    # evaluate predictions days for each 30 mins
    predictions = array(predictions)
    score, scores, maes = evaluate_forecasts(test[:, :, 0], predictions)

    return score, scores, maes, predictions, test[:, :, 0]

In [12]:
csv_train_file_path = [os.path.join(train_dir_path, item) for item in os.listdir(train_dir_path)]
csv_test_file_path = [os.path.join(test_dir_path, item) for item in os.listdir(test_dir_path)]

In [13]:
csv_test_file_path

['/KDH2020/KDH2020_BGLP_Challenge/practice/final_submission/5_min_complete_dataset_testing/584-ws-testing.csv',
 '/KDH2020/KDH2020_BGLP_Challenge/practice/final_submission/5_min_complete_dataset_testing/540-ws-testing.csv',
 '/KDH2020/KDH2020_BGLP_Challenge/practice/final_submission/5_min_complete_dataset_testing/544-ws-testing.csv',
 '/KDH2020/KDH2020_BGLP_Challenge/practice/final_submission/5_min_complete_dataset_testing/552-ws-testing.csv',
 '/KDH2020/KDH2020_BGLP_Challenge/practice/final_submission/5_min_complete_dataset_testing/596-ws-testing.csv',
 '/KDH2020/KDH2020_BGLP_Challenge/practice/final_submission/5_min_complete_dataset_testing/567-ws-testing.csv']

In [14]:
csv_train_file_path

['/KDH2020/KDH2020_BGLP_Challenge/practice/final_submission/5_min_complete_dataset/567-ws-training.csv',
 '/KDH2020/KDH2020_BGLP_Challenge/practice/final_submission/5_min_complete_dataset/563-ws-training.csv',
 '/KDH2020/KDH2020_BGLP_Challenge/practice/final_submission/5_min_complete_dataset/540-ws-training.csv',
 '/KDH2020/KDH2020_BGLP_Challenge/practice/final_submission/5_min_complete_dataset/570-ws-training.csv',
 '/KDH2020/KDH2020_BGLP_Challenge/practice/final_submission/5_min_complete_dataset/559-ws-training.csv',
 '/KDH2020/KDH2020_BGLP_Challenge/practice/final_submission/5_min_complete_dataset/575-ws-training.csv',
 '/KDH2020/KDH2020_BGLP_Challenge/practice/final_submission/5_min_complete_dataset/544-ws-training.csv',
 '/KDH2020/KDH2020_BGLP_Challenge/practice/final_submission/5_min_complete_dataset/596-ws-training.csv',
 '/KDH2020/KDH2020_BGLP_Challenge/practice/final_submission/5_min_complete_dataset/591-ws-training.csv',
 '/KDH2020/KDH2020_BGLP_Challenge/practice/final_submis

In [15]:
train_df_552 = read_csv_file(csv_train_file_path[11])
test_df_552 = read_csv_file(csv_test_file_path[3])

train_df_552.drop(['meal_type_Lunch'], inplace = True, axis = 1)

In [16]:
train_values_552 = train_df_552.values
test_values_552 = test_df_552.values

train = split_dataset(train_values_552, 11088)
test = split_dataset(test_values_552, 3948)

print(train.shape)
print(test.shape)

(924, 12, 20)
(329, 12, 20)


In [17]:
model_552 = load_model('552_bi_lstm_encoder_decoder_final.h5')

In [18]:
score, scores, maes, predictions, actual = evaluate_model(model_552, train, test, 6)

Actual Shape : (329, 12)
Predicted Shape : (329, 12, 1)


In [19]:
summarize_scores('seq2seq_bilstm', score, scores, maes)

RMSE : 
5 Minutes : 5.8
30 Minutes : 14.1
60 Minutes : 24.8
MAE : 
5 Minutes : 4.1
30 Minutes : 10.6
60 Minutes : 19.0


In [20]:
pred = np.reshape(predictions, (predictions.shape[0]*predictions.shape[1]))

In [21]:
act = np.reshape(actual, (actual.shape[0]*actual.shape[1]))

In [22]:
#timestamp_col = ['{:%Y-%m-%d %H:%M:%S}'.format(item) for item in list(test_df_567.index.to_pydatetime())]
timestamp_col = [pd.to_datetime(item) for item in list(test_df_552.index.to_pydatetime())]

In [23]:
timestamp_col = timestamp_col[:-1]
len(timestamp_col)

3948

In [24]:
df = pd.DataFrame({
    'timestamp' : timestamp_col,
    'predicted_BGL' : pred,
    'actual_BGL' : act
})

In [25]:
df.head()

Unnamed: 0,timestamp,predicted_BGL,actual_BGL
0,2025-05-25 00:00:00,205.518219,181.0
1,2025-05-25 00:05:00,197.817764,180.0
2,2025-05-25 00:10:00,190.74884,178.0
3,2025-05-25 00:15:00,187.622757,178.0
4,2025-05-25 00:20:00,183.755463,176.0


In [26]:
df.to_csv('552_seq2seq_bi_lstm_5_min.csv', sep=' ', index = False, quoting=0)

In [27]:
df_30 = pd.DataFrame({
    'timestamp' : timestamp_col[::6],
    'predicted_BGL' : pred[::6],
    'actual_BGL' : act[::6]
})

In [28]:
df_30.head()

Unnamed: 0,timestamp,predicted_BGL,actual_BGL
0,2025-05-25 00:00:00,205.518219,181.0
1,2025-05-25 00:30:00,176.106873,168.0
2,2025-05-25 01:00:00,150.290741,153.0
3,2025-05-25 01:30:00,137.064606,129.0
4,2025-05-25 02:00:00,102.546745,105.0


In [29]:
df_30.to_csv('552_seq2seq_bi_lstm_30_min.csv', sep=' ', index = False, quoting=0)

In [30]:
df_60 = pd.DataFrame({
    'timestamp' : timestamp_col[::12],
    'predicted_BGL' : pred[::12],
    'actual_BGL' : act[::12]
})

In [31]:
df_60.head()

Unnamed: 0,timestamp,predicted_BGL,actual_BGL
0,2025-05-25 00:00:00,205.518219,181.0
1,2025-05-25 01:00:00,150.290741,153.0
2,2025-05-25 02:00:00,102.546745,105.0
3,2025-05-25 03:00:00,70.707893,69.0
4,2025-05-25 04:00:00,106.117508,109.0


In [32]:
df_60.to_csv('552_seq2seq_bi_lstm_60_min.csv', sep=' ', index = False, quoting=0)