In [3]:

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf
import tensorflow_probability as tfp
import time 
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit
import matplotlib.pyplot as plt
import numpy as np
import warnings
warnings.filterwarnings('ignore', category=UserWarning)
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [4]:


# 커스텀 BayesianLSTM 셀 구현
class BayesianLSTMCell(tf.keras.layers.LSTMCell):
    def __init__(self, units, **kwargs):
        super(BayesianLSTMCell, self).__init__(units, **kwargs)
        self.units = units
        self.kernel_posterior_fn = tfp.layers.default_mean_field_normal_fn()
        self.kernel_prior_fn = lambda dtype, shape, name, trainable, add_variable_fn: tfp.layers.default_multivariate_normal_fn(
            dtype=dtype, shape=shape, name=name, trainable=trainable, add_variable_fn=add_variable_fn)
        self.recurrent_kernel_posterior_fn = tfp.layers.default_mean_field_normal_fn()
        self.recurrent_kernel_prior_fn = lambda dtype, shape, name, trainable, add_variable_fn: tfp.layers.default_multivariate_normal_fn(
            dtype=dtype, shape=shape, name=name, trainable=trainable, add_variable_fn=add_variable_fn)

    def build(self, input_shape):
        self.kernel_posterior = self.kernel_posterior_fn(
            dtype=tf.float32,
            shape=[input_shape[-1], self.units * 4],
            name='kernel_posterior',
            trainable=True,
            add_variable_fn=self.add_weight
        )
        self.kernel_prior = self.kernel_prior_fn(
            dtype=tf.float32,
            shape=[input_shape[-1], self.units * 4],
            name='kernel_prior',
            trainable=True,
            add_variable_fn=self.add_weight
        )
        self.recurrent_kernel_posterior = self.recurrent_kernel_posterior_fn(
            dtype=tf.float32,
            shape=[self.units, self.units * 4],
            name='recurrent_kernel_posterior',
            trainable=True,
            add_variable_fn=self.add_weight
        )
        self.recurrent_kernel_prior = self.recurrent_kernel_prior_fn(
            dtype=tf.float32,
            shape=[self.units, self.units * 4],
            name='recurrent_kernel_prior',
            trainable=True,
            add_variable_fn=self.add_weight
        )
        super(BayesianLSTMCell, self).build(input_shape)

    def call(self, inputs, states, training=None):
        self.kernel = self.kernel_posterior.sample()
        self.recurrent_kernel = self.recurrent_kernel_posterior.sample()
        return super(BayesianLSTMCell, self).call(inputs, states, training=training)

def build_bayesian_lstm_model(input_shape):
    model = Sequential([
        Input(shape=input_shape),
        tf.keras.layers.RNN(BayesianLSTMCell(10)),
        Dense(1)
    ])
    # model.compile(optimizer=Adam(learning_rate=0.01), loss='mean_squared_error')
    # return model
    kl_divergence = sum(model.losses)
    model.compile(optimizer=Adam(learning_rate=0.01),
                  loss=lambda y_true, 
                  y_pred: tf.keras.losses.mean_squared_error(y_true, y_pred) + kl_divergence)
    return model

In [5]:

def load_and_prepare_data(data_file):
    """
    주어진 CSV 파일을 불러와 기본 전처리를 수행.
    
    매개변수:
    - data_file: CSV 파일의 경로
    
    Returns:
    - data: 전처리된 데이터프레임
    """
    
    # Load the data
    data = pd.read_csv(data_file)

    # Prepare the data
    data['Time'] = pd.to_datetime(data['Time'])


    data['Target_MHC_Water_Level'] = data['MHC_Water_Level'].shift(-3)

    # Fill NaN values in 'Target_MHC_Water_Level' with the last value of 'MHC_Water_Level'
    data['Target_MHC_Water_Level'].fillna(data['MHC_Water_Level'].iloc[-1], inplace=True)
    
    
    
    # # Add lag variables for MHC_Water_Level from t-1 to t-10
    # for i in range(3, 11):
    #     data[f'MHC_Water_Level_lag_{i}'] = data['MHC_Water_Level'].shift(i)
    # data.dropna(inplace=True)
    
    
    #Add lag variables for MHC_Water_Level from t-1 to t-10
    for i in range(3, 6):
        data[f'MHC_Water_Level_lag_{i}'] = data['MHC_Water_Level'].shift(i)
        data[f'MH_Water_Level_lag_{i}'] = data['MH_Water_Level'].shift(i)
        data[f'PG_Water_Level_lag_{i}'] = data['PG_Water_Level'].shift(i)
        data[f'HH_Water_Level_lag_{i}'] = data['HH_Water_Level'].shift(i)
        data[f'GG_Water_Level_lag_{i}'] = data['GG_Water_Level'].shift(i)
    data.dropna(inplace=True)


    


    return data

def split_data(data):
    """
    데이터를 학습 및 테스트 데이터로 분할.
    
    매개변수:
    - data: 전체 데이터프레임
    
    Returns:
    - train_data: 학습 데이터
    - test_data: 테스트 데이터
    """
    # Split data
    train_size = int(len(data) * 0.8)
    train_data = data.iloc[:train_size]
    test_data = data.iloc[train_size:]
    
    return train_data, test_data

In [6]:


def bayes_prepare_train_test_sets(train_data, test_data):
    X_train = train_data.drop(columns=['Time', 'Target_MHC_Water_Level']).values
    y_train = train_data['Target_MHC_Water_Level'].values
    X_test = test_data.drop(columns=['Time', 'Target_MHC_Water_Level']).values
    y_test = test_data['Target_MHC_Water_Level'].values
    return X_train, y_train, X_test, y_test

In [7]:

                data = load_and_prepare_data('water_data.csv')
                train_data, test_data = split_data(data)
                X_train, y_train, X_test, y_test = bayes_prepare_train_test_sets(train_data, test_data)
                
                
                scaler_X = MinMaxScaler(feature_range=(0, 1))
                scaler_y = MinMaxScaler(feature_range=(0, 1))
                X_train_scaled = scaler_X.fit_transform(X_train)
                X_test_scaled = scaler_X.transform(X_test)
                y_train_scaled = scaler_y.fit_transform(y_train.reshape(-1, 1)).ravel()
                
                model = build_bayesian_lstm_model(input_shape=(X_train_scaled.shape[1], 1))
                X_train_scaled = X_train_scaled.reshape((X_train_scaled.shape[0], X_train_scaled.shape[1], 1))
                X_test_scaled = X_test_scaled.reshape((X_test_scaled.shape[0], X_test_scaled.shape[1], 1))
                model.fit(X_train_scaled, y_train_scaled, epochs=10, batch_size=32, verbose=1)

2024-08-12 12:15:53.077074: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2024-08-12 12:15:53.077247: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublas.so.11'; dlerror: libcublas.so.11: cannot open shared object file: No such file or directory
2024-08-12 12:15:53.077386: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublasLt.so.11'; dlerror: libcublasLt.so.11: cannot open shared object file: No such file or directory
2024-08-12 12:15:53.081192: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcusolver.so.11'; dlerror: libcusolver.so.11: cannot open shared object file: No such file or directory
2024-08-12 12:15:53.081273: W te

Epoch 1/10
Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7ff988366410>

In [8]:
bayes_pred_uncer =pd.read_csv('bayes_pred_uncer.csv')


In [69]:
def save_bayesian_visual_predictions_to_csv(model, 
                                            x,
                                            time_points, 
                                            scaler_y,
                                            n_iter=100):
    """
    예측 결과를 CSV 파일로 저장하는 함수
    
    매개변수:
    - model: 학습된 Bayesian LSTM 모델
    - x: 테스트 데이터의 특성 (정규화된 상태)
    - time_points: 시간대 포인트 (원래 시간 데이터)
    - scaler_y: 타겟 변수의 스케일러
    - n_iter: 예측 수행 횟수
    - filename: 저장할 CSV 파일 이름
    """

    
    # time_points = time_points[-3:]
    # print(time_points)
    
    predictions_scaled = np.zeros((len(time_points), n_iter))
    
    for j in range(n_iter):
        predictions_scaled[:, j] = model(x, training=True).numpy().flatten()
    
    # 예측 결과 역정규화
    predictions = scaler_y.inverse_transform(predictions_scaled)
    
    # 시간대와 예측 결과를 DataFrame으로 변환
    df_predictions = pd.DataFrame(
                                  predictions, 
                                  columns=[f'Prediction_{i+1}' for i in range(n_iter)])
    df_predictions.insert(0, 'Time', time_points)

    
    return df_predictions


In [71]:
import time


time_points = bayes_pred_uncer['Time']

# 시작 시간 기록
start_time = time.time()

# 실행할 코드
bayes_ppd_data = save_bayesian_visual_predictions_to_csv(model, 
                                                         X_test_scaled,
                                                         time_points, 
                                                         scaler_y,
                                                         n_iter=300)

# 종료 시간 기록
end_time = time.time()

# 경과 시간 계산
elapsed_time = end_time - start_time
print(f"Execution time: {elapsed_time} seconds")


Execution time: 61.48241329193115 seconds


In [72]:
bayes_ppd_data = bayes_ppd_data.assign(Time=time_points.values)


In [22]:
import tensorflow as tf
import tensorflow_probability as tfp
import numpy as np
import pandas as pd

# @tf.function
def predict_and_save_bayesian_visual_predictions(model, 
                                                 x, 
                                                 time_points, 
                                                 scaler_y, 
                                                 true_value,
                                                 n_iter=5000):
    """
    Bayesian LSTM 모델의 예측 결과를 여러 번 수행하여 불확실성과 신뢰 구간을 포함한
    예측 분포를 계산하고 이를 DataFrame으로 반환하는 함수.
    
    매개변수:
    - model: 학습된 Bayesian LSTM 모델
    - x: 테스트 데이터의 특성 (정규화된 상태)
    - time_points: 시간대 포인트 (원래 시간 데이터)
    - scaler_y: 타겟 변수의 스케일러
    - n_iter: 예측 수행 횟수
    
    반환값:
    - df_predictions: 예측 결과와 신뢰 구간을 포함한 DataFrame
    """

    # 여러 번의 예측 수행
    result = tf.TensorArray(tf.float32, size=n_iter)
    for i in tf.range(n_iter):
        result = result.write(i, model(x, training=True))
    result = result.stack()  # (n_iter, batch_size, output_size)
    
    # 평균 예측 값 계산
    prediction = tf.reduce_mean(result, axis=0)
    
    # 불확실성 계산 (표준편차)
    uncertainty = tf.math.reduce_std(result, axis=0)
    
    # 95% 신뢰구간 (credible interval) 계산
    lower_bound = tfp.stats.percentile(result, 2.5, axis=0)
    upper_bound = tfp.stats.percentile(result, 97.5, axis=0)
    
    # 예측 결과 역정규화
    prediction = scaler_y.inverse_transform(prediction.numpy())
    lower_bound = scaler_y.inverse_transform(lower_bound.numpy())
    upper_bound = scaler_y.inverse_transform(upper_bound.numpy())
    
    # 각 예측을 역정규화하고 DataFrame에 포함시키기 위해 loop
    predictions_scaled = np.zeros((len(time_points), n_iter))
    for j in range(n_iter):
        predictions_scaled[:, j] = scaler_y.inverse_transform(result[j].numpy().flatten().reshape(-1, 1)).flatten()

    # 시간대와 예측 결과를 DataFrame으로 변환
    df_predictions = pd.DataFrame(predictions_scaled, 
                                  columns=[f'Prediction_{i+1}' for i in range(n_iter)])
    df_predictions.insert(0, 'Time', time_points)
    df_predictions['Lower_Bound'] = lower_bound.flatten()
    df_predictions['Upper_Bound'] = upper_bound.flatten()
    df_predictions['Mean_Prediction'] = prediction.flatten()
    df_predictions['Uncertainty'] = uncertainty.numpy().flatten()
    df_predictions['True_value'] = true_value

    
    # 열 순서 변경: Mean_Prediction, Uncertainty, Lower_Bound, Upper_Bound, Time + 나머지 Prediction 열
    cols = ['Time', 'True_value', 'Mean_Prediction', 'Uncertainty', 'Lower_Bound', 'Upper_Bound'] + [col for col in df_predictions.columns if col.startswith('Prediction_')]
    df_predictions = df_predictions[cols]
    
    return df_predictions


In [14]:
test_data['MHC_Water_Level']

16022    1.50
16023    1.49
16024    1.50
16025    1.49
16026    1.49
         ... 
20022    1.51
20023    1.50
20024    1.50
20025    1.50
20026    1.49
Name: MHC_Water_Level, Length: 4005, dtype: float64

In [21]:
# 예측 및 불확실성 계산, Bayesian PPD 데이터 생성
time_points = test_data["Time"] + pd.Timedelta(hours=3)
bayes_ppd_data = predict_and_save_bayesian_visual_predictions(model, 
                                                              X_test_scaled, 
                                                              time_points,
                                                              scaler_y,
                                                              # true_value = test_data['MHC_Water_Level'], 
                                                              n_iter=50)

# # 예측 데이터 정리 및 저장
# bayes_pred_uncer = pd.DataFrame({
#     'Time': time_points,
#     "True_Value": test_data['MHC_Water_Level'].shift(-3),
#     "Prediction": bayes_ppd_data['Mean_Prediction'],
#     "Uncertainty": bayes_ppd_data['Uncertainty']
# })

In [17]:
bayes_ppd_data = bayes_ppd_data.assign(Time=time_points.values)

In [18]:
bayes_ppd_data

Unnamed: 0,Time,Mean_Prediction,Uncertainty,Lower_Bound,Upper_Bound,Prediction_1,Prediction_2,Prediction_3,Prediction_4,Prediction_5,...,Prediction_41,Prediction_42,Prediction_43,Prediction_44,Prediction_45,Prediction_46,Prediction_47,Prediction_48,Prediction_49,Prediction_50
0,2024-02-27 18:00:00,1.538655,0.001561,1.514522,1.564837,1.561734,1.560965,1.539598,1.523686,1.540223,...,1.544753,1.520570,1.560267,1.520646,1.549334,1.521666,1.543660,1.535542,1.524243,1.520331
1,2024-02-27 19:00:00,1.531852,0.001561,1.507713,1.557985,1.554882,1.554369,1.532951,1.516842,1.533418,...,1.537768,1.513883,1.553423,1.513901,1.542416,1.514757,1.536861,1.528666,1.517352,1.513580
2,2024-02-27 20:00:00,1.532189,0.001563,1.507949,1.558395,1.555318,1.554549,1.533150,1.517219,1.533839,...,1.538197,1.514169,1.553789,1.514261,1.542825,1.515106,1.537159,1.529060,1.517823,1.513758
3,2024-02-27 21:00:00,1.529958,0.001563,1.505636,1.556122,1.553142,1.552327,1.530916,1.514979,1.531645,...,1.535936,1.511957,1.551532,1.512063,1.540624,1.512861,1.534943,1.526857,1.515546,1.511516
4,2024-02-27 22:00:00,1.529198,0.001564,1.504930,1.555406,1.552385,1.551578,1.530155,1.514184,1.530884,...,1.535199,1.511189,1.550806,1.511306,1.539843,1.512095,1.534160,1.526049,1.514859,1.510716
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4000,2024-08-12 10:00:00,1.379095,0.001356,1.350863,1.403237,1.401853,1.389992,1.374061,1.363255,1.384079,...,1.380759,1.362036,1.397719,1.383117,1.386615,1.367756,1.384974,1.373151,1.372735,1.363863
4001,2024-08-12 11:00:00,1.377228,0.001356,1.348909,1.401379,1.400023,1.388170,1.372154,1.361411,1.382249,...,1.378847,1.360209,1.395856,1.381284,1.384706,1.365914,1.383103,1.371323,1.370803,1.361995
4002,2024-08-12 12:00:00,1.377502,0.001355,1.349223,1.401635,1.400277,1.388383,1.372456,1.361690,1.382509,...,1.379203,1.360388,1.396121,1.381549,1.385026,1.366188,1.383413,1.371519,1.371034,1.362301
4003,2024-08-12 13:00:00,1.376140,0.001355,1.347851,1.400322,1.398862,1.387080,1.371112,1.360341,1.381177,...,1.377731,1.359096,1.394772,1.380229,1.383674,1.364816,1.382009,1.370085,1.369700,1.360955


In [None]:
# # CSV 파일로 저장
# bayes_pred_uncer.to_csv('../streamlit/data/bayes_pred_uncer.csv', index=False)
# bayes_ppd_data.to_csv('../streamlit/data/bayesian_ppd_visual.csv', index=False)

# print('Bayesian LSTM Model Learning & Prediction Finish\n')