In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.metrics import mean_squared_error, r2_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, TimeDistributed
from tensorflow.keras.callbacks import EarlyStopping

np.random.seed(42)
tf.random.set_seed(42)
input_steps = 3  
output_steps = 6   


In [None]:
data_file = "data_readed.xlsx"
df = pd.read_excel(data_file)
print("DataFrame head:")
print(df.head())

# Xác định 10 features làm đầu vào (giả sử nằm ở cột thứ 4 đến 13)
features = df.columns[3:13].tolist()  
target_label = 'AWS'

# Chuyển đổi cột datetime và thêm các cột tuần, tháng, năm
df['datetime'] = pd.to_datetime(df['datetime'])
df['week'] = df['datetime'].dt.isocalendar().week
df['month'] = df['datetime'].dt.month
df['year'] = df['datetime'].dt.year
df = df.sort_values(by='datetime').reset_index(drop=True)
time_features = ['week', 'month', 'year']
features += time_features
df[features] = df[features].fillna(method='ffill')
for col in features:
    min_val = df[col].min()
    max_val = df[col].max()
    df[col] = (df[col] - min_val) / (max_val - min_val)
df[features] = df[features].astype(np.float32)


In [None]:
def create_dataset(df, features, target_label, past=6, future=6):
    X, y = [], []
    feature_values = df[features].values
    target_values = df[target_label].values
    for i in range(len(df) - past - future + 1):
        X.append(feature_values[i:(i+past)])
        y.append(target_values[i+past:i+past+future])
    return np.array(X), np.array(y)

X, y = create_dataset(df, features, target_label, input_steps, output_steps)
print("Dataset shape:", X.shape, y.shape)
split = int(0.8 * len(X))
X_train, X_test = X[:split], X[split:]
y_train, y_test = y[:split], y[split:]


In [None]:
def create_model(input_shape, output_steps):
    model = Sequential()
    model.add(LSTM(64, activation='tanh', input_shape=input_shape))
    model.add(Dense(output_steps))
    model.compile(optimizer='adam', loss='mse')
    return model
sample_model = create_model(input_shape=(X_train.shape[1], X_train.shape[2]), output_steps=output_steps)
sample_model.summary()


In [None]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error, r2_score

tscv = TimeSeriesSplit(n_splits=5)
cv_mse = []
cv_r2 = []
fold = 1

for train_index, val_index in tscv.split(X_train):
    X_tr, X_val = X_train[train_index], X_train[val_index]
    y_tr, y_val = y_train[train_index], y_train[val_index]
    model_cv = create_model(input_shape=(X_tr.shape[1], X_tr.shape[2]), output_steps=output_steps)
    history_cv = model_cv.fit(X_tr, y_tr.astype(np.float32), epochs=50, batch_size=32,
                          validation_data=(X_val, y_val.astype(np.float32)), verbose=0)

    
    # Dự đoán và tính các chỉ số trên tập validation của fold này
    y_val_pred = model_cv.predict(X_val)
    mse_fold = mean_squared_error(y_val.flatten(), y_val_pred.flatten())
    r2_fold = r2_score(y_val.flatten(), y_val_pred.flatten())
    cv_mse.append(mse_fold)
    cv_r2.append(r2_fold)
    
    print(f"Fold {fold}: MSE = {mse_fold:.4f}, R2 = {r2_fold:.4f}")
    fold += 1

print("CV Average MSE:", np.mean(cv_mse))
print("CV Average R2:", np.mean(cv_r2))


In [None]:
final_model = create_model(input_shape=(X_train.shape[1], X_train.shape[2]), output_steps=output_steps)
history = final_model.fit(X_train, y_train, epochs=100, batch_size=64, validation_split=0.2)
final_model.save('rain_prediction_model_v2.h5')


In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

y_pred = final_model.predict(X_test)
mse_test = mean_squared_error(y_test.flatten(), y_pred.flatten())
mae_test = mean_absolute_error(y_test.flatten(), y_pred.flatten())
r2_test = r2_score(y_test.flatten(), y_pred.flatten())

print("Test MSE:", mse_test)
print("Test MAE:", mae_test)
print("Test R2:", r2_test)

plt.figure(figsize=(10, 6))
plt.plot(y_test[0], marker='o', label='Actual')
plt.plot(y_pred[0], marker='x', label='Predicted')
plt.xlabel('Hour')
plt.ylabel('Rain value')
plt.title('Actual vs Predicted Rain for the first test sample')
plt.legend()
plt.show()


In [None]:
import xgboost as xgb
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Flatten X_train và X_test từ (samples, input_steps, features) sang (samples, input_steps*features)
X_train_xgb = X_train.reshape(X_train.shape[0], -1)
X_test_xgb = X_test.reshape(X_test.shape[0], -1)

# Khởi tạo mô hình XGB cho bài toán multi-output
xgb_reg = MultiOutputRegressor(xgb.XGBRegressor(objective='reg:squarederror', random_state=42))
xgb_reg.fit(X_train_xgb, y_train)

# Dự đoán trên tập test với XGB
y_pred_xgb = xgb_reg.predict(X_test_xgb)

mse_xgb = mean_squared_error(y_test.flatten(), y_pred_xgb.flatten())
mae_xgb = mean_absolute_error(y_test.flatten(), y_pred_xgb.flatten())
r2_xgb = r2_score(y_test.flatten(), y_pred_xgb.flatten())

print("XGB Test MSE:", mse_xgb)
print("XGB Test MAE:", mae_xgb)
print("XGB Test R2:", r2_xgb)

plt.figure(figsize=(10, 6))
plt.plot(y_test[0], marker='o', label='Actual')
plt.plot(y_pred_xgb[0], marker='x', label='Predicted XGB')
plt.xlabel('Hour')
plt.ylabel('Rain value')
plt.title('Actual vs Predicted Rain for the first test sample (XGB)')
plt.legend()
plt.show()


In [None]:
# Giả sử y_pred là dự đoán từ LSTM và y_pred_xgb là dự đoán từ XGB đã có
y_pred_ensemble = (y_pred + y_pred_xgb) / 2

# Tính các chỉ số cho dự đoán kết hợp
mse_ensemble = mean_squared_error(y_test.flatten(), y_pred_ensemble.flatten())
mae_ensemble = mean_absolute_error(y_test.flatten(), y_pred_ensemble.flatten())
r2_ensemble = r2_score(y_test.flatten(), y_pred_ensemble.flatten())

print("Ensemble Test MSE:", mse_ensemble)
print("Ensemble Test MAE:", mae_ensemble)
print("Ensemble Test R2:", r2_ensemble)

plt.figure(figsize=(10, 6))
plt.plot(y_test[0], marker='o', label='Actual')
plt.plot(y_pred_ensemble[0], marker='x', label='Ensemble Predicted')
plt.xlabel('Hour')
plt.ylabel('Rain value')
plt.title('Actual vs Predicted Rain for the first test sample (Ensemble)')
plt.legend()
plt.show()
