In [None]:
import numpy as np

import pandas as pd

from tensorflow.keras.models import Sequential

from tensorflow.keras.layers import LSTM, Dense, Dropout

from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import train_test_split


In [None]:
data = pd.read_csv('/kaggle/input/rain-forest/MERGE_DATA_HANDING_NULL.csv')  # Thay thế đường dẫn cho phù hợp
data.head()


In [None]:
data['time'] = pd.to_datetime(data['time'], format="%m/%d/%Y %H:%M")

data = data.sort_values(by=['lat', 'lon', 'time']).reset_index(drop=True)

data['rainfall'] = data.groupby(['lat', 'lon'])['AWS'].shift(-1)

data = data.dropna(subset=['rainfall']).reset_index(drop=True)

data.head(5)

In [None]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

def prepare_data(data, n_steps, times):
    X, y, time_stamps = [], [], []
    for i in range(n_steps, len(data), 1):
        X.append(data[i - n_steps:i, :-1])  # Dữ liệu đặc trưng không bao gồm `rainfall`
        y.append(data[i, -1])              # Dữ liệu mục tiêu là `rainfall`
        time_stamps.append(times[i])       # Lưu `time` tương ứng
    return np.array(X), np.array(y), np.array(time_stamps)

n_steps = 6

lat_lon_time_all = []
X_all, y_all = [], []

for (lat, lon), group_data in data.groupby(['lat', 'lon']):
    times = group_data['time'].values  
    features = group_data[['AWS','ERA5_TCW', 'ERA5_U850', 'ERA5_EWSS', 'ERA5_V850', 
                           'ERA5_TCLW', 'ERA5_U250', 'ERA5_R850', 'ERA5_R500', 'ERA5_CAPE', 
                           'ERA5_KX', 'ERA5_V250', 'ERA5_R250', 'rainfall']]
    
    X_group, y_group, time_group = prepare_data(features.values, n_steps, times)
    
    lat_lon_time_all.extend([(lat, lon, time) for time in time_group])
    
    X_all.append(X_group)
    y_all.append(y_group)

X_all = np.concatenate(X_all, axis=0)
y_all = np.concatenate(y_all, axis=0)
lat_lon_time_all = np.array(lat_lon_time_all) 
print(len(y_all), len(lat_lon_time_all))


In [None]:
X_train, X_test, y_train, y_test, lat_lon_time_train, lat_lon_time_test = train_test_split(
    X_all, y_all, lat_lon_time_all, test_size=0.2, random_state=42
)

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train.reshape(-1, X_train.shape[2])).reshape(X_train.shape)
X_test_scaled = scaler.transform(X_test.reshape(-1, X_test.shape[2])).reshape(X_test.shape)


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Dense
from tensorflow.keras.optimizers import Adam

def build_model(n_steps, input_dim):
    model = Sequential()
    model.add(LSTM(64, activation='relu', return_sequences=True, input_shape=(n_steps, input_dim)))
    model.add(Dropout(0.2))
    model.add(LSTM(128, activation='relu', return_sequences=True))
    model.add(Dropout(0.2))
    model.add(LSTM(96, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(1))
    optimizer = Adam(learning_rate=0.001)

    model.compile(optimizer=optimizer, loss='mse')
    return model

input_dim = X_train.shape[2]  
global_model = build_model(n_steps, input_dim)

history = global_model.fit(X_train_scaled, y_train, epochs=32, batch_size=64, validation_data=(X_test_scaled, y_test), verbose=1)

loss = global_model.evaluate(X_test_scaled, y_test, verbose=0)
y_pred = global_model.predict(X_test_scaled)

print(f'Loss trên tập kiểm tra: {loss}')


In [None]:
import matplotlib.pyplot as plt

train_loss = history.history['loss']
val_loss = history.history['val_loss']


In [None]:
plt.figure(figsize=(8, 6))
plt.plot(train_loss, label='Train Loss', marker='o')
plt.plot(val_loss, label='Validation Loss', marker='o')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training and Validation Loss Curve')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
result_array = []

for i, (lat, lon, time) in enumerate(lat_lon_time_test):
    result_array.append([lat, lon, time, y_pred[i]])




In [None]:
import pandas as pd

df_result = pd.DataFrame(result_array, columns=['Lat', 'Lon', 'Time', 'y_pred'])

print(df_result.head())


In [None]:
df_result.to_csv('output.csv', index=False, encoding='utf-8')

print("File CSV đã được ghi thành công với tên 'output.csv'")


In [None]:
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
cc = np.corrcoef(y_test.flatten(), y_pred.flatten())[0, 1]

print(f'MSE trên tập kiểm tra: {mse}')
print(f'MAE trên tập kiểm tra: {mae}')
print(f"R² (Hệ số xác định): {r2}")
print(f"Hệ số tương quan CC: {cc}")
