In [1]:
!pip install rasterio

Collecting rasterio
  Downloading rasterio-1.4.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.1 kB)
Collecting affine (from rasterio)
  Downloading affine-2.4.0-py3-none-any.whl.metadata (4.0 kB)
Downloading rasterio-1.4.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (22.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m22.2/22.2 MB[0m [31m68.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hDownloading affine-2.4.0-py3-none-any.whl (15 kB)
Installing collected packages: affine, rasterio
Successfully installed affine-2.4.0 rasterio-1.4.3


In [2]:
import os
import numpy as np
import pandas as pd
import rasterio
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold
from sklearn.impute import SimpleImputer
from scipy.ndimage import gaussian_filter
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
import cv2

In [3]:
# Đường dẫn dữ liệu trên Kaggle
RADAR_DIR = "/kaggle/input/radar1/Radar"
ERA5_BASE_DIR = "/kaggle/input/era005/ERA5"

In [4]:
# Danh sách các biến ERA5
ERA5_VARIABLES = [
    "CAPE", "CIN", "EWSS", "IE", "ISOR", "KX", "PEV", "R250", "R500", "R850",
    "SLHF", "SLOR", "SSHF", "TCLW", "TCW", "TCWV", "U250", "U850", "V250", "V850"
]

In [5]:
# Hàm trích xuất thời gian từ tên file
def extract_time_from_filename(filename):
    time_str = filename.split('_')[1].replace('.tif', '')
    return datetime.strptime(time_str, '%Y%m%d%H%M%S')

# Hàm tải file Radar và ERA5
def load_files(base_dir, year, month, is_radar=True):
    files = {}
    if is_radar:
        month_dir = os.path.join(base_dir, str(year), f'{month:02d}')
        files['Radar'] = [os.path.join(root, f) for root, _, filenames in os.walk(month_dir) 
                          for f in filenames if f.endswith('.tif')]
        files['Radar'].sort(key=extract_time_from_filename)
    else:
        for var in ERA5_VARIABLES:
            var_dir = os.path.join(base_dir, var, str(year), f'{month:02d}')
            if os.path.exists(var_dir):
                files[var] = [os.path.join(root, f) for root, _, filenames in os.walk(var_dir) 
                              for f in filenames if f.endswith('.tif')]
                files[var].sort(key=extract_time_from_filename)
    return files

In [6]:
# Chuyển đổi từ TIFF sang DataFrame
radar_files = load_files(RADAR_DIR, 2019, 4, is_radar=True)
era5_files = load_files(ERA5_BASE_DIR, 2019, 4, is_radar=False)

# Đồng bộ thời gian
common_times = set([extract_time_from_filename(f) for f in radar_files['Radar']])
for var in ERA5_VARIABLES:
    if var in era5_files:
        common_times &= set([extract_time_from_filename(f) for f in era5_files[var]])
common_times = sorted(common_times)

In [None]:
# Đọc dữ liệu TIFF bằng Rasterio
data_dict = {'Time': [], 'Lat': [], 'Lon': []}
for var in ['Radar'] + ERA5_VARIABLES:
    data_dict[var] = []

for t in common_times:
    radar_file = [f for f in radar_files['Radar'] if extract_time_from_filename(f) == t][0]
    with rasterio.open(radar_file) as src:
        radar_data = src.read(1)  # Đọc band 1 (90x250)
        transform = src.transform
        for i in range(90):
            for j in range(250):
                lat, lon = transform * (j, i)  # Chuyển tọa độ pixel sang lat/lon
                data_dict['Time'].append(t)
                data_dict['Lat'].append(lat)
                data_dict['Lon'].append(lon)
                data_dict['Radar'].append(radar_data[i, j])
                for var in ERA5_VARIABLES:
                    if var in era5_files and era5_files[var]:
                        era_file = [f for f in era5_files[var] if extract_time_from_filename(f) == t][0]
                        with rasterio.open(era_file) as src_var:
                            var_data = src_var.read(1)
                            data_dict[var].append(var_data[i, j])
                    else:
                        data_dict[var].append(np.nan)

# Tạo DataFrame
df = pd.DataFrame(data_dict)

In [None]:
# Lưu CSV
df.to_csv('/kaggle/working/rainfall_era5_data.csv', index=False)
print("Đã lưu file CSV: rainfall_era5_data.csv")

In [None]:
# Kiểm tra giá trị thiếu
print("Tỷ lệ giá trị thiếu trong mỗi cột:")
print(df.isnull().mean() * 100)

In [None]:
# Xử lý giá trị thiếu
df_clean = df.copy()
df_clean['Radar'] = df_clean['Radar'].replace(-np.inf, np.nan)  # Thay -inf bằng NaN trong Radar

In [None]:
# Phương pháp 1: Điền bằng 0
imputer_zero = SimpleImputer(strategy='constant', fill_value=0)
df_zero = pd.DataFrame(imputer_zero.fit_transform(df_clean.drop(columns=['Time', 'Lat', 'Lon'])), 
                       columns=df_clean.drop(columns=['Time', 'Lat', 'Lon']).columns)
df_zero[['Time', 'Lat', 'Lon']] = df_clean[['Time', 'Lat', 'Lon']]

In [None]:
# Phương pháp 2: Kernel-based imputation (Gaussian smoothing)
for col in ERA5_VARIABLES + ['Radar']:
    data_array = df_clean[col].values.reshape(-1, 90, 250)
    for i in range(len(common_times)):
        data_array[i] = gaussian_filter(data_array[i], sigma=1, mode='nearest')
    df_clean[col] = data_array.flatten()

In [None]:
# Xác định tương quan
corr_matrix = df_clean.drop(columns=['Time', 'Lat', 'Lon']).corr()
print("Hệ số tương quan với lượng mưa (Radar):")
print(corr_matrix['Radar'].sort_values(ascending=False))

In [None]:
# 2. Feature Engineering
# Chênh lệch độ ẩm
df_clean['R850_R500'] = df_clean['R850'] - df_clean['R500']

# Tổng hợp gió
df_clean['WS850'] = np.sqrt(df_clean['U850']**2 + df_clean['V850']**2)

# Tích hợp xoáy và cắt gió
df_clean['ISOR_EWSS'] = df_clean['ISOR'] * df_clean['EWSS']

# Thêm độ trễ (lag features)
for var in ['TCWV', 'R850']:
    for lag in [1, 3, 6]:  # Lag 1, 3, 6 giờ
        df_clean[f'{var}_lag{lag}'] = df_clean.groupby(['Lat', 'Lon'])[var].shift(lag)

In [None]:
# Chuẩn hóa dữ liệu (Min-Max Scaling)
scaler = MinMaxScaler()
features = df_clean.drop(columns=['Time', 'Lat', 'Lon', 'Radar']).columns
df_scaled = pd.DataFrame(scaler.fit_transform(df_clean[features]), columns=features)
df_scaled[['Time', 'Lat', 'Lon', 'Radar']] = df_clean[['Time', 'Lat', 'Lon', 'Radar']]

In [None]:
# Resize ảnh về 240x80
def resize_image(data, new_shape=(240, 80)):
    data_array = data.values.reshape(-1, 90, 250)
    resized_data = np.zeros((data_array.shape[0], *new_shape))
    for i in range(data_array.shape[0]):
        resized_data[i] = cv2.resize(data_array[i], new_shape[::-1], interpolation=cv2.INTER_LINEAR)
    return resized_data

In [None]:
radar_resized = resize_image(df_scaled['Radar'])
X_resized = np.stack([resize_image(df_scaled[var]) for var in features], axis=-1)  # (samples, 240, 80, n_features)

In [None]:
# Chuẩn bị dữ liệu cho K-Fold
X = X_resized  # (samples, 240, 80, n_features)
y = radar_resized  # (samples, 240, 80)