In [None]:
import os
import json
import pandas as pd
import joblib
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from boruta import BorutaPy
from pathlib import Path

ROOT = Path(os.getcwd()).resolve().parent.parent
DATA = ROOT / "data"
TRAINED_DATA = DATA / "trained_data"

METHOD = "Boruta"

# Load dữ liệu
df = pd.read_csv("../../data/processed/clean_daily_weather.csv")

# Danh sách các biến mục tiêu
target_variables = [
    "temperature_2m_max",
    "apparent_temperature_min",
    "relative_humidity_2m_max",
    "wind_speed_10m_max",
    "winddirection_10m_dominant",
    "rain_sum",
    "shortwave_radiation_sum"
]

# Duyệt qua từng mục tiêu
for target_variable in target_variables:
    print(f"\n[INFO] Running Boruta for target: {target_variable}")
    # Tính tương quan với biến mục tiêu
    corr_with_target = df.corr(numeric_only=True)[target_variable].abs()

    # Giữ lại các biến có tương quan < 0.95
    safe_features = corr_with_target[corr_with_target < 0.95].index.tolist()
    safe_features = [f for f in safe_features if f != target_variable]

    categorical_to_force_keep = ["season"]
    for cat in categorical_to_force_keep:
        if cat in df.columns and cat not in safe_features:
            safe_features.append(cat)

    print(f"[INFO] Số lượng biến còn lại sau khi loại proxy: {len(safe_features)}")

    # Xử lý thiếu dữ liệu
    df = df.dropna(subset=safe_features + [target_variable])

    # Chia dữ liệu train/test theo thời gian: 80% train, 20% test
    split_index = int(len(df) * 0.8)
    train_df = df.iloc[:split_index]
    test_df = df.iloc[split_index:]

    X_train_full = train_df[safe_features]
    y_train = train_df[target_variable]
    X_test_full = test_df[safe_features]
    y_test = test_df[target_variable]

    # Chuẩn bị cho Boruta: OneHot encode season trước để Boruta xử lý toàn bộ biến số
    X_train_boruta = pd.get_dummies(X_train_full, columns=["season"], drop_first=True)
    X_test_boruta = pd.get_dummies(X_test_full, columns=["season"], drop_first=True)

    # Chuẩn hóa cho Boruta
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_boruta)

    # Chạy Boruta
    rf_for_boruta = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
    boruta_selector = BorutaPy(rf_for_boruta, n_estimators='auto', random_state=42)
    boruta_selector.fit(X_train_scaled, y_train.values)

    # Lấy các đặc trưng được chọn
    selected_features = X_train_boruta.columns[boruta_selector.support_].tolist()
    print(f"[INFO] Selected features for '{target_variable}': {selected_features}")

    # Tách lại theo đặc trưng được chọn cho train/test
    X_train_selected = X_train_boruta[selected_features]
    X_test_selected = X_test_boruta[selected_features]

    # Tìm lại các cột season để xử lý bằng OneHotEncoder trong pipeline (nếu còn)
    categorical_features = [col for col in selected_features if col.startswith("season_")]
    numerical_features = [col for col in selected_features if col not in categorical_features]

    # Tạo pipeline chính
    preprocessor = ColumnTransformer(
        transformers=[
            ("num", StandardScaler(), numerical_features),
            ("cat", "passthrough", categorical_features)
        ]
    )
    target_output_dir = TRAINED_DATA / METHOD.lower() / target_variable
    target_output_dir.mkdir(parents=True, exist_ok=True)
    
    # Lưu dữ liệu
    joblib.dump(X_train_selected, target_output_dir / "X_train_selected.pkl")
    joblib.dump(X_test_selected, target_output_dir / "X_test_selected.pkl")
    joblib.dump(y_train, target_output_dir / "y_train.pkl")
    joblib.dump(y_test, target_output_dir / "y_test.pkl")
    joblib.dump(preprocessor, target_output_dir / "preprocessor.pkl")
    with open(target_output_dir / "selected_features.json", "w") as f:
        json.dump(selected_features, f, indent=2)

    print(f"[INFO] Saved all artifacts to: {target_output_dir}")
    
    test_df = test_df.copy()
    test_df["date"] = df.iloc[split_index:]["date"].values
    test_df[target_variable] = y_test.values
    test_df.to_csv(target_output_dir / "test_df.csv", index=False)

print("\n✅ DONE: Boruta feature selection and saving complete for all target variables.")