In [5]:
import os
import json
import pandas as pd
import joblib
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from pathlib import Path

ROOT = Path(os.getcwd()).resolve().parent.parent
DATA = ROOT / "data"
TRAINED_DATA = DATA / "trained_data"

METHOD = "Boruta"

# Load dữ liệu
df = pd.read_csv("../../data/processed/clean_daily_weather.csv")

def remove_redundant_mean_features(features):
    """
    Loại bỏ các đặc trưng *_mean nếu đã có cả *_min và *_max tương ứng.

    Args:
        features (list): Danh sách tên các đặc trưng đã chọn.

    Returns:
        list: Danh sách đặc trưng sau khi loại bỏ các *_mean dư thừa.
    """
    features_set = set(features)
    cleaned_features = features.copy()

    # Tìm tất cả các nhóm có dạng <prefix>_min, <prefix>_mean, <prefix>_max
    for feature in features:
        if feature.endswith("_mean"):
            prefix = feature[:-5]  # Bỏ "_mean"
            min_feat = f"{prefix}_min"
            max_feat = f"{prefix}_max"
            if min_feat in features_set and max_feat in features_set:
                print(f"[INFO] Detected redundant features: {min_feat}, {feature}, {max_feat}. Removing {feature}.")
                cleaned_features.remove(feature)
                print(f"[INFO] Removed redundant mean feature: {feature}")

    return cleaned_features

# Danh sách các biến mục tiêu
target_variables = [
    "temperature_2m_max",
    "apparent_temperature_min",
    "relative_humidity_2m_max",
    "wind_speed_10m_max",
    "winddirection_10m_dominant",
    "rain_sum",
    "shortwave_radiation_sum"
]

# Duyệt qua từng mục tiêu
for target_variable in target_variables:
    print(f"\n[INFO] Running features filtering for target: {target_variable}")
    # Tính tương quan với biến mục tiêu
    corr_with_target = df.corr(numeric_only=True)[target_variable].drop(target_variable)

    # Giữ lại các biến có tương quan < 0.97
    safe_features = corr_with_target[corr_with_target.abs() < 0.99].index.tolist()

    categorical_to_force_keep = ["season"]
    for cat in categorical_to_force_keep:
        if cat in df.columns and cat not in safe_features:
            safe_features.append(cat)

    print(f"[INFO] The number of remaining features after proxying: {len(safe_features)}")

    # Chia dữ liệu train/test theo thời gian: 80% train, 20% test
    split_index = int(len(df) * 0.8)
    train_df = df.iloc[:split_index]
    test_df = df.iloc[split_index:]
    
    X_corr = df[safe_features]
    X_scaled = StandardScaler().fit_transform(X_corr)

    vif = pd.DataFrame()
    vif["feature"] = safe_features
    vif["VIF"] = [variance_inflation_factor(X_scaled, i) for i in range(X_scaled.shape[1])]

    # Giữ lại những biến có VIF thấp hơn ngưỡng < 10
    vif_selected = vif[vif["VIF"] < 10]["feature"].tolist()

    # Gộp vào kết quả cuối cùng
    selected_features = sorted(set(vif_selected + ["season_sin", "season_cos"]))
    print(f"[INFO] Detecting redundant features...")
    selected_features = remove_redundant_mean_features(selected_features)
    print("[INFO] Final selected features:", selected_features)


    # Tách lại theo đặc trưng được chọn cho train/test
    X_train = train_df[selected_features]
    X_test = test_df[selected_features]
    y_train = train_df[target_variable]
    y_test = test_df[target_variable]

    # Tạo pipeline chính
    preprocessor = ColumnTransformer(transformers=[("num", StandardScaler(), selected_features)])
    target_output_dir = TRAINED_DATA / METHOD.lower() / target_variable
    target_output_dir.mkdir(parents=True, exist_ok=True)
    
    # Lưu dữ liệu
    joblib.dump(X_train, target_output_dir / "X_train.pkl")
    joblib.dump(X_test, target_output_dir / "X_test.pkl")
    joblib.dump(y_train, target_output_dir / "y_train.pkl")
    joblib.dump(y_test, target_output_dir / "y_test.pkl")
    joblib.dump(preprocessor, target_output_dir / "preprocessor.pkl")
    with open(target_output_dir / "selected_features.json", "w") as f:
        json.dump(selected_features, f, indent=2)

    print(f"[INFO] Saved all artifacts to: {target_output_dir}")
    
    test_df = test_df.copy()
    test_df["date"] = df.iloc[split_index:]["date"].values
    test_df[target_variable] = y_test.values
    test_df.to_csv(target_output_dir / "test_df.csv", index=False)

print("\n✅ DONE: Feature selection and saving complete for all target variables.")


[INFO] Running features filtering for target: temperature_2m_max
[INFO] The number of remaining features after proxying: 51


  vif = 1. / (1. - r_squared_i)


[INFO] Detecting redundant features...
[INFO] Detected redundant features: cloud_cover_min, cloud_cover_mean, cloud_cover_max. Removing cloud_cover_mean.
[INFO] Removed redundant mean feature: cloud_cover_mean
[INFO] Final selected features: ['cloud_cover_max', 'cloud_cover_min', 'precipitation_hours', 'season_cos', 'season_sin', 'wind_gusts_10m_max', 'wind_gusts_10m_min', 'wind_speed_10m_min', 'winddirection_10m_dominant']
[INFO] Saved all artifacts to: C:\Users\ADMIN\MyProject\School_Projects\WeatherForecastML-DM\data\trained_data\boruta\temperature_2m_max

[INFO] Running features filtering for target: apparent_temperature_min
[INFO] The number of remaining features after proxying: 49


  vif = 1. / (1. - r_squared_i)


[INFO] Detecting redundant features...
[INFO] Detected redundant features: cloud_cover_min, cloud_cover_mean, cloud_cover_max. Removing cloud_cover_mean.
[INFO] Removed redundant mean feature: cloud_cover_mean
[INFO] Final selected features: ['cloud_cover_max', 'cloud_cover_min', 'precipitation_hours', 'season_cos', 'season_sin', 'wind_gusts_10m_max', 'wind_gusts_10m_min', 'wind_speed_10m_min', 'winddirection_10m_dominant']
[INFO] Saved all artifacts to: C:\Users\ADMIN\MyProject\School_Projects\WeatherForecastML-DM\data\trained_data\boruta\apparent_temperature_min

[INFO] Running features filtering for target: relative_humidity_2m_max
[INFO] The number of remaining features after proxying: 51


  vif = 1. / (1. - r_squared_i)


[INFO] Detecting redundant features...
[INFO] Detected redundant features: cloud_cover_min, cloud_cover_mean, cloud_cover_max. Removing cloud_cover_mean.
[INFO] Removed redundant mean feature: cloud_cover_mean
[INFO] Final selected features: ['cloud_cover_max', 'cloud_cover_min', 'precipitation_hours', 'season_cos', 'season_sin', 'wind_gusts_10m_max', 'wind_gusts_10m_min', 'wind_speed_10m_min', 'winddirection_10m_dominant']
[INFO] Saved all artifacts to: C:\Users\ADMIN\MyProject\School_Projects\WeatherForecastML-DM\data\trained_data\boruta\relative_humidity_2m_max

[INFO] Running features filtering for target: wind_speed_10m_max
[INFO] The number of remaining features after proxying: 51


  vif = 1. / (1. - r_squared_i)


[INFO] Detecting redundant features...
[INFO] Detected redundant features: cloud_cover_min, cloud_cover_mean, cloud_cover_max. Removing cloud_cover_mean.
[INFO] Removed redundant mean feature: cloud_cover_mean
[INFO] Final selected features: ['cloud_cover_max', 'cloud_cover_min', 'precipitation_hours', 'season_cos', 'season_sin', 'wind_gusts_10m_max', 'wind_gusts_10m_min', 'wind_speed_10m_min', 'winddirection_10m_dominant']
[INFO] Saved all artifacts to: C:\Users\ADMIN\MyProject\School_Projects\WeatherForecastML-DM\data\trained_data\boruta\wind_speed_10m_max

[INFO] Running features filtering for target: winddirection_10m_dominant
[INFO] The number of remaining features after proxying: 51


  vif = 1. / (1. - r_squared_i)


[INFO] Detecting redundant features...
[INFO] Detected redundant features: cloud_cover_min, cloud_cover_mean, cloud_cover_max. Removing cloud_cover_mean.
[INFO] Removed redundant mean feature: cloud_cover_mean
[INFO] Final selected features: ['cloud_cover_max', 'cloud_cover_min', 'precipitation_hours', 'season_cos', 'season_sin', 'wind_gusts_10m_max', 'wind_gusts_10m_min', 'wind_speed_10m_min']
[INFO] Saved all artifacts to: C:\Users\ADMIN\MyProject\School_Projects\WeatherForecastML-DM\data\trained_data\boruta\winddirection_10m_dominant

[INFO] Running features filtering for target: rain_sum
[INFO] The number of remaining features after proxying: 50


  vif = 1. / (1. - r_squared_i)


[INFO] Detecting redundant features...
[INFO] Detected redundant features: cloud_cover_min, cloud_cover_mean, cloud_cover_max. Removing cloud_cover_mean.
[INFO] Removed redundant mean feature: cloud_cover_mean
[INFO] Final selected features: ['cloud_cover_max', 'cloud_cover_min', 'precipitation_hours', 'season_cos', 'season_sin', 'wind_gusts_10m_max', 'wind_gusts_10m_min', 'wind_speed_10m_min', 'winddirection_10m_dominant']
[INFO] Saved all artifacts to: C:\Users\ADMIN\MyProject\School_Projects\WeatherForecastML-DM\data\trained_data\boruta\rain_sum

[INFO] Running features filtering for target: shortwave_radiation_sum
[INFO] The number of remaining features after proxying: 51


  vif = 1. / (1. - r_squared_i)


[INFO] Detecting redundant features...
[INFO] Detected redundant features: cloud_cover_min, cloud_cover_mean, cloud_cover_max. Removing cloud_cover_mean.
[INFO] Removed redundant mean feature: cloud_cover_mean
[INFO] Final selected features: ['cloud_cover_max', 'cloud_cover_min', 'precipitation_hours', 'season_cos', 'season_sin', 'wind_gusts_10m_max', 'wind_gusts_10m_min', 'wind_speed_10m_min', 'winddirection_10m_dominant']
[INFO] Saved all artifacts to: C:\Users\ADMIN\MyProject\School_Projects\WeatherForecastML-DM\data\trained_data\boruta\shortwave_radiation_sum

✅ DONE: Feature selection and saving complete for all target variables.
