In [15]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import h5py

# ===============================
# 1. Load & Clean Dataset
# ===============================

In [2]:
df = pd.read_csv("../../datasets/latestdataset - Copy.csv")

# Drop unwanted columns 
df.drop(columns=["rain (mm)", "precipitation (mm)", "soil_moisture_0_to_7cm (mÂ³/mÂ³)"], inplace=True)

# Parse datetime
df['time'] = pd.to_datetime(df['time'])
df = df.set_index('time')

# Convert to float
df = df.astype(float)

In [3]:
df.head()

Unnamed: 0_level_0,temperature_2m (Â°C),relative_humidity_2m (%),wind_speed_10m (km/h),soil_temperature_0_to_7cm (Â°C)
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2015-01-01 00:00:00,20.2,96.0,6.6,22.2
2015-01-01 01:00:00,20.2,95.0,9.3,22.0
2015-01-01 02:00:00,20.3,96.0,10.5,22.0
2015-01-01 03:00:00,21.1,93.0,8.7,22.4
2015-01-01 04:00:00,22.5,86.0,8.2,23.3


# ===============================
# 2. Scale Data
# ===============================

In [18]:
features = df.columns.tolist()
#scaler_X = MinMaxScaler()
#scaler_y = MinMaxScaler()
df_scaled = pd.DataFrame(scaler_X.fit_transform(df), columns=features, index=df.index)

# ===============================
# 3. Create Supervised Dataset
# ===============================

In [19]:
def create_supervised(data, look_back=72, horizon=6):
    X, y = [], []
    values = data.values
    for i in range(len(values) - look_back - horizon + 1):
        X.append(values[i:i+look_back].flatten())
        y.append(values[i+look_back:i+look_back+horizon].flatten())
    X = np.array(X)
    y = np.array(y)  # shape: (samples, horizon * features)
    return X, y

look_back, horizon = 72, 6
X, y = create_supervised(df, look_back=look_back, horizon=horizon)

print("X shape:", X.shape)  
print("y shape:", y.shape)  

X shape: (87619, 288)
y shape: (87619, 24)


# ===============================
# 4. Train/Test Split (chronological)
# ===============================

In [20]:
n = len(X)
train_idx = int(0.70 * n)
val_idx   = int(0.85 * n)

X_train, y_train = X[:train_idx], y[:train_idx]
X_val, y_val     = X[train_idx:val_idx], y[train_idx:val_idx]
X_test, y_test   = X[val_idx:], y[val_idx:]

# ===============================
# 5. Train XGBoost (MultiOutput)
# ===============================

In [16]:
from sklearn.multioutput import MultiOutputRegressor

xgb_multi = MultiOutputRegressor(
    XGBRegressor(
        n_estimators=200,
        learning_rate=0.1,
        max_depth=6,
        objective="reg:squarederror",
        random_state=42,
        verbosity=0
    )
)

print("\nðŸš€ Training XGBoost...")
xgb_multi.fit(X_train, y_train)


ðŸš€ Training XGBoost...


# ===============================
# 6. Predict 
# ===============================

In [21]:
y_pred = xgb_multi.predict(X_test)

# ===============================
# 7. Evaluation (per feature, per horizon)
# ===============================

In [22]:
n_features = len(features)
y_test_reshaped = y_test.reshape(-1, horizon, n_features)
y_pred_reshaped = y_pred.reshape(-1, horizon, n_features)

results = {}
for f_idx, feature in enumerate(features):
    results[feature] = {}
    for h in range(horizon):
        mae  = mean_absolute_error(y_test_reshaped[:, h, f_idx], y_pred_reshaped[:, h, f_idx])
        rmse = mean_squared_error(y_test_reshaped[:, h, f_idx], y_pred_reshaped[:, h, f_idx], squared=False)
        r2   = r2_score(y_test_reshaped[:, h, f_idx], y_pred_reshaped[:, h, f_idx])
        results[feature][f"Horizon_{h+1}"] = {"MAE": mae, "RMSE": rmse, "R2": r2}

# Average per feature across horizons
avg_results = {}
for feature in features:
    avg_results[feature] = {
        "MAE": np.mean([results[feature][f"Horizon_{h+1}"]["MAE"] for h in range(horizon)]),
        "RMSE": np.mean([results[feature][f"Horizon_{h+1}"]["RMSE"] for h in range(horizon)]),
        "R2": np.mean([results[feature][f"Horizon_{h+1}"]["R2"] for h in range(horizon)])
    }

print("\nâœ… Average metrics per feature across all horizons:")
for f, metrics in avg_results.items():
    print(f"{f} -> MAE: {metrics['MAE']:.4f}, RMSE: {metrics['RMSE']:.4f}, R2: {metrics['R2']:.4f}")


âœ… Average metrics per feature across all horizons:
temperature_2m (Â°C) -> MAE: 0.6052, RMSE: 0.8652, R2: 0.9559
relative_humidity_2m (%) -> MAE: 3.8651, RMSE: 5.5000, R2: 0.9329
wind_speed_10m (km/h) -> MAE: 1.9527, RMSE: 2.5828, R2: 0.7623
soil_temperature_0_to_7cm (Â°C) -> MAE: 0.3953, RMSE: 0.5714, R2: 0.9807


In [24]:
import joblib

# Save the trained XGBoost model
joblib.dump(xgb_multi, "xgb_multi_model.pkl")
print("âœ… XGBoost model saved as 'xgb_multi_model.pkl'")

# Later, you can reload it with:
# xgb_loaded = joblib.load("xgb_multi_model.pkl")
# y_pred = xgb_loaded.predict(X_test)

âœ… XGBoost model saved as 'xgb_multi_model.pkl'


In [25]:
import pandas as pd

# Flatten results into a DataFrame for tabular view
rows = []
for feat, horizons in results.items():
    for h, metrics in horizons.items():
        row = {"Feature": feat, "Horizon": h}
        row.update(metrics)
        rows.append(row)

# Average metrics per feature
avg_df = pd.DataFrame(avg_results).T.reset_index().rename(columns={"index":"Feature"})
print("\n===== Weighted Ensemble Average Metrics per Feature =====")
print(avg_df)


===== Weighted Ensemble Average Metrics per Feature =====
                          Feature       MAE      RMSE        R2
0             temperature_2m (Â°C)  0.605216  0.865152  0.955870
1        relative_humidity_2m (%)  3.865136  5.500032  0.932895
2           wind_speed_10m (km/h)  1.952744  2.582843  0.762304
3  soil_temperature_0_to_7cm (Â°C)  0.395333  0.571366  0.980657
