In [57]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error, r2_score
import optuna

# Load data

In [2]:
path = r"C:\Users\matta\Desktop\Documents\Python\Geolocation\climate_data\working_data\clean_labeled_climate_data.pkl"
df = pd.read_pickle(path)
df.shape

(470342, 125)

In [3]:
labels = pd.DataFrame(df[['Level_1', 'Level_2', 'Level_3', 'Level_4', 'ECO_NAME', 'climates_f']])
df = df.drop(columns = ['Level_1', 'Level_2', 'Level_3', 'Level_4', 'ECO_NAME', 'climates_f'])

# Train test split

In [6]:
X = df[['longitude', 'latitude', 'jan_tmin', 'annual_tmin', 'annual_meant', 'jul_maxt', 'annual_maxt']]

y = df[['jan_dptmean', 'jul_dptmean', 'annual_dptmean',
        'jan_precip', 'feb_precip', 'mar_precip', 'apr_precip', 'may_precip', 'jun_precip', 
        'jul_precip', 'aug_precip', 'sep_precip', 'oct_precip', 'nov_precip', 'dec_precip', 'annual_precip']] 

In [9]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.5)

# Modeling

## Random forest

In [10]:
rfr = MultiOutputRegressor(RandomForestRegressor())

rfr.fit(X_train, Y_train)

In [11]:
Y_pred = rfr.predict(X_test)
Y_pred = pd.DataFrame(Y_pred, columns=y.columns)

In [12]:
metrics = {}

for col in y.columns:
    rmse = np.sqrt(mean_squared_error(Y_test[col], Y_pred[col]))
    r2 = r2_score(Y_test[col], Y_pred[col])
    metrics[col] = {'RMSE': rmse, 'R²': r2}

metrics_df = pd.DataFrame(metrics).T
print(metrics_df)

                     RMSE        R²
jan_dptmean      0.153369  0.999303
jul_dptmean      0.159837  0.999412
annual_dptmean   0.136104  0.999441
jan_precip       6.786577  0.987652
feb_precip       5.632384  0.986882
mar_precip       5.555368  0.987863
apr_precip       4.067121  0.990770
may_precip       3.113565  0.993947
jun_precip       2.335357  0.997475
jul_precip       1.896810  0.998206
aug_precip       1.864187  0.998080
sep_precip       2.062243  0.997163
oct_precip       3.651093  0.991070
nov_precip       5.849168  0.988798
dec_precip       7.375561  0.987007
annual_precip   43.166459  0.991321


## XGBoost

In [None]:
# Objective function for Optuna
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 200, 800),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'max_depth': trial.suggest_int('max_depth', 4, 12),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'gamma': trial.suggest_float('gamma', 0.0, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.5, 3.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 1.0),
        'tree_method': 'hist',
        'n_jobs': -1,
        'random_state': 42
    }

    model = MultiOutputRegressor(XGBRegressor(**params))
    model.fit(X_train, Y_train)
    Y_pred = model.predict(X_test)
    r2_scores = [r2_score(Y_test.iloc[:, i], Y_pred[:, i]) for i in range(Y_test.shape[1])]
    mean_r2 = np.mean(r2_scores)
    return -mean_r2  # negate to make Optuna minimize the negative R²

# Run the Optuna study
study = optuna.create_study(direction='minimize', study_name='xgboost_multioutput_tuning')
study.optimize(objective, n_trials=10, show_progress_bar=True)

# Results
print("\n✅ Best trial:")
print(f"Value (negative R²): {study.best_trial.value}")
print(f"Best R²: {-study.best_trial.value:.4f}")
print("Best hyperparameters:")
for key, value in study.best_params.items():
    print(f"  {key}: {value}")

try:
    optuna.visualization.plot_optimization_history(study).show()
    optuna.visualization.plot_param_importances(study).show()
except Exception as e:
    print(f"Visualization skipped ({e})")

[I 2025-10-22 18:21:34,531] A new study created in memory with name: xgboost_multioutput_tuning


  0%|          | 0/10 [00:00<?, ?it/s]

In [None]:
# Retrain the best model
best_params = study.best_params
best_model = MultiOutputRegressor(XGBRegressor(**best_params))
best_model.fit(X_train, Y_train)

Y_pred = best_model.predict(X_test)

# Compute final per-variable metrics
metrics = {}
for col, pred in zip(Y_test.columns, Y_pred.T):
    metrics[col] = {
        'RMSE': np.sqrt(np.mean((Y_test[col] - pred) ** 2)),
        'R²': r2_score(Y_test[col], pred)
    }

metrics_df = pd.DataFrame(metrics).T
print("\n📊 Per-variable performance:")
print(metrics_df)
print("\nMean R²:", metrics_df["R²"].mean())