In [1]:
!pip install optuna catboost -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.9/400.9 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m247.4/247.4 kB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m80.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.3/47.3 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m607.6/607.6 kB[0m [31m30.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
from google.colab import files

print("Please upload your master3.3.csv file.")
uploaded = files.upload()

INPUT_FILE = next(iter(uploaded))
print(f"\nSuccessfully uploaded {INPUT_FILE}")

Please upload your master3.3.csv file.


Saving master3.3.csv to master3.3.csv

Successfully uploaded master3.3.csv


In [None]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor
import optuna
from sklearn.metrics import mean_squared_error

# --- 1. Data Preparation ---
print(f"Loading and preparing data from {INPUT_FILE}...")
df = pd.read_csv(INPUT_FILE)
df['date'] = pd.to_datetime(df['date'])
df.dropna(inplace=True)

train_df = df[df['date'].dt.year <= 2014]
valid_df = df[df['date'].dt.year == 2015]
test_df  = df[df['date'].dt.year == 2016]

train_df_real_only = train_df[train_df['is_measured'] == 1]
print(f"Using {len(train_df_real_only)} real measurements for training.")

target_col = "gw_level"
drop_cols = ["date", target_col, "is_measured"]
feature_cols = [c for c in train_df.columns if c not in drop_cols]
categorical_cols = ["hru_id"]

X_train, y_train = train_df_real_only[feature_cols].copy(), train_df_real_only[target_col]
X_valid, y_valid = valid_df[feature_cols].copy(), valid_df[target_col]
X_test,  y_test  = test_df[feature_cols].copy(),  test_df[target_col]

for df_split in [X_train, X_valid, X_test]:
    for c in categorical_cols:
        df_split[c] = df_split[c].astype("category")
print("Data preparation complete.")

# --- 2. Define the Objective Function (Unchanged) ---
def objective(trial):
    params = {
        "objective": "RMSE", "random_state": 42, "n_estimators": 4000,
        "verbose": 0, "cat_features": categorical_cols,
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1),
        "depth": trial.suggest_int("depth", 6, 12),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1.0, 10.0),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
    }
    model = CatBoostRegressor(**params)
    model.fit(
        X_train, y_train,
        eval_set=[(X_valid, y_valid)],
        early_stopping_rounds=100
    )
    preds = model.predict(X_valid)
    rmse = np.sqrt(mean_squared_error(y_valid, preds))
    return rmse

# --- 3. Run the Focused Optimization ---
# Define the best parameters from your INTERRUPTED run (Trial 14)
best_params_from_previous_run = {
    'learning_rate': 0.06599093771128242,
    'depth': 11,
    'l2_leaf_reg': 1.5411099462892297,
    'subsample': 0.726056962147056
}

study = optuna.create_study(direction="minimize")

# Enqueue the best trial to give the study a headstart
study.enqueue_trial(best_params_from_previous_run)

print("\nStarting FOCUSED hyperparameter search (restarted)...")
# Run 70 NEW trials
study.optimize(objective, n_trials=70)

# --- 4. Store and Print the Best Results ---
print("\n--- Optuna Search Complete ---")
best_trial = study.best_trial
print(f"  Value (Validation RMSE): {best_trial.value:.4f}")
print("  Params: ")
for key, value in best_trial.params.items():
    print(f"    {key}: {value}")

# Store best params for the final model training
best_params = best_trial.params

Loading and preparing data from master3.3.csv...


[I 2025-09-03 16:10:17,011] A new study created in memory with name: no-name-701787f0-1e3c-487a-bdbf-40dc11a88548


Using 5615 real measurements for training.
Data preparation complete.

Starting FOCUSED hyperparameter search (restarted)...


[I 2025-09-03 16:11:42,791] Trial 0 finished with value: 1.4455958397603181 and parameters: {'learning_rate': 0.06599093771128242, 'depth': 11, 'l2_leaf_reg': 1.5411099462892297, 'subsample': 0.726056962147056}. Best is trial 0 with value: 1.4455958397603181.
[I 2025-09-03 16:18:58,442] Trial 1 finished with value: 1.4733594041076348 and parameters: {'learning_rate': 0.056504375966806616, 'depth': 12, 'l2_leaf_reg': 7.991523072519586, 'subsample': 0.6663867272543055}. Best is trial 0 with value: 1.4455958397603181.
[I 2025-09-03 16:27:43,248] Trial 2 finished with value: 1.567453539026798 and parameters: {'learning_rate': 0.023145871314178633, 'depth': 12, 'l2_leaf_reg': 2.6740592907771314, 'subsample': 0.9266820902405758}. Best is trial 0 with value: 1.4455958397603181.
[I 2025-09-03 16:27:57,910] Trial 3 finished with value: 1.799364317992188 and parameters: {'learning_rate': 0.08199380602579537, 'depth': 6, 'l2_leaf_reg': 4.616078653960468, 'subsample': 0.6197429701146066}. Best is 

In [None]:
import joblib
from sklearn.metrics import r2_score

# Add necessary parameters for the final model
final_params = best_params
final_params['objective'] = 'RMSE'
final_params['random_state'] = 42
final_params['n_estimators'] = 4000
final_params['verbose'] = 0
final_params['cat_features'] = categorical_cols

print("\nTraining final model with newly optimized parameters...")
final_model = CatBoostRegressor(**final_params)
final_model.fit(
    X_train, y_train,
    eval_set=[(X_valid, y_valid)],
    early_stopping_rounds=100
)

print("\n--- Final Model Performance ---")
for name, X, y in [("Train (Real Only)", X_train, y_train),
                   ("Validation (Full)", X_valid, y_valid),
                   ("Test (Full)", X_test, y_test)]:
    preds = final_model.predict(X)
    rmse = np.sqrt(mean_squared_error(y, preds))
    r2 = r2_score(y, preds)
    print(f"[{name: <18}] RMSE={rmse:.4f}  R²={r2:.4f}")

MODEL_OUTPUT_FILE = "catboost2.pkl"
print(f"\nSaving final model to {MODEL_OUTPUT_FILE}...")
joblib.dump(final_model, MODEL_OUTPUT_FILE)

from google.colab import files
files.download(MODEL_OUTPUT_FILE)