In [3]:
# TabNet regression for estimating ocean CO2 flux from global tabular data (PyTorch TabNet Version)
import os
import glob
import pickle
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from pytorch_tabnet.tab_model import TabNetRegressor
from sklearn.preprocessing import StandardScaler
import torch


In [4]:
# ---------------------------
# Load and concatenate data
# ---------------------------

data_files = sorted(glob.glob("../../data/*_df.pkl"))  # e.g., ./data/2009_df.pkl to ./data/2018_df.pkl
all_data = []

for file in data_files:
    year = int(os.path.basename(file).split('_')[0])
    with open(file, 'rb') as f:
        df = pickle.load(f)
        df['year'] = year
        if 'time_counter' in df.columns:
            df["month"] = df["time_counter"].apply(lambda x: x.month)
        all_data.append(df)

data = pd.concat(all_data, ignore_index=True)

In [5]:
# ---------------------------
# Feature and target columns
# ---------------------------

features = [
    'SST', 'SAL', 'ice_frac', 'mixed_layer_depth', 'heat_flux_down',
    'water_flux_up', 'stress_X', 'stress_Y', 'currents_X', 'currents_Y',
    'month', 'tmask', 'year', 'nav_lat', 'nav_lon'
]
target = 'co2flux'

# Drop NaNs (or you can impute if preferred)
data = data.dropna(subset=features + [target])

X = data[features].values
y = data[target].values.reshape(-1, 1)

In [None]:
# ---------------------------
# Train/test split & scaling
# ---------------------------

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler_x = StandardScaler()
scaler_y = StandardScaler()

X_train_scaled = scaler_x.fit_transform(X_train)
X_test_scaled = scaler_x.transform(X_test)

y_train_scaled = scaler_y.fit_transform(y_train)
y_test_scaled = scaler_y.transform(y_test)

In [None]:
# ---------------------------
# Train TabNet Regressor
# ---------------------------

regressor = TabNetRegressor(verbose=1)

regressor.fit(
    X_train_scaled, y_train_scaled,
    eval_set=[(X_test_scaled, y_test_scaled)],
    eval_metric=['rmse'],
    max_epochs=100,
    patience=10,
    batch_size=1024,
    virtual_batch_size=128
)

In [None]:
# ---------------------------
# Evaluate model
# ---------------------------

y_pred_scaled = regressor.predict(X_test_scaled)
y_pred = scaler_y.inverse_transform(y_pred_scaled)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"Test RMSE: {rmse:.3f}")
print(f"Test R^2: {r2:.3f}")

# Optionally: Save model
# regressor.save_model('tabnet_co2flux_model')