In [1]:
using CSV
using DataFrames
using JSON
using ArchGDAL
using Proj
using Rasters
using Base.Threads
using JLD2
using Lux
using LuxCore
using EasyHybrid
using Optimisers
using Statistics
using Plots
using Distributed
using Parquet
include("helpers.jl")
using .Helpers
version = "v20251219"

"v20251219"

## train production model with all available data

In [2]:
# prepare data
# load in predictors
datafile = "/mnt/tupi/HybridModeling/EasyDensity.jl/data/lucas_preprocessed_v20251125.csv"
oridf = CSV.read(datafile, DataFrame; normalizenames=true)
predictors = Symbol.(names(oridf))[18:end-6]; # CHECK EVERY TIME 


In [1]:

parameters = (
    SOCconc = (0.01f0, 0.0f0, 1.0f0),   # fraction
    CF      = (0.15f0, 0.0f0, 1.0f0),   # fraction,
    oBD     = (0.20f0, 0.05f0, 0.40f0),  # also NN learnt, g/cm3
    mBD     = (1.20f0, 0.75f0, 2.0f0),  # NN leanrt
)
neural_param_names = [:SOCconc, :CF, :mBD, :oBD]
forcing = Symbol[]
targets = [:BD, :SOCconc, :SOCdensity, :CF]   

hmb = constructHybridModel(
    predictors,
    forcing,
    targets,
    SOCD_model,
    parameters,
    neural_param_names,
    [];
    hidden_layers = [256, 128, 64, 32],
    activation = gelu,
    scale_nn_outputs = true,
    input_batchnorm = false,
    start_from_default = true
);

In [2]:
rlt = train(
    hmb, oridf, ();
    nepochs = 200,
    batchsize = 512,
    opt = AdamW(0.0005),
    training_loss = :mse,
    loss_types = [:mse, :r2],
    shuffleobs = true,
    file_name = "prod_SiNN.jld2",
    random_seed = 42,
    patience = 15,
    yscale = identity,
    monitor_names = [:oBD, :mBD],
    agg = mean,
    return_model = :best,
    show_progress = false,
    plotting = false,
    hybrid_name = "model_prod_SiNN" 
)


[33m[1m└ [22m[39m[90m@ EasyHybrid /opt/julia/packages/EasyHybrid/PjjBT/src/train.jl:156[39m
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mPlotting disabled.
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mCheck the saved output (.png, .mp4, .jld2) from training at: /mnt/tupi/HybridModeling/EasyDensity.jl-main/output_tmp
[33m[1m└ [22m[39m[90m@ EasyHybrid /opt/julia/packages/EasyHybrid/PjjBT/src/train.jl:319[39m
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mReturning best model from epoch 4 of 200 epochs with best validation loss wrt mse: 0.014769173349902721


[38;5;6m  train_history[39m[33m: [39m[90m(20, 2)[39m
    mse  (BD, SOCconc, SOCdensity, CF, mean)
    r2   (BD, SOCconc, SOCdensity, CF, mean)
[38;5;6m  val_history[39m[33m: [39m[90m(20, 2)[39m
    mse  (BD, SOCconc, SOCdensity, CF, mean)
    r2   (BD, SOCconc, SOCdensity, CF, mean)
[38;5;6m  ps_history[39m[33m: [39m[90m(20, 2)[39m
    ϕ        ()
    monitor  (train, val)
[38;5;6m  train_obs_pred[39m[33m: [39m[90m44894×9 DataFrame[39m
[34m    [39mBD, SOCconc, SOCdensity, CF, index, BD_pred, SOCconc_pred, SOCdensity_pred, CF_pred
[38;5;6m  val_obs_pred[39m[33m: [39m[90m11223×9 DataFrame[39m
[34m    [39mBD, SOCconc, SOCdensity, CF, index, BD_pred, SOCconc_pred, SOCdensity_pred, CF_pred
[38;5;6m  train_diffs[39m[33m: [39m
[38;5;10m    oBD         [39m[90m(44894,)[39m
[38;5;10m    mBD         [39m[90m(44894,)[39m
    parameters  (SOCconc, CF, mBD, oBD)
[38;5;6m  val_diffs[39m[33m: [39m
[38;5;10m    oBD         [39m[90m(11223,)[39m
[38

In [6]:
pss = rlt.ps
stt = rlt.st
@save "./map/prod_SiNN_model_$(version).jld2" hmb pss stt
@save "./data/predictors_$(version).jld2" predictors


## save the covariate scalers using before training

In [3]:
# ? move the `csv` file into the `BulkDSOC/data` folder (create folder)
df_o = CSV.read("/mnt/tupi/HybridModeling/EasyDensity.jl/data/lucas_overlaid.csv", DataFrame, normalizenames=true);
println(size(df_o));

############################
###### clean targets #######
############################

# filter horizon depth = 10 cm
df_o = df_o[df_o.hzn_dep .== 10, :];
select!(df_o, Not(:hzn_dep));
println(size(df_o))

# identify noise time supervise
gdf = groupby(df_o, :id);
df_o.maxdiff = fill(0.0, nrow(df_o));  # initialize noise column
# compute max abs difference of SOCconc per id
for sub in groupby(df_o, :id)
    soc = sort(sub.soc)

    if length(soc) < 2
        maxdiff = -1
    else
        maxdiff = maximum(abs.(diff(soc)))
    end

    df_o[df_o.id .== sub.id[1], :maxdiff] .= maxdiff
    
end
println(size(df_o))
df_o = df_o[df_o.maxdiff .<= 50, :];
println(size(df_o))

# coords = collect(zip(df_o.lat, df_o.lon));

########################
###### clean cov #######
########################
# t clean covariates
names_cov = Symbol.(names(df_o))[18:end-1];

# Fix soilsuite and cropland extent columns
for col in names_cov
    if occursin("_soilsuite_", String(col))
        df_o[!, col] = replace(df_o[!, col], missing => 0)
    elseif occursin("cropland_extent_", String(col))
        df_o[!, col] = replace(df_o[!, col], missing => 0)
        df_o[!, col] .= ifelse.(df_o[!, col] .> 0, 1, 0)
    end
end

# rm missing values: 1. >5%, drop col; 2. <=5%, drop row
cols_to_drop_row = Symbol[];
cols_to_drop_col = Symbol[];
for col in names_cov
    n_missing = count(ismissing, df_o[!, col])
    frac_missing = n_missing / nrow(df_o)
    if frac_missing > 0.05
        println(n_missing, " ", col)
        select!(df_o, Not(col))  # drop the column
        push!(cols_to_drop_col, col)  
    elseif n_missing > 0
        # println(n_missing, " ", col)
        push!(cols_to_drop_row, col)  # collect column name
    end

    if occursin("CHELSA_kg", String(col)) 
        push!(cols_to_drop_col, col) 
        select!(df_o, Not(col))  # rm kg catagorical col
    end 
end

names_cov = filter(x -> !(x in cols_to_drop_col), names_cov) # remove cols-to-drop from names_cov
if !isempty(cols_to_drop_row) 
    df_o = subset(df_o, cols_to_drop_row .=> ByRow(!ismissing)) # drop rows with missing values in cols_to_drop_row
end
println(size(df_o))

cols_to_drop_col = Symbol[] 
for col in names_cov
    if std(df_o[:,col])==0
        push!(cols_to_drop_col, col)  # rm constant col (std==0)
        select!(df_o, Not(col))
    end
end
names_cov = filter(x -> !(x in cols_to_drop_col), names_cov) # remove cols-to-drop from names_cov
println(size(df_o))

# for col in names_cov # to check covairate distribution
#     println(string(col)[1:10], ' ', round(std(df[:, col]); digits=2), ' ', round(mean(df[:, col]); digits=2))
# end

# # Normalize covariates by (x-mean) / std
means = map(c -> mean(skipmissing(df_o[!, c])), predictors);
stds  = map(c -> std(skipmissing(df_o[!, c])), predictors);

using JLD2

scaler = Dict{Symbol, NamedTuple{(:mean, :std), Tuple{Float64, Float64}}}()

for (i, col) in enumerate(predictors)
    scaler[col] = (mean = means[i], std = stds[i])
end

cov_scaler = scaler
@save "./data/covs_scaler.jld2" cov_scaler


(62577, 422)
(62199, 421)
(62199, 422)
(57343, 422)
33487 CHELSA_swe_1981_2010_V_2_1
(56117, 415)
(56117, 380)
