In [None]:
using DataFrames, CSV, MLJ, VegaLite

In [None]:
# Import CSV to dataframe, select some columns
lichen = CSV.read("C:/Users/julio/Downloads/predicted_biomass_Nov2021.csv", DataFrame; missingstring="") |>
x -> select(x, :biomass_log, :BASAL_AREA, :PL, :LON) 

In [None]:
# schema 
schema(lichen)

In [None]:
# Split predictors (X) and response variable (y)
y, X = unpack(lichen, ==(:biomass_log), _ -> true; rng=1010);

In [None]:
# Search for models that match the data scitype and are pure Julia
for m in models(matching(X, y))
    if m.is_pure_julia == true
        println(rpad(m.name, 40), "($(m.package_name))")
    end
end

In [None]:
# Load a RandomForestRegressor model using the DecisionTree package
model = @load RandomForestRegressor pkg="DecisionTree" add=true verbosity=0

In [None]:
# Get model info
info(model)

In [None]:
# Instantiate a random forest regressor model
model_rf = model(n_trees=50)

In [None]:
# Create an MLJ machine (model + data)
mach_rf = machine(model_rf, X, y)

In [None]:
# Split into train/test datasets
train, test = partition(eachindex(y), 0.7, shuffle=true, rng=1010)

In [None]:
# Fit the machine
fit!(mach_rf, rows=train)

In [None]:
# Check trained parameters
fitted_params(mach_rf)

In [None]:
# Make predictions
y_test_pred = predict(mach_rf, rows=test)

In [None]:
# Get list of accuracy measures
# [m.name for m in measures() if m.target_scitype <: scitype(y)]
println([(m.name) for m in measures() if m.prediction_type == :deterministic])

In [None]:
# Show some accuracy measures
@show rms(y_test_pred, y[test])
@show l1(y_test_pred, y[test]) |> mean
@show l2(y_test_pred, y[test]) |> mean
@show mav(y_test_pred, y[test])
@show rmsl(y_test_pred, y[test]);

In [None]:
# Evaluate the model
evaluate(model_rf, X, y,
         resampling=CV(nfolds=3, shuffle=true),
         measure=rms, verbosity=1)

In [None]:
# Evaluate the machine
# evaluate!(mach_rf,
#           resampling=Holdout(fraction_train=0.7, shuffle=true),
#           measure=rms, verbosity=1)
evaluate!(mach_rf,
          resampling=CV(nfolds=3, shuffle=true),
          measure=rms, verbosity=1)

In [None]:
# Increase number of trees and re-evaluate the machine
model_rf.n_trees = 100
evaluate!(mach_rf,
          resampling=Holdout(fraction_train=0.7, shuffle=true),
          measure=rms, verbosity=1)