In [1]:
using DataFrames, CSV, MLJ, VegaLite

In [2]:
# Import CSV to dataframe, select some columns
lichen = CSV.read("C:/Users/julio/Downloads/predicted_biomass_Nov2021.csv", DataFrame; missingstring="") |>
x -> select(x, :biomass_log, :BASAL_AREA, :PL, :LON) 

Unnamed: 0_level_0,biomass_log,BASAL_AREA,PL,LON
Unnamed: 0_level_1,Float64,Float64,Float64,Float64
1,675.195,29.1059,95.0,-125.514
2,270.647,33.2638,75.0,-125.46
3,441.115,32.757,95.0,-125.335
4,755.119,15.1756,80.0,-125.356
5,2008.27,8.77539,100.0,-125.361
6,391.87,37.5881,100.0,-125.331
7,389.608,37.5881,100.0,-125.326
8,482.739,35.5002,100.0,-125.405
9,353.893,37.5944,95.0,-125.373
10,602.184,27.9706,95.0,-125.371


In [3]:
# schema 
schema(lichen)

┌─────────────┬─────────┬────────────┐
│[22m _.names     [0m│[22m _.types [0m│[22m _.scitypes [0m│
├─────────────┼─────────┼────────────┤
│ biomass_log │ Float64 │ Continuous │
│ BASAL_AREA  │ Float64 │ Continuous │
│ PL          │ Float64 │ Continuous │
│ LON         │ Float64 │ Continuous │
└─────────────┴─────────┴────────────┘
_.nrows = 213821


In [4]:
# Split predictors (X) and response variable (y)
y, X = unpack(lichen, ==(:biomass_log), _ -> true; rng=1010);

In [5]:
# Search for models that match the data scitype and are pure Julia
for m in models(matching(X, y))
    if m.is_pure_julia == true
        println(rpad(m.name, 40), "($(m.package_name))")
    end
end

ConstantRegressor                       (MLJModels)
DecisionTreeRegressor                   (BetaML)
DecisionTreeRegressor                   (DecisionTree)
DeterministicConstantRegressor          (MLJModels)
ElasticNetRegressor                     (MLJLinearModels)
EvoTreeGaussian                         (EvoTrees)
EvoTreeRegressor                        (EvoTrees)
HuberRegressor                          (MLJLinearModels)
KNNRegressor                            (NearestNeighborModels)
KPLSRegressor                           (PartialLeastSquaresRegressor)
LADRegressor                            (MLJLinearModels)
LassoRegressor                          (MLJLinearModels)
LinearRegressor                         (GLM)
LinearRegressor                         (MLJLinearModels)
LinearRegressor                         (MultivariateStats)


NeuralNetworkRegressor                  (MLJFlux)
PLSRegressor                            (PartialLeastSquaresRegressor)
QuantileRegressor                       (MLJLinearModels)
RandomForestRegressor                   (BetaML)
RandomForestRegressor                   (DecisionTree)
RidgeRegressor                          (MLJLinearModels)
RidgeRegressor                          (MultivariateStats)
RobustRegressor                         (MLJLinearModels)


In [6]:
# Load a RandomForestRegressor model using the DecisionTree package
model = @load RandomForestRegressor pkg="DecisionTree" add=true verbosity=0

[32m[1m    Updating[22m[39m registry at `C:\Users\julio\.julia\registries\General`


[32m[1m    Updating[22m[39m git-repo `https://github.com/JuliaRegistries/General.git`


[32m[1m   Resolving[22m[39m package versions...


[32m[1m    Updating[22m[39m `C:\Users\julio\.julia\environments\v1.6\Project.toml`
 [90m [c6f25543] [39m

[92m+ MLJDecisionTreeInterface v0.1.3[39m
[32m[1m    Updating[22m[39m `C:\Users\julio\.julia\environments\v1.6\Manifest.toml`
 [90m [7806a523] [39m[92m+ DecisionTree v0.10.11[39m
 [90m [c6f25543] [39m[92m+ MLJDecisionTreeInterface v0.1.3[39m
 [90m [6e75b9c4] [39m[92m+ ScikitLearnBase v0.5.0[39m


MLJDecisionTreeInterface.RandomForestRegressor

In [7]:
# Get model info
info(model)

[35mRandom forest regressor.[39m
[35m→ based on [DecisionTree](https://github.com/bensadeghi/DecisionTree.jl).[39m
[35m→ do `@load RandomForestRegressor pkg="DecisionTree"` to use the model.[39m
[35m→ do `?RandomForestRegressor` for documentation.[39m
(name = "RandomForestRegressor",
 package_name = "DecisionTree",
 is_supervised = true,
 abstract_type = Deterministic,
 deep_properties = (),
 docstring = "Random forest regressor.\n→ based on [DecisionTree](https://github.com/bensadeghi/DecisionTree.jl).\n→ do `@load RandomForestRegressor pkg=\"DecisionTree\"` to use the model.\n→ do `?RandomForestRegressor` for documentation.",
 fit_data_scitype = Tuple{Table{var"#s53"} where var"#s53"<:Union{AbstractVector{var"#s52"} where var"#s52"<:Count, AbstractVector{var"#s52"} where var"#s52"<:OrderedFactor, AbstractVector{var"#s52"} where var"#s52"<:Continuous}, AbstractVector{Continuous}},
 hyperparameter_ranges = (nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing,

In [8]:
# Instantiate a random forest regressor model
model_rf = model(n_trees=50)

RandomForestRegressor(
    max_depth = -1,
    min_samples_leaf = 1,
    min_samples_split = 2,
    min_purity_increase = 0.0,
    n_subfeatures = -1,
    n_trees = 50,
    sampling_fraction = 0.7,
    pdf_smoothing = 0.0,
    rng = Random._GLOBAL_RNG())

In [9]:
# Create an MLJ machine (model + data)
mach_rf = machine(model_rf, X, y)

Machine{RandomForestRegressor,…} trained 0 times; caches data
  args: 
    1:	Source @103 ⏎ `Table{AbstractVector{Continuous}}`
    2:	Source @424 ⏎ `AbstractVector{Continuous}`


In [10]:
# Split into train/test datasets
train, test = partition(eachindex(y), 0.7, shuffle=true, rng=1010)

([147639, 177698, 91889, 34295, 48724, 125313, 119613, 148157, 57157, 191173  …  171075, 44873, 201843, 85709, 126579, 48495, 169544, 63318, 180986, 115741], [177578, 21749, 148862, 163174, 165760, 95349, 195482, 75326, 180782, 17438  …  34885, 49657, 113535, 166407, 112955, 52881, 124062, 41582, 55339, 192691])

In [11]:
# Fit the machine
fit!(mach_rf, rows=train)

┌ Info: Training Machine{RandomForestRegressor,…}.
└ @ MLJBase C:\Users\julio\.julia\packages\MLJBase\QXObv\src\machines.jl:403


Machine{RandomForestRegressor,…} trained 1 time; caches data
  args: 
    1:	Source @103 ⏎ `Table{AbstractVector{Continuous}}`
    2:	Source @424 ⏎ `AbstractVector{Continuous}`


In [12]:
# Check trained parameters
fitted_params(mach_rf)

(forest = Ensemble of Decision Trees
Trees:      50
Avg Leaves: 75250.82
Avg Depth:  30.88,)

In [13]:
# Make predictions
y_test_pred = predict(mach_rf, rows=test)

64146-element Vector{Float64}:
  345.90909316884296
   40.85744981503151
   52.22712952730869
   51.94727156592433
   73.02638432847881
  102.24555000774224
  306.818678813781
  542.1554043029845
   71.74544182696063
  141.74636732545682
    ⋮
   32.697064211941886
   15.918151952175776
  142.96103602856704
   97.95152686190634
   40.41504104221105
  103.64116298061818
 1085.4341340748072
   46.00926678814909
   29.855037935233995

In [14]:
# Get list of accuracy measures
# [m.name for m in measures() if m.target_scitype <: scitype(y)]
println([(m.name) for m in measures() if m.prediction_type == :deterministic])

["LPLoss", "LogCoshLoss", "Accuracy", "BalancedAccuracy", "ConfusionMatrix", "FScore", "FalseDiscoveryRate", "FalseNegative", "FalseNegativeRate", "FalsePositive", "FalsePositiveRate", "MatthewsCorrelation", "MeanAbsoluteError", "MeanAbsoluteProportionalError", "MisclassificationRate", "MulticlassFScore", "MulticlassFalseDiscoveryRate", "MulticlassFalseNegative", "MulticlassFalseNegativeRate", "MulticlassFalsePositive", "MulticlassFalsePositiveRate", "MulticlassNegativePredictiveValue", "MulticlassPrecision", "MulticlassTrueNegative", "MulticlassTrueNegativeRate", "MulticlassTruePositive", "MulticlassTruePositiveRate", "NegativePredictiveValue", "Precision", "RSquared", "RootMeanSquaredError", "RootMeanSquaredLogError", "RootMeanSquaredLogProportionalError", "RootMeanSquaredProportionalError", "TrueNegative", "TrueNegativeRate", "TruePositive", "TruePositiveRate", "HuberLoss", "L1EpsilonInsLoss", "L2EpsilonInsLoss", "LPDistLoss", "LogitDistLoss", "PeriodicLoss", "QuantileLoss"]


In [15]:
# Show some accuracy measures
@show rms(y_test_pred, y[test], sigdigits=3)
@show l1(y_test_pred, y[test]) |> mean
@show l2(y_test_pred, y[test]) |> mean
@show mav(y_test_pred, y[test])
@show rmsl(y_test_pred, y[test]);

rms(y_test_pred, y[test]) = 31.72304784769363


l1(y_test_pred, y[test]) |> mean = 4.751955310945135
l2(y_test_pred, y[test]) |> mean = 1006.3517647470552


mav(y_test_pred, y[test]) = 4.751955310945115


rmsl(y_test_pred, y[test]) = 0.020049349192033973


In [16]:
# Evaluate the model
evaluate(model_rf, X, y,
         resampling=CV(nfolds=3, shuffle=true),
         measure=rms, verbosity=1)





PerformanceEvaluation object with these fields:
  measure, measurement, operation, per_fold,
  per_observation, fitted_params_per_fold,
  report_per_fold, train_test_pairs
Extract:
┌────────────────────────┬─────────────┬───────────┬────────────────────┐
│[22m measure                [0m│[22m measurement [0m│[22m operation [0m│[22m per_fold           [0m│
├────────────────────────┼─────────────┼───────────┼────────────────────┤
│ RootMeanSquaredError() │ 39.5        │ predict   │ [39.4, 36.2, 42.5] │
└────────────────────────┴─────────────┴───────────┴────────────────────┘


In [17]:
# Evaluate the machine
# evaluate!(mach_rf,
#           resampling=Holdout(fraction_train=0.7, shuffle=true),
#           measure=rms, verbosity=1)
evaluate!(mach_rf,
          resampling=CV(nfolds=3, shuffle=true),
          measure=rms, verbosity=1)





PerformanceEvaluation object with these fields:
  measure, measurement, operation, per_fold,
  per_observation, fitted_params_per_fold,
  report_per_fold, train_test_pairs
Extract:
┌────────────────────────┬─────────────┬───────────┬────────────────────┐
│[22m measure                [0m│[22m measurement [0m│[22m operation [0m│[22m per_fold           [0m│
├────────────────────────┼─────────────┼───────────┼────────────────────┤
│ RootMeanSquaredError() │ 38.6        │ predict   │ [40.9, 39.1, 35.7] │
└────────────────────────┴─────────────┴───────────┴────────────────────┘


In [18]:
# Increase number of trees and re-evaluate the machine
model_rf.n_trees = 100
evaluate!(mach_rf,
          resampling=Holdout(fraction_train=0.7, shuffle=true),
          measure=rms, verbosity=1)

PerformanceEvaluation object with these fields:
  measure, measurement, operation, per_fold,
  per_observation, fitted_params_per_fold,
  report_per_fold, train_test_pairs
Extract:
┌────────────────────────┬─────────────┬───────────┬──────────┐
│[22m measure                [0m│[22m measurement [0m│[22m operation [0m│[22m per_fold [0m│
├────────────────────────┼─────────────┼───────────┼──────────┤
│ RootMeanSquaredError() │ 41.5        │ predict   │ [41.5]   │
└────────────────────────┴─────────────┴───────────┴──────────┘
