In [1]:
using DataFrames, CSV, MLJ, VegaLite

In [2]:
# Import CSV to dataframe, select some columns
lichen_training = CSV.read("C:/Users/julio/Downloads/lichen_training.csv", DataFrame) |>
x -> select(x, "Total.lichen", "basal_area", "PL", "mean_Long") |>
x -> rename(x, ["biomass", "basalarea", "pl", "lon"]) |>
x -> coerce(x, Count => Continuous);

In [3]:
lichen_training[!, "biomass_log"] = log.(lichen_training[!, :biomass])
select!(lichen_training, Not(:biomass));

In [5]:
# schema 
schema(lichen_training)

┌─────────────┬─────────┬────────────┐
│[22m _.names     [0m│[22m _.types [0m│[22m _.scitypes [0m│
├─────────────┼─────────┼────────────┤
│ basalarea   │ Float64 │ Continuous │
│ pl          │ Float64 │ Continuous │
│ lon         │ Float64 │ Continuous │
│ biomass_log │ Float64 │ Continuous │
└─────────────┴─────────┴────────────┘
_.nrows = 78


In [8]:
# Split predictors/responde variable
y, X = unpack(lichen_training, ==(:biomass_log), _ -> true; rng=1010);

In [9]:
# Search for models that match the data scitype and are pure Julia
for m in models(matching(X, y))
    if m.is_pure_julia == true
        println(rpad(m.name, 40), "($(m.package_name))")
    end
end

ConstantRegressor                       (MLJModels)
DecisionTreeRegressor                   (BetaML)
DecisionTreeRegressor                   (DecisionTree)
DeterministicConstantRegressor          (MLJModels)
ElasticNetRegressor                     (MLJLinearModels)
EvoTreeGaussian                         (EvoTrees)
EvoTreeRegressor                        (EvoTrees)
HuberRegressor                          (MLJLinearModels)
KNNRegressor                            (NearestNeighborModels)
KPLSRegressor                           (PartialLeastSquaresRegressor)
LADRegressor                            (MLJLinearModels)
LassoRegressor                          (MLJLinearModels)
LinearRegressor                         (GLM)
LinearRegressor                         (MLJLinearModels)
LinearRegressor                         (MultivariateStats)
NeuralNetworkRegressor                  (MLJFlux)
PLSRegressor                            (PartialLeastSquaresRegressor)
QuantileRegressor                     

In [14]:
# Load a RidgeRegressor
ridge_regressor = @load RidgeRegressor pkg=MLJLinearModels

import MLJLinearModels ✔


┌ Info: For silent loading, specify `verbosity=0`. 
└ @ Main C:\Users\julio\.julia\packages\MLJModels\GKDnU\src\loading.jl:168


MLJLinearModels.RidgeRegressor

In [15]:
# Create a pipeline
ridge_regressor_pipe = @pipeline(Standardizer(), ridge_regressor())

Pipeline276(
    standardizer = Standardizer(
            features = Symbol[],
            ignore = false,
            ordered_factor = false,
            count = false),
    ridge_regressor = RidgeRegressor(
            lambda = 1.0,
            fit_intercept = true,
            penalize_intercept = false,
            solver = nothing))

In [16]:
# Create an MLJ machine (model + data)
rr_model = machine(ridge_regressor_pipe, X, y)

Machine{Pipeline276,…} trained 0 times; caches data
  args: 
    1:	Source @482 ⏎ `Table{AbstractVector{Continuous}}`
    2:	Source @217 ⏎ `AbstractVector{Continuous}`


In [18]:
# Split into train/test subsets
y, X = unpack(lichen_training, ==(:biomass_log), _ -> true; rng=1010);
train, test = partition(eachindex(y), 0.7, shuffle=true, rng=1010)

([66, 63, 5, 22, 49, 15, 47, 64, 35, 12  …  11, 67, 54, 73, 32, 4, 62, 61, 57, 21], [14, 78, 40, 27, 74, 68, 18, 2, 53, 50  …  77, 29, 69, 7, 60, 59, 17, 30, 43, 51])

In [19]:
# Fit the machine
fit!(rr_model, rows=train)

┌ Info: Training Machine{Pipeline276,…}.
└ @ MLJBase C:\Users\julio\.julia\packages\MLJBase\QXObv\src\machines.jl:403
┌ Info: Training Machine{Standardizer,…}.
└ @ MLJBase C:\Users\julio\.julia\packages\MLJBase\QXObv\src\machines.jl:403
┌ Info: Training Machine{RidgeRegressor,…}.
└ @ MLJBase C:\Users\julio\.julia\packages\MLJBase\QXObv\src\machines.jl:403


Machine{Pipeline276,…} trained 1 time; caches data
  args: 
    1:	Source @482 ⏎ `Table{AbstractVector{Continuous}}`
    2:	Source @217 ⏎ `AbstractVector{Continuous}`


In [20]:
# Check trained parameters
fitted_params(rr_model)

(ridge_regressor = (coefs = [:basalarea => -0.7942611119507311, :pl => 1.2122873454925065, :lon => -1.0287575660792603],
                    intercept = 4.4364774738687665,),
 standardizer = Dict(:basalarea => (20.008545454545455, 9.491712255901016), :pl => (61.6, 42.11562567415954), :lon => (-125.2877088, 0.839681844712773)),
 machines = Machine[Machine{Standardizer,…}, Machine{RidgeRegressor,…}],
 fitted_params_given_machine = OrderedCollections.LittleDict{Any, Any, Vector{Any}, Vector{Any}}(Machine{Standardizer,…} => Dict(:basalarea => (20.008545454545455, 9.491712255901016), :pl => (61.6, 42.11562567415954), :lon => (-125.2877088, 0.839681844712773)), Machine{RidgeRegressor,…} => (coefs = [:basalarea => -0.7942611119507311, :pl => 1.2122873454925065, :lon => -1.0287575660792603], intercept = 4.4364774738687665)),)

In [22]:
# Make predictions
y_test_pred = predict(rr_model, rows=test)
y_train_pred = predict(rr_model, rows=train);

In [14]:
# Get list of accuracy measures
# [m.name for m in measures() if m.target_scitype <: scitype(y)]
println([(m.name) for m in measures() if m.prediction_type == :deterministic])

["LPLoss", "LogCoshLoss", "Accuracy", "BalancedAccuracy", "ConfusionMatrix", "FScore", "FalseDiscoveryRate", "FalseNegative", "FalseNegativeRate", "FalsePositive", "FalsePositiveRate", "MatthewsCorrelation", "MeanAbsoluteError", "MeanAbsoluteProportionalError", "MisclassificationRate", "MulticlassFScore", "MulticlassFalseDiscoveryRate", "MulticlassFalseNegative", "MulticlassFalseNegativeRate", "MulticlassFalsePositive", "MulticlassFalsePositiveRate", "MulticlassNegativePredictiveValue", "MulticlassPrecision", "MulticlassTrueNegative", "MulticlassTrueNegativeRate", "MulticlassTruePositive", "MulticlassTruePositiveRate", "NegativePredictiveValue", "Precision", "RSquared", "RootMeanSquaredError", "RootMeanSquaredLogError", "RootMeanSquaredLogProportionalError", "RootMeanSquaredProportionalError", "TrueNegative", "TrueNegativeRate", "TruePositive", "TruePositiveRate", "HuberLoss", "L1EpsilonInsLoss", "L2EpsilonInsLoss", "LPDistLoss", "LogitDistLoss", "PeriodicLoss", "QuantileLoss"]


In [12]:
println([(m) for m in measures() if m.name == "RootMeanSquaredError"])

NamedTuple{(:name, :instances, :human_name, :target_scitype, :supports_weights, :supports_class_weights, :prediction_type, :orientation, :reports_each_observation, :aggregation, :is_feature_dependent, :docstring, :distribution_type), Tuple{String, Vector{String}, String, Union, Bool, Bool, Symbol, Symbol, Bool, StatisticalTraits.RootMeanSquare, Bool, String, DataType}}[(name = RootMeanSquaredError, instances = [rms, rmse, root_mean_squared_error], ...)]


In [24]:
# Show some accuracy measures
@show rms(y_test_pred, y[test])
@show rms(y_train_pred, y[train])
@show rmsp(y_test_pred, y[test])
@show rmsp(y_train_pred, y[train]);

rms(y_test_pred, y[test]) = 2.6312134768852733
rms(y_train_pred, y[train]) = 2.2170870123096473
rmsp(y_test_pred, y[test]) = 2.728853375593934
rmsp(y_train_pred, y[train]) = 2.607014111366455


In [29]:
# Fit the machine using the complete training dataset
fit!(rr_model)

┌ Info: Not retraining Machine{Pipeline276,…}. Use `force=true` to force.
└ @ MLJBase C:\Users\julio\.julia\packages\MLJBase\QXObv\src\machines.jl:406


Machine{Pipeline276,…} trained 2 times; caches data
  args: 
    1:	Source @482 ⏎ `Table{AbstractVector{Continuous}}`
    2:	Source @217 ⏎ `AbstractVector{Continuous}`


In [30]:
# Import CSV to dataframe, select the same columns used for training
lichen = CSV.read("C:/Users/julio/Downloads/predicted_biomass_Nov2021.csv", DataFrame) |>
x -> select(x, :BASAL_AREA, :PL, :LON) |>
x -> rename(x, ["basalarea", "pl", "lon"]);

In [32]:
# Predict on a new dataset
biomass = exp.(predict(rr_model, lichen));

In [33]:
# Export results to CSV
CSV.write("C:/Users/julio/Downloads/predicted_biomass_rr.csv", DataFrame(biomass=biomass))

"C:/Users/julio/Downloads/predicted_biomass_rr.csv"