In [3]:
###Rudimentary test suite to ensure that updates do not break the code 
using Ronin
using Missings 
using HDF5 
using NCDatasets
using BenchmarkTools 
using StatsBase
using Scratch

using JLD2 
include("../src/DecisionTree/DecisionTree.jl")
global scratchspace = @get_scratch!("ronin_testing")


###Will undergo a basic training/QC pipeline. Model is not meant to output 
###Correct results, but rather simply show that it can produce data, train a model, 
###and correctly apply the model to a scan. 

###The below will be testing for a single-pass model

TRAINING_PATH = "../BENCHMARKING/benchmark_cfrads/"
config_file_path = "../BENCHMARKING/benchmark_setup/config.txt"
sample_model = "../BENCHMARKING/benchmark_setup/benchmark_model.joblib"



"../BENCHMARKING/benchmark_setup/benchmark_model.joblib"

In [92]:
###NEED TO ALLOW THIS TO IGNORE COMMENTS 
tasks = Ronin.get_task_params(config_file_path)

placeholder_matrix = allowmissing(ones(3,3))
center_weight::Float64 = 0

###Weight matrixes for calculating spatial parameters 
iso_weights::Matrix{Union{Missing, Float64}} = allowmissing(ones(7,7))
iso_weights[4,4] = center_weight 
iso_window::Tuple{Int64, Int64} = (7,7)

avg_weights::Matrix{Union{Missing, Float64}} = allowmissing(ones(5,5))
avg_weights[3,3] = center_weight 
avg_window::Tuple{Int64, Int64} = (5,5)

std_weights::Matrix{Union{Missing, Float64}} = allowmissing(ones(5,5))
std_weights[3,3] = center_weight 
std_window::Tuple{Int64, Int64} = (5,5)


weight_matrixes = [placeholder_matrix, placeholder_matrix, std_weights, placeholder_matrix, placeholder_matrix, iso_weights]

path1 = joinpath(scratchspace, "_1.h5")
path2 = joinpath(scratchspace, "_2.h5")

        
function clean_config() 

    task_path = "./tasks.txt"


    task_paths = [task_path, task_path, task_path] 
    input_path = ds_path
    num_models = 3
    initial_met_prob = (.1, .9) 
    final_met_prob = (.1,.9)
    
    ###Combine into vector for model configuration object 
    ###It's important to note that len(met_probs) is enforced to be equal to num_models 
    met_probs = [initial_met_prob, initial_met_prob, final_met_prob]
    
    ###The following are default windows specified in RoninConstants.jl 
    ###Standard 7x7 window 
    sw = Ronin.standard_window 
    ###7x7 window with only nonzero weights in azimuth dimension 
    aw = Ronin.azi_window
    ###7x7 window with only nonzero weights in range dimension 
    rw = Ronin.range_window 
    ###Placeholder window for tasks that do not require spatial context 
    pw = Ronin.placeholder_window 
    
    ###Specify a weight matrix for each individual task in the configuration file 
    weight_vec = [pw, rw]
    ###Specify a weight vector for each model pass 
    ###len(weight_vector) is enforced to be equal to num_models (should have a set of weights for each pass) 
    task_weights = [weight_vec, weight_vec, weight_vec] 
    
    base_name = joinpath(scratchspace, "raw_model")
    base_name_features = joinpath(scratchspace, "output_features")
    ###List of paths to output trained models to. Enforced to be same size as num_models 
    model_output_paths = [base_name * "_$(i-1).jld2" for i in 1:num_models ]
    ###List of paths to output calculated features to. Enforced to be same size as num_models 
    feature_output_paths = [base_name_features * "_$(i-1).h5" for i in 1:num_models]
    
    
    ###Options are "balanced" or "". If "balanced", the decision trees will be trained 
    ###on a weighted version of the existing classes in order to combat class imbalance 
    class_weights = "balanced"
    
    ###Name of variable in cfradials that has already had interactive QC applied 
    QC_var = "VG"
    
    ###Name of a variable in cfradials that will be used to mask what gates are predicted upon.
    ###Missing values in this variable mean that gates will be removed
    remove_var = "VV"
    ###Name of a variable in input cfradials that has not had postprocessing applied. 
    ###This variable is used to determine where MISSING gates exist in the scan 
    remove_var = "VEL"
    
    ###Whether or not the input features for the model have already been calculated 
    file_preprocessed = [false, false, false]
    
    ###Where to write out the masks to in cfradial file. 
    mask_names = ["PASS_1_MASK", "PASS_2_MASK", "PASS_3_MASK"]
    
    
    ###Create model config object
    config = ModelConfig(num_models = num_models,model_output_paths =  model_output_paths,met_probs =  met_probs, 
                        feature_output_paths = feature_output_paths, input_path = input_path,task_mode="nan",file_preprocessed = file_preprocessed,
                         task_paths = task_paths, QC_var = QC_var, remove_var = remove_var, QC_mask = false, mask_names = mask_names,
                         VARS_TO_QC = ["VEL"], class_weights = class_weights, HAS_INTERACTIVE_QC=true, task_weights = task_weights,
                         REMOVE_HIGH_PGG=false, REMOVE_LOW_NCP=false)

end

clean_config (generic function with 1 method)

In [28]:
##Create a toy sized cfradial file that we know the exact values of calculations for 
ds_path = joinpath(scratchspace, "toy_set.nc")
isfile(ds_path) && rm(ds_path)
ds = NCDataset(ds_path, "c")
# import Base.size
# import Base.length 
# function size(v::Matrix{Union{Missing, Float32}}, dim::Int64)
    
#     return 5
# end 

# function length(v::Nothing)
#     return 5
# end 
####start with 5x5 
range_dim = 5
time_dim  = 5

times = collect(1:1:5)
ranges = collect(1:1:5)

sample_DBZ = Matrix{Union{Missing, Float32}}(reshape(sample(1:65, range_dim*time_dim),(range_dim, time_dim)))
sample_VEL = Matrix{Float32}(reshape(sample(-20:20, range_dim*time_dim), (range_dim, time_dim)))
sample_NCP = fill(1., (range_dim, time_dim))
sample_NCP[:,1] .= .1

sample_PGG = fill(.1, (range_dim, time_dim))
sample_PGG[1,:] .= 1

sample_VG = Matrix{Union{Missing, Float32}}(sample_VEL)
###Collocate with the low values of NCP 
sample_VG[:,1] .= missing
###Add center ro 
sample_VG[3,:] .= missing 

defDim(ds, "range", range_dim)
defDim(ds, "time", time_dim)

tv = defVar(ds, "time", Float32, ("time",), attrib=Dict("units" => "s"))
tv[:] = times

rv = defVar(ds, "range", Float32, ("range",), attrib=Dict("units" => "m"))
rv[:] = ranges

NCP = defVar(ds, "NCP", Float32, ("range", "time"), attrib=Dict("units" => "NCP units"))
NCP[:,:] = sample_NCP 



VEL = defVar(ds, "VEL", Float32, ("range", "time"), attrib=Dict("units" => "m/s"))
VEL[:,:] = sample_VEL

defVar(ds, "DBZ", sample_DBZ, ("range", "time"), attrib=Dict("units" => "log"))
defVar(ds, "PGG", sample_PGG, ("range", "time"), attrib=Dict("units" => "percent"))
defVar(ds, "VG", sample_VG, ("range", "time"), attrib=Dict("units" => "m/s"))

close(ds)

closed Dataset

In [6]:
task_path = "./tasks.txt"


task_paths = [task_path, task_path] 
input_path = ds_path
num_models = 2
initial_met_prob = (.1, .9) 
final_met_prob = (.1,.9)

###Combine into vector for model configuration object 
###It's important to note that len(met_probs) is enforced to be equal to num_models 
met_probs = [initial_met_prob, final_met_prob]

###The following are default windows specified in RoninConstants.jl 
###Standard 7x7 window 
sw = Ronin.standard_window 
###7x7 window with only nonzero weights in azimuth dimension 
aw = Ronin.azi_window
###7x7 window with only nonzero weights in range dimension 
rw = Ronin.range_window 
###Placeholder window for tasks that do not require spatial context 
pw = Ronin.placeholder_window 

###Specify a weight matrix for each individual task in the configuration file 
weight_vec = [pw, rw]
###Specify a weight vector for each model pass 
###len(weight_vector) is enforced to be equal to num_models (should have a set of weights for each pass) 
task_weights = [weight_vec, weight_vec] 

base_name = "raw_model"
base_name_features = "output_features" 
###List of paths to output trained models to. Enforced to be same size as num_models 
model_output_paths = [base_name * "_$(i-1).jld2" for i in 1:num_models ]
###List of paths to output calculated features to. Enforced to be same size as num_models 
feature_output_paths = [base_name_features * "_$(i-1).h5" for i in 1:num_models]


###Options are "balanced" or "". If "balanced", the decision trees will be trained 
###on a weighted version of the existing classes in order to combat class imbalance 
class_weights = "balanced"

###Name of variable in cfradials that has already had interactive QC applied 
QC_var = "VG"

###Name of a variable in cfradials that will be used to mask what gates are predicted upon.
###Missing values in this variable mean that gates will be removed
remove_var = "VV"
###Name of a variable in input cfradials that has not had postprocessing applied. 
###This variable is used to determine where MISSING gates exist in the scan 
remove_var = "VEL"

###Whether or not the input features for the model have already been calculated 
file_preprocessed = [false, false]

###Where to write out the masks to in cfradial file. 
mask_names = ["PASS_1_MASK", "PASS_2_MASK"]




2-element Vector{String}:
 "PASS_1_MASK"
 "PASS_2_MASK"

In [25]:
scratchspace

"/Users/ischluesche/.julia/scratchspaces/905eca60-9fa9-4fb3-8835-f5cd63a3719c/ronin_testing"

In [29]:
config = clean_config()
config.input_path = "../BENCHMARKING/benchmark_NOAA_cfrads"

"../BENCHMARKING/benchmark_NOAA_cfrads"

In [30]:
config.model_output_paths

2-element Vector{String}:
 "/Users/ischluesche/.julia/scrat"[93m[1m ⋯ 45 bytes ⋯ [22m[39m"/ronin_testing/raw_model_0.jld2"
 "/Users/ischluesche/.julia/scrat"[93m[1m ⋯ 45 bytes ⋯ [22m[39m"/ronin_testing/raw_model_1.jld2"

In [31]:
evaluate_model(config)


[32mCALCULATING FEATURES FOR PASS: 1[39m
Processed ../BENCHMARKING/benchmark_NOAA_cfrads/cfrad.20240705_222204.187_to_20240705_222208.161_N42RF-TM_AIR.nc in 0.25669312477111816 seconds
Processed ../BENCHMARKING/benchmark_NOAA_cfrads/cfrad.20240705_222216.825_to_20240705_222220.799_N42RF-TM_AIR.nc in 0.11336898803710938 seconds
Processed ../BENCHMARKING/benchmark_NOAA_cfrads/cfrad.20240705_222229.464_to_20240705_222233.438_N42RF-TM_AIR.nc in 0.10779500007629395 seconds
Processed ../BENCHMARKING/benchmark_NOAA_cfrads/cfrad.20240705_222242.102_to_20240705_222246.076_N42RF-TM_AIR.nc in 0.12096595764160156 seconds
Processed ../BENCHMARKING/benchmark_NOAA_cfrads/cfrad.20240705_222254.752_to_20240705_222258.731_N42RF-TM_AIR.nc in 0.10852599143981934 seconds
Processed ../BENCHMARKING/benchmark_NOAA_cfrads/cfrad.20240705_222307.390_to_20240705_222311.370_N42RF-TM_AIR.nc in 0.11613702774047852 seconds
Processed ../BENCHMARKING/benchmark_NOAA_cfrads/cfrad.20240705_222320.034_to_20240705_222324

Excessive output truncated after 524292 bytes.

units                = [36mdBm[39m
     meta_group           = [36mradar_calibration[39m
     _FillValue           = [36m-9999.0[39m

[32m  r_calib_test_power_v[39m   (1)
    Datatype:    [0m[1mUnion{Missing, Float32}[22m (Float32)
    Dimensions:  r_calib
    Attributes:
     long_name            = [36mradar_calibration_test_power_v_channel[39m
     units                = [36mdBm[39m
     meta_group           = [36mradar_calibration[39m
     _FillValue           = [36m-9999.0[39m

[32m  time[39m   (360)
    Datatype:    [0m[1mDates.DateTime[22m (Float64)
    Dimensions:  time
    Attributes:
     standard_name        = [36mtime[39m
     long_name            = [36mtime in seconds since volume start[39m
     calendar             = [36mgregorian[39m
     units                = [36mseconds since 2024-07-05T22:23:57Z[39m
     comment              = [36mtimes are relative to the volume start_time[39m

[32m  range[39m   (721)
    Datatype:    [0m[1mFloat

Row,met_probs,task_paths,class_weights,n_trees,max_depth,precision,recall,f1,true_positives,false_positives,true_negatives,false_negatives
Unnamed: 0_level_1,Array…,Array…,String,Int64,Int64,Float32,Float32,Float32,Int64,Int64,Int64,Int64
1,"Tuple{Float32, Float32}[(0.1, 0.9), (0.1, 0.9)]","[""./tasks.txt"", ""./tasks.txt""]",balanced,21,14,0.648193,0.813151,0.721362,367307,199356,366086,84401


In [76]:
using JLD2 


LoadError: UndefVarError: `lenght` not defined in `Main`
Suggestion: check for spelling errors or missing imports.

In [94]:
###Test composite prediction 
config = clean_config()
config.input_path =  "../BENCHMARKING/benchmark_NOAA_cfrads"
train_multi_model(config)
###Trained 3 pass model 




[32mCALCULATING FEATURES FOR PASS: 1[39m
Processed ../BENCHMARKING/benchmark_NOAA_cfrads/cfrad.20240705_222204.187_to_20240705_222208.161_N42RF-TM_AIR.nc in 0.2894599437713623 seconds
Processed ../BENCHMARKING/benchmark_NOAA_cfrads/cfrad.20240705_222216.825_to_20240705_222220.799_N42RF-TM_AIR.nc in 0.11131691932678223 seconds
Processed ../BENCHMARKING/benchmark_NOAA_cfrads/cfrad.20240705_222229.464_to_20240705_222233.438_N42RF-TM_AIR.nc in 0.10559296607971191 seconds
Processed ../BENCHMARKING/benchmark_NOAA_cfrads/cfrad.20240705_222242.102_to_20240705_222246.076_N42RF-TM_AIR.nc in 0.11139893531799316 seconds
Processed ../BENCHMARKING/benchmark_NOAA_cfrads/cfrad.20240705_222254.752_to_20240705_222258.731_N42RF-TM_AIR.nc in 0.1089320182800293 seconds
Processed ../BENCHMARKING/benchmark_NOAA_cfrads/cfrad.20240705_222307.390_to_20240705_222311.370_N42RF-TM_AIR.nc in 0.1069939136505127 seconds
Processed ../BENCHMARKING/benchmark_NOAA_cfrads/cfrad.20240705_222320.034_to_20240705_222324.00

Excessive output truncated after 524291 bytes.


     units                = [36mmeters[39m
     _FillValue           = [36m-9999.0[39m
     meta_group           = [36minstrument_parameters[39m

[32m  antenna_transition[39m   (360)
    Datatype:    [0m[1mUnion{Missing, Int8}[22m (Int8)
    Dimensions:  time
    Attributes:
     long_name            = [36mantenna_is_in_transition_between_sweeps[39m
     units                = 
     _FillValue           = [36m-128[39m
     comment              = [36m1 if antenna is in transition, 0 otherwise[39m

[32m  georefs_applied[39m   (360)
    Datatype:    [0m[1mUnion{Missing, Int8}[22m (Int8)
    Dimensions:  time
    Attributes:
     long_name            = [36mgeorefs_have_been_applied_to_ray[39m
     units                = 
     _FillValue           = [36m-128[39m
     comment              = [36m1 if georefs have been applied, 0 otherwise[39m

[32m  n_samples[39m   (360)
    Datatype:    [0m[1mUnion{Missing, Int32}[22m (Int32)
    Dimensions:  time
    Attrib

In [95]:
predictions = [] 
targets = [] 
###Test this by interactively going through the models
###start by opening the first model 
for (i, model) in enumerate(config.model_output_paths)
    currm = load_object(model) 

    currh5 = h5open(config.feature_output_paths[i])
    currfeatures = currh5["X"][:,:]
    currtargets = currh5["Y"][:,:][:]
    close(currh5) 

    push!(predictions, DecisionTree.predict_proba(currm, currfeatures))
    push!(targets, currtargets)

    ###Construct prediction vector  
end 

In [108]:
targets[1]

1017150-element Vector{Int64}:
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 1
 1
 1
 ⋮
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0

In [112]:
(predictions[1][:,2] .> .9) .| (predictions[1][:,2])

1017150-element Vector{Float64}:
 1.0
 1.0
 1.0
 1.0
 1.0
 1.0
 1.0
 0.047619047619047616
 0.0
 0.0
 0.0
 0.0
 0.0
 ⋮
 0.047619047619047616
 1.0
 0.047619047619047616
 0.6190476190476191
 0.047619047619047616
 0.0
 1.0
 1.0
 0.5238095238095238
 1.0
 1.0
 1.0

In [114]:
targets[1]

1017150-element Vector{Int64}:
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 1
 1
 1
 ⋮
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0

In [113]:
targets[2]

95378-element Vector{Int64}:
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 ⋮
 0
 1
 1
 1
 1
 1
 1
 1
 1
 1
 0
 0

In [115]:
total_predictions = fill(-1, length(predictions[1][:,2]))
total_targets =  fill(-1, length(predictions[1][:,2]))
idxer = fill(true, length(predictions[1][:,2]))
still_to_predict = fill(true, length(predictions[1][:,2]))

###iteratively construct predictions vector 
###THIS IS ALSO TECHNICALLY NOT TESTING THIS ON A NEW SET OF FEATURES, BUT RATHER ONES THAT 
###HAVE ALREADY BEEN CALCULATED 
for (i, prediction_vec) in enumerate(predictions) 

    ###All the gates are still to be predicted upon 
    cp_mps = predictions[i][:,2]
    cp_metprobs = config.met_probs[i]
    ###Subset of gates from current pass to be predicted upon 
    curr_idx = (cp_mps .< cp_metprobs[1]) .| (cp_mps .> cp_metprobs[2])
    println(sum(curr_idx))
    curr_predictions = cp_mps .> fp_metprobs[2]
    ###We predict where both still_to_predict is 1, and curr_idx is 1 
    ###Still_to_predict will have a value of 1s at all locations 
    ###Just overwrite the next set of predictions too 

    ###At the valid locations in the current idxer, we will write gates 
    total_predictions[still_to_predict] .= curr_predictions 
    still_to_predict[still_to_predict] .= .! curr_idx

   
    # sp_mps = predictions[2][:,1]
    # sp_metprobs = config.met_probs[2] 

    # total_predictions[first_sweep_idx] .= 
end 

###DON'T NEED TO ACTUALLY ITERATIVE CONSTRUCT THE FEATURE VALUES? 


921772
46749
6684


In [116]:
evaluate_model(Vector{Bool}(total_predictions), Vector{Bool}(targets[1]))

(0.64814544f0, 0.81856865f0, 0.72345597f0, 369754, 200726, 364716, 81954, 1017150)

In [110]:
evaluate_model(config)


[32mCALCULATING FEATURES FOR PASS: 1[39m
Processed ../BENCHMARKING/benchmark_NOAA_cfrads/cfrad.20240705_222204.187_to_20240705_222208.161_N42RF-TM_AIR.nc in 0.6051180362701416 seconds
Processed ../BENCHMARKING/benchmark_NOAA_cfrads/cfrad.20240705_222216.825_to_20240705_222220.799_N42RF-TM_AIR.nc in 0.11199712753295898 seconds
Processed ../BENCHMARKING/benchmark_NOAA_cfrads/cfrad.20240705_222229.464_to_20240705_222233.438_N42RF-TM_AIR.nc in 0.10510802268981934 seconds
Processed ../BENCHMARKING/benchmark_NOAA_cfrads/cfrad.20240705_222242.102_to_20240705_222246.076_N42RF-TM_AIR.nc in 0.1123502254486084 seconds
Processed ../BENCHMARKING/benchmark_NOAA_cfrads/cfrad.20240705_222254.752_to_20240705_222258.731_N42RF-TM_AIR.nc in 0.10483479499816895 seconds
Processed ../BENCHMARKING/benchmark_NOAA_cfrads/cfrad.20240705_222307.390_to_20240705_222311.370_N42RF-TM_AIR.nc in 0.10565018653869629 seconds
Processed ../BENCHMARKING/benchmark_NOAA_cfrads/cfrad.20240705_222320.034_to_20240705_222324.0

Excessive output truncated after 524289 bytes.


    Dimensions:  time
    Attributes:
     long_name            = [36mstart_range_for_ray[39m
     units                = [36mmeters[39m
     _FillValue           = [36m-9999.0[39m

[32m  ray_gate_spacing[39m   (360)
    Datatype:    [0m[1mUnion{Missing, Float32}[22m (Float32)
    Dimensions:  time
    Attributes:
     long_name            = [36mgate_spacing_for_ray[39m
     units                = [36mmeters[39m
     _FillValue           = [36m-9999.0[39m

[32m  azimuth[39m   (360)
    Datatype:    [0m[1mUnion{Missing, Float32}[22m (Float32)
    Dimensions:  time
    Attributes:
     long_name            = [36mray_azimuth_angle[39m
     units                = [36mdegrees[39m
     _FillValue           = [36m-9999.0[39m

[32m  elevation[39m   (360)
    Datatype:    [0m[1mUnion{Missing, Float32}[22m (Float32)
    Dimensions:  time
    Attributes:
     long_name            = [36mray_elevation_angle[39m
     units                = [36mdegrees[39m
     _

Row,met_probs,task_paths,class_weights,n_trees,max_depth,precision,recall,f1,true_positives,false_positives,true_negatives,false_negatives
Unnamed: 0_level_1,Array…,Array…,String,Int64,Int64,Float32,Float32,Float32,Int64,Int64,Int64,Int64
1,"Tuple{Float32, Float32}[(0.1, 0.9), (0.1, 0.9), (0.1, 0.9)]","[""./tasks.txt"", ""./tasks.txt"", ""./tasks.txt""]",balanced,21,14,0.648145,0.818569,0.723456,369754,200726,364716,81954


In [85]:
total_predictions

1017150-element Vector{Int64}:
  1
  1
  1
  1
  1
  1
  1
  0
  0
  0
  0
  0
  0
  ⋮
  0
  1
  0
 -1
  0
  0
  1
  1
 -1
  1
  1
  1

In [71]:
targets[2]

95378-element Vector{Int64}:
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 ⋮
 0
 1
 1
 1
 1
 1
 1
 1
 1
 1
 0
 0

In [75]:
predictions[2][:, 1]

95378-element Vector{Float64}:
 0.14285714285714285
 0.0
 0.38095238095238093
 0.42857142857142855
 0.47619047619047616
 0.8095238095238095
 0.38095238095238093
 0.19047619047619047
 0.6190476190476191
 0.0
 0.0
 0.0
 0.6666666666666666
 ⋮
 0.9523809523809523
 0.0
 0.0
 0.19047619047619047
 0.047619047619047616
 0.09523809523809523
 0.047619047619047616
 0.09523809523809523
 0.3333333333333333
 0.23809523809523808
 0.8571428571428571
 1.0

In [69]:
mps = predictions[2][:,1]


95378-element Vector{Float64}:
 0.14285714285714285
 0.0
 0.38095238095238093
 0.42857142857142855
 0.47619047619047616
 0.8095238095238095
 0.38095238095238093
 0.19047619047619047
 0.6190476190476191
 0.0
 0.0
 0.0
 0.6666666666666666
 ⋮
 0.9523809523809523
 0.0
 0.0
 0.19047619047619047
 0.047619047619047616
 0.09523809523809523
 0.047619047619047616
 0.09523809523809523
 0.3333333333333333
 0.23809523809523808
 0.8571428571428571
 1.0

In [68]:
sum((mps .>= config.met_probs[1][1]) .& (mps .<= config.met_probs[1][2]))

95378

In [67]:
config.met_probs[1][2]

0.9f0

In [48]:
currm

LoadError: UndefVarError: `currm` not defined in `Main`
Suggestion: check for spelling errors or missing imports.

In [7]:
###Create model config object
config = ModelConfig(num_models = num_models,model_output_paths =  model_output_paths,met_probs =  met_probs, 
                    feature_output_paths = feature_output_paths, input_path = input_path,task_mode="nan",file_preprocessed = [false, false],
                     task_paths = task_paths, QC_var = QC_var, remove_var = remove_var, QC_mask = false, mask_names = mask_names,
                     VARS_TO_QC = ["VEL"], class_weights = class_weights, HAS_INTERACTIVE_QC=true, task_weights = task_weights,
                     REMOVE_HIGH_PGG=false, REMOVE_LOW_NCP=false)

ModelConfig(2, ["raw_model_0.jld2", "raw_model_1.jld2"], Tuple{Float32, Float32}[(0.1, 0.9), (0.1, 0.9)], ["output_features_0.h5", "output_features_1.h5"], "/Users/ischluesche/.julia/scratchspaces/905eca60-9fa9-4fb3-8835-f5cd63a3719c/ronin_testing/toy_set.nc", "nan", Bool[0, 0], ["./tasks.txt", "./tasks.txt"], [""], Vector[Matrix{Union{Missing, Float32}}[[1.0 1.0 1.0; 1.0 1.0 1.0; 1.0 1.0 1.0], [0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0; … ; 0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0]], Matrix{Union{Missing, Float32}}[[1.0 1.0 1.0; 1.0 1.0 1.0; 1.0 1.0 1.0], [0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0; … ; 0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0]]], true, false, false, true, "VG", "VEL", -32000.0f0, false, true, false, ["PASS_1_MASK", "PASS_2_MASK"], ["VEL"], "_QC", "balanced", 21, 14, false, 0.2f0, 1.0f0)

In [8]:
###Tests to ensure removing high/low NCP gates 
###functions as expected


config.REMOVE_HIGH_PGG = false  
config.REMOVE_LOW_NCP = false  

valid_NCP_gates = sum(sample_NCP .> config.NCP_THRESHOLD)
total_gates = length(sample_DBZ)

config.REMOVE_LOW_NCP = true 

try 
    train_multi_model(config)
catch 

end 

NCDataset(config.feature_output_paths[1]) do f
    @assert size(f["X"][:,:])[1] == valid_NCP_gates
end 

config.REMOVE_LOW_NCP = false 
try 
    train_multi_model(config)
catch 

end 

NCDataset(config.feature_output_paths[1]) do f
    @assert size(f["X"][:,:])[1] == total_gates
end 

# valid_PGG_gates = sum(sample_PGG .< config.PGG_THRESHOLD)

# config.REMOVE_HIGH_PGG = true 

# try 
#     train_multi_model(config)
# catch 

# end 

# NCDataset(config.feature_output_paths[1]) do f
#     print(size(f["X"][:,:]))
#     @assert size(f["X"][:,:])[1] == valid_PGG_gates
# end 






[32mCALCULATING FEATURES FOR PASS: 1[39m
Processed /Users/ischluesche/.julia/scratchspaces/905eca60-9fa9-4fb3-8835-f5cd63a3719c/ronin_testing/toy_set.nc in 1.1991300582885742 seconds
COMPLETED PROCESSING 1 FILES IN 1.35 SECONDS
OUTPUTTING DATA IN HDF5 FORMAT TO FILE: output_features_0.h5

WRITING DATA TO FILE OF SHAPE (20, 2)
X TYPE: Matrix{Float32}
[32mFINISHED CALCULATING FEATURES FOR PASS 1 in 2.257 seconds...[39m

[32mTRAINING MODEL FOR PASS: 1[39m

[32m...TRAINING FOR PASS: 1 ON 20 GATES...[39m

[34mOpening HDF5.File: (read-only) output_features_0.h5...[39m
FITTING MODEL
COMPLETED FITTING MODEL IN 0.7590851783752441 seconds

MODEL VERIFICATION:
ACCURACY ON TRAINING SET: 90.0%

[32mSAVING MODEL TO: raw_model_0.jld2 [39m
Processed /Users/ischluesche/.julia/scratchspaces/905eca60-9fa9-4fb3-8835-f5cd63a3719c/ronin_testing/toy_set.nc in 0.0016109943389892578 seconds
COMPLETED PROCESSING 1 FILES IN 0.0 SECONDS
RESULTANT GATES: 10[31mDataset: /Users/ischluesche/.julia/scrat

In [7]:
###Cannot change fill value 
# QC_scan(config)

# currs = NCDataset(ds_path)
# #currs["VEL_QC"].attrib["_FillValue"] == config.FILL_VAL
# #currs["PASS_2_MASK"].attrib["_FillValue"] == config.FILL_VAL 
# close(currs)

# config.FILL_VAL = 123.456 
# QC_scan(config)

# currs = NCDataset(ds_path)
# @assert currs["VEL_QC"].attrib["_FillValue"] == config.FILL_VAL
# #@assert currs["PASS_2_MASK"].attrib["_FillValue"] == config.FILL_VAL 
# close(currs)

In [10]:
###Test to ensure that QC_var is properly passed to the functions 
###Does so by ensuring that the returned target Y array is the same as the 
###specified QC_variable 
VG_map = map(! ismissing, sample_VG)
DBZ_map = map( ! ismissing, sample_DBZ)
@assert DBZ_map != VG_map 
config.QC_var = "VG"
X,Y = calculate_features(config.input_path, config.task_paths[1], config.feature_output_paths[1], config.HAS_INTERACTIVE_QC; 
                                    verbose = config.verbose, REMOVE_LOW_NCP = config.REMOVE_LOW_NCP,NCP_THRESHOLD=config.NCP_THRESHOLD, 
                                    REMOVE_HIGH_PGG=config.REMOVE_HIGH_PGG, PGG_THRESHOLD = config.PGG_THRESHOLD, QC_variable = config.QC_var, 
                                    remove_variable = config.remove_var, replace_missing = config.replace_missing,
                                    write_out = config.write_out)

@assert reshape(Y, (range_dim, time_dim)) == VG_map     

config.QC_var = "DBZ"
X,Y = calculate_features(config.input_path, config.task_paths[1], config.feature_output_paths[1], config.HAS_INTERACTIVE_QC; 
                                    verbose = config.verbose, REMOVE_LOW_NCP = config.REMOVE_LOW_NCP,NCP_THRESHOLD=config.NCP_THRESHOLD, 
                                    REMOVE_HIGH_PGG=config.REMOVE_HIGH_PGG, PGG_THRESHOLD = config.PGG_THRESHOLD, QC_variable = config.QC_var, 
                                    remove_variable = config.remove_var, replace_missing = config.replace_missing,
                                    write_out = config.write_out)

@assert reshape(Y, (range_dim, time_dim)) == DBZ_map

Processed /Users/ischluesche/.julia/scratchspaces/905eca60-9fa9-4fb3-8835-f5cd63a3719c/ronin_testing/toy_set.nc in 0.01797008514404297 seconds
COMPLETED PROCESSING 1 FILES IN 0.03 SECONDS
OUTPUTTING DATA IN HDF5 FORMAT TO FILE: output_features_0.h5

WRITING DATA TO FILE OF SHAPE (25, 2)
X TYPE: Matrix{Float32}
Processed /Users/ischluesche/.julia/scratchspaces/905eca60-9fa9-4fb3-8835-f5cd63a3719c/ronin_testing/toy_set.nc in 0.0039370059967041016 seconds
COMPLETED PROCESSING 1 FILES IN 0.01 SECONDS
OUTPUTTING DATA IN HDF5 FORMAT TO FILE: output_features_0.h5

WRITING DATA TO FILE OF SHAPE (25, 2)
X TYPE: Matrix{Float32}


In [11]:
### Test to ensure remove_var is properly passed to calculate_features by checking
### That the shape of the feature array changes associated with the variable passed to it
config.remove_var = "VG"
X,Y = calculate_features(config.input_path, config.task_paths[1], config.feature_output_paths[1], config.HAS_INTERACTIVE_QC; 
                                    verbose = config.verbose, REMOVE_LOW_NCP = config.REMOVE_LOW_NCP,NCP_THRESHOLD=config.NCP_THRESHOLD, 
                                    REMOVE_HIGH_PGG=config.REMOVE_HIGH_PGG, PGG_THRESHOLD = config.PGG_THRESHOLD, QC_variable = config.QC_var, 
                                    remove_variable = config.remove_var, replace_missing = config.replace_missing,
                                    write_out = config.write_out)
@assert size(X)[1] == sum(VG_map)

config.remove_var = "DBZ"
X,Y = calculate_features(config.input_path, config.task_paths[1], config.feature_output_paths[1], config.HAS_INTERACTIVE_QC; 
                                    verbose = config.verbose, REMOVE_LOW_NCP = config.REMOVE_LOW_NCP,NCP_THRESHOLD=config.NCP_THRESHOLD, 
                                    REMOVE_HIGH_PGG=config.REMOVE_HIGH_PGG, PGG_THRESHOLD = config.PGG_THRESHOLD, QC_variable = config.QC_var, 
                                    remove_variable = config.remove_var, replace_missing = config.replace_missing,
                                    write_out = config.write_out)

@assert size(X)[1] == sum(DBZ_map)

Processed /Users/ischluesche/.julia/scratchspaces/905eca60-9fa9-4fb3-8835-f5cd63a3719c/ronin_testing/toy_set.nc in 0.29209303855895996 seconds
COMPLETED PROCESSING 1 FILES IN 0.3 SECONDS
OUTPUTTING DATA IN HDF5 FORMAT TO FILE: output_features_0.h5

WRITING DATA TO FILE OF SHAPE (16, 2)
X TYPE: Matrix{Float32}
Processed /Users/ischluesche/.julia/scratchspaces/905eca60-9fa9-4fb3-8835-f5cd63a3719c/ronin_testing/toy_set.nc in 0.0012340545654296875 seconds
COMPLETED PROCESSING 1 FILES IN 0.0 SECONDS
OUTPUTTING DATA IN HDF5 FORMAT TO FILE: output_features_0.h5

WRITING DATA TO FILE OF SHAPE (25, 2)
X TYPE: Matrix{Float32}


In [10]:
function train_multi_model_bench(config::ModelConfig)
    ##Quick input sanitation check 
    @assert (length(config.model_output_paths) == length(config.feature_output_paths)
             == length(config.met_probs) == length(config.task_paths) == length(config.task_weights))

    full_start_time = time() 
    ###Iteratively train models and apply QC_scan with the specified probabilites to train a multi-pass model 
    ###pipeline 
    for (i, model_path) in enumerate(config.model_output_paths)
        
        out = config.feature_output_paths[i] 
        currt = config.task_paths[i]
        cw = config.task_weights[i]

        ##If execution proceeds past the first iteration, a composite model is being created, and 
        ##so a further mask will be applied to the features 
        if i > 1
            QC_mask = true 
        else 
            QC_mask = config.QC_mask 
        end 

        QC_mask ? mask_name = config.mask_names[i] : mask_name = ""
        println("MASK NAME: $(mask_name)")
        starttime = time() 
        
        if config.file_preprocessed[i]

            print("Reading input features from file $(out)...\n")
            h5open(out) do f
                X = f["X"][:,:]
                Y = f["Y"][:,:]
            end 

        else
            printstyled("\nCALCULATING FEATURES FOR PASS: $(i)\n", color=:green)

            ###Check to see if the features file already exists, if so, delete it so 
            ###that it may be overwritten 
            if config.write_out & config.overwrite_output
                isfile(out) ? rm(out) : ""
            end 

            X,Y = calculate_features(config.input_path, currt, out, config.HAS_INTERACTIVE_QC; 
                                verbose = config.verbose, REMOVE_LOW_NCP = config.REMOVE_LOW_NCP,NCP_THRESHOLD=config.NCP_THRESHOLD, 
                                REMOVE_HIGH_PGG=config.REMOVE_HIGH_PGG, PGG_THRESHOLD = config.PGG_THRESHOLD, QC_variable = config.QC_var, 
                                remove_variable = config.remove_var, replace_missing = config.replace_missing,
                                write_out = config.write_out, QC_mask = QC_mask, mask_name = mask_name, weight_matrixes=cw)
            printstyled("FINISHED CALCULATING FEATURES FOR PASS $(i) in $(round(time() - starttime, digits = 3)) seconds...\n", color=:green)
        end 

        printstyled("\nTRAINING MODEL FOR PASS: $(i)\n", color=:green)
        starttime = time() 

        class_weights = Vector{Float32}([0.0,1.0])
        ##Train model based on these features 
        if config.class_weights != ""

            if lowercase(config.class_weights) != "balanced"
                printstyled("ERROR: UNKNOWN CLASS WEIGHT $(config.class_weights)... \nContinuing with no weighting\n", color=:yellow)
            else 

                class_weights = Vector{Float32}(fill(0,length(Y[:,:][:])))
                weight_dict = compute_balanced_class_weights(Y[:,:][:])
                for class in keys(weight_dict)
                    class_weights[Y[:,:][:] .== class] .= weight_dict[class]
                end 

            end 
        end 
        
        printstyled("\n...TRAINING FOR PASS: $(i) ON $(size(X)[1]) GATES...\n", color=:green)
    
        Ronin.train_model(out, model_path, n_trees = config.n_trees, max_depth = config.max_depth, class_weights = class_weights)

        
        ###If this was the last pass, we don't need to write out a mask, and we're done!
        ###Otherwise, we need to mask out the features we want to apply the model to on the next pass 
        if i < config.num_models

            curr_model = load_object(model_path) 
            curr_metprobs = config.met_probs[i]

            paths = Vector{String}() 
            file_path = config.input_path

            if isdir(file_path) 
                paths = parse_directory(file_path)
            else 
                paths = [file_path]
            end 
                
            for path in paths

                dims = Dataset(path) do f
                    (f.dim["range"], f.dim["time"])
                end 
                
                ###NEED to update this if it's beyond two pass so we can pass it the correct mask
                X, Y, idxer = calculate_features(path, currt, out, true; 
                                    verbose = config.verbose, REMOVE_LOW_NCP = config.REMOVE_LOW_NCP, NCP_THRESHOLD=config.NCP_THRESHOLD,
                                    REMOVE_HIGH_PGG=config.REMOVE_HIGH_PGG,PGG_THRESHOLD=config.PGG_THRESHOLD, QC_variable = config.QC_var, 
                                    remove_variable = config.remove_var, replace_missing = config.replace_missing, return_idxer=true,
                                    write_out = false, QC_mask = QC_mask, mask_name = mask_name, weight_matrixes=cw)
                
                met_probs = DecisionTree.predict_proba(curr_model, X)
                if size(met_probs)[2] < 2
                    throw(DomainError(1, "ERROR: ONLY ONE CLASS IN INPUT DATASET")) 
                end 
                met_probs = met_probs[:, 2]
                valid_idxs = (met_probs .> minimum(curr_metprobs)) .& (met_probs .<= maximum(curr_metprobs))
                print("RESULTANT GATES: $(sum(valid_idxs))")
                ##Create mask field, fill it, and then write out
                new_mask = Matrix{Union{Missing, Float32}}(missings(dims))[:]
               
                ##We only care about gates that have met the base QC thresholds, so first index 
                ##by indexer returned from calculate_features, and then set the gates between
                ##the specified probability levels to valid in the mask. The next model pass will 
                ##thus only be calculated upon these features. 
                idxer = idxer[1][:]
                idxer[idxer] .= Vector{Bool}(valid_idxs)
                new_mask[idxer] .= 1.
                new_mask = reshape(new_mask, dims)
    
                write_field(path, config.mask_names[i+1], new_mask, attribs=Dict("Units" => "Bool", "Description" => "Gates between met prob theresholds"))

            end 
        end   
    end 
    printstyled("\n COMPLETED TRAINING MODEL IN $(round(time() - full_start_time, digits = 3)) seconds...\n", color=:green)   
end 


train_multi_model_bench (generic function with 1 method)

In [11]:
config.QC_mask = true

true

In [12]:
###Ensure that we have the full number of gates 
@assert sum(DBZ_map) == (range_dim * time_dim)
###Then try and mask something out... first we need to write it to file though 
###Can just used the QC'ed stuff 

config.QC_mask = true 
config.mask_names = ["VG", "OK"]

try 
    train_multi_model(config)
catch DomainError 
    println("OK")
    NCDataset(config.feature_output_paths[1]) do f1 
        @assert size(f1["X"][:,:])[1] == sum(.! map(ismissing, sample_VG)) 
    end 
    ###We should get a domain variable because we are removing the non-met 
    ###gates in the first pass 
else 
    @assert false
end 

###Now let's try it without the mask 
config.QC_mask = false 

try 
    train_multi_model(config)
catch DomainError 
    ###possible we're just getting 100% accuracy 
    println("DOMAIN ERROR") 
    NCDataset(config.feature_output_paths[1]) do f1 
        println(size(f1["X"][:,:])[1] )
        @assert size(f1["X"][:,:])[1] == length(sample_DBZ)
    end 
else 
    @assert true
end 
###Check to ensure that it's the full size 





[32mCALCULATING FEATURES FOR PASS: 1[39m
Processed /Users/ischluesche/.julia/scratchspaces/905eca60-9fa9-4fb3-8835-f5cd63a3719c/ronin_testing/toy_set.nc in 0.0019328594207763672 seconds
COMPLETED PROCESSING 1 FILES IN 0.0 SECONDS
OUTPUTTING DATA IN HDF5 FORMAT TO FILE: output_features_0.h5

WRITING DATA TO FILE OF SHAPE (16, 2)
X TYPE: Matrix{Float32}
[32mFINISHED CALCULATING FEATURES FOR PASS 1 in 0.005 seconds...[39m

[32mTRAINING MODEL FOR PASS: 1[39m

[32m...TRAINING FOR PASS: 1 ON 16 GATES...[39m

[34mOpening HDF5.File: (read-only) output_features_0.h5...[39m
FITTING MODEL
COMPLETED FITTING MODEL IN 0.00040221214294433594 seconds

MODEL VERIFICATION:
ACCURACY ON TRAINING SET: 100.0%

[32mSAVING MODEL TO: raw_model_0.jld2 [39m
Processed /Users/ischluesche/.julia/scratchspaces/905eca60-9fa9-4fb3-8835-f5cd63a3719c/ronin_testing/toy_set.nc in 0.0007100105285644531 seconds
COMPLETED PROCESSING 1 FILES IN 0.0 SECONDS
OK

[32mCALCULATING FEATURES FOR PASS: 1[39m
Processed 

In [13]:
config.mask_names = ["OK"]
try
    train_multi_model(config)
catch AssertionError

else 
    @assert false 
end 

try 
    composite_prediction(config) 
catch AssertionError 
else 
    @assert false 
end 

config.mask_names = ["OK", "MASK_2"]
config.QC_mask = false 

false

In [8]:
process_single_file(currs, "./tasks.txt"; NCP_THRESHOLD = Float32(.2), PGG_THRESHOLD=Float32(1.))

LoadError: NetCDF error: [31mVariable 'VV' not found in file ./toy_set.nc[39m (NetCDF error code: -49)

In [12]:
####Test tree depth, n trees, etc. 
config.HAS_INTERACTIVE_QC = true
config.QC_var = "VG"
config.n_trees = 40 
config.max_depth = 20 
train_multi_model(config)
classifier = load_object(config.model_output_paths[1])
@assert classifier.n_trees == config.n_trees 
@assert classifier.max_depth == config.max_depth 




[32mCALCULATING FEATURES FOR PASS: 1[39m
Processed /Users/ischluesche/.julia/scratchspaces/905eca60-9fa9-4fb3-8835-f5cd63a3719c/ronin_testing/toy_set.nc in 0.0063059329986572266 seconds
COMPLETED PROCESSING 1 FILES IN 0.01 SECONDS
OUTPUTTING DATA IN HDF5 FORMAT TO FILE: output_features_0.h5

WRITING DATA TO FILE OF SHAPE (25, 2)
X TYPE: Matrix{Float32}
[32mFINISHED CALCULATING FEATURES FOR PASS 1 in 0.038 seconds...[39m

[32mTRAINING MODEL FOR PASS: 1[39m

[32m...TRAINING FOR PASS: 1 ON 25 GATES...[39m

[34mOpening HDF5.File: (read-only) output_features_0.h5...[39m
FITTING MODEL
COMPLETED FITTING MODEL IN 0.0006389617919921875 seconds

MODEL VERIFICATION:
ACCURACY ON TRAINING SET: 96.0%

[32mSAVING MODEL TO: raw_model_0.jld2 [39m
Processed /Users/ischluesche/.julia/scratchspaces/905eca60-9fa9-4fb3-8835-f5cd63a3719c/ronin_testing/toy_set.nc in 0.0022912025451660156 seconds
COMPLETED PROCESSING 1 FILES IN 0.01 SECONDS
RESULTANT GATES: 17[31mDataset: /Users/ischluesche/.juli

In [15]:
config.REMOVE_HIGH_PGG = false  
config.REMOVE_LOW_NCP = false  

valid_NCP_gates = sum(sample_NCP .> config.NCP_THRESHOLD)
total_gates = length(sample_DBZ)

config.REMOVE_LOW_NCP = true 

train_multi_model(config)


[32mCALCULATING FEATURES FOR PASS: 1[39m
Processed /Users/ischluesche/.julia/scratchspaces/905eca60-9fa9-4fb3-8835-f5cd63a3719c/ronin_testing/toy_set.nc in 0.0016720294952392578 seconds
COMPLETED PROCESSING 1 FILES IN 0.0 SECONDS
OUTPUTTING DATA IN HDF5 FORMAT TO FILE: output_features_0.h5

WRITING DATA TO FILE OF SHAPE (20, 2)
X TYPE: Matrix{Float32}
[32mFINISHED CALCULATING FEATURES FOR PASS 1 in 0.007 seconds...[39m

[32mTRAINING MODEL FOR PASS: 1[39m

[32m...TRAINING FOR PASS: 1 ON 20 GATES...[39m

[34mOpening HDF5.File: (read-only) output_features_0.h5...[39m
FITTING MODEL
COMPLETED FITTING MODEL IN 0.00024819374084472656 seconds

MODEL VERIFICATION:
ACCURACY ON TRAINING SET: 95.0%

[32mSAVING MODEL TO: raw_model_0.jld2 [39m
Processed /Users/ischluesche/.julia/scratchspaces/905eca60-9fa9-4fb3-8835-f5cd63a3719c/ronin_testing/toy_set.nc in 0.0009279251098632812 seconds
COMPLETED PROCESSING 1 FILES IN 0.0 SECONDS
RESULTANT GATES: 16[31mDataset: /Users/ischluesche/.julia

In [106]:
sample_DBZ

5×5 Matrix{Float32}:
 32.0  65.0  54.0  18.0  52.0
 44.0  22.0  12.0  35.0  63.0
 39.0  12.0  62.0   6.0  30.0
 10.0  24.0  55.0  59.0  49.0
 44.0  31.0  23.0  17.0  54.0

In [13]:
###Create model config object
###Ensure that the file preprocessed flag works correctly by not 
###Modifying the existing features if it's already been processed 
config = ModelConfig(num_models = num_models,model_output_paths =  model_output_paths,met_probs =  met_probs, 
                    feature_output_paths = feature_output_paths, input_path = input_path,task_mode="nan",file_preprocessed = [false, false],
                     task_paths = task_paths, QC_var = QC_var, remove_var = remove_var, QC_mask = false, mask_names = mask_names,
                     VARS_TO_QC = ["VEL"], class_weights = class_weights, HAS_INTERACTIVE_QC=true, task_weights = task_weights,
                     REMOVE_HIGH_PGG=false, REMOVE_LOW_NCP=false)
sleep(1) 

config.file_preprocessed = [false, false] 
train_multi_model(config)
@assert (Base.time() - mtime(config.feature_output_paths[1])) < 1 
sleep(2) 
config.file_preprocessed = [true, true] 
train_multi_model(config) 
@assert (Base.time() - mtime(config.feature_output_paths[1])) > 2



[32mCALCULATING FEATURES FOR PASS: 1[39m
Processed /Users/ischluesche/.julia/scratchspaces/905eca60-9fa9-4fb3-8835-f5cd63a3719c/ronin_testing/toy_set.nc in 0.0031919479370117188 seconds
COMPLETED PROCESSING 1 FILES IN 0.01 SECONDS
OUTPUTTING DATA IN HDF5 FORMAT TO FILE: output_features_0.h5

WRITING DATA TO FILE OF SHAPE (25, 2)
X TYPE: Matrix{Float32}
[32mFINISHED CALCULATING FEATURES FOR PASS 1 in 0.014 seconds...[39m

[32mTRAINING MODEL FOR PASS: 1[39m

[32m...TRAINING FOR PASS: 1 ON 25 GATES...[39m

[34mOpening HDF5.File: (read-only) output_features_0.h5...[39m
FITTING MODEL
COMPLETED FITTING MODEL IN 0.0005218982696533203 seconds

MODEL VERIFICATION:
ACCURACY ON TRAINING SET: 88.0%

[32mSAVING MODEL TO: raw_model_0.jld2 [39m
Processed /Users/ischluesche/.julia/scratchspaces/905eca60-9fa9-4fb3-8835-f5cd63a3719c/ronin_testing/toy_set.nc in 0.0008840560913085938 seconds
COMPLETED PROCESSING 1 FILES IN 0.0 SECONDS
RESULTANT GATES: 16[31mDataset: /Users/ischluesche/.julia

In [27]:
###Create model config object
###Ensure that the file preprocessed flag works correctly by not 
###Modifying the existing features if it's already been processed 
isfile(config.feature_output_paths[1]) ? rm(config.feature_output_paths[1]) : ""

config = ModelConfig(num_models = num_models,model_output_paths =  model_output_paths,met_probs =  met_probs, 
                    feature_output_paths = feature_output_paths, input_path = input_path,task_mode="nan",file_preprocessed = [false, false],
                     task_paths = task_paths, QC_var = QC_var, remove_var = remove_var, QC_mask = false, mask_names = mask_names,
                     VARS_TO_QC = ["VEL"], class_weights = class_weights, HAS_INTERACTIVE_QC=true, task_weights = task_weights,
                     REMOVE_HIGH_PGG=false, REMOVE_LOW_NCP=false)

train_multi_model(config)


try
    config = ModelConfig(num_models = num_models,model_output_paths =  model_output_paths,met_probs =  met_probs, 
                    feature_output_paths = feature_output_paths, input_path = input_path,task_mode="nan",file_preprocessed = [false, false],
                     task_paths = task_paths, QC_var = QC_var, remove_var = remove_var, QC_mask = false, mask_names = mask_names,
                     VARS_TO_QC = ["VEL"], class_weights = class_weights, HAS_INTERACTIVE_QC=false, task_weights = task_weights,
                     REMOVE_HIGH_PGG=false, REMOVE_LOW_NCP=false)

    train_multi_model(config)
catch Exception 
    println("GOOD!")
else 
    @assert false 
end 




[32mCALCULATING FEATURES FOR PASS: 1[39m
Processed /Users/ischluesche/.julia/scratchspaces/905eca60-9fa9-4fb3-8835-f5cd63a3719c/ronin_testing/toy_set.nc in 0.001068115234375 seconds
COMPLETED PROCESSING 1 FILES IN 0.0 SECONDS
OUTPUTTING DATA IN HDF5 FORMAT TO FILE: output_features_0.h5

WRITING DATA TO FILE OF SHAPE (25, 2)
X TYPE: Matrix{Float32}
[32mFINISHED CALCULATING FEATURES FOR PASS 1 in 0.013 seconds...[39m

[32mTRAINING MODEL FOR PASS: 1[39m

[32m...TRAINING FOR PASS: 1 ON 25 GATES...[39m

[34mOpening HDF5.File: (read-only) output_features_0.h5...[39m
FITTING MODEL
COMPLETED FITTING MODEL IN 0.00019407272338867188 seconds

MODEL VERIFICATION:
ACCURACY ON TRAINING SET: 88.0%

[32mSAVING MODEL TO: raw_model_0.jld2 [39m
Processed /Users/ischluesche/.julia/scratchspaces/905eca60-9fa9-4fb3-8835-f5cd63a3719c/ronin_testing/toy_set.nc in 0.0008571147918701172 seconds
COMPLETED PROCESSING 1 FILES IN 0.0 SECONDS
RESULTANT GATES: 16[31mDataset: /Users/ischluesche/.julia/scr

In [None]:
###Test to ensure that calculate features does not return a Y vector if HAS_INTERACTIVE_QC
###Is set to false 

In [79]:
i = 1
out = config.feature_output_paths[i] 
currt = config.task_paths[i]
cw = config.task_weights[i]
config.write_out = false
config.HAS_INTERACTIVE_QC = false
config.REMOVE_LOW_NCP = true
##If execution proceeds past the first iteration, a composite model is being created, and 
##so a further mask will be applied to the features 
if i > 1
    QC_mask = true 
else 
    QC_mask = config.QC_mask 
end 

QC_mask ? mask_name = config.mask_names[i] : mask_name = ""
    
X,Y,idxer = calculate_features(config.input_path, currt, out, config.HAS_INTERACTIVE_QC; 
                                    verbose = config.verbose, REMOVE_LOW_NCP = config.REMOVE_LOW_NCP,NCP_THRESHOLD=config.NCP_THRESHOLD, 
                                    REMOVE_HIGH_PGG=config.REMOVE_HIGH_PGG, PGG_THRESHOLD = config.PGG_THRESHOLD, QC_variable = config.QC_var, 
                                    remove_variable = config.remove_var, replace_missing = config.replace_missing,
                                    write_out = config.write_out, QC_mask = QC_mask, mask_name = mask_name, weight_matrixes=cw, return_idxer = true)
@assert Y == [0;;]
@assert sum(idxer[1][:]) == sum(sample_NCP .> config.NCP_THRESHOLD)

Processed /Users/ischluesche/.julia/scratchspaces/905eca60-9fa9-4fb3-8835-f5cd63a3719c/ronin_testing/toy_set.nc in 0.0023729801177978516 seconds
COMPLETED PROCESSING 1 FILES IN 0.01 SECONDS


true

20

In [50]:
sum(idxer)

5×5 BitMatrix:
 1  1  1  1  1
 1  1  1  1  1
 1  1  1  1  1
 1  1  1  1  1
 1  1  1  1  1

In [40]:
[0 ;]

1-element Vector{Int64}:
 0

In [43]:
Y

1×1 Matrix{Int64}:
 0

In [23]:
isfile(config.feature_output_paths[1]) ? rm(config.feature_output_paths[1]) : ""


In [64]:
QC_mask = Matrix{Union{Missing, Float32}}(fill(1.,(5,5)))
QC_mask[3,:] .= missing

5-element view(::Matrix{Union{Missing, Float32}}, 3, :) with eltype Union{Missing, Float32}:
 missing
 missing
 missing
 missing
 missing

In [15]:
rets = composite_prediction(config, return_probs=true)

[32mLOADING MODELS....[39m
(25, 1)[31mDataset: ./toy_set.nc[39m
Group: /

[31mDimensions[39m
   range = 5
   time = 5

[31mVariables[39m
[32m  time[39m   (5)
    Datatype:    [0m[1mFloat32[22m (Float32)
    Dimensions:  time
    Attributes:
     units                = [36ms[39m

[32m  range[39m   (5)
    Datatype:    [0m[1mFloat32[22m (Float32)
    Dimensions:  range
    Attributes:
     units                = [36mm[39m

[32m  NCP[39m   (5 × 5)
    Datatype:    [0m[1mFloat32[22m (Float32)
    Dimensions:  range × time
    Attributes:
     units                = [36mNCP units[39m

[32m  DBZ[39m   (5 × 5)
    Datatype:    [0m[1mFloat32[22m (Float32)
    Dimensions:  range × time
    Attributes:
     units                = [36mdBz[39m

[32m  VEL[39m   (5 × 5)
    Datatype:    [0m[1mFloat32[22m (Float32)
    Dimensions:  range × time
    Attributes:
     units                = [36mm/s[39m

[32m  VG[39m   (5 × 5)
    Datatype:    [0m[1mUnion{Mi

(Bool[0, 0, 0, 0, 0, 1, 0, 1, 1, 1  …  1, 1, 0, 1, 0, 0, 1, 1, 0, 0], [0; 0; … ; 1; 1;;], [[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0  …  1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]], [0.3333333333333333, 0.6190476190476191, 0.14285714285714285, 0.14285714285714285, 0.23809523809523808, 1.0, 0.6190476190476191, 0.9523809523809523, 1.0, 0.9523809523809523  …  1.0, 1.0, 0.6190476190476191, 0.9523809523809523, 0.47619047619047616, 0.8095238095238095, 1.0, 1.0, 0.8095238095238095, 0.7142857142857143])

In [17]:
X, y, idxers, probs = rets

(Bool[0, 0, 0, 0, 0, 1, 0, 1, 1, 1  …  1, 1, 0, 1, 0, 0, 1, 1, 0, 0], [0; 0; … ; 1; 1;;], [[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0  …  1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]], [0.3333333333333333, 0.6190476190476191, 0.14285714285714285, 0.14285714285714285, 0.23809523809523808, 1.0, 0.6190476190476191, 0.9523809523809523, 1.0, 0.9523809523809523  …  1.0, 1.0, 0.6190476190476191, 0.9523809523809523, 0.47619047619047616, 0.8095238095238095, 1.0, 1.0, 0.8095238095238095, 0.7142857142857143])

In [19]:
reshape(probs, (5,5))

5×5 Matrix{Float64}:
 0.333333  1.0       0.952381  1.0       0.809524
 0.619048  0.619048  1.0       1.0       1.0
 0.142857  0.952381  0.714286  0.619048  1.0
 0.142857  1.0       1.0       0.952381  0.809524
 0.238095  0.952381  0.904762  0.47619   0.714286

In [118]:
sum(idxers)

25-element Vector{Float64}:
 1.0
 1.0
 1.0
 1.0
 1.0
 1.0
 1.0
 1.0
 1.0
 1.0
 1.0
 1.0
 1.0
 1.0
 1.0
 1.0
 1.0
 1.0
 1.0
 1.0
 1.0
 1.0
 1.0
 1.0
 1.0

In [112]:
curr_model = load_object("raw_model_0.jld2")
mps = predict_proba(curr_model, X)[:,2]
calced_probs = reshape(mps, (5,5))


pass_2_valid = (calced_probs .>= config.met_probs[1][1]) .& (calced_probs .<= config.met_probs[1][2])
pass_2_real = currs[config.mask_names[2]][:,:]

5×5 Matrix{Union{Missing, Float64}}:
 1.0   missing   missing   missing  1.0
 1.0  1.0        missing   missing   missing
 1.0   missing  1.0       1.0        missing
 1.0   missing   missing   missing  1.0
 1.0  1.0        missing  1.0       1.0

In [113]:
pass_2_real[BitMatrix(map(ismissing, pass_2_real))] .= 0

12-element view(reshape(::Matrix{Union{Missing, Float64}}, 25), [6, 8, 9, 11, 12, 14, 15, 16, 17, 19, 22, 23]) with eltype Union{Missing, Float64}:
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0

In [116]:
Matrix{Bool}(pass_2_real) == pass_2_valid

true

In [104]:
pass_2_real

5×5 Matrix{Union{Missing, Float64}}:
 1.0   missing   missing   missing  1.0
 1.0  1.0        missing   missing   missing
 1.0   missing  1.0       1.0        missing
 1.0   missing   missing   missing  1.0
 1.0  1.0        missing  1.0       1.0

In [98]:
pass_2_real

5×5 Matrix{Union{Missing, Float64}}:
 1.0   missing   missing   missing  1.0
 1.0  1.0        missing   missing   missing
 1.0   missing  1.0       1.0        missing
 1.0   missing   missing   missing  1.0
 1.0  1.0        missing  1.0       1.0

In [89]:
idx = pass_2_valid .== 0 
pass_2_valid = Matrix{Union{Bool, Missing}}(pass_2_valid)

5×5 Matrix{Union{Missing, Bool}}:
 1  0  0  0  1
 1  1  0  0  0
 1  0  1  1  0
 1  0  0  0  1
 1  1  0  1  1

In [90]:
pass_2_valid[.! pass_2_valid] .= missing

12-element view(reshape(::Matrix{Union{Missing, Bool}}, 25), [6, 8, 9, 11, 12, 14, 15, 16, 17, 19, 22, 23]) with eltype Union{Missing, Bool}:
 missing
 missing
 missing
 missing
 missing
 missing
 missing
 missing
 missing
 missing
 missing
 missing

In [92]:
pass_2_valid == pass_2_real

missing

In [75]:
idx

5×5 BitMatrix:
 0  1  1  1  0
 0  0  1  1  1
 0  1  0  0  1
 0  1  1  1  0
 0  0  1  0  0

In [74]:
pass_2_valid[idx] .== missing

12-element Vector{Missing}:
 missing
 missing
 missing
 missing
 missing
 missing
 missing
 missing
 missing
 missing
 missing
 missing

In [67]:
pass_2_valid .== pass_2_real

5×5 Matrix{Union{Missing, Bool}}:
 1   missing   missing   missing  1
 1  1          missing   missing   missing
 1   missing  1         1          missing
 1   missing   missing   missing  1
 1  1          missing  1         1

In [61]:
currs["./toy_set.nc"]

LoadError: NetCDF error: [31mVariable './toy_set.nc' not found in file ./toy_set.nc[39m (NetCDF error code: -49)

In [57]:
###let's write a test here. 
currs = NCDataset("./toy_set.nc")
QCed_set = currs["VEL_QC"][:,:]

5×5 Matrix{Union{Missing, Float32}}:
 missing  -17.0       -11.0         1.0          missing
 missing     missing    7.0       -19.0       -15.0
 missing  -10.0          missing     missing   -6.0
 missing   -9.0       -14.0         6.0          missing
 missing   17.0       -15.0          missing     missing

In [143]:
sample_DBZ

5×5 Matrix{Union{Missing, Float32}}:
  8.0  28.0  35.0  57.0  65.0
 22.0  57.0  55.0  26.0  22.0
 33.0  19.0  16.0  54.0  13.0
 33.0   2.0  48.0  53.0  43.0
 43.0  29.0  10.0  54.0  63.0

In [249]:
##Set up a toy example to do the analytical calculations 

function slide_window(var::Matrix{Union{Missing, Float32}}, window::Matrix{Union{Missing,Float32}}, func; replace_missing = false)

    ##First, construct matrix and apply window 
    nrow, ncol = size(var)
    wrow, wcol = size(window) 

    ###int divide to get the windows off center 
    plusx = div(wcol, 2)
    plusy = div(wrow, 2)

    ###Get center coordinates of window 
    wc = (plusx+1, plusy+1)

    res = zeros(size(var))
    curr_view = fill(Missing, (size(window)))
    for i in 1:1:nrow
        for j in 1:1:ncol
            
            ###Calculate maximum and minimum indicies into the variable array 
            maxx = i + plusy > nrow ? nrow : i + plusy 
            maxy = j + plusx > ncol ? ncol : j + plusx
            minx = i - plusy < 1    ? 1    : i - plusy 
            miny = j - plusx < 1    ? 1    : j - plusx 
            

            ###Calculate indicies into the window/weight array 
            ###Difference here is that these will simply be in relation to the center of the window 
            windx = ((wc[1] - (i-minx)), (wc[1] + (maxx-i)))
            windy = ((wc[2] - (j-miny)), (wc[2] + (maxy-j)))

            ##Calculate resultant matrix    
            weighted = var[miny:maxy,minx:maxx] .* window[windy[1]:windy[2], windx[1]:windx[2]]
            #println(weighted)

            # ##i,j is the center index 
            # println([i,j])

            # printstyled("ROW WINDOW: $(minx : maxx) COL WINDOW: $(miny : maxy)\n", color=:green)
            # printstyled("WINDOW INDEX: $(windx) $(windy)\n", color=:blue)
            # if (i,j) == (5,4) 
            #     printstyled("ROW WINDOW: $(minx : maxx) COL WINDOW: $(miny : maxy)\n", color=:green)
            #     printstyled("WINDOW INDEX: $(windx) $(windy)\n", color=:blue)
            # end 

            res[j,i] = func(weighted)
        end 
    end 
    Matrix{Float32}(res)
end 

slide_window (generic function with 3 methods)

In [250]:
sample_DBZ[2:5, 1:5] .* sw[1:4, 1:5]

4×5 Matrix{Float32}:
 22.0  57.0  55.0  26.0  22.0
 33.0  19.0  16.0  54.0  13.0
 33.0   2.0  48.0  53.0  43.0
 43.0  29.0  10.0  54.0  63.0

In [255]:
sw

7×7 Matrix{Union{Missing, Float32}}:
 1.0  1.0  1.0  1.0  1.0  1.0  1.0
 1.0  1.0  1.0  1.0  1.0  1.0  1.0
 1.0  1.0  1.0  1.0  1.0  1.0  1.0
 1.0  1.0  1.0  1.0  1.0  1.0  1.0
 1.0  1.0  1.0  1.0  1.0  1.0  1.0
 1.0  1.0  1.0  1.0  1.0  1.0  1.0
 1.0  1.0  1.0  1.0  1.0  1.0  1.0

In [268]:
@btime slide_window(sample_DBZ, sw, std)[:]

  6.942 μs (207 allocations: 14.83 KiB)


25-element Vector{Float32}:
 18.169113
 17.82045
 17.82045
 17.82045
 17.86197
 18.602419
 18.757486
 18.757486
 18.757486
 18.191597
 18.602419
 18.757486
 18.757486
 18.757486
 18.191597
 18.602419
 18.757486
 18.757486
 18.757486
 18.191597
 19.333801
 19.68295
 19.68295
 19.68295
 20.07818

In [88]:
open("./ok.txt", "w") do file 
    for task in tasks 
        write(file, " "  * task)
    end 
end 

In [269]:
sample_DBZ

5×5 Matrix{Union{Missing, Float32}}:
  8.0  28.0  35.0  57.0  65.0
 22.0  57.0  55.0  26.0  22.0
 33.0  19.0  16.0  54.0  13.0
 33.0   2.0  48.0  53.0  43.0
 43.0  29.0  10.0  54.0  63.0

In [14]:
####Writing tests for calculate_features 

function test_calculate_features()
    
    input_loc = ds_path
    tasks=["VEL", "DBZ", "STD(DBZ)", "STD(DBZ)"]
    weight_matrixes = [pw, pw, sw, rw]
    output_loc = joinpath(scratchspace, "trash_output.h5")
    HAS_INTERACTIVE_QC = true

    X1, Y1 = calculate_features(input_loc, tasks, weight_matrixes, output_loc, HAS_INTERACTIVE_QC;
            verbose=true, REMOVE_LOW_NCP=false, NCP_THRESHOLD=Float32(.2), QC_variable ="VG", remove_variable = "VEL" )


    @assert X1[:,2] == sample_DBZ[:]
    @assert X1[:,1] == sample_VEL[:]
    @assert Y1[:]   == .! map(ismissing, sample_VG[:])
    ###Test file-specified arguments version of calculate features 

    input_loc = ds_path 
    print(tasks)
    argfile_path = joinpath(scratchspace, "./sample_tasks.txt")
    open(argfile_path, "w") do file 
        for task in tasks 
            write(file, "," * task)
        end 
    end 

    X2, Y2 = calculate_features(input_loc, argfile_path, output_loc, true, QC_variable="VG", remove_variable="VEL", 
                        weight_matrixes = weight_matrixes)

    @assert X1 == X2
    @assert Y1 == Y2 

    
return X1, X2
    
end 

test_calculate_features (generic function with 1 method)

In [15]:
X1, X2 = test_calculate_features()

Processed /Users/ischluesche/.julia/scratchspaces/905eca60-9fa9-4fb3-8835-f5cd63a3719c/ronin_testing/toy_set.nc in 0.10117197036743164 seconds
COMPLETED PROCESSING 1 FILES IN 0.1 SECONDS
OUTPUTTING DATA IN HDF5 FORMAT TO FILE: /Users/ischluesche/.julia/scratchspaces/905eca60-9fa9-4fb3-8835-f5cd63a3719c/ronin_testing/trash_output.h5

WRITING DATA TO FILE OF SHAPE (25, 4)
X TYPE: Matrix{Float32}
["VEL", "DBZ", "STD(DBZ)", "STD(DBZ)"]COMPLETED PROCESSING 1 FILES IN 0.03 SECONDS
OUTPUTTING DATA IN HDF5 FORMAT TO FILE: /Users/ischluesche/.julia/scratchspaces/905eca60-9fa9-4fb3-8835-f5cd63a3719c/ronin_testing/trash_output.h5

WRITING DATA TO FILE OF SHAPE (25, 4)
X TYPE: Matrix{Float32}


(Float32[-2.0 27.0 22.49435 22.286674; 5.0 1.0 22.177752 22.827671; … ; -17.0 10.0 22.122326 23.450901; -17.0 16.0 21.28184 22.681032], Float32[-2.0 27.0 22.49435 22.286674; 5.0 1.0 22.177752 22.827671; … ; -17.0 10.0 22.122326 23.450901; -17.0 16.0 21.28184 22.681032])

In [29]:
Ronin._weighted_func

_weighted_func (generic function with 3 methods)

In [31]:
using ImageFiltering

In [80]:
function get_window_matrixes(x) 
    println("OK $(x[4,4])")
    x[4,4]
end 

get_window_matrixes (generic function with 1 method)

In [81]:
mapwindow(x -> get_window_matrixes(x), sample_DBZ, size(aw), border=Fill(missing))

OK 8.0
OK 8.0
OK 22.0
OK 33.0
OK 33.0
OK 43.0
OK 28.0
OK 57.0
OK 19.0
OK 2.0
OK 29.0
OK 35.0
OK 55.0
OK 16.0
OK 48.0
OK 10.0
OK 57.0
OK 26.0
OK 54.0
OK 53.0
OK 54.0
OK 65.0
OK 22.0
OK 13.0
OK 43.0
OK 63.0


5×5 Matrix{Float32}:
  8.0  28.0  35.0  57.0  65.0
 22.0  57.0  55.0  26.0  22.0
 33.0  19.0  16.0  54.0  13.0
 33.0   2.0  48.0  53.0  43.0
 43.0  29.0  10.0  54.0  63.0

In [60]:
things[12] === things[11]

true