In [1]:
using Pkg
Pkg.activate("..")
using SVDD
using OneClassActiveLearning
using MLKernels
using Random
using JuMP, Gurobi

In [2]:
using Memento
Memento.config!(SVDD.LOGGER, "warn"; fmt="[{level} | {name}]: {msg}")

Logger(SVDD)

Evaluation helpers

In [3]:
struct EvaluationException <: Exception end

function train_svdd_model(model::DataType, init_strategy, solver,
                          data::Array{Float64, 2}, labels_init::Vector{Symbol}, labels_train::Vector{Symbol})
    if model <: VanillaSVDD
        model = model(data)
    else
        model = model(data, labels_init)
    end
    SVDD.initialize!(model, init_strategy)
    if isa(model, SSAD)
        set_kappa!(model, 0.01)
    end
    set_adjust_K!(model, true)
    set_pools!(model, labels_train)
    SVDD.fit!(model, solver)
    return model
end

function predict_svdd_model(model, test_data)
    return SVDD.classify.(SVDD.predict(model, test_data))
end

function evaluate_with_svdd(model::DataType, init_strategy, solver,
                            data::Array{Float64, 2}, labels_init::Vector{Symbol}, labels_train::Vector{Symbol},
                            test_data::Array{Float64, 2}, test_labels::Vector{Symbol},
                            quality_metrics)
    try
        time_train = @elapsed model = train_svdd_model(model, init_strategy, solver, data, labels_init, labels_train)
        time_pred = @elapsed pred = predict_svdd_model(model, test_data)
        scores = evaluate_prediction(quality_metrics, test_labels, pred)
        add_evaluation_stats!(scores, time_train, time_pred)
        return scores
    catch
        return Dict{Symbol, Any}(:eval_status => string(EvaluationException))
    end
end

function add_evaluation_stats!(scores, time_train, time_pred)
    scores[:time_train] = time_train
    scores[:time_pred] = time_pred
    scores[:eval_status] = :success
    return nothing
end

function evaluate_prediction(quality_metrics, labels, pred)
    scores = Dict{Symbol, Any}()
    cm = ConfusionMatrix(pred, labels)
    for (metric_name, metric) in quality_metrics
        m = metric(cm)
        scores[metric_name] = m
    end
    scores[:tp] = cm.tp
    scores[:fp] = cm.fp
    scores[:tn] = cm.tn
    scores[:fn] = cm.fn
    return scores
end

evaluate_prediction (generic function with 1 method)

Find data files.

In [4]:
DATA_PATH = "../data/input/processed"
data_files = []
for d in readdir(DATA_PATH)
    append!(data_files, [joinpath(DATA_PATH, d, x) for x in readdir(joinpath(DATA_PATH, d))])
end

Set solver, evaluation metrics and init strategy.

In [5]:
solver = with_optimizer(Gurobi.Optimizer; OutputFlag=0, Threads=1)
quality_metrics = Dict(:mcc => matthews_corr, :kappa => cohens_kappa, :f1 => f1_score)
init_strategy = WangCombinedInitializationStrategy(solver, 2.0.^range(-4, stop=4, step=1.0), BoundedTaxErrorEstimate(0.05, 0.02, 0.98));
ip = ("Pn", Dict(:n => 25))
ss = ("Sf", Dict())
num_resamples_initial_pool = 5

5

Train and evaluate models.

In [6]:
results = []

for x in data_files
    ds_name = split(x, '/')[end-1]
    ds_file = split(x, '/')[end]
    d, l = load_data(x)
    for n in 1:num_resamples_initial_pool
        Random.seed!(n)
        split_strategy, labels_init = get_splits_and_init_pools(d, l, ss[1], ip[1];
                                                                ss[2]..., ip[2]...)
        labels_train = fill(:Lin, length(l))
        labels_train[l .== :outlier] .= :Lout
        r = evaluate_with_svdd(SSAD, init_strategy, solver, d, labels_init, labels_train, d, l, quality_metrics)
        r[:data_set] = ds_name
        r[:data_file] = ds_file
        r[:seed] = n
        push!(results, r)
    end
end

Academic license - for non-commercial use only
Academic license - for non-commercial use only
Academic license - for non-commercial use only
Academic license - for non-commercial use only
Academic license - for non-commercial use only
Academic license - for non-commercial use only
Academic license - for non-commercial use only
Academic license - for non-commercial use only
Academic license - for non-commercial use only
Academic license - for non-commercial use only
Academic license - for non-commercial use only
Academic license - for non-commercial use only
Academic license - for non-commercial use only
Academic license - for non-commercial use only
Academic license - for non-commercial use only
Academic license - for non-commercial use only
Academic license - for non-commercial use only
Academic license - for non-commercial use only
Academic license - for non-commercial use only
Academic license - for non-commercial use only
Academic license - for non-commercial use only
Academic lice

Aggregate results

In [7]:
using DataFrames, CSV

In [8]:
df = vcat(DataFrame.(results)...)
first(df, 5)

Unnamed: 0_level_0,data_file,data_set,eval_status,f1,fn,fp,kappa,mcc,seed,time_pred,time_train,tn,tp
Unnamed: 0_level_1,SubStrin…,SubStrin…,Symbol,Float64,Int64,Int64,Float64,Float64,Int64,Float64,Float64,Int64,Int64
1,ALOI_withoutdupl_norm_r01.csv,ALOI,success,0.0792541,33,362,-0.00997187,-0.0184426,1,0.291931,134.747,588,17
2,ALOI_withoutdupl_norm_r01.csv,ALOI,success,0.0792541,33,362,-0.00997187,-0.0184426,2,0.212361,120.405,588,17
3,ALOI_withoutdupl_norm_r01.csv,ALOI,success,0.0792541,33,362,-0.00997187,-0.0184426,3,0.174675,122.94,588,17
4,ALOI_withoutdupl_norm_r01.csv,ALOI,success,0.0792541,33,362,-0.00997187,-0.0184426,4,0.275412,119.223,588,17
5,ALOI_withoutdupl_norm_r01.csv,ALOI,success,0.0792541,33,362,-0.00997187,-0.0184426,5,0.187397,121.083,588,17


In [9]:
CSV.write("../data/output/ssad_baseline.csv", df)

"../data/output/ssad_baseline.csv"

In [10]:
using Statistics

In [11]:
by(df, [:data_set], :mcc => median)

Unnamed: 0_level_0,data_set,mcc_median
Unnamed: 0_level_1,SubStrin…,Float64
1,ALOI,0.00659741
2,Annthyroid,-0.0261539
3,Arrhythmia,1.0
4,Cardiotocography,-0.0659004
5,Glass,1.0
6,HeartDisease,1.0
7,Hepatitis,0.0999143
8,Ionosphere,1.0
9,KDDCup99,-0.035772
10,Lymphography,0.296857
