In [1]:
using Plots
using Statistics
using Surrogates
using CSV
using DataFrames
using Distributions
using Flux
import JSON
using MLUtils
using Plots
using StatsBase
using StatsPlots
using Tables
using HypothesisTests

include("../scan_hyperparameters_withcellnum_functions.jl")

main (generic function with 1 method)

In [2]:
function model_id_to_nn(model_id, x_dim, y_dim)
    model_params = Dict()
    
    for substring in split(model_id, "_")
        if occursin("-", substring)
            continue
        elseif occursin("w=", substring)
            model_params["width"] = parse(Int, last(split(substring, "=")))
        elseif occursin("d=", substring)
            model_params["depth"] = parse(Int, last(split(substring, "=")))
        elseif occursin("activation=", substring)
            activation_function_string = last(split(substring, "="))
            model_params["activation"] = parseactivationfunctions([activation_function_string])[1]
        elseif occursin("dr=", substring)
            model_params["dropout_rate"] = parse(Float32, last(split(substring, "=")))
        end
    end
    
    neuralnetworkwithdropout(
        x_dim, y_dim, model_params["width"], model_params["depth"],
        model_params["dropout_rate"], model_params["activation"]
    )
end

function predict(x_vec)
    m([x_vec...])'
end

predict (generic function with 1 method)

## RFQNet1: Without transmission cut, W100 d5

In [3]:
target_directory = "../data/full_with_cellnumber/"

println("Formatting data...")
x_raw_df, y_df, cellnumber_df = getrawdata_withcellnum(target_directory)

# we are interested in whether cellnumber is odd or even
cellnumber_df = cellnumber_df .% 2

# how to process cellnumber, for now let's make it another dvar
x_raw_df = hcat(x_raw_df, cellnumber_df)

#= cutting transmission
if cut_transmission
    println("Cutting Transmission to 60-100 percent...")
    lower::Float32 = 60
    upper::Float32 = 120
    x_raw_df, y_df = applycut(x_raw_df, y_df, "OBJ1", lower, upper; with_numcells=true)
end =#

# decorrelating
println("Decorrelating...")
x_df = decorrelatedvars(x_raw_df; with_numcells=true)

# scaling
x_scaled_df, x_scalers = minmaxscaledf(x_df)
y_scaled_df, y_scalers = minmaxscaledf(y_df)

# need to make sure that column names didn't switch orders
@assert names(x_raw_df) == names(x_scaled_df)
@assert names(y_df) == names(y_scaled_df)

# hardcoding that we are using the same train / test indexes for everything
x_train_df, x_test_df, y_train_df, y_test_df = traintestsplit(
    x_scaled_df, y_scaled_df; read_in=true, path="../indexes/", cut_transmission=false
)

x_train = Float32.(Matrix(x_train_df));
x_test = Float32.(Matrix(x_test_df));
y_train = Float32.(Matrix(y_train_df));
y_test = Float32.(Matrix(y_test_df));

Formatting data...
Decorrelating...
- Using preexisting train and test sets from ../indexes/




In [4]:
# RFQ1 is D5 W100. Let's get that model
model_id = "2023-06-17_01-16-33_w=100_d=5_activation=sigmoid_bs=1024_lr=0.001_dr=0.0_1"
model_state = JLD2.load("../models/$model_id.jld2", "model_state");
m = model_id_to_nn(model_id, size(x_df)[2], size(y_df)[2]);
Flux.loadmodel!(m, model_state);

In [5]:
# compute MAPEs for each objective in test set
test_preds = m(x_test')'

for i in 1:6
    println("OBJ$i")
    println(mape(y_test[:, i], test_preds[:, i], y_scalers["OBJ$i"]))
end

OBJ1
0.015064367592520542
OBJ2
0.018039971025603344
OBJ3
0.012843054362993565
OBJ4
0.06935988706256477
OBJ5
0.04776152641999147
OBJ6
0.04847033803329832


In [6]:
# check for overfitting
train_preds = m(x_train')'
train_resids = y_train .- train_preds; test_resids = y_test .- test_preds;

for i in 1:6
    println("OBJ$i")
    println(HypothesisTests.ApproximateTwoSampleKSTest(train_resids[:, i], test_resids[:, i]))
end

OBJ1
Approximate two sample Kolmogorov-Smirnov test
----------------------------------------------
Population details:
    parameter of interest:   Supremum of CDF differences
    value under h_0:         0.0
    point estimate:          0.00760502

Test summary:
    outcome with 95% confidence: reject h_0
    two-sided p-value:           0.0363

Details:
    number of observations:   [173260,43316]
    KS-statistic:              1.4156908752342232

OBJ2
Approximate two sample Kolmogorov-Smirnov test
----------------------------------------------
Population details:
    parameter of interest:   Supremum of CDF differences
    value under h_0:         0.0
    point estimate:          0.00929977

Test summary:
    outcome with 95% confidence: reject h_0
    two-sided p-value:           0.0050

Details:
    number of observations:   [173260,43316]
    KS-statistic:              1.7311733907849716

OBJ3
Approximate two sample Kolmogorov-Smirnov test
----------------------------------------

## RFQNet2: With transmission cut, w100 d6

In [12]:
target_directory = "../data/full_with_cellnumber/"

println("Formatting data...")
x_raw_cut_df, y_cut_df, cellnumber_cut_df = getrawdata_withcellnum(target_directory)

# we are interested in whether cellnumber is odd or even
cellnumber_cut_df = cellnumber_cut_df .% 2

# how to process cellnumber, for now let's make it another dvar
x_raw_cut_df = hcat(x_raw_cut_df, cellnumber_cut_df)

#= cutting transmission
if cut_transmission
    println("Cutting Transmission to 60-100 percent...")
    lower::Float32 = 60
    upper::Float32 = 120
    x_raw_cut_df, y_cut_df = applycut(x_raw_cut_df, y_cut_df, "OBJ1", lower, upper; with_numcells=true)
end =#

# decorrelating
println("Decorrelating...")
x_cut_df = decorrelatedvars(x_raw_cut_df; with_numcells=true)

# scaling
x_scaled_cut_df, x_scalers = minmaxscaledf(x_cut_df)
y_scaled_cut_df, y_scalers = minmaxscaledf(y_cut_df)

# need to make sure that column names didn't switch orders
@assert names(x_raw_cut_df) == names(x_scaled_cut_df)
@assert names(y_cut_df) == names(y_scaled_cut_df)

# hardcoding that we are using the same train / test indexes for everything
x_train_cut_df, x_test_cut_df, y_train_cut_df, y_test_cut_df = traintestsplit(
    x_scaled_cut_df, y_scaled_cut_df; read_in=true, path="../indexes/", cut_transmission=true
)

x_train_cut = Float32.(Matrix(x_train_cut_df));
x_test_cut = Float32.(Matrix(x_test_cut_df));
y_train_cut = Float32.(Matrix(y_train_cut_df));
y_test_cut = Float32.(Matrix(y_test_cut_df));

Formatting data...
Decorrelating...
- Using preexisting train and test sets from ../indexes/transmission_above_60/


In [17]:
y_scalers

Dict{String, MinMaxScaler} with 6 entries:
  "OBJ4" => MinMaxScaler(0.00582, 1.34901)
  "OBJ6" => MinMaxScaler(0.00697, 0.10912)
  "OBJ2" => MinMaxScaler(0.0542, 0.087)
  "OBJ3" => MinMaxScaler(35.05, 386.37)
  "OBJ5" => MinMaxScaler(0.00731, 0.10398)
  "OBJ1" => MinMaxScaler(1.53, 99.5)

In [13]:
# RFQ2 is D6 W100. Let's get that model
model_id = "2023-06-16_19-27-42_w=100_d=6_activation=sigmoid_bs=1024_lr=0.001_dr=0.0_1"
model_state = JLD2.load("../models/$model_id.jld2", "model_state");
m = model_id_to_nn(model_id, size(x_df)[2], size(y_df)[2]);
Flux.loadmodel!(m, model_state);

In [14]:
# compute MAPEs for each objective in test set
test_preds_cut = m(x_test_cut')'

for i in 1:6
    println("OBJ$i")
    println(mape(y_test_cut[:, i], test_preds_cut[:, i], y_scalers["OBJ$i"]))
end

OBJ1
0.009707415807886447
OBJ2
0.01786268787232915
OBJ3
0.01250091997066104
OBJ4
0.057817749745693786
OBJ5
0.040755643857710695
OBJ6
0.04047786215140378


In [15]:
# check for overfitting
train_preds_cut = m(x_train_cut')'
train_resids_cut = y_train_cut .- train_preds_cut; test_resids_cut = y_test_cut .- test_preds_cut;

for i in 1:6
    println("OBJ$i")
    println(HypothesisTests.ApproximateTwoSampleKSTest(train_resids_cut[:, i], test_resids_cut[:, i]))
end

OBJ1
Approximate two sample Kolmogorov-Smirnov test
----------------------------------------------
Population details:
    parameter of interest:   Supremum of CDF differences
    value under h_0:         0.0
    point estimate:          0.0109877

Test summary:
    outcome with 95% confidence: reject h_0
    two-sided p-value:           0.0039

Details:
    number of observations:   [128677,32357]
    KS-statistic:              1.7667789957857338

OBJ2
Approximate two sample Kolmogorov-Smirnov test
----------------------------------------------
Population details:
    parameter of interest:   Supremum of CDF differences
    value under h_0:         0.0
    point estimate:          0.0132115

Test summary:
    outcome with 95% confidence: reject h_0
    two-sided p-value:           0.0002

Details:
    number of observations:   [128677,32357]
    KS-statistic:              2.1243534884653217

OBJ3
Approximate two sample Kolmogorov-Smirnov test
------------------------------------------

In [16]:
# compute MAPEs for each objective in FULL test set
test_preds_full = m(x_test')'

for i in 1:6
    println("OBJ$i")
    println(mape(y_test[:, i], test_preds_full[:, i], y_scalers["OBJ$i"]))
end

OBJ1
0.193832157378058
OBJ2
0.020860344582082403
OBJ3
0.020385291669277862
OBJ4
0.15457395643437846
OBJ5
0.1095556414780649
OBJ6
0.10413224008120145
