In [None]:
using JLD2 
using Plots
using StatsBase
using Random

In [2]:
f = jldopen("../../Data/NASA_KeplerV2/datasets/KeplerLightCurveOrig.jld2", "r");
X_train = read(f, "X_train");
X_test = read(f, "X_test");
y_train = read(f, "y_train")
y_test = read(f, "y_test");

In [3]:
class0_tr_idxs = findall(x -> x .== 0, y_train);
class0_te_idxs = findall(x -> x .== 0, y_test);
println("Number of C0 training instances: $(length(class0_tr_idxs))")
println("Number of C0 testing instances: $(length(class0_te_idxs))")

class6_tr_idxs = findall(x -> x .== 6, y_train);
class6_te_idxs = findall(x -> x .== 6, y_test);
println("Number of C6 training instances: $(length(class6_tr_idxs))")
println("Number of C6 testing instances: $(length(class6_te_idxs))")

Number of C0 training instances: 119
Number of C0 testing instances: 52
Number of C6 training instances: 17
Number of C6 testing instances: 8


In [4]:
c6_all = vcat(X_train[class6_tr_idxs, :], X_train[class6_te_idxs, :]);
c0_all = vcat(X_train[class0_tr_idxs, :], X_train[class0_te_idxs, :]);

In [5]:
function detect_flat_regions(data)
    flat_regions = []
    for i in 2:length(data)
        if data[i] == data[i-1]
            push!(flat_regions, i)
        end
    end
    return flat_regions
end

detect_flat_regions (generic function with 1 method)

In [6]:
function window_ts_instance(ts::Vector{Float64}, window_size::Int64, 
    stride::Int64=window_size, keep::Union{Float64, Int64}=0.5; return_artefacts::Bool=false)
    n = length(ts)
    windows = [ts[i:i+window_size-1] for i in 1:stride:n-window_size+1]
    corr_win_idxs = findall(length.(detect_flat_regions.(windows)) .> 8); # find windows with artefacts
    sample_window_idxs = setdiff(collect(1:length(windows)), corr_win_idxs) # clean windows to sample from 
    # take a fraction of the windows
    keep_num_windows = isa(keep, Float64) ? round(Int64, length(sample_window_idxs)*keep) : keep # either use a fraction of all windows or specify exact number
    window_idxs_subset = sample(sample_window_idxs, keep_num_windows; replace=false)
    windows_subset = windows[window_idxs_subset]
    if return_artefacts
        windows_subset = vcat(windows_subset, windows[corr_win_idxs])
    end
    return windows_subset
end

window_ts_instance (generic function with 3 methods)

In [7]:
X_train_c6 = []
X_test_c6 = []
for i in 1:size(c6_all, 1)
    # window each instance
    windows = window_ts_instance(c6_all[i, :], 100, 100, 1.0)
    num_train = Int(round(0.8 * length(windows)))
    train_idxs = sample(1:length(windows), num_train; replace=false);
    test_idxs = setdiff(1:length(windows), train_idxs);
    push!(X_train_c6, windows[train_idxs])
    push!(X_test_c6, windows[test_idxs])
end

In [8]:
jldopen("kepler_c6_final.jld2", "w") do f
    f["X_train"] = X_train_c6
    f["X_test"] = X_test_c6
end

25-element Vector{Any}:
 [[-0.0199977938427818, -0.0432188050545508, -0.0520011175105861, -0.0476708982393146, -0.0388180026719455, -0.0299651071045763, -0.0211567710412752, -0.0140395405393234, -0.00684625334040168, -0.000808843792272862  …  0.0325835387342867, 0.0273875811209345, 0.00198139773295802, -0.0285880853055136, -0.0458601713890943, -0.0508524658271698, -0.0445406832451924, -0.0354912242705496, -0.0265898539233844, -0.0184752361850764], [-0.0235816508124549, -0.0146619253666807, -0.00740176762684319, -0.00131829237999836, 0.00393338748712557, 0.00900419823162746, 0.0138374139566392, 0.0184783457820041, 0.022665749559196, 0.0257397709187452  …  -0.0553796403211506, -0.0445002009910753, -0.0326557640020693, -0.021647035642637, -0.0124944149451371, -0.00535687679892982, 0.000378988554870285, 0.00549095420342349, 0.0102776588867051, 0.0149716445110332], [-0.0340284951909109, -0.0236656206339152, -0.015142792693523, -0.00811232863427591, -0.00204074133226095, 0.00349261881836471,

In [26]:
X_train_c0 = []
X_test_c0 = []
for i in 1:size(c0_all, 1)
    # window each instance
    windows = window_ts_instance(c0_all[i, :], 100, 100, 1.0)
    num_train = Int(round(0.8 * length(windows)))
    train_idxs = sample(1:length(windows), num_train; replace=false);
    test_idxs = setdiff(1:length(windows), train_idxs);
    push!(X_train_c0, windows[train_idxs])
    push!(X_test_c0, windows[test_idxs])
end

In [32]:
jldopen("kepler_c0_final.jld2", "w") do f
    f["X_train"] = X_train_c0
    f["X_test"] = X_test_c0
end

171-element Vector{Any}:
 [[0.00914007609675271, 0.00795705442736472, 0.00605433967046931, 0.00378378516952715, 0.00140684127912183, -0.000971395649157381, -0.0031571627080581, -0.00501360781582816, -0.0064145513525945, -0.00721484819663087  …  -0.00646109902598135, -0.00549328315439135, -0.00393175163778203, -0.00198721952051428, 0.000231801934464926, 0.00255025474498527, 0.00469053463338898, 0.00635719123951994, 0.00715471728485162, 0.00689151766981366], [-0.00698723101310306, -0.00701809541624343, -0.00638232332067779, -0.00520297701244032, -0.00351572021837143, -0.0014486719869321, 0.000780387220269407, 0.00310098171943418, 0.00526099542194691, 0.00720809266069433  …  0.00910091453251549, 0.00828490236913249, 0.00654144603324225, 0.0043886415405292, 0.00204347446323894, -0.00032954962824927, -0.00243900689997267, -0.00454846417169608, -0.00606095769654991, -0.00707305960048366], [-0.0061591293460358, -0.00465220878862371, -0.00272413060553822, -0.00043000573061136, 0.00193231583848

# Make the folds

In [9]:
f1 = jldopen("/Users/joshua/Desktop/QuantumInspiredMLFinal/QuantumInspiredML/FinalBenchmarks/NASA_KeplerV2/kepler_c0_final.jld2", "r");
X_train_c0_orig = read(f1, "X_train")
X_test_c0_orig = read(f1, "X_test");

The approach is the following: for each instance, we merge the train and test splits, then resample. Let's start with just instance 1

In [20]:
folds = [(X_train_c0_orig, X_test_c0_orig)] # train, test 
for f in 2:30
    instances_train = [] # all train windows per instance
    instances_test = [] # all test windows per instance
    for inst in 1:length(X_train_c0_orig) # for each instance
        # merge the instance-specific train/test windows
        i_train = X_train_c0_orig[inst]
        i_test = X_test_c0_orig[inst]
        i_merged = vcat(i_train, i_test)
        # resample the train indices and infer the test indices
        resampled_train_idxs = sample(1:length(i_merged), size(i_train, 1); replace=false)
        resampled_test_idxs = setdiff(1:length(i_merged), resampled_train_idxs)
        i_train_rs = i_merged[resampled_train_idxs] # obtain new instance specific train/test windows
        i_test_rs = i_merged[resampled_test_idxs]
        push!(instances_train, i_train_rs)
        push!(instances_test, i_test_rs)
    end
    push!(folds, (instances_train, instances_test))
end

In [34]:
@save "kepler_c0_all_folds.jld2" folds

Make the window locations

In [35]:
ps = [0.05, 0.15, 0.25, 0.35, 0.45, 0.55, 0.65, 0.75, 0.85, 0.95]

10-element Vector{Float64}:
 0.05
 0.15
 0.25
 0.35
 0.45
 0.55
 0.65
 0.75
 0.85
 0.95

In [51]:
function generate_windows(T::Int, fraction_missing::Float64, num_windows::Int)
    num_pts = round(Int, T*fraction_missing)
    max_start_idx = T - num_pts
    max_num_windows = length(collect(1:max_start_idx))
    num_windows_choose = min(max_num_windows, num_windows)
    start_idx = sample(collect(1:max_start_idx), num_windows_choose; replace=false)
    windows = [collect(range(s, (s+num_pts-1))) for s in start_idx]
    return windows
end

generate_windows (generic function with 1 method)

In [72]:
Random.seed!(0)
windows_per_percentage = Dict()
for p in ps
    windows_per_percentage[p] = generate_windows(100, p, 15)
end

In [74]:
JLD2.@save "kepler_windows_julia_idx.jld2" windows_per_percentage