In [1]:
using JLD2 
using Plots
using StatsBase
using Random

In [2]:
f = jldopen("../../Data/NASA_KeplerV2/datasets/KeplerLightCurveOrig.jld2", "r");
X_train = read(f, "X_train");
X_test = read(f, "X_test");
y_train = read(f, "y_train")
y_test = read(f, "y_test");

In [5]:
class0_tr_idxs = findall(x -> x .== 0, y_train);
class0_te_idxs = findall(x -> x .== 0, y_test);
println("Number of C0 training instances: $(length(class0_tr_idxs))")
println("Number of C0 testing instances: $(length(class0_te_idxs))")

class6_tr_idxs = findall(x -> x .== 6, y_train);
class6_te_idxs = findall(x -> x .== 6, y_test);
println("Number of C6 training instances: $(length(class6_tr_idxs))")
println("Number of C6 testing instances: $(length(class6_te_idxs))")

class4_tr_idxs = findall(x -> x .== 4, y_train);
class4_te_idxs = findall(x -> x .== 4, y_test);
println("Number of C4 training instances: $(length(class4_tr_idxs))")
println("Number of C4 testing instances: $(length(class4_te_idxs))")

Number of C0 training instances: 119
Number of C0 testing instances: 52
Number of C6 training instances: 17
Number of C6 testing instances: 8
Number of C4 training instances: 140
Number of C4 testing instances: 61


In [36]:
c6_all = vcat(X_train[class6_tr_idxs, :], X_test[class6_te_idxs, :]);
c0_all = vcat(X_train[class0_tr_idxs, :], X_test[class0_te_idxs, :]);
c4_all = vcat(X_train[class4_tr_idxs, :], X_test[class4_te_idxs, :]);

In [37]:
function detect_flat_regions(data)
    flat_regions = []
    for i in 2:length(data)
        if data[i] == data[i-1]
            push!(flat_regions, i)
        end
    end
    return flat_regions
end

detect_flat_regions (generic function with 1 method)

In [38]:
function window_ts_instance(ts::Vector{Float64}, window_size::Int64, 
    stride::Int64=window_size, keep::Union{Float64, Int64}=0.5; return_artefacts::Bool=false)
    n = length(ts)
    windows = [ts[i:i+window_size-1] for i in 1:stride:n-window_size+1]
    corr_win_idxs = findall(length.(detect_flat_regions.(windows)) .> 8); # find windows with artefacts
    sample_window_idxs = setdiff(collect(1:length(windows)), corr_win_idxs) # clean windows to sample from 
    # take a fraction of the windows
    keep_num_windows = isa(keep, Float64) ? round(Int64, length(sample_window_idxs)*keep) : keep # either use a fraction of all windows or specify exact number
    window_idxs_subset = sample(sample_window_idxs, keep_num_windows; replace=false)
    windows_subset = windows[window_idxs_subset]
    if return_artefacts
        windows_subset = vcat(windows_subset, windows[corr_win_idxs])
    end
    return windows_subset
end

window_ts_instance (generic function with 3 methods)

In [39]:
X_train_c6 = []
X_test_c6 = []
for i in 1:size(c6_all, 1)
    # window each instance
    windows = window_ts_instance(c6_all[i, :], 100, 100, 1.0)
    num_train = Int(round(0.95 * length(windows)))
    train_idxs = sample(1:length(windows), num_train; replace=false);
    test_idxs = setdiff(1:length(windows), train_idxs);
    push!(X_train_c6, windows[train_idxs])
    push!(X_test_c6, windows[test_idxs])
end

In [40]:
X_test_c6[1]

2-element Vector{Vector{Float64}}:
 [0.0286349971358986, 0.0289348255593886, 0.0246541618763184, 0.00252260042697083, -0.0326583012494142, -0.0576252411731946, -0.0593754062682474, -0.0485290452900393, -0.0361796654940154, -0.0249898638433306  …  0.0301969105445661, 0.0287166562080546, 0.0279973492168318, 0.0294447360655162, 0.029253012772828, 0.0219316648213974, -0.00316031639916248, -0.0397954998415941, -0.0606612944532519, -0.0554311643875904]
 [0.017897377673066, 0.0205904759382445, 0.0225036112680396, 0.0230481631528899, 0.0272433968412497, 0.0302253108480981, 0.028861491584899, 0.0168112611489816, -0.00857536400871561, -0.0376172272128035  …  0.01653892792942, 0.0184086304565402, 0.0186413031315737, 0.0189088390005184, 0.0218160886339281, 0.022494601174738, 0.0239482062035631, 0.028078793212045, 0.0300214149220303, 0.0264454011148499]

In [41]:
jldopen("kepler_c6.jld2", "w") do f
    f["X_train"] = X_train_c6
    f["X_test"] = X_test_c6
end

25-element Vector{Any}:
 [[0.0286349971358986, 0.0289348255593886, 0.0246541618763184, 0.00252260042697083, -0.0326583012494142, -0.0576252411731946, -0.0593754062682474, -0.0485290452900393, -0.0361796654940154, -0.0249898638433306  …  0.0301969105445661, 0.0287166562080546, 0.0279973492168318, 0.0294447360655162, 0.029253012772828, 0.0219316648213974, -0.00316031639916248, -0.0397954998415941, -0.0606612944532519, -0.0554311643875904], [0.017897377673066, 0.0205904759382445, 0.0225036112680396, 0.0230481631528899, 0.0272433968412497, 0.0302253108480981, 0.028861491584899, 0.0168112611489816, -0.00857536400871561, -0.0376172272128035  …  0.01653892792942, 0.0184086304565402, 0.0186413031315737, 0.0189088390005184, 0.0218160886339281, 0.022494601174738, 0.0239482062035631, 0.028078793212045, 0.0300214149220303, 0.0264454011148499]]
 [[-0.0273973543245951, -0.0197653505036229, -0.0138411637479848, -0.0086866219482199, -0.00424662689316258, -0.000258577683867833, 0.00370265744687692, 0.0

Pick a subset of 25 instances to match the number of instances in class 6

In [50]:
size(c0_all)

(171, 4767)

In [None]:
Random.seed!(42)
selected_instances_c0 = sample(1:size(c0_all, 1), size(c6_all, 1); replace=false);

25-element Vector{Int64}:
 108
  78
  83
 122
 117
  33
   1
   5
   3
  58
   ⋮
  59
  47
   9
  93
   6
  99
  52
 112
  19

In [58]:
X_train_c0 = []
X_test_c0 = []
for i in selected_instances_c0
    # window each instance
    windows = window_ts_instance(c0_all[i, :], 100, 100, 1.0)
    num_train = Int(round(0.95 * length(windows)))
    train_idxs = sample(1:length(windows), num_train; replace=false);
    test_idxs = setdiff(1:length(windows), train_idxs);
    push!(X_train_c0, windows[train_idxs])
    push!(X_test_c0, windows[test_idxs])
end

In [64]:
jldopen("kepler_c0.jld2", "w") do f
    f["X_train"] = X_train_c0
    f["X_test"] = X_test_c0
end

25-element Vector{Any}:
 [[-0.000200999113006972, -0.000280172024027747, -0.000328552343926258, -0.000366846259801434, -0.000372715621241859, -0.000376461346691226, -0.00034772220148116, -0.000301042178044497, -0.000257992627030523, -0.000177213998820536  …  -0.000162461096923083, -0.000268658136203914, -0.000347121829022967, -0.000422991796379391, -0.000468761343153767, -0.000505246919749069, -0.000522837215751526, -0.000540555139040721, -0.000531763105772631, -0.000522971072504541], [-0.000419803772298444, -0.000455158752577844, -0.000488250132592238, -0.000505379367679915, -0.000503874990048558, -0.000491359506374245, -0.000477707856712639, -0.0004226769148028, -0.000337778883387374, -0.000259972625420213  …  -0.000108523560295115, -0.000197866159833415, -0.000278580980509036, -0.000333705323840139, -0.00038214104812484, -0.000399169801798438, -0.000393885496752011, -0.000377019036342574, -0.000325123522577631, -0.00028261869493007]]
 [[0.000921940959643819, -0.000955023260489241, -

Repeat for class 4

In [69]:
Random.seed!(42)
selected_instances_c4 = sample(1:size(c4_all, 1), size(c6_all, 1); replace=false)

25-element Vector{Int64}:
 127
  92
  98
 143
 137
  38
 126
   5
  97
  67
   ⋮
  10
  53
  96
 108
  35
 114
  58
 130
  95

In [70]:
X_train_c4 = []
X_test_c4 = []
for i in selected_instances_c4
    # window each instance
    windows = window_ts_instance(c4_all[i, :], 100, 100, 1.0)
    num_train = Int(round(0.95 * length(windows)))
    train_idxs = sample(1:length(windows), num_train; replace=false);
    test_idxs = setdiff(1:length(windows), train_idxs);
    push!(X_train_c4, windows[train_idxs])
    push!(X_test_c4, windows[test_idxs])
end

In [72]:
jldopen("kepler_c4.jld2", "w") do f
    f["X_train"] = X_train_c4
    f["X_test"] = X_test_c4
end

25-element Vector{Any}:
 [[-5.5014499866779e-6, -4.53792799759434e-6, -1.62002871202382e-5, -2.35656438765108e-5, 2.67708171267644e-5, 4.67918871902562e-5, 3.11786797113633e-6, 2.726980406631e-5, -1.21935631436365e-5, -5.94093888506286e-6  …  7.40800956455789e-6, 1.09904305556352e-6, -7.71828544521291e-6, -4.78237771199819e-5, -1.56878841142127e-5, -1.43385176201161e-5, -1.11324592981799e-5, -5.36640206840744e-5, -1.58629023558343e-5, 2.19382159724058e-5], [5.01632885567815e-5, -4.31655819177479e-6, 5.5307852515174e-5, 5.22257567483475e-5, 4.3539772643153e-5, -3.78501596585146e-5, -2.99453489858159e-5, -8.90722914403952e-6, -3.99820080090318e-5, 5.12303338451314e-6  …  -4.19883784212871e-5, -2.29829396972692e-5, -3.97750097325122e-6, -2.09027455144728e-6, -7.77199808821294e-6, -5.77863568884673e-6, 4.53281971537489e-5, 2.96065487470631e-5, 1.02909153374409e-5, 1.67050048514028e-5]]
 [[-8.56018136496939e-6, -3.17600238386451e-6, -2.0464497539785e-5, -2.60903908821941e-6, -3.018291729584

# Make the folds

In [74]:
f1 = jldopen("kepler_c0.jld2", "r");
X_train_c0_orig = read(f1, "X_train")
X_test_c0_orig = read(f1, "X_test");

The approach is the following: for each instance, we merge the train and test splits, then resample. Let's start with just instance 1

In [75]:
folds = [(X_train_c0_orig, X_test_c0_orig)] # train, test 
for f in 2:30
    instances_train = [] # all train windows per instance
    instances_test = [] # all test windows per instance
    for inst in 1:length(X_train_c0_orig) # for each instance
        # merge the instance-specific train/test windows
        i_train = X_train_c0_orig[inst]
        i_test = X_test_c0_orig[inst]
        i_merged = vcat(i_train, i_test)
        # resample the train indices and infer the test indices
        resampled_train_idxs = sample(1:length(i_merged), size(i_train, 1); replace=false)
        resampled_test_idxs = setdiff(1:length(i_merged), resampled_train_idxs)
        i_train_rs = i_merged[resampled_train_idxs] # obtain new instance specific train/test windows
        i_test_rs = i_merged[resampled_test_idxs]
        push!(instances_train, i_train_rs)
        push!(instances_test, i_test_rs)
    end
    push!(folds, (instances_train, instances_test))
end

In [76]:
@save "kepler_c0_folds.jld2" folds

Make the window locations

In [35]:
ps = [0.05, 0.15, 0.25, 0.35, 0.45, 0.55, 0.65, 0.75, 0.85, 0.95]

10-element Vector{Float64}:
 0.05
 0.15
 0.25
 0.35
 0.45
 0.55
 0.65
 0.75
 0.85
 0.95

In [51]:
function generate_windows(T::Int, fraction_missing::Float64, num_windows::Int)
    num_pts = round(Int, T*fraction_missing)
    max_start_idx = T - num_pts
    max_num_windows = length(collect(1:max_start_idx))
    num_windows_choose = min(max_num_windows, num_windows)
    start_idx = sample(collect(1:max_start_idx), num_windows_choose; replace=false)
    windows = [collect(range(s, (s+num_pts-1))) for s in start_idx]
    return windows
end

generate_windows (generic function with 1 method)

In [72]:
Random.seed!(0)
windows_per_percentage = Dict()
for p in ps
    windows_per_percentage[p] = generate_windows(100, p, 15)
end

In [74]:
JLD2.@save "kepler_windows_julia_idx.jld2" windows_per_percentage

### Class 6 folds

In [77]:
f3 = jldopen("kepler_c6.jld2", "r");
X_train_c6_orig = read(f3, "X_train")
X_test_c6_orig = read(f3, "X_test");
close(f3)

In [78]:
folds = [(X_train_c6_orig, X_test_c6_orig)] # train, test 
for f in 2:30
    instances_train = [] # all train windows per instance
    instances_test = [] # all test windows per instance
    for inst in 1:length(X_train_c6_orig) # for each instance
        # merge the instance-specific train/test windows
        i_train = X_train_c6_orig[inst]
        i_test = X_test_c6_orig[inst]
        i_merged = vcat(i_train, i_test)
        # resample the train indices and infer the test indices
        resampled_train_idxs = sample(1:length(i_merged), size(i_train, 1); replace=false)
        resampled_test_idxs = setdiff(1:length(i_merged), resampled_train_idxs)
        i_train_rs = i_merged[resampled_train_idxs] # obtain new instance specific train/test windows
        i_test_rs = i_merged[resampled_test_idxs]
        push!(instances_train, i_train_rs)
        push!(instances_test, i_test_rs)
    end
    push!(folds, (instances_train, instances_test))
end

In [79]:
@save "kepler_c6_folds.jld2" folds

In [80]:
f4 = jldopen("kepler_c4.jld2", "r");
X_train_c4_orig = read(f4, "X_train")
X_test_c4_orig = read(f4, "X_test");
close(f4)

In [81]:
folds = [(X_train_c4_orig, X_test_c4_orig)] # train, test 
for f in 2:30
    instances_train = [] # all train windows per instance
    instances_test = [] # all test windows per instance
    for inst in 1:length(X_train_c4_orig) # for each instance
        # merge the instance-specific train/test windows
        i_train = X_train_c4_orig[inst]
        i_test = X_test_c4_orig[inst]
        i_merged = vcat(i_train, i_test)
        # resample the train indices and infer the test indices
        resampled_train_idxs = sample(1:length(i_merged), size(i_train, 1); replace=false)
        resampled_test_idxs = setdiff(1:length(i_merged), resampled_train_idxs)
        i_train_rs = i_merged[resampled_train_idxs] # obtain new instance specific train/test windows
        i_test_rs = i_merged[resampled_test_idxs]
        push!(instances_train, i_train_rs)
        push!(instances_test, i_test_rs)
    end
    push!(folds, (instances_train, instances_test))
end

In [82]:
@save "kepler_c4_folds.jld2" folds