In [1]:
import matplotlib.pyplot as plt
import sklearn
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
# import tsdb
# import benchpots
# from pypots.optim import Adam
# from pypots.imputation import CSDI, MRNN, BRITS

# from pypots.utils.random import set_random_seed
# from pypots.utils.metrics import calc_mae
import random
import json

#from cdrec.python.recovery import centroid_recovery as CDrec


In [2]:
import numpy as np

# 1. Import Original Train/Test Split

In [4]:
train = np.loadtxt("../../../Data/italypower/datasets/ItalyPowerDemand_TRAIN.txt")
test = np.loadtxt("../../../Data/italypower/datasets/ItalyPowerDemand_TEST.txt");
X_train = train[:, 1:]
y_train = train[:, 0]
X_test = test[:, 1:]
y_test = test[:, 0]
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(67, 24)
(67,)
(1029, 24)
(1029,)


In [5]:
print(len(np.argwhere(y_train == 1).flatten())/len(y_train))

0.5074626865671642


In [6]:
print(len(np.argwhere(y_test == 2).flatten())/len(y_test))

0.5014577259475219


# 2. Make 29 resampled folds

In [7]:
Xs = np.vstack([X_train, X_test]) # recombined train and test 
ys = np.concatenate([y_train, y_test])

In [8]:
ss = StratifiedShuffleSplit(n_splits=29, random_state=0, train_size=X_train.shape[0])
splits = ss.split(Xs, ys)

In [9]:
fold_idxs = {0: {"train": np.arange(len(y_train)).tolist(), "test": np.arange(len(y_train), len(ys)).tolist()}} # append original dataset as fold 0
for i, (train_idx, test_idx) in enumerate(ss.split(Xs, ys)):
    sub_dict = {"train": train_idx.tolist(), "test": test_idx.tolist()}
    fold_idxs[i+1] = sub_dict   

Check class distributions

In [11]:
for f in fold_idxs.keys():
    train_idxs = fold_idxs[f]["train"]
    test_idxs = fold_idxs[f]["test"]
    class0_tr_counts = len(np.argwhere(ys[train_idxs] == 1))/len(ys[train_idxs])
    class1_tr_counts = len(np.argwhere(ys[train_idxs] == 2))/len(ys[train_idxs])
    class0_te_counts = len(np.argwhere(ys[test_idxs] == 1))/len(ys[test_idxs])
    class1_te_counts = len(np.argwhere(ys[test_idxs] == 2))/len(ys[test_idxs])
    print(f"fold {f} | class 0: {class0_tr_counts} | class 1: {class1_tr_counts}")
    print(f"fold {f} | class 0: {class0_te_counts} | class 1: {class1_te_counts}")
    print("="*50)

fold 0 | class 0: 0.5074626865671642 | class 1: 0.4925373134328358
fold 0 | class 0: 0.49854227405247814 | class 1: 0.5014577259475219
fold 1 | class 0: 0.4925373134328358 | class 1: 0.5074626865671642
fold 1 | class 0: 0.49951409135082603 | class 1: 0.500485908649174
fold 2 | class 0: 0.4925373134328358 | class 1: 0.5074626865671642
fold 2 | class 0: 0.49951409135082603 | class 1: 0.500485908649174
fold 3 | class 0: 0.4925373134328358 | class 1: 0.5074626865671642
fold 3 | class 0: 0.49951409135082603 | class 1: 0.500485908649174
fold 4 | class 0: 0.4925373134328358 | class 1: 0.5074626865671642
fold 4 | class 0: 0.49951409135082603 | class 1: 0.500485908649174
fold 5 | class 0: 0.4925373134328358 | class 1: 0.5074626865671642
fold 5 | class 0: 0.49951409135082603 | class 1: 0.500485908649174
fold 6 | class 0: 0.4925373134328358 | class 1: 0.5074626865671642
fold 6 | class 0: 0.49951409135082603 | class 1: 0.500485908649174
fold 7 | class 0: 0.4925373134328358 | class 1: 0.50746268656

# 3. Generate window locations

In [12]:
ps = [0.05, 0.15, 0.25, 0.35, 0.45, 0.55, 0.65, 0.75, 0.85, 0.95]

In [13]:
def generate_windows(T, fraction_missing, num_windows, seed=0):
    np.random.seed(0)
    num_pts = round(T*fraction_missing)
    max_start_idx = T - num_pts # inclusive of start_idx
    max_num_windows = len(np.arange(0, max_start_idx))
    num_windows_choose = min(num_windows, max_num_windows)
    start_idx = random.sample(np.arange(0, max_start_idx).tolist(), num_windows_choose)
    windows = [np.arange(sidx, sidx+num_pts).tolist() for sidx in start_idx]
    return windows

In [14]:
window_per_percentage = dict()
for p in ps:
    window_per_percentage[p] = generate_windows(24, p, 15)

In [16]:
window_per_percentage[0.15]

[[8, 9, 10, 11],
 [10, 11, 12, 13],
 [3, 4, 5, 6],
 [18, 19, 20, 21],
 [4, 5, 6, 7],
 [6, 7, 8, 9],
 [1, 2, 3, 4],
 [5, 6, 7, 8],
 [0, 1, 2, 3],
 [16, 17, 18, 19],
 [2, 3, 4, 5],
 [13, 14, 15, 16],
 [9, 10, 11, 12],
 [19, 20, 21, 22],
 [11, 12, 13, 14]]

# 4. Save Data

Save window idxs:

In [17]:
with open('ipd_windows_python_idx.json', 'w') as f:
    json.dump(window_per_percentage, f)

Save resample fold idxs:

In [18]:
with open('ipd_resample_folds_python_idx.json', 'w') as f:
    json.dump(fold_idxs, f)