In [1]:
# script version of the imputation benchmarks notebook for running on the cluster 
import json 
import numpy as np
import matplotlib.pyplot as plt
from pypots.optim import Adam
from pypots.imputation import CSDI, BRITS
from pypots.utils.random import set_random_seed
from pypots.utils.metrics import calc_mae
import pickle
import sys
#sys.path.append("../../Interpolation/Imputation_Algs")
from cdrec.python.recovery import centroid_recovery as CDrec
set_random_seed(1234)
# check that GPU acceleration is enabled
import torch
#torch.cuda.device_count()
#print(f"GPU: {torch.cuda.get_device_name()}")
#print(f"CUDA ENABLED: {torch.cuda.is_available()}")

2024-10-28 06:52:08 [INFO]: Have set the random seed as 1234 for numpy and pytorch.


[34m
████████╗██╗███╗   ███╗███████╗    ███████╗███████╗██████╗ ██╗███████╗███████╗    █████╗ ██╗
╚══██╔══╝██║████╗ ████║██╔════╝    ██╔════╝██╔════╝██╔══██╗██║██╔════╝██╔════╝   ██╔══██╗██║
   ██║   ██║██╔████╔██║█████╗█████╗███████╗█████╗  ██████╔╝██║█████╗  ███████╗   ███████║██║
   ██║   ██║██║╚██╔╝██║██╔══╝╚════╝╚════██║██╔══╝  ██╔══██╗██║██╔══╝  ╚════██║   ██╔══██║██║
   ██║   ██║██║ ╚═╝ ██║███████╗    ███████║███████╗██║  ██║██║███████╗███████║██╗██║  ██║██║
   ╚═╝   ╚═╝╚═╝     ╚═╝╚══════╝    ╚══════╝╚══════╝╚═╝  ╚═╝╚═╝╚══════╝╚══════╝╚═╝╚═╝  ╚═╝╚═╝
ai4ts v0.0.3 - building AI for unified time-series analysis, https://time-series.ai [0m



In [49]:
def evaluate_folds_cdrec(Xs, ys, fold_idxs, window_idxs):
    fold_scores = dict()
    for fold in range(0, len(fold_idxs)):
        print(f"Evaluating fold {fold}/{len(fold_idxs)-1}...")
        # make the splits
        X_train_fold = Xs[fold_idxs[fold]["train"]]
        y_train_fold = ys[fold_idxs[fold]["train"]]
        X_test_fold = Xs[fold_idxs[fold]["test"]]
        y_test_fold = ys[fold_idxs[fold]["test"]]
        # check class distributions
        counts_tr = np.unique(y_train_fold, return_counts=True)[1]
        print(f"Training class distribution: {counts_tr/np.sum(counts_tr)}")
        counts_te = np.unique(y_test_fold, return_counts=True)[1]
        print(f"Testing class distribution: {counts_te/np.sum(counts_te)}")
        print(f"Computing CDrec on fold {fold}...")
        percent_missing_score = dict()
        for pm in window_idxs:
            print(f"Imputing {pm}% missing data over {len(window_idxs[pm])} windows...")
            per_window_scores = dict()
            for (idx, widx) in enumerate(window_idxs[pm]):
                X_test_corrupted = X_test_fold.copy()
                X_test_corrupted[:, widx] = np.nan
                mask = np.isnan(X_test_corrupted) # mask ensures only misisng values are imputed
                Xdata = np.concatenate([X_train_fold.squeeze(), X_test_corrupted.squeeze()])
                cdrec_imputed_raw = CDrec(matrix=Xdata) # using default paramss
                cdrec_imputed = cdrec_imputed_raw[X_train_fold.shape[0]:][:].reshape([-1, X_train_fold.shape[1], 1]) # only the test data from the concatenated matrix            
                errs = [calc_mae(cdrec_imputed[i], X_test_fold[i], mask[i]) for i in range(0, X_test_fold.shape[0])] # get individual errors for uncertainty quantification
                per_window_scores[idx] = errs
            percent_missing_score[pm] = per_window_scores
        fold_scores[fold] = percent_missing_score
    return fold_scores



In [15]:
# load the original IPD Split 
train_f = np.loadtxt("../../Data/italypower/datasets/ItalyPowerDemand_TRAIN.txt")
test_f = np.loadtxt("../../Data/italypower/datasets/ItalyPowerDemand_TEST.txt")
X_train = train_f[:, 1:]
y_train = train_f[:, 0]
X_test = test_f[:, 1:]
y_test = test_f[:, 0]
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

# reshape data for imputation models
X_train_original = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test_original =  X_test.reshape(X_test.shape[0], X_test.shape[1], 1)
y_train_original = y_train
y_test_original = y_test
print(X_train_original.shape)
print(X_test_original.shape)

# Combine the train and test splits for resampling
Xs = np.vstack([X_train_original, X_test_original])
print(Xs.shape)
ys = np.concatenate([y_train_original, y_test_original])
print(ys.shape)



(67, 24)
(67,)
(1029, 24)
(1029,)
(67, 24, 1)
(1029, 24, 1)
(1096, 24, 1)
(1096,)


In [12]:
# load resample fold indices
with open("../../FinalBenchmarks/ItalyPower/Python/ipd_resample_folds_python_idx.json", "r") as f:
    resample_fold_idxs_f = json.load(f)
resample_fold_idxs = {int(k): v for k, v in resample_fold_idxs_f.items()}
print(resample_fold_idxs.keys())

X_train_f1 = Xs[resample_fold_idxs[0]["train"]]
X_test_f1 = Xs[resample_fold_idxs[0]["test"]]
y_train_f1 = ys[resample_fold_idxs[0]["train"]]
y_test_f1 = ys[resample_fold_idxs[0]["test"]]

print(X_train_f1.shape)
print(X_test_f1.shape)
print(y_train_f1.shape)
print(y_test_f1.shape)

print(np.all(np.equal(X_train_f1, X_train_original)))
print(np.all(np.equal(y_train_f1, y_train_original)))
print(np.all(np.equal(X_test_f1, X_test_original)))
print(np.all(np.equal(y_test_f1, y_test_original)))


dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29])
(67, 24, 1)
(1029, 24, 1)
(67,)
(1029,)
True
True
True
True


In [13]:
# load imputation window indices
with open("../../FinalBenchmarks/ItalyPower/Python/ipd_windows_python_idx.json", "r") as f:
    window_idxs_f = json.load(f)
window_idxs = {int(float(k)*100): v for k, v in window_idxs_f.items()}
print(window_idxs.keys())


dict_keys([5, 15, 25, 35, 45, 55, 65, 75, 85, 95])


In [50]:
fold_scores_cdrec = evaluate_folds_cdrec(Xs, ys, resample_fold_idxs, window_idxs)

Evaluating fold 0/29...
Training class distribution: [0.50746269 0.49253731]
Testing class distribution: [0.49854227 0.50145773]
Computing CDrec on fold 0...
Imputing 5% missing data over 15 windows...
Imputing 15% missing data over 15 windows...
Imputing 25% missing data over 15 windows...
Imputing 35% missing data over 15 windows...
Imputing 45% missing data over 13 windows...
Imputing 55% missing data over 11 windows...
Imputing 65% missing data over 8 windows...
Imputing 75% missing data over 6 windows...
Imputing 85% missing data over 4 windows...
Imputing 95% missing data over 1 windows...
Evaluating fold 1/29...
Training class distribution: [0.49253731 0.50746269]
Testing class distribution: [0.49951409 0.50048591]
Computing CDrec on fold 1...
Imputing 5% missing data over 15 windows...
Imputing 15% missing data over 15 windows...
Imputing 25% missing data over 15 windows...
Imputing 35% missing data over 15 windows...
Imputing 45% missing data over 13 windows...
Imputing 55% mi

KeyboardInterrupt: 

In [46]:
f1 = fold_scores_cdrec[0][5][0]
for i in range(1, 15):
    f1 = np.hstack([f1, fold_scores_cdrec[0][5][i]])
