In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
from datetime import datetime, timedelta
import pandas as pd
import math
import numpy as np
import random
from tqdm import trange

from io import BytesIO
from urllib.request import urlopen
from zipfile import ZipFile

from math import sqrt
from pandas import read_csv, DataFrame
from scipy import stats

import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt

def prep_data(data, covariates, data_start, train = True):

    time_len = data.shape[0]  # 32136

    input_size = window_size-stride_size  # (192 - 24)
    
    windows_per_series = np.full((num_series), (time_len-input_size) // stride_size)
    if train: windows_per_series -= (data_start+stride_size-1) // stride_size

    total_windows = np.sum(windows_per_series)
    x_input = np.zeros((total_windows, window_size, 1 + num_covariates + 1), dtype='float32')
    label = np.zeros((total_windows, window_size), dtype='float32')
    v_input = np.zeros((total_windows, 2), dtype='float32')
    #cov = 3: ground truth + age + day_of_week + hour_of_day + num_series
    #cov = 4: ground truth + age + day_of_week + hour_of_day + month_of_year + num_series
    count = 0
    if not train:  # for test
        covariates = covariates[-time_len:]
    for series in trange(num_series):  #穷举series
        
        cov_age = stats.zscore(np.arange(total_time-data_start[series]))  # 序列中的第几个，位置变量
        if train:
            covariates[data_start[series]:time_len, 0] = cov_age[:time_len-data_start[series]]  # 序列中的第几个，位置变量（矫正当前序列）
        else:
            covariates[:, 0] = cov_age[-time_len:]
        
        for i in range(windows_per_series[series]):  # 穷举window
            if train:
                window_start = stride_size*i+data_start[series]
            else:
                window_start = stride_size*i
            window_end = window_start + window_size

            x_input[count, 1:, 0] = data[window_start:window_end-1, series]
            x_input[count, :, 1:1+num_covariates] = covariates[window_start:window_end, :]
            x_input[count, :, -1] = series  # 序列标签
            label[count, :] = data[window_start:window_end, series]
            nonzero_sum = (x_input[count, 1:input_size, 0]!=0).sum()  # 个数
            if nonzero_sum == 0:
                v_input[count, 0] = 0
            else:
                v_input[count, 0] = np.true_divide(x_input[count, 1:input_size, 0].sum(),nonzero_sum)+1
                x_input[count, :, 0] = x_input[count, :, 0]/v_input[count, 0]
                if train:
                    label[count, :] = label[count, :]/v_input[count, 0]
            count += 1
    prefix = os.path.join(save_path, 'train_' if train else 'test_')
    np.save(prefix+'data_'+save_name, x_input)
    np.save(prefix+'v_'+save_name, v_input)
    np.save(prefix+'label_'+save_name, label)

def gen_covariates(times, num_covariates):
    covariates = np.zeros((times.shape[0], num_covariates))
    for i, input_time in enumerate(times):
        covariates[i, 1] = input_time.weekday()
        covariates[i, 2] = input_time.hour
        covariates[i, 3] = input_time.month
    for i in range(1,num_covariates):
        covariates[:,i] = stats.zscore(covariates[:,i])
    return covariates[:, :num_covariates]

def visualize(data, week_start):  
    x = np.arange(window_size)
    f = plt.figure()
    plt.plot(x, data[week_start:week_start+window_size], color='b')
    f.savefig("visual.png")
    plt.close()

In [2]:
global save_path
name = 'LD2011_2014.txt'
save_name = 'elect'
window_size = 192  # encoder + decoder?
stride_size = 24
num_covariates = 4
train_start = '2011-01-01 00:00:00'
train_end = '2014-08-31 23:00:00'
test_start = '2014-08-25 00:00:00' # need additional 7 days as given input e.g., 1 - 20训练，14 - 27测试
test_end = '2014-09-07 23:00:00'
pred_days = 7
given_days = 7

save_path = os.path.join('data', save_name)
if not os.path.exists(save_path):
    os.makedirs(save_path)
csv_path = os.path.join(save_path, name)
if not os.path.exists(csv_path):
    zipurl = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00321/LD2011_2014.txt.zip'
    with urlopen(zipurl) as zipresp:
        with ZipFile(BytesIO(zipresp.read())) as zfile:
            zfile.extractall(save_path)

In [3]:
data_frame = pd.read_csv(csv_path, sep=";", index_col=0, parse_dates=True, decimal=',')

In [4]:
data_frame = data_frame.resample('1H',label = 'left',closed = 'right').sum()[train_start:test_end]

In [5]:
data_frame.fillna(0, inplace=True)
covariates = gen_covariates(data_frame[train_start:test_end].index, num_covariates)

In [6]:
train_data = data_frame[train_start:train_end].values
test_data = data_frame[test_start:test_end].values

In [7]:
data_start = (train_data!=0).argmax(axis=0)  #find first nonzero value in each time series

In [None]:
total_time = data_frame.shape[0] #32304
num_series = data_frame.shape[1] #370
prep_data(train_data, covariates, data_start)
prep_data(test_data, covariates, data_start, train=False)

In [None]:
# total_time = data_frame.shape[0] #32304
# num_series = data_frame.shape[1] #370

# data = test_data
# train= False
# time_len = data.shape[0]  # 32136

# input_size = window_size-stride_size  # (192 - 24)

# windows_per_series = np.full((num_series), (time_len-input_size) // stride_size)
# if train: windows_per_series -= (data_start+stride_size-1) // stride_size

# total_windows = np.sum(windows_per_series)
# x_input = np.zeros((total_windows, window_size, 1 + num_covariates + 1), dtype='float32')
# label = np.zeros((total_windows, window_size), dtype='float32')
# v_input = np.zeros((total_windows, 2), dtype='float32')
# #cov = 3: ground truth + age + day_of_week + hour_of_day + num_series
# #cov = 4: ground truth + age + day_of_week + hour_of_day + month_of_year + num_series
# count = 0
# if not train:  # for test
#     covariates = covariates[-time_len:]
# for series in trange(num_series):  #穷举series

#     cov_age = stats.zscore(np.arange(total_time-data_start[series]))  # 序列中的第几个，位置变量
#     if train:
#         covariates[data_start[series]:time_len, 0] = cov_age[:time_len-data_start[series]]  # 序列中的第几个，位置变量（矫正当前序列）
#     else:
#         covariates[:, 0] = cov_age[-time_len:]

#     for i in range(windows_per_series[series]):  # 穷举window
#         if train:
#             window_start = stride_size*i+data_start[series]
#         else:
#             window_start = stride_size*i
#         window_end = window_start + window_size

#         x_input[count, 1:, 0] = data[window_start:window_end-1, series]
#         x_input[count, :, 1:1+num_covariates] = covariates[window_start:window_end, :]
#         x_input[count, :, -1] = series  # 序列标签
#         label[count, :] = data[window_start:window_end, series]
#         nonzero_sum = (x_input[count, 1:input_size, 0]!=0).sum()  # 个数
#         if nonzero_sum == 0:
#             v_input[count, 0] = 0
#         else:
#             v_input[count, 0] = np.true_divide(x_input[count, 1:input_size, 0].sum(),nonzero_sum)+1
#             x_input[count, :, 0] = x_input[count, :, 0]/v_input[count, 0]
#             if train:
#                 label[count, :] = label[count, :]/v_input[count, 0]
#         count += 1
# prefix = os.path.join(save_path, 'train_' if train else 'test_')
# np.save(prefix+'data_'+save_name, x_input)
# np.save(prefix+'v_'+save_name, v_input)
# np.save(prefix+'label_'+save_name, label)

In [1]:
from __future__ import division
import numpy as np
import os
import logging
from torch.utils.data import DataLoader, Dataset, Sampler
import torch

logger = logging.getLogger('DeepAR.Data')

class TrainDataset(Dataset):
    def __init__(self, data_path, data_name, num_class):
        self.data = np.load(os.path.join(data_path, f'train_data_{data_name}.npy'))  # f-string字符串，可直接代入数值
        self.label = np.load(os.path.join(data_path, f'train_label_{data_name}.npy'))
        self.train_len = self.data.shape[0]
        logger.info(f'train_len: {self.train_len}')
        logger.info(f'building datasets from {data_path}...')

    def __len__(self):
        return self.train_len

    def __getitem__(self, index):
        return (self.data[index,:,:-1],int(self.data[index,0,-1]), self.label[index])

class TestDataset(Dataset):
    def __init__(self, data_path, data_name, num_class):
        self.data = np.load(os.path.join(data_path, f'test_data_{data_name}.npy'))
        self.v = np.load(os.path.join(data_path, f'test_v_{data_name}.npy'))
        self.label = np.load(os.path.join(data_path, f'test_label_{data_name}.npy'))
        self.test_len = self.data.shape[0]
        logger.info(f'test_len: {self.test_len}')
        logger.info(f'building datasets from {data_path}...')

    def __len__(self):
        return self.test_len

    def __getitem__(self, index):
        return (self.data[index,:,:-1],int(self.data[index,0,-1]),self.v[index],self.label[index])

class WeightedSampler(Sampler):
    def __init__(self, data_path, data_name, replacement=True):
        v = np.load(os.path.join(data_path, f'train_v_{data_name}.npy'))
        self.weights = torch.as_tensor(np.abs(v[:,0]) / np.sum(np.abs(v[:,0])), dtype=torch.double)
        logger.info(f'weights: {self.weights}')
        self.num_samples = self.weights.shape[0]
        logger.info(f'num samples: {self.num_samples}')
        self.replacement = replacement

    def __iter__(self):
        return iter(torch.multinomial(self.weights, self.num_samples, self.replacement).tolist())

    def __len__(self):
        return self.num_samples

In [2]:
from torch.utils.data.sampler import RandomSampler
sampler = WeightedSampler('/Users/kaishuai/Desktop/ProbabilisticLoadForecasting/forecasting/probabilistic/data/elect', 'elect')
test_set = TestDataset('/Users/kaishuai/Desktop/ProbabilisticLoadForecasting/forecasting/probabilistic/data/elect', 'elect', 370)
test_loader = DataLoader(test_set, batch_size=32, sampler=RandomSampler(test_set), num_workers=4)

In [3]:
sampler = WeightedSampler('/Users/kaishuai/Desktop/ProbabilisticLoadForecasting/forecasting/probabilistic/data/elect', 'elect')
train_set = TrainDataset('/Users/kaishuai/Desktop/ProbabilisticLoadForecasting/forecasting/probabilistic/data/elect', 'elect', 370)
train_loader = DataLoader(train_set, batch_size=32, sampler=sampler, num_workers=4)

In [4]:
for i, (test_batch, id_batch, v, labels) in enumerate(test_loader):
    break

In [20]:
test_batch.shape

torch.Size([192, 32, 5])

In [None]:
for t in range(params.test_predict_start):  # encoder
    # if z_t is missing, replace it by output mu from the last time step
    zero_index = (test_batch[t,:,0] == 0)
    if t > 0 and torch.sum(zero_index) > 0:
        test_batch[t,zero_index,0] = mu[zero_index]

    mu, sigma, hidden, cell = model(test_batch[t].unsqueeze(0), id_batch, hidden, cell)
    input_mu[:,t] = v_batch[:, 0] * mu + v_batch[:, 1]  # v_batch[:, 1] == 0, useless
    input_sigma[:,t] = v_batch[:, 0] * sigma

test_batch[params.test_predict_start, :, 0] = input_mu[:, params.test_predict_start-1]

# decoder
if sample:
    samples, sample_mu, sample_sigma = model.test(test_batch, v_batch, id_batch, hidden, cell, sampling=True)
    raw_metrics = utils.update_metrics(raw_metrics, input_mu, input_sigma, sample_mu, labels, params.test_predict_start, samples, relative = params.relative_metrics)
else:
    sample_mu, sample_sigma = model.test(test_batch, v_batch, id_batch, hidden, cell)
    raw_metrics = utils.update_metrics(raw_metrics, input_mu, input_sigma, sample_mu, labels, params.test_predict_start, relative = params.relative_metrics)


In [1]:
import torch
from torch import nn

In [15]:
embeds = nn.Embedding(2, 5)

In [6]:
torch.manual_seed(1)
embeds.weight

Parameter containing:
tensor([[-0.1172,  1.0811, -0.5033, -1.0248,  0.4967],
        [-0.2623,  0.0701,  0.8654, -0.7001,  1.1395]], requires_grad=True)

In [16]:
s = torch.Tensor([[1.0,2,3,4]])

In [17]:
embeds(s)

RuntimeError: Expected tensor for argument #1 'indices' to have scalar type Long; but got torch.FloatTensor instead (while checking arguments for embedding)