In [59]:
import numpy as np
import pandas as pd

# Load Dataset

In [60]:
train_dataset = np.load('./data/coord_train.npy')
test_dataset = np.load('./data/coord_train.npy')

In [61]:
print(train_dataset.shape)
print(test_dataset.shape)

(8760, 10, 20)
(8760, 10, 20)


# Sequence 데이터로 만들기

In [62]:
def sampler(data, lag=8, bias=0, step=1, temp=False):
    """This function makes samples of the time series data
    args:
    - data : (# of data, height, width)
    - lag : the length of sampling
    - step : (step)-ahead forecasting label
    return: 
    - data_x (# of sample, height, width, lag)
    - data_y (# of sample, height, width, 1)
    """
    num_row = len(data)
    data_x, data_y = [], []
    for idx in range(num_row):
        strat_idx = idx + bias
        try:
            y = np.array(data[strat_idx+lag+(step-1)])
            data_y.append(y)
            if not temp:
                x = np.transpose(data[strat_idx:strat_idx+lag], [1,2,0])
                data_x.append(x)
        except:
            if len(np.shape(data_y)) <4 and not temp:
                data_y = np.expand_dims(data_y, axis=-1)
            print("Sampler Return", np.shape(data_x), np.shape(data_y))
            break
            
    if not temp:
        return np.array(data_x), np.array(data_y)
    else:
        return np.array(data_y)

## Hourly Data
- 24시간 후를 예측하는 hourly data

In [63]:
LAG = 8
END_LAG = 16
STEP = 1 #6
BIAS = END_LAG - LAG # 16-8=8

In [64]:
x_train_1, y_train_1 = sampler(train_dataset, lag=LAG, bias=0, step=1)
x_train_2, y_train_2 = sampler(train_dataset, lag=LAG, bias=0, step=2)
x_train_3, y_train_3 = sampler(train_dataset, lag=LAG, bias=0, step=3)

# 24시간 후를 예측하는 hourly data
x_train_24, y_train_24 = sampler(train_dataset, lag=24, bias=0, step=24)

Sampler Return (8752, 10, 20, 8) (8752, 10, 20, 1)
Sampler Return (8751, 10, 20, 8) (8751, 10, 20, 1)
Sampler Return (8750, 10, 20, 8) (8750, 10, 20, 1)
Sampler Return (8713, 10, 20, 24) (8713, 10, 20, 1)


In [65]:
x_test_1, y_test_1 = sampler(test_dataset, lag=LAG, bias=0, step=1)
x_test_2, y_test_2 = sampler(test_dataset, lag=LAG, bias=0, step=2)
x_test_3, y_test_3 = sampler(test_dataset, lag=LAG, bias=0, step=3)

x_test_24, y_test_24 = sampler(test_dataset, lag=24, bias=0, step=24)

Sampler Return (8752, 10, 20, 8) (8752, 10, 20, 1)
Sampler Return (8751, 10, 20, 8) (8751, 10, 20, 1)
Sampler Return (8750, 10, 20, 8) (8750, 10, 20, 1)
Sampler Return (8713, 10, 20, 24) (8713, 10, 20, 1)


## Daily Data
- 1일 후를 예측하는 daily data

In [78]:
train_dataset

array([[[-100. , -100. , -100. , ..., -100. , -100. , -100. ],
        [-100. , -100. , -100. , ..., -100. , -100. , -100. ],
        [-100. , -100. , -100. , ..., -100. , -100. , -100. ],
        ...,
        [-100. , -100. ,    1.5, ..., -100. , -100. , -100. ],
        [-100. , -100. , -100. , ..., -100. , -100. , -100. ],
        [-100. , -100. , -100. , ..., -100. , -100. , -100. ]],

       [[-100. , -100. , -100. , ..., -100. , -100. , -100. ],
        [-100. , -100. , -100. , ..., -100. , -100. , -100. ],
        [-100. , -100. , -100. , ..., -100. , -100. , -100. ],
        ...,
        [-100. , -100. ,    1.8, ..., -100. , -100. , -100. ],
        [-100. , -100. , -100. , ..., -100. , -100. , -100. ],
        [-100. , -100. , -100. , ..., -100. , -100. , -100. ]],

       [[-100. , -100. , -100. , ..., -100. , -100. , -100. ],
        [-100. , -100. , -100. , ..., -100. , -100. , -100. ],
        [-100. , -100. , -100. , ..., -100. , -100. , -100. ],
        ...,
        [-10

In [None]:
x_train_24, y_train_24 = sampler(train_dataset, lag=24, bias=0, step=24)

In [66]:
np.savez('./TGNet/data/x_train_1.npz', x_train_1)
np.savez('./TGNet/data/y_train_1.npz', y_train_1)

np.savez('./TGNet/data/x_train_2.npz', x_train_2)
np.savez('./TGNet/data/y_train_2.npz', y_train_2)

np.savez('./TGNet/data/x_train_3.npz', x_train_3)
np.savez('./TGNet/data/y_train_3.npz', y_train_3)

np.savez('./TGNet/data/x_train_24.npz', x_train_24)
np.savez('./TGNet/data/y_train_24.npz', y_train_24)

In [67]:
np.savez('./TGNet/data/x_test_1.npz', x_test_1)
np.savez('./TGNet/data/y_test_1.npz', y_test_1)

np.savez('./TGNet/data/x_test_2.npz', x_test_2)
np.savez('./TGNet/data/y_test_2.npz', y_test_2)

np.savez('./TGNet/data/x_test_3.npz', x_test_3)
np.savez('./TGNet/data/y_test_3.npz', y_test_3)

np.savez('./TGNet/data/x_test_24.npz', x_test_24)
np.savez('./TGNet/data/y_test_24.npz', y_test_24)

In [16]:
# 확인
# x_train = np.load('./TGNet/data/x_train_1.npz')
# x_train['arr_0'].shape

(8752, 10, 20, 8)

## Make Temporal Information

In [68]:
print("Train shape: ", np.shape(train_dataset), ", Test shape: ", np.shape(test_dataset))

# Setting Some Parameters 
num_train, num_test = np.shape(train_dataset)[0], np.shape(train_dataset)[0]
num_row = num_train + num_test

Train shape:  (8760, 10, 20) , Test shape:  (8760, 10, 20)


In [69]:
### Initialize numpy array of temporal information (one-hot encoding)
# datasets_min_30 = np.zeros([num_row, 48]) # 30분 단위 시간 => 1시간 단위로 바꾸기

datasets_hour = np.zeros([num_row, 24]) # 1시간 단위의 시간 (0시~23시)
print(datasets_hour.shape)

# datasets_dow = np.zeros([num_row, 7]) # dayofweek: 불필요
# datasets_holiday = np.zeros([num_row, 1]) # 공휴일 여부: 불필요
# datasets_prev_holiday = np.zeros([num_row, 1]) # 공휴일 전날 여부: 불필요

(17520, 24)


In [70]:
# 1 hour index are calculated below
for i in range(num_row):
    idx_hour = int(int(i)%24)
    datasets_hour[i,idx_hour] = 1
print(datasets_hour.shape)

(17520, 24)


In [71]:
def train_test_split(data, idx):
    return data[:idx], data[idx:]

## Split Train&Test Period

In [72]:
train_index = num_train
hour_train, hour_test = train_test_split(dataset_hour, train_index)

In [73]:
hour_train.shape

(8760, 24)

## Sampler

In [74]:
LAG = 8
END_LAG = 16
STEP = 1 #6
BIAS = END_LAG - LAG

In [75]:
hour_train_y_1 = sampler(hour_train, lag=8, step=1, temp=True)
hour_test_y_1 = sampler(hour_test, lag=8, step=1, temp=True)

hour_train_y_2 = sampler(hour_train, lag=8, step=2, temp=True)
hour_test_y_2 = sampler(hour_test, lag=8, step=2, temp=True)

hour_train_y_3 = sampler(hour_train, lag=8, step=3, temp=True)
hour_test_y_3 = sampler(hour_test, lag=8, step=3, temp=True)

hour_train_y_24 = sampler(hour_train, lag=24, step=24, temp=True)
hour_test_y_24 = sampler(hour_test, lag=24, step=24, temp=True)

Sampler Return (0,) (8752, 24)
Sampler Return (0,) (8752, 24)
Sampler Return (0,) (8751, 24)
Sampler Return (0,) (8751, 24)
Sampler Return (0,) (8750, 24)
Sampler Return (0,) (8750, 24)
Sampler Return (0,) (8713, 24)
Sampler Return (0,) (8713, 24)


In [76]:
temporal_train_1 = hour_train_y_1
temporal_test_1 = hour_test_y_1

temporal_train_2 = hour_train_y_2
temporal_test_2 = hour_test_y_2

temporal_train_3 = hour_train_y_3
temporal_test_3 = hour_test_y_3

temporal_train_24 = hour_train_y_24
temporal_test_24 = hour_test_y_24

print(np.shape(temporal_train_1), np.shape(temporal_test_1))
print(np.shape(temporal_train_2), np.shape(temporal_test_2))
print(np.shape(temporal_train_3), np.shape(temporal_test_3))
print(np.shape(temporal_train_24), np.shape(temporal_test_24))

(8752, 24) (8752, 24)
(8751, 24) (8751, 24)
(8750, 24) (8750, 24)
(8713, 24) (8713, 24)


In [77]:
np.savez('./data/temporal_train_1.npz', temporal_train_1)
np.savez('./data/temporal_test_1.npz', temporal_test_1)

np.savez('./data/temporal_train_2.npz', temporal_train_2)
np.savez('./data/temporal_test_2.npz', temporal_test_2)

np.savez('./data/temporal_train_3.npz', temporal_train_3)
np.savez('./data/temporal_test_3.npz', temporal_test_3)

np.savez('./data/temporal_train_24.npz', temporal_train_24)
np.savez('./data/temporal_test_24.npz', temporal_test_24)

# Important Remark
For fair comparision with STDN setting, we only use 477 number of test dataset.

In [None]:
# %mkdir NYC_taxi_dataset

In [None]:
# np.savez('./NYC_taxi_dataset/x_st_train.npz', start_train_x)
# np.savez('./NYC_taxi_dataset/x_st_test.npz', start_test_x)
# np.savez('./NYC_taxi_dataset/x_end_train.npz', end_train_x)
# np.savez('./NYC_taxi_dataset/x_end_test.npz', end_test_x)
# np.savez('./NYC_taxi_dataset/y_st_train.npz', start_train_y)
# np.savez('./NYC_taxi_dataset/y_st_test.npz', start_test_y)
# np.savez('./NYC_taxi_dataset/temporal_train.npz', temporal_train)
# np.savez('./NYC_taxi_dataset/temporal_test.npz', temporal_test)

## Coordinate Information

In [None]:
train_num, h, w = np.shape(start_train_y)[:-1]
test_num = np.shape(start_test_y)[0]

In [None]:
coord_y = np.expand_dims(np.array([[y]*w for y in range(h)]), axis=-1)
coord_x = np.expand_dims(np.array([[x]*h for x in range(w)]), axis=-1)
coord_x = np.transpose(coord_x, [1,0,2])
coord_xy = np.concatenate([coord_y, coord_x], axis=-1)
coord_xy = np.repeat(np.expand_dims(coord_xy, axis=0), repeats=num_row, axis=0)
print(np.shape(coord_xy))

(2880, 10, 20, 2)


In [None]:
len(start_train_y)

1904

In [None]:
coord_train, coord_test = coord_xy[:len(start_train_y)], coord_xy[len(start_train_y):]
print(np.shape(coord_train), np.shape(coord_test))
np.savez('./NYC_taxi_dataset/coord_train.npz', coord_train)
np.savez('./NYC_taxi_dataset/coord_test.npz', coord_test)

(1904, 10, 20, 2) (976, 10, 20, 2)


In [None]:
48*20

960

In [None]:
960-16

944