# train, valid, test loader

In [1]:
# import external libraries
import os
import sys
import random
import numpy as np
import pandas as pd
import torch
import math
import time

# import internal modules
sys.path.insert(1, '../src/')
from models.nn import MLP, TimeSeriesDataset
from utils.data_editor import lag, train_test_split

In [2]:
# read processed data
df = pd.read_csv("../data/processed/tidy_df.csv", index_col=[0, 1, 2])

# empty list for dataframes
y_test_list = []
y_hat_umlp = []

i = df.index.get_level_values(0).unique()[0]
print(i)

# y : "EPS"
y = df.loc[pd.IndexSlice[i, :, :], "EPS"]

# x, exogenous regressors : 'INV', 'AR', 'CAPX', 'GM', 'SA', 'ETR', 'LF'
#     x = df.loc[pd.IndexSlice[i, :, :], ['INV', 'AR', 'CAPX', 'GM', 'SA', 'ETR', 'LF']]

# Unlike statsmodel SARIMA package, NN needs to prepare lagged inputs manually if needed.
# y_lag and x_lag (lag 4 for now)
num_lag = 4
y_lag = lag(y, num_lag, drop_nan=False, reset_index=False)
#     x_lag = lag(x, num_lag, drop_nan=False, reset_index=False)

# Redefine data name as target (y) and feature (y_lag) (explanatory variable, predictor)
target = y
feature = y_lag

# save simple test data series
_, target_test = train_test_split(target, test_size=12)
_, feature_test = train_test_split(feature, test_size=12)

# drop nan caused by lag()
feature = feature.dropna(axis=0)
target = target[feature.index]

# setting torch
dtype = torch.float # double float problem in layer 
device = torch.device("cpu")

# Make data to torch.tensor
target_all_window = torch.tensor(target.values, dtype=dtype)
feature_all_window = torch.tensor(feature.values, dtype=dtype)
target_test_all_window = torch.tensor(target_test.values, dtype=dtype)
feature_test_all_window = torch.tensor(feature_test.values, dtype=dtype)

あらた


In [3]:
print("  target total all window size: ", target_all_window.size())
print(" feature total all window size: ", feature_all_window.size())
print("   target test all window size: ", target_test_all_window.size())
print("  feature test all window size: ", feature_test_all_window.size())

  target total all window size:  torch.Size([48])
 feature total all window size:  torch.Size([48, 4])
   target test all window size:  torch.Size([12])
  feature test all window size:  torch.Size([12, 4])


In [4]:
test_window_size = len(target_test_all_window)
print("test window size: ", test_window_size)
train_valid_window_size = len(target_all_window) - test_window_size
print("train-valid window size: ", train_valid_window_size)
valid_window_size = test_window_size
print("valid window size: ", valid_window_size)
train_window_size = train_valid_window_size - valid_window_size
print("train window size: ", train_window_size)

test window size:  12
train-valid window size:  36
valid window size:  12
train window size:  24


# Test, Train-Valid split before rolling sample

In [5]:
train_valid_all_window_dataset = TimeSeriesDataset(feature_all_window, target_all_window, train_valid_window_size)
print("length of train-valid all window dataset: ", len(train_valid_all_window_dataset))

train_valid_window_loader = torch.utils.data.DataLoader(train_valid_all_window_dataset, batch_size=1, shuffle=False)
for window, (feature_train_valid, target_train_valid) in enumerate(train_valid_window_loader):
    feature_train_valid = feature_train_valid[0] # extract single batch
    target_train_valid = target_train_valid[0] # extract single batch
        
#     if (window == 0) | (window == 1):
    print("")
    print("======================================")
    print("WINDOW: ", window)
    print("TRAIN-VALID FEATURE: ")
    print(feature_train_valid.round())
    print(feature_train_valid.size())
    print("TRAIN-VALID TARGET: ")
    print(target_train_valid.round())
    print(target_train_valid.size())
    
    # all "batch" dataset
    train_valid_dataset = TimeSeriesDataset(feature_train_valid, target_train_valid, train_window=None)
    print("    length of train_valid_dataset: ", len(train_valid_dataset))
    
    # 分け方は一応 full-psuedo で。(valid に rolling windowはしない)
    train_dataset = torch.utils.data.dataset.Subset(train_valid_dataset, list(range(0, train_window_size)))
    print("    length of train_dataset: ", len(train_dataset))
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=train_window_size, shuffle=False)
    for mini_batch, (feature_train, target_train) in enumerate(train_loader):
        print("--------------------------------")
        print("    MINI-BATCH: ", mini_batch)
        print("    TRAIN FEATURE: ")
        print(feature_train.round())
        print(feature_train.size())
        print("    TRAIN TARGET: ")
        print(target_train.round())
        print(target_train.size())
        
    valid_dataset = torch.utils.data.dataset.Subset(train_valid_dataset, list(range(train_window_size, len(train_valid_dataset))))
    print("    length of valid_dataset: ", len(valid_dataset))
    valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=valid_window_size, shuffle=False)
    for mini_batch, (feature_valid, target_valid) in enumerate(valid_loader):
        print("--------------------------------")
        print("    MINI-BATCH: ", mini_batch)
        print("    VALID FEATURE: ")
        print(feature_valid.round())
        print(feature_valid.size())
        print("    VALID TARGET: ")
        print(target_valid.round())
        print(target_valid.size())

test_all_window_dataset = TimeSeriesDataset(feature_test_all_window, target_test_all_window, None)
print("length of test all window dataset: ", len(test_all_window_dataset))
test_window_loader = torch.utils.data.DataLoader(test_all_window_dataset, batch_size=1, shuffle=False)
for window, (feature_test, target_test) in enumerate(test_window_loader):
#     feature_test = feature_test[0] # extract single batch
#     target_test = target_test[0] # extract single batch
        
#     if (window == 0) | (window == 1):
    print("")
    print("======================================")
    print("WINDOW: ", window)
    print("TEST FEATURE: ")
    print(feature_test.round())
    print(feature_test.size())
    print("TEST TARGET: ")
    print(target_test.round())
    print(target_test.size())

length of train-valid all window dataset:  12

WINDOW:  0
TRAIN-VALID FEATURE: 
tensor([[ -5.,   3.,   2.,   3.],
        [  7.,  -5.,   3.,   2.],
        [  7.,   7.,  -5.,   3.],
        [  5.,   7.,   7.,  -5.],
        [ -1.,   5.,   7.,   7.],
        [ -3.,  -1.,   5.,   7.],
        [  8.,  -3.,  -1.,   5.],
        [  9.,   8.,  -3.,  -1.],
        [ -1.,   9.,   8.,  -3.],
        [  4.,  -1.,   9.,   8.],
        [  8.,   4.,  -1.,   9.],
        [  6.,   8.,   4.,  -1.],
        [  4.,   6.,   8.,   4.],
        [  5.,   4.,   6.,   8.],
        [  7.,   5.,   4.,   6.],
        [  4.,   7.,   5.,   4.],
        [  7.,   4.,   7.,   5.],
        [  6.,   7.,   4.,   7.],
        [  5.,   6.,   7.,   4.],
        [  6.,   5.,   6.,   7.],
        [ 15.,   6.,   5.,   6.],
        [ -0.,  15.,   6.,   5.],
        [  3.,  -0.,  15.,   6.],
        [  9.,   3.,  -0.,  15.],
        [  3.,   9.,   3.,  -0.],
        [ 12.,   3.,   9.,   3.],
        [ 42.,  12.,   3.,   9.],
  

# single test data within each rolling window sample 

In [6]:
feature_test_all_window

tensor([[ 69.3400, 119.8800,  90.9200, 121.1700],
        [105.4200,  69.3400, 119.8800,  90.9200],
        [ 90.4400, 105.4200,  69.3400, 119.8800],
        [106.2400,  90.4400, 105.4200,  69.3400],
        [ 99.2200, 106.2400,  90.4400, 105.4200],
        [116.8500,  99.2200, 106.2400,  90.4400],
        [101.7600, 116.8500,  99.2200, 106.2400],
        [113.9300, 101.7600, 116.8500,  99.2200],
        [ 79.9000, 113.9300, 101.7600, 116.8500],
        [145.2300,  79.9000, 113.9300, 101.7600],
        [120.6800, 145.2300,  79.9000, 113.9300],
        [153.3500, 120.6800, 145.2300,  79.9000]])

In [7]:
target_test_all_window

tensor([105.4200,  90.4400, 106.2400,  99.2200, 116.8500, 101.7600, 113.9300,
         79.9000, 145.2300, 120.6800, 153.3500,  61.4034])

In [8]:
train_valid_all_window_dataset = TimeSeriesDataset(feature_all_window, target_all_window, train_valid_window_size)
print("length of train-valid all window dataset: ", len(train_valid_all_window_dataset))

train_valid_window_loader = torch.utils.data.DataLoader(train_valid_all_window_dataset, batch_size=1, shuffle=False)
for window, (feature_train_valid, target_train_valid) in enumerate(train_valid_window_loader):
    feature_train_valid = feature_train_valid[0] # extract single batch
    target_train_valid = target_train_valid[0] # extract single batch
        
#     if (window == 0) | (window == 1):
    print("")
    print("======================================")
    print("WINDOW: ", window)
    print("TRAIN-VALID FEATURE: ")
    print(feature_train_valid.round())
    print(feature_train_valid.size())
    print("TRAIN-VALID TARGET: ")
    print(target_train_valid.round())
    print(target_train_valid.size())
    
    # all "batch" dataset
    train_valid_dataset = TimeSeriesDataset(feature_train_valid, target_train_valid, train_window=None)
    print("    length of train_valid_dataset: ", len(train_valid_dataset))
    
    # 分け方は一応 full-psuedo で。(valid に rolling windowはしない)
    train_dataset = torch.utils.data.dataset.Subset(train_valid_dataset, list(range(0, train_window_size)))
    print("///// TRAIN /////")
    print("    length of train_dataset: ", len(train_dataset))
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=train_window_size, shuffle=False)
    for mini_batch, (feature_train, target_train) in enumerate(train_loader):
        print("--------------------------------")
        print("    MINI-BATCH: ", mini_batch)
        print("    TRAIN FEATURE: ")
        print(feature_train.round())
        print(feature_train.size())
        print("    TRAIN TARGET: ")
        print(target_train.round())
        print(target_train.size())
        
    valid_dataset = torch.utils.data.dataset.Subset(train_valid_dataset, list(range(train_window_size, len(train_valid_dataset))))
    print("///// VALID /////")
    print("    length of valid_dataset: ", len(valid_dataset))
    valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=valid_window_size, shuffle=False)
    for mini_batch, (feature_valid, target_valid) in enumerate(valid_loader):
        print("--------------------------------")
        print("    MINI-BATCH: ", mini_batch)
        print("    VALID FEATURE: ")
        print(feature_valid.round())
        print(feature_valid.size())
        print("    VALID TARGET: ")
        print(target_valid.round())
        print(target_valid.size())
        
    test_dataset = TimeSeriesDataset(feature_test_all_window[0].reshape(1, -1), target_test_all_window[0].reshape(-1), train_window=None)
    print("///// TEST /////")
    print("    length of test_dataset: ", len(test_dataset))
    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=1, shuffle=False)
    for mini_batch, (feature_test, target_test) in enumerate(test_loader):
        print("--------------------------------")
        print("    MINI-BATCH: ", mini_batch)
        print("    TEST FEATURE: ")
        print(feature_test.round())
        print(feature_test.size())
        print("    TEST TARGET: ")
        print(target_test.round())
        print(target_test.size())

length of train-valid all window dataset:  12

WINDOW:  0
TRAIN-VALID FEATURE: 
tensor([[ -5.,   3.,   2.,   3.],
        [  7.,  -5.,   3.,   2.],
        [  7.,   7.,  -5.,   3.],
        [  5.,   7.,   7.,  -5.],
        [ -1.,   5.,   7.,   7.],
        [ -3.,  -1.,   5.,   7.],
        [  8.,  -3.,  -1.,   5.],
        [  9.,   8.,  -3.,  -1.],
        [ -1.,   9.,   8.,  -3.],
        [  4.,  -1.,   9.,   8.],
        [  8.,   4.,  -1.,   9.],
        [  6.,   8.,   4.,  -1.],
        [  4.,   6.,   8.,   4.],
        [  5.,   4.,   6.,   8.],
        [  7.,   5.,   4.,   6.],
        [  4.,   7.,   5.,   4.],
        [  7.,   4.,   7.,   5.],
        [  6.,   7.,   4.,   7.],
        [  5.,   6.,   7.,   4.],
        [  6.,   5.,   6.,   7.],
        [ 15.,   6.,   5.,   6.],
        [ -0.,  15.,   6.,   5.],
        [  3.,  -0.,  15.,   6.],
        [  9.,   3.,  -0.,  15.],
        [  3.,   9.,   3.,  -0.],
        [ 12.,   3.,   9.,   3.],
        [ 42.,  12.,   3.,   9.],
  

In [37]:
# Scaling
from sklearn.preprocessing import StandardScaler

In [62]:
feature_train_valid.round()

tensor([[  6.,   8.,   4.,  -1.],
        [  4.,   6.,   8.,   4.],
        [  5.,   4.,   6.,   8.],
        [  7.,   5.,   4.,   6.],
        [  4.,   7.,   5.,   4.],
        [  7.,   4.,   7.,   5.],
        [  6.,   7.,   4.,   7.],
        [  5.,   6.,   7.,   4.],
        [  6.,   5.,   6.,   7.],
        [ 15.,   6.,   5.,   6.],
        [ -0.,  15.,   6.,   5.],
        [  3.,  -0.,  15.,   6.],
        [  9.,   3.,  -0.,  15.],
        [  3.,   9.,   3.,  -0.],
        [ 12.,   3.,   9.,   3.],
        [ 42.,  12.,   3.,   9.],
        [ 81.,  42.,  12.,   3.],
        [ 28.,  81.,  42.,  12.],
        [ 87.,  28.,  81.,  42.],
        [ 77.,  87.,  28.,  81.],
        [100.,  77.,  87.,  28.],
        [ 68., 100.,  77.,  87.],
        [121.,  68., 100.,  77.],
        [ 91., 121.,  68., 100.],
        [120.,  91., 121.,  68.],
        [ 69., 120.,  91., 121.],
        [105.,  69., 120.,  91.],
        [ 90., 105.,  69., 120.],
        [106.,  90., 105.,  69.],
        [ 99.,

In [63]:
TRAIN_WINDOW_SIZE = 30
feature_train_valid[:TRAIN_WINDOW_SIZE].round()

tensor([[  6.,   8.,   4.,  -1.],
        [  4.,   6.,   8.,   4.],
        [  5.,   4.,   6.,   8.],
        [  7.,   5.,   4.,   6.],
        [  4.,   7.,   5.,   4.],
        [  7.,   4.,   7.,   5.],
        [  6.,   7.,   4.,   7.],
        [  5.,   6.,   7.,   4.],
        [  6.,   5.,   6.,   7.],
        [ 15.,   6.,   5.,   6.],
        [ -0.,  15.,   6.,   5.],
        [  3.,  -0.,  15.,   6.],
        [  9.,   3.,  -0.,  15.],
        [  3.,   9.,   3.,  -0.],
        [ 12.,   3.,   9.,   3.],
        [ 42.,  12.,   3.,   9.],
        [ 81.,  42.,  12.,   3.],
        [ 28.,  81.,  42.,  12.],
        [ 87.,  28.,  81.,  42.],
        [ 77.,  87.,  28.,  81.],
        [100.,  77.,  87.,  28.],
        [ 68., 100.,  77.,  87.],
        [121.,  68., 100.,  77.],
        [ 91., 121.,  68., 100.],
        [120.,  91., 121.,  68.],
        [ 69., 120.,  91., 121.],
        [105.,  69., 120.,  91.],
        [ 90., 105.,  69., 120.],
        [106.,  90., 105.,  69.],
        [ 99.,

In [67]:
feature_train_valid[TRAIN_WINDOW_SIZE:].round()

tensor([[117.,  99., 106.,  90.],
        [102., 117.,  99., 106.],
        [114., 102., 117.,  99.],
        [ 80., 114., 102., 117.],
        [145.,  80., 114., 102.],
        [121., 145.,  80., 114.]])

In [64]:
feature_train_scaler = StandardScaler().fit(feature_train_valid[:TRAIN_WINDOW_SIZE])

In [68]:
# torch.tensor(feature_train_scaler.transform(feature_train).reshape(feature_train.shape), dtype=dtype)

In [80]:
# scaled train
torch.tensor(feature_train_scaler.transform(feature_train_valid[:TRAIN_WINDOW_SIZE]), dtype=dtype)

tensor([[-0.9061, -0.8164, -0.8418, -0.9051],
        [-0.9662, -0.8487, -0.7571, -0.7782],
        [-0.9283, -0.9097, -0.7903, -0.6925],
        [-0.8960, -0.8712, -0.8530, -0.7261],
        [-0.9669, -0.8384, -0.8134, -0.7895],
        [-0.8777, -0.9104, -0.7798, -0.7495],
        [-0.9090, -0.8199, -0.8537, -0.7154],
        [-0.9438, -0.8517, -0.7607, -0.7902],
        [-0.9088, -0.8870, -0.7934, -0.6961],
        [-0.7088, -0.8514, -0.8296, -0.7292],
        [-1.0499, -0.6485, -0.7931, -0.7659],
        [-0.9797, -0.9946, -0.5847, -0.7289],
        [-0.8454, -0.9234, -0.9402, -0.5180],
        [-0.9859, -0.7871, -0.8671, -0.8777],
        [-0.7826, -0.9297, -0.7271, -0.8038],
        [-0.0906, -0.7234, -0.8735, -0.6621],
        [ 0.8024, -0.0210, -0.6617, -0.8103],
        [-0.4089,  0.8853,  0.0596, -0.5959],
        [ 0.9347, -0.3441,  0.9904,  0.1341],
        [ 0.7111,  1.0195, -0.2721,  1.0760],
        [ 1.2328,  0.7926,  1.1282, -0.2016],
        [ 0.4961,  1.3221,  0.8952

In [82]:
# scaled val
torch.tensor(feature_train_scaler.transform(feature_train_valid[TRAIN_WINDOW_SIZE:]), dtype=dtype)

tensor([[1.6252, 1.3110, 1.5949, 1.3064],
        [1.2799, 1.7204, 1.4275, 1.6877],
        [1.5584, 1.3699, 1.8479, 1.5183],
        [0.7798, 1.6525, 1.4880, 1.9438],
        [2.2746, 0.8623, 1.7783, 1.5796],
        [1.7128, 2.3794, 0.9668, 1.8733]])

In [85]:
# overwrite memory
feature_train_valid[:TRAIN_WINDOW_SIZE] = torch.tensor(feature_train_scaler.transform(feature_train_valid[:TRAIN_WINDOW_SIZE]), dtype=dtype)
feature_train_valid[TRAIN_WINDOW_SIZE:] = torch.tensor(feature_train_scaler.transform(feature_train_valid[TRAIN_WINDOW_SIZE:]), dtype=dtype)

In [86]:
feature_train_valid.round()

tensor([[-1., -1., -1., -1.],
        [-1., -1., -1., -1.],
        [-1., -1., -1., -1.],
        [-1., -1., -1., -1.],
        [-1., -1., -1., -1.],
        [-1., -1., -1., -1.],
        [-1., -1., -1., -1.],
        [-1., -1., -1., -1.],
        [-1., -1., -1., -1.],
        [-1., -1., -1., -1.],
        [-1., -1., -1., -1.],
        [-1., -1., -1., -1.],
        [-1., -1., -1., -1.],
        [-1., -1., -1., -1.],
        [-1., -1., -1., -1.],
        [-0., -1., -1., -1.],
        [ 1., -0., -1., -1.],
        [-0.,  1.,  0., -1.],
        [ 1., -0.,  1.,  0.],
        [ 1.,  1., -0.,  1.],
        [ 1.,  1.,  1., -0.],
        [ 0.,  1.,  1.,  1.],
        [ 2.,  1.,  1.,  1.],
        [ 1.,  2.,  1.,  2.],
        [ 2.,  1.,  2.,  1.],
        [ 1.,  2.,  1.,  2.],
        [ 1.,  1.,  2.,  1.],
        [ 1.,  1.,  1.,  2.],
        [ 1.,  1.,  2.,  1.],
        [ 1.,  1.,  1.,  2.],
        [ 2.,  1.,  2.,  1.],
        [ 1.,  2.,  1.,  2.],
        [ 2.,  1.,  2.,  2.],
        [ 

In [94]:
feature_test_all_window

tensor([[ 69.3400, 119.8800,  90.9200, 121.1700],
        [105.4200,  69.3400, 119.8800,  90.9200],
        [ 90.4400, 105.4200,  69.3400, 119.8800],
        [106.2400,  90.4400, 105.4200,  69.3400],
        [ 99.2200, 106.2400,  90.4400, 105.4200],
        [116.8500,  99.2200, 106.2400,  90.4400],
        [101.7600, 116.8500,  99.2200, 106.2400],
        [113.9300, 101.7600, 116.8500,  99.2200],
        [ 79.9000, 113.9300, 101.7600, 116.8500],
        [145.2300,  79.9000, 113.9300, 101.7600],
        [120.6800, 145.2300,  79.9000, 113.9300],
        [153.3500, 120.6800, 145.2300,  79.9000]])

In [95]:
feature_test = feature_test_all_window[0].reshape(1, -1)
feature_test

tensor([[ 69.3400, 119.8800,  90.9200, 121.1700]])

In [102]:
feature_test = torch.tensor(feature_train_scaler.transform(feature_test), dtype=dtype)
feature_test

tensor([[-1.0361, -0.9515, -0.9093, -0.8267]])

In [101]:
feature_test_all_window

tensor([[ 69.3400, 119.8800,  90.9200, 121.1700],
        [105.4200,  69.3400, 119.8800,  90.9200],
        [ 90.4400, 105.4200,  69.3400, 119.8800],
        [106.2400,  90.4400, 105.4200,  69.3400],
        [ 99.2200, 106.2400,  90.4400, 105.4200],
        [116.8500,  99.2200, 106.2400,  90.4400],
        [101.7600, 116.8500,  99.2200, 106.2400],
        [113.9300, 101.7600, 116.8500,  99.2200],
        [ 79.9000, 113.9300, 101.7600, 116.8500],
        [145.2300,  79.9000, 113.9300, 101.7600],
        [120.6800, 145.2300,  79.9000, 113.9300],
        [153.3500, 120.6800, 145.2300,  79.9000]])

In [112]:
# read processed data
df = pd.read_csv("../data/processed/tidy_df.csv", index_col=[0, 1, 2])

# empty list for dataframes
y_test_list = []
y_hat_umlp = []

i = df.index.get_level_values(0).unique()[0]
print(i)

# y : "EPS"
y = df.loc[pd.IndexSlice[i, :, :], "EPS"]

# x, exogenous regressors : 'INV', 'AR', 'CAPX', 'GM', 'SA', 'ETR', 'LF'
#     x = df.loc[pd.IndexSlice[i, :, :], ['INV', 'AR', 'CAPX', 'GM', 'SA', 'ETR', 'LF']]

# Unlike statsmodel SARIMA package, NN needs to prepare lagged inputs manually if needed.
# y_lag and x_lag (lag 4 for now)
num_lag = 4
y_lag = lag(y, num_lag, drop_nan=False, reset_index=False)
#     x_lag = lag(x, num_lag, drop_nan=False, reset_index=False)

# Redefine data name as target (y) and feature (y_lag) (explanatory variable, predictor)
target = y
feature = y_lag

# save simple test data series
_, target_test = train_test_split(target, test_size=12)
_, feature_test = train_test_split(feature, test_size=12)

# drop nan caused by lag()
feature = feature.dropna(axis=0)
target = target[feature.index]

# setting torch
dtype = torch.float # double float problem in layer 
device = torch.device("cpu")

# Make data to torch.tensor
target_all_window = torch.tensor(target.values, dtype=dtype)
feature_all_window = torch.tensor(feature.values, dtype=dtype)
target_test_all_window = torch.tensor(target_test.values, dtype=dtype)
feature_test_all_window = torch.tensor(feature_test.values, dtype=dtype)

#########################################################################################################################
SCALE_X = True

train_valid_all_window_dataset = TimeSeriesDataset(feature_all_window, target_all_window, train_valid_window_size)
print("length of train-valid all window dataset: ", len(train_valid_all_window_dataset))

train_valid_window_loader = torch.utils.data.DataLoader(train_valid_all_window_dataset, batch_size=1, shuffle=False)
for window, (feature_train_valid, target_train_valid) in enumerate(train_valid_window_loader):
    feature_train_valid = feature_train_valid[0] # extract single batch
    target_train_valid = target_train_valid[0] # extract single batch
    feature_test = feature_test_all_window[window].reshape(1, -1)
    target_test = target_test_all_window[window].reshape(-1)
    
    if SCALE_X:
        feature_train_scaler = StandardScaler().fit(feature_train_valid[:TRAIN_WINDOW_SIZE])
        # overwrite memory
        feature_train_valid[:TRAIN_WINDOW_SIZE] = torch.tensor(feature_train_scaler.transform(feature_train_valid[:TRAIN_WINDOW_SIZE]), dtype=dtype)
        feature_train_valid[TRAIN_WINDOW_SIZE:] = torch.tensor(feature_train_scaler.transform(feature_train_valid[TRAIN_WINDOW_SIZE:]), dtype=dtype)
        feature_test = torch.tensor(feature_train_scaler.transform(feature_test), dtype=dtype)
#     if (window == 0) | (window == 1):
    print("")
    print("======================================")
    print("WINDOW: ", window)
    print("TRAIN-VALID FEATURE: ")
    print(feature_train_valid.round())
    print(feature_train_valid.size())
    print("TRAIN-VALID TARGET: ")
    print(target_train_valid.round())
    print(target_train_valid.size())
    
    # all "batch" dataset
    train_valid_dataset = TimeSeriesDataset(feature_train_valid, target_train_valid, train_window=None)
    print("    length of train_valid_dataset: ", len(train_valid_dataset))
    
    # 分け方は一応 full-psuedo で。(valid に rolling windowはしない)
    train_dataset = torch.utils.data.dataset.Subset(train_valid_dataset, list(range(0, train_window_size)))
    print("///// TRAIN /////")
    print("    length of train_dataset: ", len(train_dataset))
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=train_window_size, shuffle=False)
    for mini_batch, (feature_train, target_train) in enumerate(train_loader):
        print("--------------------------------")
        print("    MINI-BATCH: ", mini_batch)
        print("    TRAIN FEATURE: ")
        print(feature_train.round())
        print(feature_train.size())
        print("    TRAIN TARGET: ")
        print(target_train.round())
        print(target_train.size())
        
    valid_dataset = torch.utils.data.dataset.Subset(train_valid_dataset, list(range(train_window_size, len(train_valid_dataset))))
    print("///// VALID /////")
    print("    length of valid_dataset: ", len(valid_dataset))
    valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=valid_window_size, shuffle=False)
    for mini_batch, (feature_valid, target_valid) in enumerate(valid_loader):
        print("--------------------------------")
        print("    MINI-BATCH: ", mini_batch)
        print("    VALID FEATURE: ")
        print(feature_valid.round())
        print(feature_valid.size())
        print("    VALID TARGET: ")
        print(target_valid.round())
        print(target_valid.size())
        
    test_dataset = TimeSeriesDataset(feature_test, target_test, train_window=None)
    print("///// TEST /////")
    print("    length of test_dataset: ", len(test_dataset))
    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=1, shuffle=False)
    for mini_batch, (feature_test, target_test) in enumerate(test_loader):
        print("--------------------------------")
        print("    MINI-BATCH: ", mini_batch)
        print("    TEST FEATURE: ")
        print(feature_test.round())
        print(feature_test.size())
        print("    TEST TARGET: ")
        print(target_test.round())
        print(target_test.size())

あらた
length of train-valid all window dataset:  12

WINDOW:  0
TRAIN-VALID FEATURE: 
tensor([[-1., -0., -0., -0.],
        [-0., -1., -0., -0.],
        [-0., -0., -1., -0.],
        [-0., -0., -0., -1.],
        [-1., -0., -0.,  0.],
        [-1., -1., -0.,  0.],
        [-0., -1., -1., -0.],
        [-0., -0., -1., -1.],
        [-1., -0., -0., -1.],
        [-0., -1.,  0.,  0.],
        [-0., -0., -1.,  0.],
        [-0., -0., -0., -1.],
        [-0., -0., -0., -0.],
        [-0., -0., -0.,  0.],
        [-0., -0., -0.,  0.],
        [-0., -0., -0., -0.],
        [-0., -0., -0., -0.],
        [-0., -0., -0.,  0.],
        [-0., -0., -0., -0.],
        [-0., -0., -0.,  0.],
        [ 0., -0., -0.,  0.],
        [-1.,  0., -0., -0.],
        [-0., -1.,  0.,  0.],
        [-0., -0., -1.,  1.],
        [-0., -0., -0., -1.],
        [-0., -0.,  0., -0.],
        [ 1.,  0., -0.,  0.],
        [ 3.,  2.,  0., -0.],
        [ 1.,  5.,  2.,  1.],
        [ 4.,  1.,  5.,  5.],
        [ 3.,  5

In [110]:
# read processed data
df = pd.read_csv("../data/processed/tidy_df.csv", index_col=[0, 1, 2])

# empty list for dataframes
y_test_list = []
y_hat_umlp = []

i = df.index.get_level_values(0).unique()[0]
print(i)

# y : "EPS"
y = df.loc[pd.IndexSlice[i, :, :], "EPS"]

# x, exogenous regressors : 'INV', 'AR', 'CAPX', 'GM', 'SA', 'ETR', 'LF'
#     x = df.loc[pd.IndexSlice[i, :, :], ['INV', 'AR', 'CAPX', 'GM', 'SA', 'ETR', 'LF']]

# Unlike statsmodel SARIMA package, NN needs to prepare lagged inputs manually if needed.
# y_lag and x_lag (lag 4 for now)
num_lag = 4
y_lag = lag(y, num_lag, drop_nan=False, reset_index=False)
#     x_lag = lag(x, num_lag, drop_nan=False, reset_index=False)

# Redefine data name as target (y) and feature (y_lag) (explanatory variable, predictor)
target = y
feature = y_lag

# save simple test data series
_, target_test = train_test_split(target, test_size=12)
_, feature_test = train_test_split(feature, test_size=12)

# drop nan caused by lag()
feature = feature.dropna(axis=0)
target = target[feature.index]

# setting torch
dtype = torch.float # double float problem in layer 
device = torch.device("cpu")

# Make data to torch.tensor
target_all_window = torch.tensor(target.values, dtype=dtype)
feature_all_window = torch.tensor(feature.values, dtype=dtype)
target_test_all_window = torch.tensor(target_test.values, dtype=dtype)
feature_test_all_window = torch.tensor(feature_test.values, dtype=dtype)

#########################################################################################################################
SCALE_X = True

train_valid_all_window_dataset = TimeSeriesDataset(feature_all_window, target_all_window, train_valid_window_size)
print("length of train-valid all window dataset: ", len(train_valid_all_window_dataset))

train_valid_window_loader = torch.utils.data.DataLoader(train_valid_all_window_dataset, batch_size=1, shuffle=False)
for window, (feature_train_valid, target_train_valid) in enumerate(train_valid_window_loader):
    feature_train_valid = feature_train_valid[0] # extract single batch
    target_train_valid = target_train_valid[0] # extract single batch
    feature_test = feature_test_all_window[window].reshape(1, -1)
    target_test = target_test_all_window[window].reshape(-1)
    
    if SCALE_X:
        feature_train_scaler = StandardScaler().fit(feature_train_valid[:TRAIN_WINDOW_SIZE])
        # overwrite memory
        feature_train_valid[:TRAIN_WINDOW_SIZE] = torch.tensor(feature_train_scaler.transform(feature_train_valid[:TRAIN_WINDOW_SIZE]), dtype=dtype)
        feature_train_valid[TRAIN_WINDOW_SIZE:] = torch.tensor(feature_train_scaler.transform(feature_train_valid[TRAIN_WINDOW_SIZE:]), dtype=dtype)
        feature_test = torch.tensor(feature_train_scaler.transform(feature_test), dtype=dtype)
#     if (window == 0) | (window == 1):
    print("")
    print("======================================")
    print("WINDOW: ", window)
    print("TRAIN-VALID FEATURE: ")
    print(feature_train_valid.round())
    print(feature_train_valid.size())
    print("TRAIN-VALID TARGET: ")
    print(target_train_valid.round())
    print(target_train_valid.size())
    
    # all "batch" dataset
    train_valid_dataset = TimeSeriesDataset(feature_train_valid, target_train_valid, train_window=None)
    print("    length of train_valid_dataset: ", len(train_valid_dataset))
    
    # 分け方は一応 full-psuedo で。(valid に rolling windowはしない)
    train_dataset = torch.utils.data.dataset.Subset(train_valid_dataset, list(range(0, train_window_size)))
    print("///// TRAIN /////")
    print("    length of train_dataset: ", len(train_dataset))
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=train_window_size, shuffle=False)
    for mini_batch, (feature_train, target_train) in enumerate(train_loader):
        print("--------------------------------")
        print("    MINI-BATCH: ", mini_batch)
        print("    TRAIN FEATURE: ")
        print(feature_train.round())
        print(feature_train.size())
        print("    TRAIN TARGET: ")
        print(target_train.round())
        print(target_train.size())
        
    valid_dataset = torch.utils.data.dataset.Subset(train_valid_dataset, list(range(train_window_size, len(train_valid_dataset))))
    print("///// VALID /////")
    print("    length of valid_dataset: ", len(valid_dataset))
    valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=valid_window_size, shuffle=False)
    for mini_batch, (feature_valid, target_valid) in enumerate(valid_loader):
        print("--------------------------------")
        print("    MINI-BATCH: ", mini_batch)
        print("    VALID FEATURE: ")
        print(feature_valid.round())
        print(feature_valid.size())
        print("    VALID TARGET: ")
        print(target_valid.round())
        print(target_valid.size())
        
    test_dataset = TimeSeriesDataset(feature_test, target_test, train_window=None)
    print("///// TEST /////")
    print("    length of test_dataset: ", len(test_dataset))
    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=1, shuffle=False)
    for mini_batch, (feature_test, target_test) in enumerate(test_loader):
        print("--------------------------------")
        print("    MINI-BATCH: ", mini_batch)
        print("    TEST FEATURE: ")
        print(feature_test.round())
        print(feature_test.size())
        print("    TEST TARGET: ")
        print(target_test.round())
        print(target_test.size())

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,EPS_lag1,EPS_lag2,EPS_lag3,EPS_lag4
企業名,会計年度,四半期,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
あらた,2018,Q1,69.34,119.88,90.92,121.17
あらた,2018,Q2,105.42,69.34,119.88,90.92
あらた,2018,Q3,90.44,105.42,69.34,119.88
あらた,2018,Q4,106.24,90.44,105.42,69.34
あらた,2019,Q1,99.22,106.24,90.44,105.42
あらた,2019,Q2,116.85,99.22,106.24,90.44
あらた,2019,Q3,101.76,116.85,99.22,106.24
あらた,2019,Q4,113.93,101.76,116.85,99.22
あらた,2020,Q1,79.9,113.93,101.76,116.85
あらた,2020,Q2,145.23,79.9,113.93,101.76
