# train, valid, test loader

In [62]:
# import external libraries
import os
import sys
import random
import numpy as np
import pandas as pd
import torch
import math
import time

# import internal modules
sys.path.insert(1, '../src/')
from models.nn import MLP, TimeSeriesDataset
from utils.data_editor import lag, train_test_split

In [63]:
# read processed data
df = pd.read_csv("../data/processed/tidy_df.csv", index_col=[0, 1, 2])

# empty list for dataframes
y_test_list = []
y_hat_umlp = []

i = df.index.get_level_values(0).unique()[0]
print(i)

# y : "EPS"
y = df.loc[pd.IndexSlice[i, :, :], "EPS"]

# x, exogenous regressors : 'INV', 'AR', 'CAPX', 'GM', 'SA', 'ETR', 'LF'
#     x = df.loc[pd.IndexSlice[i, :, :], ['INV', 'AR', 'CAPX', 'GM', 'SA', 'ETR', 'LF']]

# Unlike statsmodel SARIMA package, NN needs to prepare lagged inputs manually if needed.
# y_lag and x_lag (lag 4 for now)
num_lag = 4
y_lag = lag(y, num_lag, drop_nan=False, reset_index=False)
#     x_lag = lag(x, num_lag, drop_nan=False, reset_index=False)

# Redefine data name as target (y) and feature (y_lag) (explanatory variable, predictor)
target = y
feature = y_lag

# save simple test data series
_, target_test = train_test_split(target, test_size=12)
_, feature_test = train_test_split(feature, test_size=12)

# drop nan caused by lag()
feature = feature.dropna(axis=0)
target = target[feature.index]

# setting torch
dtype = torch.float # double float problem in layer 
device = torch.device("cpu")

# Make data to torch.tensor
target_all_window = torch.tensor(target.values, dtype=dtype)
feature_all_window = torch.tensor(feature.values, dtype=dtype)
target_test_all_window = torch.tensor(target_test.values, dtype=dtype)
feature_test_all_window = torch.tensor(feature_test.values, dtype=dtype)

あらた


In [64]:
print("  target total all window size: ", target_all_window.size())
print(" feature total all window size: ", feature_all_window.size())
print("   target test all window size: ", target_test_all_window.size())
print("  feature test all window size: ", feature_test_all_window.size())

  target total all window size:  torch.Size([48])
 feature total all window size:  torch.Size([48, 4])
   target test all window size:  torch.Size([12])
  feature test all window size:  torch.Size([12, 4])


In [65]:
test_window_size = len(target_test_all_window)
print("test window size: ", test_window_size)
train_valid_window_size = len(target_all_window) - test_window_size
print("train-valid window size: ", train_valid_window_size)
valid_window_size = test_window_size
print("valid window size: ", valid_window_size)
train_window_size = train_valid_window_size - valid_window_size
print("train window size: ", train_window_size)

test window size:  12
train-valid window size:  36
valid window size:  12
train window size:  24


# Test, Train-Valid split before rolling sample

In [66]:
train_valid_all_window_dataset = TimeSeriesDataset(feature_all_window, target_all_window, train_valid_window_size)
print("length of train-valid all window dataset: ", len(train_valid_all_window_dataset))

train_valid_window_loader = torch.utils.data.DataLoader(train_valid_all_window_dataset, batch_size=1, shuffle=False)
for window, (feature_train_valid, target_train_valid) in enumerate(train_valid_window_loader):
    feature_train_valid = feature_train_valid[0] # extract single batch
    target_train_valid = target_train_valid[0] # extract single batch
        
#     if (window == 0) | (window == 1):
    print("")
    print("======================================")
    print("WINDOW: ", window)
    print("TRAIN-VALID FEATURE: ")
    print(feature_train_valid.round())
    print(feature_train_valid.size())
    print("TRAIN-VALID TARGET: ")
    print(target_train_valid.round())
    print(target_train_valid.size())
    
    # all "batch" dataset
    train_valid_dataset = TimeSeriesDataset(feature_train_valid, target_train_valid, train_window=None)
    print("    length of train_valid_dataset: ", len(train_valid_dataset))
    
    # 分け方は一応 full-psuedo で。(valid に rolling windowはしない)
    train_dataset = torch.utils.data.dataset.Subset(train_valid_dataset, list(range(0, train_window_size)))
    print("    length of train_dataset: ", len(train_dataset))
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=train_window_size, shuffle=False)
    for mini_batch, (feature_train, target_train) in enumerate(train_loader):
        print("--------------------------------")
        print("    MINI-BATCH: ", mini_batch)
        print("    TRAIN FEATURE: ")
        print(feature_train.round())
        print(feature_train.size())
        print("    TRAIN TARGET: ")
        print(target_train.round())
        print(target_train.size())
        
    valid_dataset = torch.utils.data.dataset.Subset(train_valid_dataset, list(range(train_window_size, len(train_valid_dataset))))
    print("    length of valid_dataset: ", len(valid_dataset))
    valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=valid_window_size, shuffle=False)
    for mini_batch, (feature_valid, target_valid) in enumerate(valid_loader):
        print("--------------------------------")
        print("    MINI-BATCH: ", mini_batch)
        print("    VALID FEATURE: ")
        print(feature_valid.round())
        print(feature_valid.size())
        print("    VALID TARGET: ")
        print(target_valid.round())
        print(target_valid.size())

test_all_window_dataset = TimeSeriesDataset(feature_test_all_window, target_test_all_window, None)
print("length of test all window dataset: ", len(test_all_window_dataset))
test_window_loader = torch.utils.data.DataLoader(test_all_window_dataset, batch_size=1, shuffle=False)
for window, (feature_test, target_test) in enumerate(test_window_loader):
#     feature_test = feature_test[0] # extract single batch
#     target_test = target_test[0] # extract single batch
        
#     if (window == 0) | (window == 1):
    print("")
    print("======================================")
    print("WINDOW: ", window)
    print("TEST FEATURE: ")
    print(feature_test.round())
    print(feature_test.size())
    print("TEST TARGET: ")
    print(target_test.round())
    print(target_test.size())

length of train-valid all window dataset:  12

WINDOW:  0
TRAIN-VALID FEATURE: 
tensor([[ -5.,   3.,   2.,   3.],
        [  7.,  -5.,   3.,   2.],
        [  7.,   7.,  -5.,   3.],
        [  5.,   7.,   7.,  -5.],
        [ -1.,   5.,   7.,   7.],
        [ -3.,  -1.,   5.,   7.],
        [  8.,  -3.,  -1.,   5.],
        [  9.,   8.,  -3.,  -1.],
        [ -1.,   9.,   8.,  -3.],
        [  4.,  -1.,   9.,   8.],
        [  8.,   4.,  -1.,   9.],
        [  6.,   8.,   4.,  -1.],
        [  4.,   6.,   8.,   4.],
        [  5.,   4.,   6.,   8.],
        [  7.,   5.,   4.,   6.],
        [  4.,   7.,   5.,   4.],
        [  7.,   4.,   7.,   5.],
        [  6.,   7.,   4.,   7.],
        [  5.,   6.,   7.,   4.],
        [  6.,   5.,   6.,   7.],
        [ 15.,   6.,   5.,   6.],
        [ -0.,  15.,   6.,   5.],
        [  3.,  -0.,  15.,   6.],
        [  9.,   3.,  -0.,  15.],
        [  3.,   9.,   3.,  -0.],
        [ 12.,   3.,   9.,   3.],
        [ 42.,  12.,   3.,   9.],
  

# single test data within each rolling window sample 

In [84]:
feature_test_all_window

tensor([[ 69.3400, 119.8800,  90.9200, 121.1700],
        [105.4200,  69.3400, 119.8800,  90.9200],
        [ 90.4400, 105.4200,  69.3400, 119.8800],
        [106.2400,  90.4400, 105.4200,  69.3400],
        [ 99.2200, 106.2400,  90.4400, 105.4200],
        [116.8500,  99.2200, 106.2400,  90.4400],
        [101.7600, 116.8500,  99.2200, 106.2400],
        [113.9300, 101.7600, 116.8500,  99.2200],
        [ 79.9000, 113.9300, 101.7600, 116.8500],
        [145.2300,  79.9000, 113.9300, 101.7600],
        [120.6800, 145.2300,  79.9000, 113.9300],
        [153.3500, 120.6800, 145.2300,  79.9000]])

In [85]:
target_test_all_window

tensor([105.4200,  90.4400, 106.2400,  99.2200, 116.8500, 101.7600, 113.9300,
         79.9000, 145.2300, 120.6800, 153.3500,  61.4034])

In [87]:
train_valid_all_window_dataset = TimeSeriesDataset(feature_all_window, target_all_window, train_valid_window_size)
print("length of train-valid all window dataset: ", len(train_valid_all_window_dataset))

train_valid_window_loader = torch.utils.data.DataLoader(train_valid_all_window_dataset, batch_size=1, shuffle=False)
for window, (feature_train_valid, target_train_valid) in enumerate(train_valid_window_loader):
    feature_train_valid = feature_train_valid[0] # extract single batch
    target_train_valid = target_train_valid[0] # extract single batch
        
#     if (window == 0) | (window == 1):
    print("")
    print("======================================")
    print("WINDOW: ", window)
    print("TRAIN-VALID FEATURE: ")
    print(feature_train_valid.round())
    print(feature_train_valid.size())
    print("TRAIN-VALID TARGET: ")
    print(target_train_valid.round())
    print(target_train_valid.size())
    
    # all "batch" dataset
    train_valid_dataset = TimeSeriesDataset(feature_train_valid, target_train_valid, train_window=None)
    print("    length of train_valid_dataset: ", len(train_valid_dataset))
    
    # 分け方は一応 full-psuedo で。(valid に rolling windowはしない)
    train_dataset = torch.utils.data.dataset.Subset(train_valid_dataset, list(range(0, train_window_size)))
    print("///// TRAIN /////")
    print("    length of train_dataset: ", len(train_dataset))
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=train_window_size, shuffle=False)
    for mini_batch, (feature_train, target_train) in enumerate(train_loader):
        print("--------------------------------")
        print("    MINI-BATCH: ", mini_batch)
        print("    TRAIN FEATURE: ")
        print(feature_train.round())
        print(feature_train.size())
        print("    TRAIN TARGET: ")
        print(target_train.round())
        print(target_train.size())
        
    valid_dataset = torch.utils.data.dataset.Subset(train_valid_dataset, list(range(train_window_size, len(train_valid_dataset))))
    print("///// VALID /////")
    print("    length of valid_dataset: ", len(valid_dataset))
    valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=valid_window_size, shuffle=False)
    for mini_batch, (feature_valid, target_valid) in enumerate(valid_loader):
        print("--------------------------------")
        print("    MINI-BATCH: ", mini_batch)
        print("    VALID FEATURE: ")
        print(feature_valid.round())
        print(feature_valid.size())
        print("    VALID TARGET: ")
        print(target_valid.round())
        print(target_valid.size())
        
    test_dataset = TimeSeriesDataset(feature_test_all_window[0].reshape(1, -1), target_test_all_window[0].reshape(-1), train_window=None)
    print("///// TEST /////")
    print("    length of test_dataset: ", len(test_dataset))
    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=1, shuffle=False)
    for mini_batch, (feature_test, target_test) in enumerate(test_loader):
        print("--------------------------------")
        print("    MINI-BATCH: ", mini_batch)
        print("    TEST FEATURE: ")
        print(feature_test.round())
        print(feature_test.size())
        print("    TEST TARGET: ")
        print(target_test.round())
        print(target_test.size())

length of train-valid all window dataset:  12

WINDOW:  0
TRAIN-VALID FEATURE: 
tensor([[ -5.,   3.,   2.,   3.],
        [  7.,  -5.,   3.,   2.],
        [  7.,   7.,  -5.,   3.],
        [  5.,   7.,   7.,  -5.],
        [ -1.,   5.,   7.,   7.],
        [ -3.,  -1.,   5.,   7.],
        [  8.,  -3.,  -1.,   5.],
        [  9.,   8.,  -3.,  -1.],
        [ -1.,   9.,   8.,  -3.],
        [  4.,  -1.,   9.,   8.],
        [  8.,   4.,  -1.,   9.],
        [  6.,   8.,   4.,  -1.],
        [  4.,   6.,   8.,   4.],
        [  5.,   4.,   6.,   8.],
        [  7.,   5.,   4.,   6.],
        [  4.,   7.,   5.,   4.],
        [  7.,   4.,   7.,   5.],
        [  6.,   7.,   4.,   7.],
        [  5.,   6.,   7.,   4.],
        [  6.,   5.,   6.,   7.],
        [ 15.,   6.,   5.,   6.],
        [ -0.,  15.,   6.,   5.],
        [  3.,  -0.,  15.,   6.],
        [  9.,   3.,  -0.,  15.],
        [  3.,   9.,   3.,  -0.],
        [ 12.,   3.,   9.,   3.],
        [ 42.,  12.,   3.,   9.],
  