In [5]:
import os
import numpy as np
import torch
from pathlib import Path

# 주피터 노트북에서는 __file__ 변수가 정의되지 않기에 직접 설정할 것임
# BASE_PATH = str(Path(__file__).resolve().parent.parent.parent) # BASE_PATH: /Users/yhhan/git/link_dl


# 본 과제 제출자는 현재 우분투 도커 환경에서 작업중이므로 다음과 같이 경로 설정
BASE_PATH="/home/Deep-Learning-study"
import sys
sys.path.append(BASE_PATH)


# 출력 옵셜 설정
torch.set_printoptions(edgeitems=2, threshold=50, linewidth=75)

# 데이터셋 경로 로드
bikes_path = os.path.join(BASE_PATH, "_00_data", "e_time-series-bike-sharing-dataset", "hour-fixed.csv")

# 1번째 열에서 일(day)추출하여 변환
bikes_numpy = np.loadtxt(
  fname=bikes_path, dtype=np.float32, delimiter=",", skiprows=1,
  converters={
    1: lambda x: float(x[8:10])  # 2011-01-07 --> 07 --> 7
  }
)


# 넘파이 배열을 텐서로 변환 후 자료형은 float
bikes_data = torch.from_numpy(bikes_numpy).to(torch.float)
print(bikes_data.shape)    # >>> torch.Size([17520, 17])

# 마지막 열을 분리
bikes_target = bikes_data[:, -1].unsqueeze(dim=-1)  # 'cnt'
# 마지막 열을 제외한 데이터를 원본에 저장
bikes_data = bikes_data[:, :-1]   # >>> torch.Size([17520, 16])


# 4*4 아이덴티티 메트릭스 생성
eye_matrix = torch.eye(4)

data_torch_list = []
# 시간에 대해 처리
for idx in range(bikes_data.shape[0]):  # range(730)
  # 한시간지 데이터 추출
  hour_data = bikes_data[idx]  # hour_data.shape: [17]
  #원핫 벡터 코딩
  weather_onehot = eye_matrix[hour_data[9].long() - 1]
  # 원본 데이터와 원핫 벡터 데이터 결합
  concat_data_torch = torch.cat(tensors=(hour_data, weather_onehot), dim=-1)
  # concat_data_torch.shape: [20]
  data_torch_list.append(concat_data_torch)


# 하나의 텐서로 결합
bikes_data = torch.stack(data_torch_list, dim=0)
# 0번째 열 데이터와 9번쨰 열 데이터를 제외한 텐서 결합
bikes_data = torch.cat([bikes_data[:, 1:9], bikes_data[:, 10:]], dim=-1)
# Drop 'instant' and 'whethersit' columns

print(bikes_data.shape)
print(bikes_data[0])

#################################################################################################

sequence_size = 24
validation_size = 96
test_size = 24
y_normalizer = 100

# 데이터 사이즈 계산
data_size = len(bikes_data) - sequence_size + 1
print("data_size: {0}".format(data_size))
train_size = data_size - (validation_size + test_size)
print("train_size: {0}, validation_size: {1}, test_size: {2}".format(train_size, validation_size, test_size))

print("#" * 50, 1)

#################################################################################################

row_cursor = 0

X_train_list = []
y_train_regression_list = []
for idx in range(0, train_size):
  # 24시간 만큼 데이터 추출
  sequence_data = bikes_data[idx: idx + sequence_size]
  # 마지막 시간에 대한 타겟 추출
  sequence_target = bikes_target[idx + sequence_size - 1]
  X_train_list.append(sequence_data)
  y_train_regression_list.append(sequence_target)
  # 다음 시퀀스로 이동
  row_cursor += 1

X_train = torch.stack(X_train_list, dim=0).to(torch.float)
print(X_train.shape)
# 정규화 및 텐서로 변환
y_train_regression = torch.tensor(y_train_regression_list, dtype=torch.float32) / y_normalizer

# 평균 연산
m = X_train.mean(dim=0, keepdim=True)
# 표준 편차 연산
s = X_train.std(dim=0, keepdim=True)
# 정규화
X_train = (X_train - m) / s

print(X_train.shape, y_train_regression.shape)
# >>> torch.Size([17376, 24, 19]) torch.Size([17376])

print("#" * 50, 2)
#################################################################################################

X_validation_list = []
y_validation_regression_list = []

# validation 크기만큼 시퀀스
# train데이터 가공과 과정 같음
for idx in range(row_cursor, row_cursor + validation_size):
  sequence_data = bikes_data[idx: idx + sequence_size]
  sequence_target = bikes_target[idx + sequence_size - 1]
  X_validation_list.append(sequence_data)
  y_validation_regression_list.append(sequence_target)
  row_cursor += 1

X_validation = torch.stack(X_validation_list, dim=0).to(torch.float)
y_validation_regression = torch.tensor(y_validation_regression_list, dtype=torch.float32) / y_normalizer

X_validation = (X_validation - m) / s

print(X_validation.shape, y_validation_regression.shape)
# >>> torch.Size([96, 24, 19]) torch.Size([96])

print("#" * 50, 3)
#################################################################################################

# train 데이터 가공과 과정 동일
X_test_list = []
y_test_regression_list = []
for idx in range(row_cursor, row_cursor + test_size):
  sequence_data = bikes_data[idx: idx + sequence_size]
  sequence_target = bikes_target[idx + sequence_size - 1]
  X_test_list.append(sequence_data)
  y_test_regression_list.append(sequence_target)
  row_cursor += 1

X_test = torch.stack(X_test_list, dim=0).to(torch.float)
y_test_regression = torch.tensor(y_test_regression_list, dtype=torch.float32) / y_normalizer

X_test -= (X_test - m) / s

print(X_test.shape, y_test_regression.shape)
# >>> torch.Size([24, 24, 18]) torch.Size([24])

torch.Size([17520, 17])
torch.Size([17520, 18])
tensor([ 1.0000,  1.0000,  0.0000,  1.0000,  0.0000,  0.0000,  6.0000,
         0.0000,  0.2400,  0.2879,  0.8100,  0.0000,  3.0000, 13.0000,
         1.0000,  0.0000,  0.0000,  0.0000])
data_size: 17497
train_size: 17377, validation_size: 96, test_size: 24
################################################## 1
torch.Size([17377, 24, 18])
torch.Size([17377, 24, 18]) torch.Size([17377])
################################################## 2
torch.Size([96, 24, 18]) torch.Size([96])
################################################## 3
torch.Size([24, 24, 18]) torch.Size([24])
