In [2]:
import os
from pathlib import Path

import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader, random_split

# 주피터 노트북에서는 __file__ 변수가 정의되지 않기에 직접 설정할 것임
# BASE_PATH = str(Path(__file__).resolve().parent.parent.parent) # BASE_PATH: /Users/yhhan/git/link_dl


# 본 과제 제출자는 현재 우분투 도커 환경에서 작업중이므로 다음과 같이 경로 설정
BASE_PATH="/home/Deep-Learning-study"
import sys
sys.path.append(BASE_PATH)


# 자전거 공유 시스템 데이터 처리 클래스
class BikesDataset(Dataset):
  def __init__(self, train=True, test_days=1):
    self.train = train
    self.test_days = test_days

    # 데이터 경로 설정
    bikes_path = os.path.join(BASE_PATH, "_00_data", "e_time-series-bike-sharing-dataset", "hour-fixed.csv")

    # csv파일을 넘파이 배열로 로드
    bikes_numpy = np.loadtxt(
      fname=bikes_path, dtype=np.float32, delimiter=",", skiprows=1,
      converters={
        # 일 정보만 추출
        1: lambda x: float(x[8:10])  # 2011-01-07 --> 07 --> 7
      }
    )

    # 텐서로 변환
    bikes = torch.from_numpy(bikes_numpy)

    # 24시간 데이터 단위로 묶기
    daily_bikes = bikes.view(-1, 24, bikes.shape[1])  # daily_bikes.shape: torch.Size([730, 24, 17])
    self.daily_bikes_target = daily_bikes[:, :, -1].unsqueeze(dim=-1)


    # 입력 데이터에서 타겟 열 제외
    self.daily_bikes_data = daily_bikes[:, :, :-1]
    # 원핫 코딩 위한 아이덴티티 메트릭스 설정
    eye_matrix = torch.eye(4)


    # 날씨 정보에 원핫 인코딩 적용
    day_data_torch_list = []
    for daily_idx in range(self.daily_bikes_data.shape[0]):  # range(730)
      day = self.daily_bikes_data[daily_idx]  # day.shape: [24, 17]
      weather_onehot = eye_matrix[day[:, 9].long() - 1]
      day_data_torch = torch.cat(tensors=(day, weather_onehot), dim=1)  # day_torch.shape: [24, 21]
      day_data_torch_list.append(day_data_torch)


    # 리스트를 텐서로 변환
    self.daily_bikes_data = torch.stack(day_data_torch_list, dim=0)

    # 특정 열 제거
    self.daily_bikes_data = torch.cat(
      [self.daily_bikes_data[:, :, :9], self.daily_bikes_data[:, :, 10:]], dim=2
    )

    total_length = len(self.daily_bikes_data)
    self.train_bikes_data = self.daily_bikes_data[:total_length - test_days]
    self.train_bikes_targets = self.daily_bikes_target[:total_length - test_days]
    train_temperatures = self.train_bikes_data[:, :, 9]

    # 데이터 정규화
    train_temperatures_mean = torch.mean(train_temperatures)
    train_temperatures_std = torch.std(train_temperatures)
    self.train_bikes_data[:, :, 9] = \
      (self.train_bikes_data[:, :, 9] - torch.mean(train_temperatures_mean)) / torch.std(train_temperatures_std)

    assert len(self.train_bikes_data) == len(self.train_bikes_targets)

    self.test_bikes_data = self.daily_bikes_data[-test_days:]
    self.test_bikes_targets = self.daily_bikes_target[-test_days:]

    self.test_bikes_data[:, :, 9] = \
      (self.test_bikes_data[:, :, 9] - torch.mean(train_temperatures_mean)) / torch.std(train_temperatures_std)

    assert len(self.test_bikes_data) == len(self.test_bikes_targets)

  # 데이터 크기 반환
  def __len__(self):
    return len(self.train_bikes_data) if self.train is True else len(self.test_bikes_data)

  # 인덱스 해당하는 샘플 반환
  def __getitem__(self, idx):
    bike_feature = self.train_bikes_data[idx] if self.train is True else self.test_bikes_data[idx]
    bike_target = self.train_bikes_targets[idx] if self.train is True else self.test_bikes_targets[idx]
    return bike_feature, bike_target
  
  # 데이터셋 정보 출력
  def __str__(self):
    if self.train is True:
      str = "Data Size: {0}, Input Shape: {1}, Target Shape: {2}".format(
        len(self.train_bikes_data), self.train_bikes_data.shape, self.train_bikes_targets.shape
      )
    else:
      str = "Data Size: {0}, Input Shape: {1}, Target Shape: {2}".format(
        len(self.test_bikes_data), self.test_bikes_data.shape, self.test_bikes_targets.shape
      )
    return str


if __name__ == "__main__":
  train_bikes_dataset = BikesDataset(train=True, test_days=1)
  print(train_bikes_dataset)

  print("#" * 50, 1)

  # 데이터셋 분리 ( 8:2 )
  train_dataset, validation_dataset = random_split(train_bikes_dataset, [0.8, 0.2])

  print("[TRAIN]")
  # 학습 데이터셋 출력
  for idx, sample in enumerate(train_dataset):
    input, target = sample
    print("{0} - {1}: {2}".format(idx, input.shape, target.shape))


  # 데이터 로드
  train_data_loader = DataLoader(dataset=train_dataset, batch_size=32, shuffle=True, drop_last=True)

  for idx, batch in enumerate(train_data_loader):
    input, target = batch
    print("{0} - {1}: {2}".format(idx, input.shape, target.shape))

  print("#" * 50, 2)

  print("[VALIDATION]")
  for idx, sample in enumerate(validation_dataset):
    input, target = sample
    print("{0} - {1}: {2}".format(idx, input.shape, target.shape))

  validation_data_loader = DataLoader(dataset=validation_dataset, batch_size=32)

  for idx, batch in enumerate(validation_data_loader):
    input, target = batch
    print("{0} - {1}: {2}".format(idx, input.shape, target.shape))

  print("#" * 50, 3)

  test_dataset = BikesDataset(train=False, test_days=1)
  print(test_dataset)

  print("[TEST]")
  for idx, sample in enumerate(test_dataset):
    input, target = sample
    print("{0} - {1}: {2}".format(idx, input.shape, target.shape))

  test_data_loader = DataLoader(dataset=test_dataset, batch_size=len(test_dataset))

  for idx, batch in enumerate(test_data_loader):
    input, target = batch
    print("{0} - {1}: {2}".format(idx, input.shape, target.shape))

Data Size: 729, Input Shape: torch.Size([729, 24, 19]), Target Shape: torch.Size([729, 24, 1])
################################################## 1
[TRAIN]
0 - torch.Size([24, 19]): torch.Size([24, 1])
1 - torch.Size([24, 19]): torch.Size([24, 1])
2 - torch.Size([24, 19]): torch.Size([24, 1])
3 - torch.Size([24, 19]): torch.Size([24, 1])
4 - torch.Size([24, 19]): torch.Size([24, 1])
5 - torch.Size([24, 19]): torch.Size([24, 1])
6 - torch.Size([24, 19]): torch.Size([24, 1])
7 - torch.Size([24, 19]): torch.Size([24, 1])
8 - torch.Size([24, 19]): torch.Size([24, 1])
9 - torch.Size([24, 19]): torch.Size([24, 1])
10 - torch.Size([24, 19]): torch.Size([24, 1])
11 - torch.Size([24, 19]): torch.Size([24, 1])
12 - torch.Size([24, 19]): torch.Size([24, 1])
13 - torch.Size([24, 19]): torch.Size([24, 1])
14 - torch.Size([24, 19]): torch.Size([24, 1])
15 - torch.Size([24, 19]): torch.Size([24, 1])
16 - torch.Size([24, 19]): torch.Size([24, 1])
17 - torch.Size([24, 19]): torch.Size([24, 1])
18 - tor

  (self.train_bikes_data[:, :, 9] - torch.mean(train_temperatures_mean)) / torch.std(train_temperatures_std)
  (self.test_bikes_data[:, :, 9] - torch.mean(train_temperatures_mean)) / torch.std(train_temperatures_std)
