# Train Dataset & Test Dataset 생성

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
import random
import warnings
from tqdm.auto import tqdm
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
import math

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import platform

warnings.filterwarnings('ignore')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

device

In [None]:
# 0번 GPU에 할당

os.environ['CUDA_DEVICE_ORDER'] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [None]:
CFG = {
    'TRAIN_WINDOW_SIZE':90, # 90일치로 학습
    'PREDICT_SIZE':21, # 21일치 예측
    'EPOCHS':10,
    'LEARNING_RATE':1e-4,
    'BATCH_SIZE':2048,
    'SEED':41
}

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = False
    torch.backends.cudnn.benchmark = False

seed_everything(CFG['SEED']) # Seed 고정

PATH = '../data/'

## Dataset

In [None]:
submit = pd.read_csv(PATH + 'sample_submission.csv')
train_enc = pd.read_csv(PATH + 'train_fe.csv', low_memory=False)
sales = pd.read_csv(PATH + 'sales.csv')
origin_train = pd.read_csv(PATH + 'train.csv')

In [None]:
train_enc

In [None]:
# 라벨 인코딩

col_names = ['대분류', '중분류', '소분류', '브랜드', '쇼핑몰', 'day_name']
le = LabelEncoder()

for col in col_names:
    train_enc[col] = le.fit_transform(train_enc[col])

In [None]:
event_idx = train_enc.query("event != '0'").index
train_enc.loc[event_idx, 'event'] = [1] * len(event_idx)
train_enc = train_enc.sort_values(by = ['ID', 'date']).reset_index(drop = True)

In [None]:
train_enc = train_enc.drop('price', axis = 1)
train_enc

In [None]:
columns = ['ID', '대분류', '중분류', '소분류', '브랜드', '쇼핑몰', 'day_name', 'quarter', 'keyword', 'event', 'sales', 'date']
train_enc = train_enc[columns]
train_enc = train_enc.drop('event', axis = 1)
train_enc.head(3)

In [None]:
train_1 = train_enc.iloc[:4623360, :]
train_2 = train_enc.iloc[4623360:9246240, :]
train_3 = train_enc.iloc[9246240:, :]

## Make Train & Test Dataset to Array

In [None]:
#  시간이 너무 오래걸림
## 현실적으로 쓸 수 없는 함수
## pandas 라이브러리가 너무 무거운 거로 판단됨 -> numpy 데이터로 변형 후 함수 적용하는게 맞는듯
## numpy로 바꿔도 차이가 없음 : 시간복잡도가 너무 높아서 생기는 문제로 판단됨 O(n^2)
## pandas.DataFrame.query()의 문제라고 판명
## ID및 date의 순서로 되어있기 때문에 iloc을 사용해 순서대로 잘라서 dataset을 만들어서 작업 소요시간이 매우 줄음

def make_train_data(data, train_size = CFG['TRAIN_WINDOW_SIZE'], predict_size = CFG['PREDICT_SIZE']):
    '''
    학습 기간 블럭, 예측 기간 블럭의 세트로 데이터를 생성
    data : date를 melt시킨 새로운 train data
    train_size : 학습에 활용할 기간 => 90 Days
    predict_size : 추론할 기간 => 21 Days
    '''
    window_size = train_size + predict_size         # 90 + 21 = 111
    num_id = data.ID.nunique()                      # ID: 28894
    num_date = data.date.nunique()                  # 날짜: 479
    num_features = len(data.iloc[0, 1:10])           # date, sales를 제외한 나머지 features : 대분류 ~ event / sales <- Target
    data = np.array(data)                           # DataFrame to Numpy Data
    
    input_data = np.empty((num_id * ((num_date + num_features) - window_size + 1), train_size, num_features + 1), dtype = np.float16)
    target_data = np.empty((num_id * ((num_date + num_features) - window_size + 1), predict_size), dtype = np.float16)

    for id in tqdm(range(0, num_id, 2)):
        for j in range(num_date - window_size + 1):
            temp_data = data[id*479: 479*(id+1)][j:train_size+j, 1:11]
            input_data[id * ((num_date + num_features) - window_size + 1) + j] = temp_data
            target_data[id * ((num_date + num_features) - window_size + 1) + j] = data[id*479: 479*(id+1)][train_size+j:window_size+j, 10] # sales

    return input_data, target_data

In [None]:
def make_predict_data(data, train_size=CFG['TRAIN_WINDOW_SIZE']): #90
    '''
    평가 데이터(Test Dataset)를 추론하기 위한 Input 데이터를 생성
    data : date를 melt시킨 새로운 train data
    train_size : 추론을 위해 필요한 일별 판매량 기간 (= 학습에 활용할 기간)
    '''
    num_id = data.ID.nunique()                      # ID: 15890
    num_date = data.date.nunique()                  # 날짜: 425
    num_features = len(data.iloc[0, 1:10])           # date를 제외한 나머지 features : 대분류 ~ sales / sales <- Target
    data = np.array(data)
    
    test_input = np.empty((num_id, train_size, num_features + 1), dtype = np.float16)

    for id in tqdm(range(num_id)):
        temp_data = data[id*425: 425*(id+1)][-train_size:, 1:11]
        test_input[id] = temp_data

    return test_input

In [None]:
input_1, target_1 = make_train_data(train_1)
test_1 = make_predict_data(train_1)

In [None]:
input_2, target_2 = make_train_data(train_2)
test_2 = make_predict_data(train_2)

In [None]:
input_3, target_3 = make_train_data(train_3)
test_3 = make_predict_data(train_3)

In [None]:
np.save(PATH + 'dataset/train_input_1', input_1)
np.save(PATH + 'dataset/train_target_1', target_1)
np.save(PATH + 'dataset/test_input_1', test_1)

np.save(PATH + 'dataset/train_input_2', input_2)
np.save(PATH + 'dataset/train_target_2', target_2)
np.save(PATH + 'dataset/test_input_2', test_2)

np.save(PATH + 'dataset/train_input_3', input_3)
np.save(PATH + 'dataset/train_target_3', target_3)
np.save(PATH + 'dataset/test_input_3', test_3)