# Library import 

In [69]:
import os
import random
import math
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

import torch
import torch.nn.functional as F
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader

from omegaconf import OmegaConf, DictConfig

from tqdm.auto import tqdm
import warnings
import wandb
from datetime import datetime
import re
from typing import Tuple

warnings.filterwarnings("ignore")

In [70]:
def sanitize_filename(filename):
    # Remove characters that are not allowed in Windows file names
    # (e.g., : / \ ? * < > | ")
    filename = re.sub(r'[\\/:*?"<>|]', '_', filename)
    return filename

In [71]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# Configuration

In [72]:
project_root = os.getcwd()
data_root = os.path.join(project_root, "data")

In [99]:
cfg_dict: dict = {
    "WINDOW_SIZE" : 30,
    "PREDICT_SIZE" : 21,
    "EPOCHS" : 20,
    "LEARNING_RATE" : 1e-3,
    "BATCH_SIZE" : 256,
    "NUM_WORKERS" : 0,
    "SEED" : 29,
    "input_size" : 5,
    "hidden_size" : 512,
    "output_size" : 21,
    "num_layers" : 3,
    "num_attention_heads" : 4,
    "feedforward_dim" : 25,
    "dropout_rate" : 0.2,
    "hidden_sizes" : [512, 256, 128, 64]
}

cfg = OmegaConf.create(cfg_dict)
print(OmegaConf.to_yaml(cfg))

WINDOW_SIZE: 30
PREDICT_SIZE: 21
EPOCHS: 20
LEARNING_RATE: 0.001
BATCH_SIZE: 256
NUM_WORKERS: 0
SEED: 29
input_size: 5
hidden_size: 512
output_size: 21
num_layers: 3
num_attention_heads: 4
feedforward_dim: 25
dropout_rate: 0.2
hidden_sizes:
- 512
- 256
- 128
- 64



### SET SEED

In [74]:
random.seed(cfg["SEED"])
os.environ["PYTHONHASHSEED"] = str(cfg["SEED"])
np.random.seed(cfg["SEED"])
torch.manual_seed(cfg["SEED"])
torch.cuda.manual_seed(cfg["SEED"])
torch.backends.cudnn.deterministic = False
torch.backends.cudnn.benchmark = False # 실험시 False

# Data Load

In [75]:
train = pd.read_csv(data_root+"\\train.csv")
train.drop(["ID", "제품"], axis=1, inplace=True)

In [None]:
train.head()

In [76]:
# 숫자형 변수들의 min-max scaling을 수행하는 코드입니다.
numeric_cols = train.columns[4:]
# 칵 column의 min 및 max 계산
min_values = train[numeric_cols].min(axis=1)
max_values = train[numeric_cols].max(axis=1)
# 각 행의 범위(max-min)를 계산하고, 범위가 0인 경우 1로 대체
ranges = max_values - min_values
ranges[ranges == 0] = 1
# min-max scaling 수행
train[numeric_cols] = (train[numeric_cols].subtract(min_values, axis=0)).div(ranges, axis=0)
# max와 min 값을 dictionary 형태로 저장
scale_min_dict = min_values.to_dict()
scale_max_dict = max_values.to_dict()

In [77]:
encoder = LabelEncoder()
categorical_col = ["대분류", "중분류", "소분류", "브랜드"]

for col in categorical_col:
    train[col] = encoder.fit_transform(train[col])

In [78]:
def make_train_data(data, train_size=cfg["WINDOW_SIZE"], predict_size=cfg["PREDICT_SIZE"]):
    '''
    학습 기간 블럭, 예측 기간 블럭의 세트로 데이터를 생성
    data : 일별 판매량
    train_size : 학습에 활용할 기간
    predict_size : 추론할 기간
    '''
    num_rows = len(data)
    window_size = train_size + predict_size
    
    input_data = np.empty((num_rows * (len(data.columns) - window_size + 1), train_size, len(data.iloc[0, :4]) + 1))
    target_data = np.empty((num_rows * (len(data.columns) - window_size + 1), predict_size))
    
    for i in tqdm(range(num_rows)):
        encode_info = np.array(data.iloc[i, :4])
        sales_data = np.array(data.iloc[i, 4:])
        
        for j in range(len(sales_data) - window_size + 1):
            window = sales_data[j : j + window_size]
            temp_data = np.column_stack((np.tile(encode_info, (train_size, 1)), window[:train_size]))
            input_data[i * (len(data.columns) - window_size + 1) + j] = temp_data
            target_data[i * (len(data.columns) - window_size + 1) + j] = window[train_size:]
    
    return input_data, target_data

In [79]:
def make_predict_data(data, train_size=cfg["WINDOW_SIZE"]):
    '''
    평가 데이터(Test Dataset)를 추론하기 위한 Input 데이터를 생성
    data : 일별 판매량
    train_size : 추론을 위해 필요한 일별 판매량 기간 (= 학습에 활용할 기간)
    '''
    num_rows = len(data)
    
    input_data = np.empty((num_rows, train_size, len(data.iloc[0, :4]) + 1))
    
    for i in tqdm(range(num_rows)):
        encode_info = np.array(data.iloc[i, :4])
        sales_data = np.array(data.iloc[i, -train_size:])
        
        window = sales_data[-train_size : ]
        temp_data = np.column_stack((np.tile(encode_info, (train_size, 1)), window[:train_size]))
        input_data[i] = temp_data
    
    return input_data

```python
class CustomDataset(Dataset):
    def __init__(self, data, train_size=CFG['TRAIN_WINDOW_SIZE'], predict_size=CFG['PREDICT_SIZE'], is_inference=False):
        self.data = data.values # convert DataFrame to numpy array
        self.train_size = train_size
        self.predict_size = predict_size
        self.window_size = self.train_size + self.predict_size
        self.is_inference = is_inference

    def __len__(self):
        if self.is_inference:
            return len(self.data)
        else:
            return self.data.shape[0] * (self.data.shape[1] - self.window_size - 3)

    def __getitem__(self, idx):
        if self.is_inference:
            # 추론 시
            encode_info = self.data[idx, :4]
            window = self.data[idx, -self.train_size:]
            input_data = np.column_stack((np.tile(encode_info, (self.train_size, 1)), window))
            return input_data
        else:
            # 학습 시
            row = idx // (self.data.shape[1] - self.window_size - 3)
            col = idx % (self.data.shape[1] - self.window_size - 3)
            encode_info = self.data[row, :4]
            sales_data = self.data[row, 4:]
            window = sales_data[col : col + self.window_size]
            input_data = np.column_stack((np.tile(encode_info, (self.train_size, 1)), window[:self.train_size]))
            target_data = window[self.train_size:]
            return input_data, target_data

```

In [80]:
train_input, train_target = make_train_data(train)
test_input = make_predict_data(train)

  0%|          | 0/15890 [00:00<?, ?it/s]

  0%|          | 0/15890 [00:00<?, ?it/s]

In [83]:
train_target.shape

(6562570, 21)

In [84]:
train

Unnamed: 0,대분류,중분류,소분류,브랜드,2022-01-01,2022-01-02,2022-01-03,2022-01-04,2022-01-05,2022-01-06,...,2023-03-26,2023-03-27,2023-03-28,2023-03-29,2023-03-30,2023-03-31,2023-04-01,2023-04-02,2023-04-03,2023-04-04
0,1,6,37,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.00000,0.00000,0.000000,0.000000
1,2,7,43,1,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.111111,0.333333,0.222222,0.00000,0.00000,0.222222,0.000000
2,2,7,43,1,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.00000,0.00000,0.000000,0.000000
3,2,7,43,1,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.00000,0.00000,0.000000,0.000000
4,0,0,2,2,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.00000,0.00000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15885,2,7,41,3169,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.00000,0.00000,0.000000,0.000000
15886,2,7,43,3169,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.024390,0.000000,0.016260,0.03252,0.00813,0.008130,0.024390
15887,2,7,43,3169,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.00000,0.00000,0.000000,0.000000
15888,2,7,43,3169,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.00000,0.00000,0.000000,0.142857


In [85]:
# Train / Validation Split
data_len = len(train_input)
val_input = train_input[-int(data_len*0.2):]
val_target = train_target[-int(data_len*0.2):]
train_input = train_input[:-int(data_len*0.2)]
train_target = train_target[:-int(data_len*0.2)]

In [86]:
train_input.shape, train_target.shape, val_input.shape, val_target.shape, test_input.shape

((5250056, 30, 5),
 (5250056, 21),
 (1312514, 30, 5),
 (1312514, 21),
 (15890, 30, 5))

# DataSet

In [87]:
class CustomDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y
        
    def __getitem__(self, index):
        if self.Y is not None:
            return torch.Tensor(self.X[index]), torch.Tensor(self.Y[index])
        return torch.Tensor(self.X[index])
    
    def __len__(self):
        return len(self.X)

In [100]:
train_dataset = CustomDataset(train_input, train_target)
train_dataloader = DataLoader(train_dataset, batch_size = cfg['BATCH_SIZE'], shuffle=True, num_workers=0)

val_dataset = CustomDataset(val_input, val_target)
val_dataloader = DataLoader(val_dataset, batch_size = cfg['BATCH_SIZE'], shuffle=False, num_workers=0)

In [101]:
for sample in train_dataloader:
    print(sample[0].shape)
    print(sample[1].shape)
    break

torch.Size([256, 30, 5])
torch.Size([256, 21])


# Define Model

In [90]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, dropout: float=0.1, max_len:int =5000):
        '''
        트랜스포머의 경우 Linear층을 활용한 셀프 어텐션을 통해 연산이 되기 때문에 시퀀스 정보가 입력되지 않는다. 따라서 포지셔널 인코딩을 통하여 이를 입력시켜주어야함
        d_model : model 의 차원
        dropout : dropout 확률
        max_len : 최대 길이
        '''
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)
        # pe : Positional Encoding
        # 최대 길이와 모델의 차원대로 영행렬을 만들어주고 사인과 코사인을 활용하여 입력데이터에 대한 위치정보를 인코딩해준다.
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        # attention is all you need 논문에 제시된 계산식 활용
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        # sin : 2i
        pe[:, 0::2] = torch.sin(position * div_term)
        # cos : 2i + 1
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        # register_buffer을 등록하면서 모델이 학습 중 가중치를 업데이트 하지 않고 초기값 그대로 반환
        # 어차피 위치 정보를 인코딩 하는 것이기 때문에 굳이 가중치 업데이트가 필요 없음
        # 물론 위의 방식이 아닌 다른 방식을 통하여 포지셔널 인코딩 (학습 및 nueral net 활용)을 한다면 register_buffer을 해제해야함
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

class TransformerModel(nn.Module):
    def __init__(self, input_size: int, hidden_size: int, output_size: int, nhead: int, num_layers: int, dropout: float=0.5):
        
        '''
        Transformer model
            - 흔히 알려진 구조 그대로 사용
            - 물론 시계열 예측의 경우 인코더만 사용한다는 얘기가 있어서 이부분 확인 필요
            - 가장 흔한 인코더 - 디코더 구조를 사용 
            - parameter
                - input_size : 입력 피처의 사이즈 [batch_size, window_size, input_size]로 구성, 메타 데이터를 활용하지 않는다면 input_size는 현재 5이나 변경될 수 있음
                - hidden_szie : 모델의 hidden_size -> 하이퍼 파라미터로 추후 변경 실험
                - output_size : 이번 태스크인 21일의 예측이기 때문에 상수로 존재
                - nhead : 트랜스포머 블록을 몇개 쌓을 것인지
                - num_layer : 층을 몇개로 할 것인지
                - dropout : dropout 확률
            - 트랜스포머 모델은 self attention만을 활용하기 때문에 시계열 layer가 아닌 Linear층 활용
            - 인코와 함께 입력값을 포지셔널 인코딩을 적용하고 그 값을 트랜스 포머 모델에 전달
            - nn.Transformer은 기본적으로 multi-head self attention이 구현되어있음
            - decoder의 경우 입력된 차원에서 Linear 층을 결합하여 output 출력 (21)
        '''
        super().__init__()
        self.encoder = nn.Linear(input_size, hidden_size)
        self.pos_encoder = PositionalEncoding(hidden_size, dropout)
        self.transformer = nn.Transformer(hidden_size, nhead, num_layers, dropout=dropout)
        self.decoder = nn.Linear(hidden_size, output_size)
        self.hidden_size = hidden_size

    def forward(self, src):
        src = self.encoder(src)
        src = self.pos_encoder(src)
        output = self.transformer(src, src)
        output = self.decoder(output[:, -1:, :])
        output = output.squeeze(1)
        return output

input_size = 5
hidden_size = 128 # hyperparameter
output_size = 21 
nhead = 8  # hyperparameter
num_layers = 4 # hyperparameter
dropout = 0.3 # hyperparameter

model = TransformerModel(feature_size, hidden_size, output_size, nhead, num_layers, dropout)

In [91]:
model

TransformerModel(
  (encoder): Linear(in_features=5, out_features=128, bias=True)
  (pos_encoder): PositionalEncoding(
    (dropout): Dropout(p=0.3, inplace=False)
  )
  (transformer): Transformer(
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0-3): 4 x TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
          )
          (linear1): Linear(in_features=128, out_features=2048, bias=True)
          (dropout): Dropout(p=0.3, inplace=False)
          (linear2): Linear(in_features=2048, out_features=128, bias=True)
          (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.3, inplace=False)
          (dropout2): Dropout(p=0.3, inplace=False)
        )
      )
      (norm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
 

# model compile

In [92]:
# Warmup Scheduler
class WarmupLR(optim.lr_scheduler.LambdaLR):

    def __init__(
        self,
        optimizer: optim.Optimizer,
        warmup_end_steps: int,
        last_epoch: int = -1,
    ):
        
        def wramup_fn(step: int):
            if step < warmup_end_steps:
                return float(step) / float(max(warmup_end_steps, 1))
            return 1.0
        
        super().__init__(optimizer, wramup_fn, last_epoch)


In [93]:
# set up gpu
gpu = 0

# define model
if gpu is not None:
    model.cuda(gpu)
model_name = type(model).__name__

# define loss
loss_function = nn.MSELoss()

# define optimizer
lr = cfg["LEARNING_RATE"]
optimizer = optim.Adam(model.parameters(), lr=lr)
optimizer_name = type(optimizer).__name__

# define scheduler
# scheduler = WarmupLR(optimizer, 1500)
scheduler = None
scheduler_name = type(scheduler).__name__ if scheduler is not None else "no"

max_epoch = cfg["EPOCHS"]

# define wandb
project_name = "LG_AIMERS_Sales_Forecast"
current_time = datetime.now().replace(microsecond=0).isoformat().replace(":", "_")
run_name = f"{current_time}_{model_name}_{optimizer_name}_optim_{lr}_with_{scheduler_name}"
run_name = sanitize_filename(run_name)
run_tags = [project_name]
# wandb.init(
#     project=project_name,
#     name=run_name,
#     tags=run_tags,
#     config={"lr": lr, "model_name": model_name, "optimizer_name": optimizer_name, "scheduler_name": scheduler_name},
#     reinit=True
# )
# wandb.watch(model)

# # set save model path
# run_dirname = "LG_AIMERS_Sales_Forecast"
# log_dir = os.path.join(project_root, "runs", run_dirname, run_name)
# log_model_path = os.path.join(log_dir, "models")
# os.makedirs(log_model_path, exist_ok=True)

# Train

In [94]:
clip_value = 1.0

In [103]:
def train(model, optimizer, train_dataloader, val_dataloader, device):
    model.to(device)
    criterion = nn.MSELoss().to(device)
    best_loss = 9999999
    best_model = None
    
    for epoch in range(1, 2):
        model.train()
        train_loss = []
        
        for X, Y in tqdm(iter(train_dataloader)):
            X = X.to(device)
            Y = Y.to(device)
            
            # Foward
            optimizer.zero_grad()
            # get prediction
            output = model(X)
            
            loss = criterion(output, Y)
            
            # back propagation
            loss.backward()
            # Apply gradient clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), clip_value)
            optimizer.step()
            # Perform LR scheduler Work
            if scheduler is not None:
                scheduler.step()
            
            train_loss.append(loss.item())
        
        val_loss = validation(model, val_dataloader, criterion, device)
        print(f'Epoch : [{epoch}] Train Loss : [{np.mean(train_loss):.5f}] Val Loss : [{val_loss:.5f}]')
        
        if best_loss > val_loss:
            best_loss = val_loss
            best_model = model
            print('Model Saved')
        
        # # WandB logging
        # wandb.log({
        #     "Epoch": epoch,
        #     "Train Loss": np.mean(train_loss),
        #     "Validation Loss": val_loss,
        # })
        
    return best_model

def validation(model, val_dataloader, criterion, device):
    model.eval()
    val_loss = []
    
    with torch.no_grad():
        for X, Y in tqdm(iter(val_dataloader)):
            X = X.to(device)
            Y = Y.to(device)

            output = model(X)
            loss = criterion(output, Y)
            
            val_loss.append(loss.item())
    return np.mean(val_loss)

In [104]:
infer_model = train(model, optimizer, train_dataloader, val_dataloader, device)

  0%|          | 0/20509 [00:00<?, ?it/s]

  0%|          | 0/5128 [00:00<?, ?it/s]

Epoch : [1] Train Loss : [0.03148] Val Loss : [0.03063]
Model Saved


In [105]:
test_dataset = CustomDataset(test_input, None)
test_dataloader = DataLoader(test_dataset, batch_size = cfg['BATCH_SIZE'], shuffle=False, num_workers=cfg["NUM_WORKERS"])

In [106]:
def inference(model, test_loader, device):
    predictions = []
    
    with torch.no_grad():
        for X in tqdm(iter(test_loader)):
            X = X.to(device)
            
            output = model(X)
            
            output = output.cpu().numpy()
            
            predictions.extend(output)
    
    return np.array(predictions)

In [107]:
pred = inference(infer_model, test_dataloader, device)

  0%|          | 0/63 [00:00<?, ?it/s]

In [108]:
# 추론 결과를 inverse scaling
for idx in range(len(pred)):
    pred[idx, :] = pred[idx, :] * (scale_max_dict[idx] - scale_min_dict[idx]) + scale_min_dict[idx]
    
# 결과 후처리
pred = np.round(pred, 0).astype(int)

In [109]:
pred

array([[1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       [3, 3, 3, ..., 4, 3, 3],
       ...,
       [2, 2, 2, ..., 2, 2, 2],
       [1, 1, 1, ..., 1, 1, 1],
       [0, 0, 0, ..., 0, 0, 0]])

In [110]:
pred.shape

(15890, 21)

# Submission

In [111]:
submit = pd.read_csv(data_root + "/sample_submission.csv")
submit.iloc[:,1:] = pred
submit.head()

Unnamed: 0,ID,2023-04-05,2023-04-06,2023-04-07,2023-04-08,2023-04-09,2023-04-10,2023-04-11,2023-04-12,2023-04-13,...,2023-04-16,2023-04-17,2023-04-18,2023-04-19,2023-04-20,2023-04-21,2023-04-22,2023-04-23,2023-04-24,2023-04-25
0,0,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
1,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
2,2,3,3,3,3,4,4,3,4,4,...,3,3,3,4,4,4,4,4,3,3
3,3,7,7,7,7,7,7,7,7,7,...,7,7,7,7,7,7,7,7,7,7
4,4,4,4,4,4,4,4,4,4,4,...,4,4,4,4,4,4,4,4,4,4


In [113]:
submit.to_csv('./transformer.csv', index=False)

In [48]:
submit.shape

(15890, 22)

In [49]:
submit = pd.read_csv(data_root + "/sample_submission.csv")

In [50]:
submit.shape

(15890, 22)