# Library import 

In [1]:
import os
import random
import math
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

import torch
import torch.nn.functional as F
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader

from omegaconf import OmegaConf, DictConfig

from tqdm.auto import tqdm
import warnings
import wandb
from datetime import datetime
import re
from typing import Tuple

warnings.filterwarnings("ignore")

In [2]:
def sanitize_filename(filename):
    # Remove characters that are not allowed in Windows file names
    # (e.g., : / \ ? * < > | ")
    filename = re.sub(r'[\\/:*?"<>|]', '_', filename)
    return filename

In [3]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# Configuration

In [4]:
project_root = os.getcwd()
data_root = os.path.join(project_root, "data")

In [39]:
cfg_dict: dict = {
    "WINDOW_SIZE" : 90,
    "PREDICT_SIZE" : 21,
    "EPOCHS" : 20,
    "LEARNING_RATE" : 1e-3,
    "BATCH_SIZE" : 9048,
    "NUM_WORKERS" : 0,
    "SEED" : 29,
    "input_size" : 5,
    "hidden_size" : 1024,
    "output_size" : 21,
    "num_layers" : 3,
    "num_attention_heads" : 4,
    "feedforward_dim" : 25,
    "dropout_rate" : 0.2,
    "hidden_sizes" : [512, 256, 128, 64]
}

cfg = OmegaConf.create(cfg_dict)
print(OmegaConf.to_yaml(cfg))

WINDOW_SIZE: 90
PREDICT_SIZE: 21
EPOCHS: 20
LEARNING_RATE: 0.001
BATCH_SIZE: 9048
NUM_WORKERS: 0
SEED: 29
input_size: 5
hidden_size: 1024
output_size: 21
num_layers: 3
num_attention_heads: 4
feedforward_dim: 25
dropout_rate: 0.2
hidden_sizes:
- 512
- 256
- 128
- 64



### SET SEED

In [6]:
random.seed(cfg["SEED"])
os.environ["PYTHONHASHSEED"] = str(cfg["SEED"])
np.random.seed(cfg["SEED"])
torch.manual_seed(cfg["SEED"])
torch.cuda.manual_seed(cfg["SEED"])
torch.backends.cudnn.deterministic = False
torch.backends.cudnn.benchmark = False # 실험시 False

# Data Load

In [7]:
train = pd.read_csv("./data/train.csv")
train.drop(["ID", "제품"], axis=1, inplace=True)

In [8]:
# 숫자형 변수들의 min-max scaling을 수행하는 코드입니다.
numeric_cols = train.columns[4:]
# 칵 column의 min 및 max 계산
min_values = train[numeric_cols].min(axis=1)
max_values = train[numeric_cols].max(axis=1)
# 각 행의 범위(max-min)를 계산하고, 범위가 0인 경우 1로 대체
ranges = max_values - min_values
ranges[ranges == 0] = 1
# min-max scaling 수행
train[numeric_cols] = (train[numeric_cols].subtract(min_values, axis=0)).div(ranges, axis=0)
# max와 min 값을 dictionary 형태로 저장
scale_min_dict = min_values.to_dict()
scale_max_dict = max_values.to_dict()

In [9]:
# 범주형 데이터를 바꾸기 위한 레이블 인코딩
# 원핫인코딩, 카테고리 인코딩등 여러 방법 고려 필요

categorical_col =  ["대분류", "중분류", "소분류", "브랜드"]
for col in categorical_col:
    encoder = LabelEncoder()
    encoder.fit(train[col])
    train[col] = encoder.transform(train[col])

In [10]:
def make_train_data(data, train_size=cfg["WINDOW_SIZE"], predict_size=cfg["PREDICT_SIZE"]):
    '''
    학습 기간 블럭, 예측 기간 블럭의 세트로 데이터를 생성
    data : 일별 판매량
    train_size : 학습에 활용할 기간
    predict_size : 추론할 기간
    '''
    num_rows = len(data)
    window_size = train_size + predict_size
    
    input_data = np.empty((num_rows * (len(data.columns) - window_size + 1), train_size, len(data.iloc[0, :4]) + 1))
    target_data = np.empty((num_rows * (len(data.columns) - window_size + 1), predict_size))
    
    for i in tqdm(range(num_rows)):
        encode_info = np.array(data.iloc[i, :4])
        sales_data = np.array(data.iloc[i, 4:])
        
        for j in range(len(sales_data) - window_size + 1):
            window = sales_data[j : j + window_size]
            temp_data = np.column_stack((np.tile(encode_info, (train_size, 1)), window[:train_size]))
            input_data[i * (len(data.columns) - window_size + 1) + j] = temp_data
            target_data[i * (len(data.columns) - window_size + 1) + j] = window[train_size:]
    
    return input_data, target_data

In [11]:
def make_predict_data(data, train_size=cfg["WINDOW_SIZE"]):
    '''
    평가 데이터(Test Dataset)를 추론하기 위한 Input 데이터를 생성
    data : 일별 판매량
    train_size : 추론을 위해 필요한 일별 판매량 기간 (= 학습에 활용할 기간)
    '''
    num_rows = len(data)
    
    input_data = np.empty((num_rows, train_size, len(data.iloc[0, :4]) + 1))
    
    for i in tqdm(range(num_rows)):
        encode_info = np.array(data.iloc[i, :4])
        sales_data = np.array(data.iloc[i, -train_size:])
        
        window = sales_data[-train_size : ]
        temp_data = np.column_stack((np.tile(encode_info, (train_size, 1)), window[:train_size]))
        input_data[i] = temp_data
    
    return input_data

In [12]:
train

Unnamed: 0,대분류,중분류,소분류,브랜드,2022-01-01,2022-01-02,2022-01-03,2022-01-04,2022-01-05,2022-01-06,...,2023-03-26,2023-03-27,2023-03-28,2023-03-29,2023-03-30,2023-03-31,2023-04-01,2023-04-02,2023-04-03,2023-04-04
0,1,6,37,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.00000,0.00000,0.000000,0.000000
1,2,7,43,1,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.111111,0.333333,0.222222,0.00000,0.00000,0.222222,0.000000
2,2,7,43,1,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.00000,0.00000,0.000000,0.000000
3,2,7,43,1,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.00000,0.00000,0.000000,0.000000
4,0,0,2,2,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.00000,0.00000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15885,2,7,41,3169,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.00000,0.00000,0.000000,0.000000
15886,2,7,43,3169,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.024390,0.000000,0.016260,0.03252,0.00813,0.008130,0.024390
15887,2,7,43,3169,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.00000,0.00000,0.000000,0.000000
15888,2,7,43,3169,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.00000,0.00000,0.000000,0.142857


In [13]:
train_input, train_target = make_train_data(train)
test_input = make_predict_data(train)

  0%|          | 0/15890 [00:00<?, ?it/s]

  0%|          | 0/15890 [00:00<?, ?it/s]

In [14]:
train_target.shape

(5609170, 21)

In [15]:
train

Unnamed: 0,대분류,중분류,소분류,브랜드,2022-01-01,2022-01-02,2022-01-03,2022-01-04,2022-01-05,2022-01-06,...,2023-03-26,2023-03-27,2023-03-28,2023-03-29,2023-03-30,2023-03-31,2023-04-01,2023-04-02,2023-04-03,2023-04-04
0,1,6,37,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.00000,0.00000,0.000000,0.000000
1,2,7,43,1,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.111111,0.333333,0.222222,0.00000,0.00000,0.222222,0.000000
2,2,7,43,1,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.00000,0.00000,0.000000,0.000000
3,2,7,43,1,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.00000,0.00000,0.000000,0.000000
4,0,0,2,2,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.00000,0.00000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15885,2,7,41,3169,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.00000,0.00000,0.000000,0.000000
15886,2,7,43,3169,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.024390,0.000000,0.016260,0.03252,0.00813,0.008130,0.024390
15887,2,7,43,3169,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.00000,0.00000,0.000000,0.000000
15888,2,7,43,3169,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.00000,0.00000,0.000000,0.142857


In [16]:
# Train / Validation Split
data_len = len(train_input)
val_input = train_input[-int(data_len*0.2):]
val_target = train_target[-int(data_len*0.2):]
train_input = train_input[:-int(data_len*0.2)]
train_target = train_target[:-int(data_len*0.2)]

In [17]:
train_input.shape, train_target.shape, val_input.shape, val_target.shape, test_input.shape

((4487336, 90, 5),
 (4487336, 21),
 (1121834, 90, 5),
 (1121834, 21),
 (15890, 90, 5))

# DataSet

In [18]:
class CustomDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y
        
    def __getitem__(self, index):
        if self.Y is not None:
            return torch.Tensor(self.X[index]), torch.Tensor(self.Y[index])
        return torch.Tensor(self.X[index])
    
    def __len__(self):
        return len(self.X)

In [40]:
train_dataset = CustomDataset(train_input, train_target)
train_dataloader = DataLoader(train_dataset, batch_size = cfg['BATCH_SIZE'], shuffle=True, num_workers=0)

val_dataset = CustomDataset(val_input, val_target)
val_dataloader = DataLoader(val_dataset, batch_size = cfg['BATCH_SIZE'], shuffle=False, num_workers=0)

In [41]:
for sample in train_dataloader:
    print(sample[0].shape)
    x = sample[0]
    print(sample[1].shape)
    y = sample[1]
    break

torch.Size([9048, 90, 5])
torch.Size([9048, 21])


# Define Model

### Sota model

- SOTA : State Of The Art
- https://paperswithcode.com/sota 
- 동일한 데이터를 가지고 모델을 제작해서 가장 높은 리더보드를 제작한 모델을 SOTA 라고 한다.
- 현재 시계열 forecasting 모델 중 Sota 모델인 Linear, NLinear, DLinear를 구현하였음
    - https://paperswithcode.com/task/time-series-forecasting  
- --
- 논문 : Are Transformers Effecitve for Time Series Forecasting?
- 해당 논문에서는 실제로 트랜스포머 기반의 모델이 장기 시계열 예측에 효과적이지 않고 오히려 단순한 모델이 더 성능이 좋다는 것을 주장
- 총 3가지 모델을 제시하였고 3가지 모델 모두 제작하였음
- 간단한 선형 및 분해 모델로도 높은 성능을 낼 수 있다는 주장
- 실제로 NLiear 모델의 경우 가장 성능이 높음

- 가장 기초적인 Linear 모델

![Alt text](<제목 없음.png>)

In [94]:
class LTSF_Linear(nn.Module):

    def __init__(self, window_size: int, forcast_size: int, individual: bool, feature_size: int):
        '''
        Long Term Time Series Forecasting (LTSF) 과제에서 시간적 변화를 모델링 하는데 주 목적을 두기 때문에 시퀀스 정보가 예측에 있어 가장 중요하다.
        이러한 시퀀스 정보를 위하여 트랜스포머에서는 포지셔널 인코딩 기법을 사용하였다.
        하지만 포지셔널 인코딩 이후 진행되는 멀티 헤드 셀프 어텐션은 시퀀스에 대한 정보 손실이 있을 수 밖에 없다.
        트랜스포머가 주로 사용되는 NLP 분야에서는 문장내 시퀀스가 의미자체에 영향을 주지 않기 때문에 약간의 손실이 크게 작용하지 않는다.
        예 : 나는 소년 이다. 나는 이다 소년
        하지만 추세 및 주기성이 존재하는 시계열 데이터의 경우 이런 정보손실은 큰 문제를 발생 할 수 있다.
        -> 이러한 관점에서 트랜스포머 모델이 실제로 성능이 과장되었다고 생각하고 진행하였다고 함

        Args:
            window_size : (int) 입력 윈도우 사이즈
            forcast_size : (int) output_size 출력 사이즈, 이번 태스크의 경우 21로 상수
            individual : (bool) 각 feature의 개수별로 별도의 Linear 층을 가짐, feature를 독립적으로 처리할 때 사용
            feature_size : (int) 입력 feature의 개수, 메타데이터를 쌓지 않은 지금의 경우 [batch_size, window_size, feature_size] 로 볼 수 있음
        '''

        super().__init__()

        self.window_size = window_size
        self.forcast_size = forcast_size
        self.individual = individual
        self.channels = feature_size

        if self.individual:
            self.Linear = torch.nn.ModuleList()
            # ModuleList 파이썬의 리스트와 비슷, 레이어를 묶어서 반복문으로 처리
            for i in range(self.channels):
                # 총 입력 채널 개수만큼 반복하여 Linear층을 생성
                self.Linear.append(nn.Linear(self.window_size, self.forcast_size))
        else:
            self.Linear = nn.Linear(self.window_size, self.forcast_size)
    
    def forward(self, x):

        if self.individual:
            results = [self.Linear[i](x[:, :, i]).unsqueeze(2) for i in range(self.channels)]
            x = sum(results) / self.channels
        else:
            x = x.mean(dim=2)  # Feature 차원의 평균 계산
            x = self.Linear(x)
            x = x.unsqueeze(2)  # (batch_size, forcast_size, 1)

        return x

In [99]:
model = LTSF_Linear(window_size=90, forcast_size=21, individual=True, feature_size=5).to(device)

## DLinear

- Autoformer 및 FEDformer에서 사용되는 시계열 분해 방식을 사용한 모델
- 이동 평균값을 구하고 추세와 주기성 데이터로 분해하여 각각 선형 레이어를 통해 적용하여 결합 후 마지막 예측을 계산

![Alt text](image.png)

In [158]:
class moving_avg(nn.Module):

    def __init__(self, kernel_size: int, stride: int):

        super().__init__()
        '''
        이동 평균
        Args: 
            kernel_size : (int) 이동 평균 시퀀스를 만들 커널의 크기 (cnn의 개념과 유사)
            stride : (int) 시퀀스 건너 뛸지
        '''
        self.kernel_size = kernel_size
        self.avg = nn.AvgPool1d(kernel_size=kernel_size, stride=stride, padding=0)

    def forward(self, x):
        front = x[:, 0:1, :].repeat(1, (self.kernel_size - 1) // 2, 1)
        end = x[:, -1:, :].repeat(1, (self.kernel_size - 1) // 2, 1)
        x = torch.cat([front, x, end], dim=1)
        x = self.avg(x.permute(0, 2, 1))
        x = x.permute(0, 2, 1)
        return x
    

class series_decomp(nn.Module):
    def __init__(self, kernel_size: int):

        '''
        시계열 분해
        이동 평균을 통해 만든 평균과 잔차를 계산해서 리턴
        Args:
            kernel_size : (int) 커널 사이즈 
           
        '''

        super().__init__()
        self.moving_avg = moving_avg(kernel_size, stride=1)

    def forward(self, x):

        moving_mean = self.moving_avg(x)
        residual = x - moving_mean
        return moving_mean, residual
    
class LTSF_DLinear(nn.Module):

    def __init__(self, window_size: int, forcast_size: int, kernel_size: int, individual: bool, feature_size: int):
        '''
        Args:
            window_size : (int) 윈도우 사이즈, 몇일의 데이터를 예측할 것인지
            forcast_size : (int) 예측할 길이, 이번 태스크에서는 21일치로 상수값
            kernel_size : (int) 예측할 커널의 사이즈,
            individual : (bool) 각 특성을 독립적으로 계산할지 한번에 할지 True / False
            feature_size : (int) 입력 특성의 개수 [batch_size, window_size, feature_size]

        이 모델의 경우 위의 그림처럼 이동 평균을 통해 주기(Seasonal)와 추세(Trend)를 분해하여 각 분해된 주기와 추세를 선형 층을 통해 연산을 하고 합을 통해 결과를 예측
        위에서 제시하였던 NLinear 보다 더 높은 성능을 기대할 수 있음 
        '''
        super().__init__()
        self.window_size = window_size
        self.forcast_size = forcast_size
        self.decompsition = series_decomp(kernel_size)
        self.individual = individual
        self.channels = feature_size

        if self.individual:
            self.Linear_Seasonal = nn.ModuleList()
            self.Linear_Trend = nn.ModuleList()

            for i in range(self.channels):
                self.Linear_Trend.append(nn.Linear(self.window_size, self.forcast_size))
                self.Linear_Trend[i].weight = nn.Parameter((1/self.window_size)*torch.ones([self.forcast_size, window_size]))
                self.Linear_Seasonal.append(nn.Linear(self.window_size, self.forcast_size))
                self.Linear_Seasonal[i].weight = nn.Parameter((1/self.window_size)*torch.ones([self.forcast_size, window_size]))

        else:
            self.Linear_Trend = nn.Linear(self.window_size, self.forcast_size)
            self.Linear_Trend.weight = nn.Parameter((1/self.window_size) * torch.ones([self.forcast_size, self.window_size]))
            self.Linear_Seasonal = nn.Linear(self.window_size, self.forcast_size)
            self.Linear_Seasonal.weight = nn.Parameter((1/self.window_size) * torch.ones([self.forcast_size, self.window_size]))

    def forward(self, x):
        trend_init, seasonal_init = self.decompsition(x)
        trend_init, seasonal_init = trend_init.permute(0, 2, 1), seasonal_init.permute(0, 2, 1)

        if self.individual:
            trend_output = torch.zeros([trend_init.size(0), trend_init.size(1), self.forcast_size], dtype=trend_init.dtype).to(trend_init.device)
            seasonal_output = torch.zeros([seasonal_init.size(0), seasonal_init.size(1), self.forcast_size], dtype=seasonal_init.dtype).to(seasonal_init.device)

            for idx in range(self.channels):
                trend_output[:, idx, :] = self.Linear_Trend[idx](trend_init[:, idx, :])
                seasonal_output[:, idx, :] = self.Linear_Seasonal[idx](seasonal_init[:, idx, :])

        else:
            trend_output = self.Linear_Trend(trend_init)
            seasonal_output = self.Linear_Seasonal(seasonal_init)
        
        x = seasonal_output + trend_output
        x = x.mean(dim=1, keepdim=True)  # Feature 차원에 대한 평균 계산
        x = x.permute(0, 2, 1)
        return x

In [208]:
model = LTSF_DLinear(window_size=90, forcast_size=21, kernel_size=3, individual=True, feature_size=5)

## NLinear

- 상승하거나 하락하는 추세를 지녔을 경우 학습 데이터의 평균과 분산으로 데이터를 정규화한다면 평가 데이터에 분포 이동이 발생할 수 있다.
- 이럴 경우 학습된 모형의 예측 값은 분포에서 크게 벗어나기 때문에 예측 성능이 하락한다.
- 따라서 이를 개선하기 위하여 가장 마지막 값을 빼서 모델을 학습시키고 가장 마지막에 다시 그 값을 더해서 실제 값이 존재하는 분포로 이동시킨다
- 이러한 방법을 적용하면서 현재 시계열 예측 모델 1위를 유지중

In [192]:
class LSTF_NLinear(nn.Module):

    def __init__(self, window_size: int, forcast_size: int, individaul: bool, featrue_size: int):

        '''
        LSTM이나 트랜스포머보다 훨씬 빠른 속도로 학습 가능하다는 장점
        오컴의 면도날 이론 처럼 단순한게 가장 최고의 답일 수 있다

        Args:
            window_size : (int) 윈도우 사이즈
            forcast_size : (int) 예측 길이, 21 상수
            individual : (bool) 피처를 같이 볼지 따로 볼지
            feature_size : (int) feature_size, [batch_size, window_size, feature_size]
        '''

        super().__init__()
        self.window_size = window_size
        self.forcast_size = forcast_size
        self.individual = individaul
        self.channels = featrue_size
        if self.individual:
            self.Linear = nn.ModuleList()
            for i in range(self.channels):

                self.Linear.append(nn.Linear(self.window_size, self.forcast_size))
        else:
            self.Linear = nn.Linear(self.window_size, self.forcast_size)

    def forward(self, x):
        seq_last = x[:, -1, :].unsqueeze(1)  
        seq_last = seq_last.repeat(1, x.size(1), 1)  
        x = x - seq_last

        if self.individual:
            output = torch.zeros([x.size(0), self.forcast_size, x.size(2)], dtype=x.dtype).to(x.device)

            for i in range(self.channels):
                output[:, :, i] = self.Linear[i](x[:, :, i])
            
            x = output
        else:
            x = self.Linear(x.permute(0, 2, 1)).permute(0, 2, 1)
        
        x = x.mean(dim=2, keepdim=True)

        return x


In [193]:
model = LSTF_NLinear(window_size=90, forcast_size=21, individaul=False, featrue_size=5)

# model compile

In [33]:
# Warmup Scheduler
class WarmupLR(optim.lr_scheduler.LambdaLR):

    def __init__(
        self,
        optimizer: optim.Optimizer,
        warmup_end_steps: int,
        last_epoch: int = -1,
    ):
        
        def wramup_fn(step: int):
            if step < warmup_end_steps:
                return float(step) / float(max(warmup_end_steps, 1))
            return 1.0
        
        super().__init__(optimizer, wramup_fn, last_epoch)


In [34]:
import torch.optim.lr_scheduler as lr_scheduler

In [35]:
model_name = type(model).__name__

# define loss
loss_function = nn.MSELoss()
scheduler = None
# define optimizer
lr = cfg["LEARNING_RATE"]
optimizer = optim.Adam(model.parameters(), lr=lr)
# 기존 Adam에 Weigth decay를 적용한 옵티마이저로 더 안정적인 학습이 가능
# optimizer = optim.AdamW(model.parameters(), lr=lr)
optimizer_name = type(optimizer).__name__

# define scheduler
# 사용하지 않는 스케줄러는 주석 처리
# warmup 스케줄러 - 초기에는 작은 값으로 학습하다 학습이 안정화되면 초기 학습률로 전환하는 방법
# scheduler = WarmupLR(optimizer, 1500)
# StepLR 스케줄러  - 일정 스텝마다 학습률에 감마값을 곱하여 학습률을 조정
# step_size = 10
# gamma = 0.5
# scheduler = lr_scheduler.StepLR(optimizer, step_size=step_size, gamma=gamma)

# # ExponentialLR 스케줄러 - 학습률의 곡선이 지수 함수 형태를 만들어 줌
# exponential_gamma = 0.95
# scheduler = lr_scheduler.ExponentialLR(optimizer, gamma=exponential_gamma)

# # CosineAnnealingLR 스케줄러 - 코사인 그래프를 그리면서 학습률이 진동하는 방식, 단순히 감소가 아닌 진동하며 최적점을 찾아감
# scheduler = lr_scheduler.CosineAnnealingLR(optimizer, T_max=50)  # T_max는 주기의 반복 횟수

# # ReduceLROnPlateau 스케줄러 - 몇번 이상(patience)가 감소하지 않으면 학습률을 factor만큼 감소시킴 
# reduce_lr_patience = 5
# reduce_lr_factor = 0.1
# reduce_lr_scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, patience=reduce_lr_patience, factor=reduce_lr_factor)

scheduler_name = type(scheduler).__name__ if scheduler is not None else "no"

# define wandb
# project_name = "LG_AIMERS_Sales_Forecast"
# current_time = datetime.now().replace(microsecond=0).isoformat().replace(":", "_")
# run_name = f"{current_time}_{model_name}_{optimizer_name}_optim_{lr}_with_{scheduler_name}"
# run_name = sanitize_filename(run_name)
# run_tags = [project_name]
# wandb.init(
#     project=project_name,
#     name=run_name,
#     tags=run_tags,
#     config={"lr": lr, "model_name": model_name, "optimizer_name": optimizer_name, "scheduler_name": scheduler_name},
#     reinit=True
# )
# wandb.watch(model)

# Train

In [36]:
clip_value = 1.0

In [45]:
def train(model, optimizer, train_dataloader, val_dataloader, device, patience=5):
    
    model.to(device)
    criterion = nn.MSELoss().to(device)
    # 초기 로스를 무한으로 설정
    best_loss = np.inf
    best_model = None
    # Early Stopping Counter
    counter = 0
    # Early Stopping Patience
    patience = 5
    best_model_state_dict = None
    
    for epoch in range(1, 2):
        model.train()
        train_loss = []
        for X, Y in tqdm(iter(train_dataloader)):
            X = X.to(device)
            Y = Y.to(device)
            Y = Y.unsqueeze(2)
            
            # Foward
            optimizer.zero_grad()

            # get prediction
            output = model(X)
            
            loss = criterion(output, Y)
            
            # back propagation
            loss.backward()
            # Apply gradient clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), clip_value)
            optimizer.step()
            # Perform LR scheduler Work
            if scheduler is not None:
                scheduler.step()
            
            train_loss.append(loss.item())
        
        val_loss = validation(model, val_dataloader, criterion, device)
        print(f'Epoch : [{epoch}] Train Loss : [{np.mean(train_loss):.5f}] Val Loss : [{val_loss:.5f}]')
        
        if best_loss > val_loss:
            best_loss = val_loss
            # 로스가 감소하였을 때 모델 갱신 및 저장
            best_model_state_dict = model.state_dict()
            torch.save(best_model_state_dict, "best_model.pth")
            counter = 0
            print('Model Saved')
        else:
            counter += 1
            print(f" Early Stopping count : {counter}")
            if counter >= patience:
                print("Early stopping.")
                break
        
        # # WandB logging
        # wandb.log({
        #     "Epoch": epoch,
        #     "Train Loss": np.mean(train_loss),
        #     "Validation Loss": val_loss,
        # })
        

    model.load_state_dict(best_model_state_dict)
    return model


def validation(model, val_dataloader, criterion, device):
    model.eval()
    val_loss = []
    
    with torch.no_grad():
        for X, Y in tqdm(iter(val_dataloader)):
            X = X.to(device)
            Y = Y.to(device)
            Y = Y.unsqueeze(2)
            
            output = model(X)
            loss = criterion(output, Y)
            
            val_loss.append(loss.item())
    return np.mean(val_loss)

In [209]:
infer_model = train(model, optimizer, train_dataloader, val_dataloader, device)

  0%|          | 0/496 [00:00<?, ?it/s]

  0%|          | 0/124 [00:00<?, ?it/s]

Epoch : [1] Train Loss : [87307.59594] Val Loss : [326993.54297]
Model Saved


In [200]:
test_dataset = CustomDataset(test_input, None)
test_dataloader = DataLoader(test_dataset, batch_size = cfg['BATCH_SIZE'], shuffle=False, num_workers=cfg["NUM_WORKERS"])

In [201]:
def inference(model, test_loader, device):
    predictions = []
    
    with torch.no_grad():
        for X in tqdm(iter(test_loader)):
            X = X.to(device)
            
            output = model(X)
            
            output = output.squeeze(2)

            output = output.cpu().numpy()
            
            predictions.extend(output)
    
    return np.array(predictions)

In [202]:
pred = inference(infer_model, test_dataloader, device)

  0%|          | 0/2 [00:00<?, ?it/s]

In [203]:
# 추론 결과를 inverse scaling
for idx in range(len(pred)):
    pred[idx, :] = pred[idx, :] * (scale_max_dict[idx] - scale_min_dict[idx]) + scale_min_dict[idx]
    
# 결과 후처리
pred = np.round(pred, 0).astype(int)
pred = np.clip(pred, 0, None)

In [204]:
pred.shape

(15890, 21)

# Submission

In [205]:
submit = pd.read_csv(data_root + "/sample_submission.csv")
print(submit.shape)
submit.iloc[:,1:] = pred
submit.head()

(15890, 22)


Unnamed: 0,ID,2023-04-05,2023-04-06,2023-04-07,2023-04-08,2023-04-09,2023-04-10,2023-04-11,2023-04-12,2023-04-13,...,2023-04-16,2023-04-17,2023-04-18,2023-04-19,2023-04-20,2023-04-21,2023-04-22,2023-04-23,2023-04-24,2023-04-25
0,0,0,0,0,0,1,0,1,0,0,...,0,1,0,0,0,1,0,1,1,0
1,1,0,0,0,0,0,0,1,0,0,...,0,1,0,0,0,1,0,1,1,0
2,2,0,1,0,0,2,2,3,0,0,...,0,3,0,0,0,3,0,4,3,0
3,3,0,2,0,0,4,4,7,0,0,...,0,6,0,0,0,5,1,7,5,0
4,4,0,2,0,0,3,2,4,0,0,...,0,3,0,0,0,4,0,4,3,0


In [206]:
submit.shape

(15890, 22)

In [207]:
submit.to_csv('./nlinear.csv', index=False)