# Library import

In [2]:
import os
import random
import math
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

import torch
import torch.nn.functional as F
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader

from tqdm.auto import tqdm
import warnings
from datetime import datetime
import re
from typing import Tuple

warnings.filterwarnings("ignore")

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
def sanitize_filename(filename):
    # Remove characters that are not allowed in Windows file names
    # (e.g., : / \ ? * < > | ")
    filename = re.sub(r'[\\/:*?"<>|]', '_', filename)
    return filename

In [6]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# Configuration

In [7]:
cfg: dict = {
    "WINDOW_SIZE" : 40,
    "PREDICT_SIZE" : 21,
    "EPOCHS" : 20,
    "LEARNING_RATE" : 1e-3,
    "BATCH_SIZE" : 1024,
    "NUM_WORKERS" : 0,
    "SEED" : 15,
    "input_size" : 6,
    "hidden_size" : 1024,
    "output_size" : 21,
    "num_layers" : 3,
    "num_attention_heads" : 4,
    "feedforward_dim" : 25,
    "dropout_rate" : 0.2,
    "hidden_sizes" : [512, 256, 128, 64]
}

### SET SEED

In [8]:
random.seed(cfg["SEED"])
os.environ["PYTHONHASHSEED"] = str(cfg["SEED"])
np.random.seed(cfg["SEED"])
torch.manual_seed(cfg["SEED"])
torch.cuda.manual_seed(cfg["SEED"])
torch.backends.cudnn.deterministic = False
torch.backends.cudnn.benchmark = False # 실험시 False

# Data Load

In [64]:
train = pd.read_csv("/content/drive/MyDrive/open/train.csv")
train.drop(["ID"], axis=1, inplace=True)

In [65]:
info = pd.read_csv("/content/drive/MyDrive/open/product_info.csv")

In [66]:
info["제품특성"] = info["제품특성"].apply(lambda x:x.split(":")[0])

In [67]:
train.drop("제품", axis=1, inplace=True)

In [70]:
train = pd.concat([train, info], axis=1)

In [72]:
train = train.drop(["제품"], axis=1)

In [73]:
col1=train.columns[-1:].to_list()
col2=train.columns[:-1].to_list()
new_col=col1+col2
train=train[new_col]

In [76]:
train = train.fillna("0")

In [78]:
# 숫자형 변수들의 min-max scaling을 수행하는 코드입니다.
numeric_cols = train.columns[6:]
# 칵 column의 min 및 max 계산
min_values = train[numeric_cols].min(axis=1)
max_values = train[numeric_cols].max(axis=1)
# 각 행의 범위(max-min)를 계산하고, 범위가 0인 경우 1로 대체
ranges = max_values - min_values
ranges[ranges == 0] = 1
# min-max scaling 수행
train[numeric_cols] = (train[numeric_cols].subtract(min_values, axis=0)).div(ranges, axis=0)
# max와 min 값을 dictionary 형태로 저장
scale_min_dict = min_values.to_dict()
scale_max_dict = max_values.to_dict()

In [80]:
# 범주형 데이터를 바꾸기 위한 레이블 인코딩
# 원핫인코딩, 카테고리 인코딩등 여러 방법 고려 필요

categorical_col =  ["제품특성","대분류", "중분류", "소분류", "브랜드", "쇼핑몰"]
for col in categorical_col:
    encoder = LabelEncoder()
    encoder.fit(train[col])
    train[col] = encoder.transform(train[col])

In [81]:
def make_train_data(data, train_size=cfg["WINDOW_SIZE"], predict_size=cfg["PREDICT_SIZE"]):
    '''
    학습 기간 블럭, 예측 기간 블럭의 세트로 데이터를 생성
    data : 일별 판매량
    train_size : 학습에 활용할 기간
    predict_size : 추론할 기간
    '''
    num_rows = len(data)
    window_size = train_size + predict_size

    input_data = np.empty((num_rows * (len(data.columns) - window_size + 1), train_size, len(data.iloc[0, :6]) + 1))
    target_data = np.empty((num_rows * (len(data.columns) - window_size + 1), predict_size))

    for i in tqdm(range(num_rows)):
        encode_info = np.array(data.iloc[i, :6])
        sales_data = np.array(data.iloc[i, 6:])

        for j in range(len(sales_data) - window_size + 1):
            window = sales_data[j : j + window_size]
            temp_data = np.column_stack((np.tile(encode_info, (train_size, 1)), window[:train_size]))
            input_data[i * (len(data.columns) - window_size + 1) + j] = temp_data
            target_data[i * (len(data.columns) - window_size + 1) + j] = window[train_size:]

    return input_data, target_data

In [82]:
def make_predict_data(data, train_size=cfg["WINDOW_SIZE"]):
    '''
    평가 데이터(Test Dataset)를 추론하기 위한 Input 데이터를 생성
    data : 일별 판매량
    train_size : 추론을 위해 필요한 일별 판매량 기간 (= 학습에 활용할 기간)
    '''
    num_rows = len(data)

    input_data = np.empty((num_rows, train_size, len(data.iloc[0, :6]) + 1))

    for i in tqdm(range(num_rows)):
        encode_info = np.array(data.iloc[i, :6])
        sales_data = np.array(data.iloc[i, -train_size:])

        window = sales_data[-train_size : ]
        temp_data = np.column_stack((np.tile(encode_info, (train_size, 1)), window[:train_size]))
        input_data[i] = temp_data

    return input_data

In [84]:
train_input, train_target = make_train_data(train)
test_input = make_predict_data(train)

  0%|          | 0/28894 [00:00<?, ?it/s]

  0%|          | 0/28894 [00:00<?, ?it/s]

In [85]:
train_target.shape

(12279950, 21)

In [86]:
# Train / Validation Split
data_len = len(train_input)
val_input = train_input[-int(data_len*0.2):]
val_target = train_target[-int(data_len*0.2):]
train_input = train_input[:-int(data_len*0.2)]
train_target = train_target[:-int(data_len*0.2)]

In [87]:
train_input.shape, train_target.shape, val_input.shape, val_target.shape, test_input.shape

((9823960, 40, 7),
 (9823960, 21),
 (2455990, 40, 7),
 (2455990, 21),
 (28894, 40, 7))

# DataSet

In [88]:
class CustomDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y

    def __getitem__(self, index):
        if self.Y is not None:
            return torch.Tensor(self.X[index]), torch.Tensor(self.Y[index])
        return torch.Tensor(self.X[index])

    def __len__(self):
        return len(self.X)

In [89]:
train_dataset = CustomDataset(train_input, train_target)
train_dataloader = DataLoader(train_dataset, batch_size = cfg['BATCH_SIZE'], shuffle=True, num_workers=0)

val_dataset = CustomDataset(val_input, val_target)
val_dataloader = DataLoader(val_dataset, batch_size = cfg['BATCH_SIZE'], shuffle=False, num_workers=0)

In [90]:
for sample in train_dataloader:
    print(sample[0].shape)
    x = sample[0]
    print(sample[1].shape)
    y = sample[1]
    break

torch.Size([1024, 40, 7])
torch.Size([1024, 21])


# Define Model

In [98]:
class BaseModel(nn.Module):
    def __init__(self, input_size=7, hidden_size=768, output_size=21):
        super().__init__()
        self.hidden_size = hidden_size
        self.rnn = nn.GRU(input_size, hidden_size, num_layers = 2, bidirectional=True, batch_first=True, dropout=0.3)
        self.fc = nn.Sequential(
            nn.Linear(hidden_size*2, hidden_size//2),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(hidden_size//2, output_size)
        )

        self.actv = nn.ReLU()

    def forward(self, x):
        # x shape: (B, TRAIN_WINDOW_SIZE, 6)
        batch_size = x.size(0)
        hidden = self.init_hidden(batch_size, x.device)

        # LSTM layer
        lstm_out, hidden = self.rnn(x, hidden)

        # Only use the last output sequence
        last_output = lstm_out[:, -1, :]

        # Fully connected layer
        output = self.actv(self.fc(last_output))

        return output.squeeze(1)

    def init_hidden(self, batch_size, device):
        # Initialize hidden state and cell state
        return torch.zeros(4, batch_size, self.hidden_size, device=device)

In [99]:
model = BaseModel()

# model compile

In [100]:
# Warmup Scheduler
class WarmupLR(optim.lr_scheduler.LambdaLR):

    def __init__(
        self,
        optimizer: optim.Optimizer,
        warmup_end_steps: int,
        last_epoch: int = -1,
    ):

        def wramup_fn(step: int):
            if step < warmup_end_steps:
                return float(step) / float(max(warmup_end_steps, 1))
            return 1.0

        super().__init__(optimizer, wramup_fn, last_epoch)


In [101]:
# set up gpu
gpu = 0

# define model
if gpu is not None:
    model.cuda(gpu)
model_name = type(model).__name__

# define loss
loss_function = nn.MSELoss()
# scheduler = None
# define optimizer
lr = cfg["LEARNING_RATE"]
optimizer = optim.Adam(model.parameters(), lr=lr)
# 기존 Adam에 Weigth decay를 적용한 옵티마이저로 더 안정적인 학습이 가능
# optimizer = optim.AdamW(model.parameters(), lr=lr)
optimizer_name = type(optimizer).__name__

# define scheduler
# 사용하지 않는 스케줄러는 주석 처리
# warmup 스케줄러 - 초기에는 작은 값으로 학습하다 학습이 안정화되면 초기 학습률로 전환하는 방법
scheduler = WarmupLR(optimizer, 1500)
# StepLR 스케줄러  - 일정 스텝마다 학습률에 감마값을 곱하여 학습률을 조정
# step_size = 10
# gamma = 0.5
# scheduler = lr_scheduler.StepLR(optimizer, step_size=step_size, gamma=gamma)

# # ExponentialLR 스케줄러 - 학습률의 곡선이 지수 함수 형태를 만들어 줌
# exponential_gamma = 0.95
# scheduler = lr_scheduler.ExponentialLR(optimizer, gamma=exponential_gamma)

# # CosineAnnealingLR 스케줄러 - 코사인 그래프를 그리면서 학습률이 진동하는 방식, 단순히 감소가 아닌 진동하며 최적점을 찾아감
# scheduler = lr_scheduler.CosineAnnealingLR(optimizer, T_max=50)  # T_max는 주기의 반복 횟수

# # ReduceLROnPlateau 스케줄러 - 몇번 이상(patience)가 감소하지 않으면 학습률을 factor만큼 감소시킴
# reduce_lr_patience = 5
# reduce_lr_factor = 0.1
# reduce_lr_scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, patience=reduce_lr_patience, factor=reduce_lr_factor)

scheduler_name = type(scheduler).__name__ if scheduler is not None else "no"

# define wandb
# project_name = "LG_AIMERS_Sales_Forecast"
# current_time = datetime.now().replace(microsecond=0).isoformat().replace(":", "_")
# run_name = f"{current_time}_{model_name}_{optimizer_name}_optim_{lr}_with_{scheduler_name}"
# run_name = sanitize_filename(run_name)
# run_tags = [project_name]
# wandb.init(
#     project=project_name,
#     name=run_name,
#     tags=run_tags,
#     config={"lr": lr, "model_name": model_name, "optimizer_name": optimizer_name, "scheduler_name": scheduler_name},
#     reinit=True
# )
# wandb.watch(model)

# Train

In [102]:
clip_value = 1.0

In [103]:
def train(model, optimizer, train_dataloader, val_dataloader, device, patience=5):

    model.to(device)
    criterion = nn.MSELoss().to(device)
    # 초기 로스를 무한으로 설정
    best_loss = np.inf
    best_model = None
    # Early Stopping Counter
    counter = 0
    # Early Stopping Patience
    patience = 2
    best_model_state_dict = None

    for epoch in range(1, 20):
        model.train()
        train_loss = []
        for X, Y in tqdm(iter(train_dataloader)):
            X = X.to(device)
            Y = Y.to(device)

            # Foward
            optimizer.zero_grad()

            # get prediction
            output = model(X)

            loss = criterion(output, Y)

            # back propagation
            loss.backward()
            # Apply gradient clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), clip_value)
            optimizer.step()
            # Perform LR scheduler Work
            if scheduler is not None:
                scheduler.step()

            train_loss.append(loss.item())

        val_loss = validation(model, val_dataloader, criterion, device)
        print(f'Epoch : [{epoch}] Train Loss : [{np.mean(train_loss):.5f}] Val Loss : [{val_loss:.5f}]')

        if best_loss > val_loss:
            best_loss = val_loss
            # 로스가 감소하였을 때 모델 갱신 및 저장
            best_model_state_dict = model.state_dict()
            torch.save(best_model_state_dict, "best_model.pth")
            counter = 0
            print('Model Saved')
        else:
            counter += 1
            print(f" Early Stopping count : {counter}")
            if counter >= patience:
                print("Early stopping.")
                break

        # # WandB logging
        # wandb.log({
        #     "Epoch": epoch,
        #     "Train Loss": np.mean(train_loss),
        #     "Validation Loss": val_loss,
        # })


    model.load_state_dict(best_model_state_dict)
    return model


def validation(model, val_dataloader, criterion, device):
    model.eval()
    val_loss = []

    with torch.no_grad():
        for X, Y in tqdm(iter(val_dataloader)):
            X = X.to(device)
            Y = Y.to(device)

            output = model(X)
            loss = criterion(output, Y)

            val_loss.append(loss.item())
    return np.mean(val_loss)

In [None]:
infer_model = train(model, optimizer, train_dataloader, val_dataloader, device)

  0%|          | 0/9594 [00:00<?, ?it/s]

  0%|          | 0/2399 [00:00<?, ?it/s]

Epoch : [1] Train Loss : [0.01503] Val Loss : [0.01469]
Model Saved


  0%|          | 0/9594 [00:00<?, ?it/s]

  0%|          | 0/2399 [00:00<?, ?it/s]

Epoch : [2] Train Loss : [0.01519] Val Loss : [0.02983]
 Early Stopping count : 1


  0%|          | 0/9594 [00:00<?, ?it/s]

  0%|          | 0/2399 [00:00<?, ?it/s]

Epoch : [3] Train Loss : [0.01558] Val Loss : [0.01456]
Model Saved


  0%|          | 0/9594 [00:00<?, ?it/s]

  0%|          | 0/2399 [00:00<?, ?it/s]

Epoch : [4] Train Loss : [0.01471] Val Loss : [0.01460]
 Early Stopping count : 1


  0%|          | 0/9594 [00:00<?, ?it/s]

In [None]:
test_dataset = CustomDataset(test_input, None)
test_dataloader = DataLoader(test_dataset, batch_size = cfg['BATCH_SIZE'], shuffle=False, num_workers=cfg["NUM_WORKERS"])

In [None]:
def inference(model, test_loader, device):
    predictions = []

    with torch.no_grad():
        for X in tqdm(iter(test_loader)):
            X = X.to(device)

            output = model(X)

            output = output.squeeze(2)

            output = output.cpu().numpy()

            predictions.extend(output)

    return np.array(predictions)

In [None]:
pred = inference(infer_model, test_dataloader, device)

  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
# 추론 결과를 inverse scaling
for idx in range(len(pred)):
    pred[idx, :] = pred[idx, :] * (scale_max_dict[idx] - scale_min_dict[idx]) + scale_min_dict[idx]

# 결과 후처리
pred = np.round(pred, 0).astype(int)
pred = np.clip(pred, 0, None)

In [None]:
pred.shape

(15890, 21)

# Submission

In [None]:
submit = pd.read_csv(data_root + "/sample_submission.csv")
print(submit.shape)
submit.iloc[:,1:] = pred
submit.head()

(15890, 22)


Unnamed: 0,ID,2023-04-05,2023-04-06,2023-04-07,2023-04-08,2023-04-09,2023-04-10,2023-04-11,2023-04-12,2023-04-13,...,2023-04-16,2023-04-17,2023-04-18,2023-04-19,2023-04-20,2023-04-21,2023-04-22,2023-04-23,2023-04-24,2023-04-25
0,0,0,0,0,0,1,0,1,0,0,...,0,1,0,0,0,1,0,1,1,0
1,1,0,0,0,0,0,0,1,0,0,...,0,1,0,0,0,1,0,1,1,0
2,2,0,1,0,0,2,2,3,0,0,...,0,3,0,0,0,3,0,4,3,0
3,3,0,2,0,0,4,4,7,0,0,...,0,6,0,0,0,5,1,7,5,0
4,4,0,2,0,0,3,2,4,0,0,...,0,3,0,0,0,4,0,4,3,0


In [None]:
submit.shape

(15890, 22)

In [None]:
submit.to_csv('./nlinear.csv', index=False)