In [None]:
import random
import os
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from sklearn.preprocessing import LabelEncoder

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller

In [None]:
from datetime import datetime
import seaborn as sns

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

from math import sqrt
from math import ceil

In [None]:
from torch.nn import DataParallel

import time
import json
import pickle

import warnings
warnings.filterwarnings('ignore')

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# Hyperparameter Setting

In [None]:
CFG = {
    'TRAIN_WINDOW_SIZE':90, # 90일치로 학습
    'PREDICT_SIZE':21, # 21일치 예측
    'EPOCHS':5,
    'LEARNING_RATE':1e-4,
    'BATCH_SIZE':2048,
    'SEED':41
}

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CFG['SEED']) # Seed 고정

# 1. 데이터 정의

In [None]:
train_data = pd.read_csv('/content/drive/MyDrive/LG aimers/train.csv').drop(columns=['ID', '제품'])
price_data = pd.read_csv('/content/drive/MyDrive/LG aimers/sales.csv').drop(columns=['ID', '제품'])
brand_data = pd.read_csv('/content/drive/MyDrive/LG aimers/brand_keyword_cnt.csv')

In [None]:
tmp_date = train_data.iloc[:, 4:].replace(0, 1)
price_data.iloc[:, 4:] = price_data.iloc[:, 4:] / tmp_date

In [None]:
brand_data = pd.merge(train_data.iloc[:, :4], brand_data, how='left', left_on='브랜드', right_on='브랜드')

#### Label Encoding

In [None]:
# Label Encoding
label_encoder = LabelEncoder()
categorical_columns = ['대분류', '중분류', '소분류', '브랜드']

for col in categorical_columns:
    label_encoder.fit(train_data[col])
    train_data[col] = label_encoder.transform(train_data[col])
    price_data[col] = label_encoder.transform(price_data[col])
    brand_data[col] = label_encoder.transform(brand_data[col])

#### Data Scaling

In [None]:
def normalization(data):
    # 숫자형 변수들의 min-max scaling을 수행하는 코드
    numeric_cols = data.columns[4:]

    # 각 column의 min 및 max 계산
    min_values = data[numeric_cols].min(axis=1)
    max_values = data[numeric_cols].max(axis=1)

    # 각 행의 범위(max-min)를 계산하고, 범위가 0인 경우 1로 대체
    ranges = max_values - min_values
    ranges[ranges == 0] = 1

    # min-max scaling 수행
    data[numeric_cols] = (data[numeric_cols].subtract(min_values, axis=0)).div(ranges, axis=0)

    # max와 min 값을 dictionary 형태로 저장
    scale_min_dict = min_values.to_dict()
    scale_max_dict = max_values.to_dict()
    return (data, scale_min_dict, scale_max_dict)

In [None]:
train_data, train_scale_min_dict, train_scale_max_dict = normalization(train_data)
price_data, price_scale_min_dict, price_scale_max_dict = normalization(price_data)

#### Missing Imputation

In [None]:
none_brand_idx = brand_data[brand_data.isnull().any(axis=1)].index

In [None]:
drop_train_data = train_data.loc[none_brand_idx]
drop_price_data = price_data.loc[none_brand_idx]
drop_brand_data = brand_data.loc[none_brand_idx]

In [None]:
train_data = train_data.drop(none_brand_idx)
price_data = price_data.drop(none_brand_idx)
brand_data = brand_data.drop(none_brand_idx)

In [None]:
drop_train_data

Unnamed: 0,대분류,중분류,소분류,브랜드,2022-01-01,2022-01-02,2022-01-03,2022-01-04,2022-01-05,2022-01-06,...,2023-03-26,2023-03-27,2023-03-28,2023-03-29,2023-03-30,2023-03-31,2023-04-01,2023-04-02,2023-04-03,2023-04-04
494,1,3,18,95,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.004883,0.002930,0.009766,0.003906,0.004883,0.009766,0.002930
1249,4,10,51,246,0.142857,0.428571,0.714286,0.714286,0.285714,0.285714,...,0.000000,0.000000,0.000000,0.000000,0.142857,0.285714,0.000000,0.000000,0.428571,0.000000
1261,1,1,5,250,1.000000,0.814815,0.759259,0.851852,0.944444,0.833333,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1262,1,1,5,250,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1477,0,0,2,303,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.064516,0.451613,0.645161,0.161290,0.451613,0.419355,0.193548
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15713,1,6,39,3142,0.032258,0.032258,0.032258,0.010753,0.010753,0.010753,...,0.010753,0.010753,0.032258,0.032258,0.021505,0.000000,0.043011,0.021505,0.032258,0.010753
15714,1,6,37,3142,0.033333,0.033333,0.033333,0.033333,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
15764,0,0,0,3149,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
15765,0,0,0,3149,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


#### validation

In [None]:
def make_train_data(data, train_size=CFG['TRAIN_WINDOW_SIZE'], predict_size=CFG['PREDICT_SIZE'], feature_size = 4):

    num_rows = len(data)
    window_size = train_size + predict_size
    series_size = len(data.iloc[0, feature_size:]) - window_size + 1

    input_data = np.empty((num_rows * series_size, train_size, len(data.iloc[0, :feature_size]) +1))
    target_data = np.empty((num_rows * series_size, predict_size))

    for i in tqdm(range(num_rows)):
        encode_info = np.array(data.iloc[i, :feature_size])
        sales_data = np.array(data.iloc[i, feature_size:])

        for j in range(len(sales_data) - window_size + 1):
            window = sales_data[j : j + window_size]


            temp_data = np.column_stack((np.tile(encode_info, (train_size, 1)), window[:train_size]))
            input_data[i * series_size + j] = temp_data
            target_data[i * series_size + j] = window[train_size:]

    return input_data, target_data, series_size

In [None]:
def make_predict_data(data, train_size=CFG['TRAIN_WINDOW_SIZE'], feature_size=4):

    num_rows = len(data)

    input_data = np.empty((num_rows, train_size, len(data.iloc[0, :feature_size]) + 1))

    for i in tqdm(range(num_rows)):
        encode_info = np.array(data.iloc[i, :feature_size])
        sales_data = np.array(data.iloc[i, -train_size:])

        window = sales_data[-train_size : ]
        temp_data = np.column_stack((np.tile(encode_info, (train_size, 1)), window[:train_size]))
        input_data[i] = temp_data

    return input_data

In [None]:
data = {
    'sales_data' : train_data,
    'brand_data' : brand_data,
    'price_data' : price_data
}

In [None]:
train_input, train_target, series_size = make_train_data(data)
test_input = make_predict_data(data)

AttributeError: ignored

In [None]:
np.save('train_input.npy', train_input)
np.save('train_target.npy', train_target)
np.save('test_input.npy', test_input)

In [None]:
train_input = np.load('train_input.npy')
train_target = np.load('train_target.npy')
test_input = np.load('test_input.npy')

In [None]:
train_input, train_target, series_size = make_train_data(train_data)
test_input = make_predict_data(train_data)

In [None]:
def train_val_split(data, input_data, target_data, series_size, train_size=CFG['TRAIN_WINDOW_SIZE'], predict_size=CFG['PREDICT_SIZE']):

    val_index = sorted(random.sample(range(series_size),int(series_size*0.2)))

    num_rows=len(data)

    val_inputs = [] # 일 순서대로 담을 예정
    val_targets = []
    for i in tqdm(val_index):
        inputs = np.empty((num_rows, train_size, input_data.shape[2]))
        targets = np.empty((num_rows, predict_size))
        for j in range(num_rows):
            inputs[j] = input_data[j * series_size + i]
            targets[j] = target_data[j * series_size + i]
        val_inputs.append(inputs)
        val_targets.append(targets)

    train_series_size = series_size - len(val_index)

    train_input = np.empty((num_rows * train_series_size, train_size, input_data.shape[2]))
    train_target = np.empty((num_rows * train_series_size, predict_size))

    # train 데이터 생성
    k = 0
    for i in tqdm(range(series_size)):
        if i not in val_index:
            for j in range(num_rows):
                train_input[k] = input_data[j * series_size + i]
                train_target[k] = target_data[j * series_size + i]
                k += 1

    return train_input, train_target, val_inputs, val_targets

In [None]:
# Train / Validation Split
train_input, train_target, val_inputs, val_targets = train_val_split(train_data, train_input, train_target, series_size)

In [None]:
train_input.shape, train_target.shape, test_input.shape

In [None]:
val_inputs[0].shape, val_targets[0].shape

In [None]:
len(val_inputs), len(val_targets)

#### Custom Dataset

In [None]:
class CustomDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y

    def __getitem__(self, index):
        if self.Y is not None:
            return torch.Tensor(self.X[index]), torch.Tensor(self.Y[index])
        return torch.Tensor(self.X[index])

    def __len__(self):
        return len(self.X)

In [None]:
train_dataset = CustomDataset(train_input, train_target)
train_loader = DataLoader(train_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=True, num_workers=0)

# 2. 모델 정의

#### Attention

In [None]:
class ConvLSTMCell(tf.keras.Model):
    def __init__(self, hidden_dim, kernel_size, bias):
        super(ConvLSTMCell, self).__init__()
        self.hidden_dim = hidden_dim

        self.kernel_size = kernel_size
        self.bias = bias

        self.conv = tf.keras.layers.Conv2D(
            filters = 4 * self.hidden_dim,
            kernel_size = self.kernel_size,
            padding = 'same',
            use_bias = self.bias
        )

    def call(self, input_tensor, cur_state):
        h_cur, c_cur = cur_state
        combined = tf.concat([input_tensor, h_cur], axis=3)
        combined_conv = self.conv(combined)
        cc_i, cc_f, cc_o, cc_g = tf.split(combined_conv, num_or_size_splits=4, axis=-1)
        i = tf.keras.activations.sigmoid(cc_i)
        f = tf.keras.activations.sigmoid(cc_f)
        o = tf.keras.activations.sigmoid(cc_o)
        g = tf.keras.activations.tanh(cc_g)

        c_next = f*c_cur+i*g
        h_next = o*tf.keras.activations.tanh(c_next)

        return h_next, c_next

    def init_hidden(self, batch_size, image_size):
        height, width = image_size
        return (tf.zeros([batch_size, height, width, self.hidden_dim]),
                tf.zeros([batch_size, height, width, self.hidden_dim]))

In [None]:
class Encoder(tf.keras.Model):
    def __init__(self, hidden, enc_num_layers=1):
        super(Encoder, self).__init__()
        self.enc_num_layers = enc_num_layers
        self.encoder_input_convlstm = ConvLSTMCell(
            hidden_dim=hidden,
            kernel_size=(3, 3),
            bias=True
        )
        if self.enc_num_layers is not None:
            self.hidden_encoder_layers = [
                ConvLSTMCell(
                    hidden_dim=hidden,
                    kernel_size=(3, 3),
                    bias=True
                ) for _ in range(self.enc_num_layers)
            ]

    def call(self, enc_input):
        h_t, c_t = self.init_hidden(enc_input, 'seq')
        if self.enc_num_layers is not None:
            hidden_h_t = []
            hidden_c_t = []
            for i in range(self.enc_num_layers):
                hidden_h_t += [self.init_hidden(h_t, i)[0]]
                hidden_c_t += [self.init_hidden(h_t, i)[1]]

        seq_len = enc_input.shape[1]
        for t in range(seq_len):
            h_t, c_t = self.encoder_input_convlstm(
                input_tensor=enc_input[:, t, :, :, :],
                cur_state=[h_t, c_t]
            )
            input_tensor = h_t
            if self.enc_num_layers is not None:
                for i in range(self.enc_num_layers):
                    hidden_h_t[i], hidden_c_t[i] = self.hidden_encoder_layers[i](
                        input_tensor=input_tensor,
                        cur_state=[hidden_h_t[i], hidden_c_t[i]]
                    )
                    input_tensor = hidden_h_t[i]

        if self.enc_num_layers is not None:
            return hidden_h_t[-1], hidden_c_t[-1]
        else:
            return h_t, c_t

    def init_hidden(self, input_tensor, seq):
        if seq == 'seq':
            b, seq_len, h, w, _ = input_tensor.shape
            h_t, c_t = self.encoder_input_convlstm.init_hidden(
                batch_size=b,
                image_size=(h, w)
            )
        else:
            b, h, w, _ = input_tensor.shape
            h_t, c_t = self.hidden_encoder_layers[seq].init_hidden(
                batch_size=b,
                image_size=(h, w)
            )
        return h_t, c_t

In [None]:
sample_enc_input_data = next(iter(train_data))[0]
sample_encoder = Encoder(16, 1)
enc_output = sample_encoder(sample_enc_input_data)
enc_output[0].shape, enc_output[1].shape

In [None]:
class Decoder(tf.keras.Model):
    def __init__(self, hidden, dec_num_layers=1, future_len=12):
        super(Decoder, self).__init__()
        self.dec_num_layers = dec_num_layers
        self.future_len = future_len
        self.decoder_input_convlstm = ConvLSTMCell(
            hidden_dim=hidden,
            kernel_size=(3, 3),
            bias=True
        )
        if self.dec_num_layers is not None:
            self.hidden_decoder_layers = [
                ConvLSTMCell(
                    hidden_dim=hidden,
                    kernel_size=(3, 3),
                    bias=True
                ) for _ in range(dec_num_layers)
            ]

        self.decoder_output_layer = tf.keras.layers.Conv2D(
            filters=1,
            kernel_size=(3,3),
            padding='same',
            activation='sigmoid'
        )

    def call(self, enc_output):
        if self.dec_num_layers is not None:
            hidden_h_t = []
            hidden_c_t = []
            for i in range(self.dec_num_layers):
                hidden_h_t += [self.init_hidden(enc_output[0], i)[0]]
                hidden_c_t += [self.init_hidden(enc_output[0], i)[1]]

        outputs = []
        input_tensor = enc_output[0]
        h_t, c_t = self.init_hidden(input_tensor, 'seq')
        for t in range(self.future_len):
            h_t, c_t=self.decoder_input_convlstm(
                input_tensor=input_tensor,
                cur_state=[h_t, c_t]
            )
            input_tensor = h_t
            if self.dec_num_layers is not None:
                for i in range(self.dec_num_layers):
                    hidden_h_t[i], hidden_c_t[i] = self.hidden_decoder_layers[i](
                        input_tensor=input_tensor,
                        cur_state=[hidden_h_t[i], hidden_c_t[i]]
                    )
                    input_tensor=hidden_h_t[i]
                output = self.decoder_output_layer(hidden_h_t[-1])
            else:
                output = self.decoder_output_layer(h_t)
            outputs += [output]
        outputs = tf.stack(outputs, 1)

        return outputs

    def init_hidden(self, input_tensor, seq):
        if seq == 'seq':
            b, h, w, _ = input_tensor.shape
            h_t, c_t = self.decoder_input_convlstm.init_hidden(
                batch_size=b,
                image_size=(h, w)
            )
        else:
            b, h, w, _ = input_tensor.shape
            h_t, c_t = self.hidden_decoder_layers[seq].init_hidden(
                batch_size=b,
                image_size=(h, w)
            )
        return h_t, c_t

In [None]:
sample_decoder = Decoder(16)
dec_output = sample_decoder(enc_output)
dec_output.shape

#### Cross Decoder

In [None]:
class DecoderLayer(nn.Module):
    '''
    The decoder layer of Crossformer, each layer will make a prediction at its scale
    '''
    def __init__(self, seg_len, d_model, n_heads, d_ff=None, dropout=0.1, out_seg_num = 10, factor = 10):
        super(DecoderLayer, self).__init__()
        self.self_attention = TwoStageAttentionLayer(out_seg_num, factor, d_model, n_heads, \
                                d_ff, dropout)
        self.cross_attention = AttentionLayer(d_model, n_heads, dropout = dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        self.MLP1 = nn.Sequential(nn.Linear(d_model, d_model),
                                nn.GELU(),
                                nn.Linear(d_model, d_model))
        self.linear_pred = nn.Linear(d_model, seg_len)

    def forward(self, x, cross):
        '''
        x: the output of last decoder layer
        cross: the output of the corresponding encoder layer
        '''

        batch = x.shape[0]
        x = self.self_attention(x)
        x = rearrange(x, 'b ts_d out_seg_num d_model -> (b ts_d) out_seg_num d_model')

        cross = rearrange(cross, 'b ts_d in_seg_num d_model -> (b ts_d) in_seg_num d_model')
        tmp = self.cross_attention(
            x, cross, cross,
        )
        x = x + self.dropout(tmp)
        y = x = self.norm1(x)
        y = self.MLP1(y)
        dec_output = self.norm2(x+y)

        dec_output = rearrange(dec_output, '(b ts_d) seg_dec_num d_model -> b ts_d seg_dec_num d_model', b = batch)
        layer_predict = self.linear_pred(dec_output)
        layer_predict = rearrange(layer_predict, 'b out_d seg_num seg_len -> b (out_d seg_num) seg_len')

        return dec_output, layer_predict

class Decoder(nn.Module):
    '''
    The decoder of Crossformer, making the final prediction by adding up predictions at each scale
    '''
    def __init__(self, seg_len, d_layers, d_model, n_heads, d_ff, dropout,\
                router=False, out_seg_num = 10, factor=10):
        super(Decoder, self).__init__()

        self.router = router
        self.decode_layers = nn.ModuleList()
        for i in range(d_layers):
            self.decode_layers.append(DecoderLayer(seg_len, d_model, n_heads, d_ff, dropout, \
                                        out_seg_num, factor))

    def forward(self, x, cross):
        final_predict = None
        i = 0

        ts_d = x.shape[1]
        for layer in self.decode_layers:
            cross_enc = cross[i]
            x, layer_predict = layer(x,  cross_enc)
            if final_predict is None:
                final_predict = layer_predict
            else:
                final_predict = final_predict + layer_predict
            i += 1

        final_predict = rearrange(final_predict, 'b (out_d seg_num) seg_len -> b (seg_num seg_len) out_d', out_d = ts_d)

        return final_predict

#### Cross Embed

In [None]:
class DSW_embedding(nn.Module):
    def __init__(self, seg_len, d_model):
        super(DSW_embedding, self).__init__()
        self.seg_len = seg_len

        self.linear = nn.Linear(seg_len, d_model)

    def forward(self, x):
        batch, ts_len, ts_dim = x.shape

        x_segment = rearrange(x, 'b (seg_num seg_len) d -> (b d seg_num) seg_len', seg_len = self.seg_len)
        x_embed = self.linear(x_segment)
        x_embed = rearrange(x_embed, '(b d seg_num) d_model -> b d seg_num d_model', b = batch, d = ts_dim)

        return x_embed

#### Cross Encoder

In [None]:
class SegMerging(nn.Module):
    '''
    Segment Merging Layer.
    The adjacent `win_size' segments in each dimension will be merged into one segment to
    get representation of a coarser scale
    we set win_size = 2 in our paper
    '''
    def __init__(self, d_model, win_size, norm_layer=nn.LayerNorm):
        super().__init__()
        self.d_model = d_model
        self.win_size = win_size
        self.linear_trans = nn.Linear(win_size * d_model, d_model)
        self.norm = norm_layer(win_size * d_model)

    def forward(self, x):
        """
        x: B, ts_d, L, d_model
        """
        batch_size, ts_d, seg_num, d_model = x.shape
        pad_num = seg_num % self.win_size
        if pad_num != 0:
            pad_num = self.win_size - pad_num
            x = torch.cat((x, x[:, :, -pad_num:, :]), dim = -2)

        seg_to_merge = []
        for i in range(self.win_size):
            seg_to_merge.append(x[:, :, i::self.win_size, :])
        x = torch.cat(seg_to_merge, -1)  # [B, ts_d, seg_num/win_size, win_size*d_model]

        x = self.norm(x)
        x = self.linear_trans(x)

        return x

class scale_block(nn.Module):
    '''
    We can use one segment merging layer followed by multiple TSA layers in each scale
    the parameter `depth' determines the number of TSA layers used in each scale
    We set depth = 1 in the paper
    '''
    def __init__(self, win_size, d_model, n_heads, d_ff, depth, dropout, \
                    seg_num = 10, factor=10):
        super(scale_block, self).__init__()

        if (win_size > 1):
            self.merge_layer = SegMerging(d_model, win_size, nn.LayerNorm)
        else:
            self.merge_layer = None

        self.encode_layers = nn.ModuleList()

        for i in range(depth):
            self.encode_layers.append(TwoStageAttentionLayer(seg_num, factor, d_model, n_heads, \
                                                        d_ff, dropout))

    def forward(self, x):
        _, ts_dim, _, _ = x.shape

        if self.merge_layer is not None:
            x = self.merge_layer(x)

        for layer in self.encode_layers:
            x = layer(x)

        return x

class Encoder(nn.Module):
    '''
    The Encoder of Crossformer.
    '''
    def __init__(self, e_blocks, win_size, d_model, n_heads, d_ff, block_depth, dropout,
                in_seg_num = 10, factor=10):
        super(Encoder, self).__init__()
        self.encode_blocks = nn.ModuleList()

        self.encode_blocks.append(scale_block(1, d_model, n_heads, d_ff, block_depth, dropout,\
                                            in_seg_num, factor))
        for i in range(1, e_blocks):
            self.encode_blocks.append(scale_block(win_size, d_model, n_heads, d_ff, block_depth, dropout,\
                                            ceil(in_seg_num/win_size**i), factor))

    def forward(self, x):
        encode_x = []
        encode_x.append(x)

        for block in self.encode_blocks:
            x = block(x)
            encode_x.append(x)

        return encode_x

#### CrossFormer

In [None]:
class Crossformer(nn.Module):
    def __init__(self, data_dim, in_len, out_len, seg_len, win_size=4,
                factor=10, d_model=512, d_ff = 1024, n_heads=8, e_layers=3,
                dropout=0.0, baseline = False, device=torch.device('cuda:0')):
        super(Crossformer, self).__init__()
        self.data_dim = data_dim
        self.in_len = in_len
        self.out_len = out_len
        self.seg_len = seg_len
        self.merge_win = win_size

        self.baseline = baseline

        self.device = device

        # The padding operation to handle invisible sgemnet length
        self.pad_in_len = ceil(1.0 * in_len / seg_len) * seg_len
        self.pad_out_len = ceil(1.0 * out_len / seg_len) * seg_len
        self.in_len_add = self.pad_in_len - self.in_len

        # Embedding
        self.enc_value_embedding = DSW_embedding(seg_len, d_model)
        self.enc_pos_embedding = nn.Parameter(torch.randn(1, data_dim, (self.pad_in_len // seg_len), d_model))
        self.pre_norm = nn.LayerNorm(d_model)

        # Encoder
        self.encoder = Encoder(e_layers, win_size, d_model, n_heads, d_ff, block_depth = 1, \
                                    dropout = dropout,in_seg_num = (self.pad_in_len // seg_len), factor = factor)

        # Decoder
        self.dec_pos_embedding = nn.Parameter(torch.randn(1, data_dim, (self.pad_out_len // seg_len), d_model))
        self.decoder = Decoder(seg_len, e_layers + 1, d_model, n_heads, d_ff, dropout, \
                                    out_seg_num = (self.pad_out_len // seg_len), factor = factor)


    def forward(self, x_seq):
        if (self.baseline):
            base = x_seq.mean(dim = 1, keepdim = True)
        else:
            base = 0
        batch_size = x_seq.shape[0]
        if (self.in_len_add != 0):
            x_seq = torch.cat((x_seq[:, :1, :].expand(-1, self.in_len_add, -1), x_seq), dim = 1)

        x_seq = self.enc_value_embedding(x_seq)
        x_seq += self.enc_pos_embedding
        x_seq = self.pre_norm(x_seq)

        enc_out = self.encoder(x_seq)

        dec_in = repeat(self.dec_pos_embedding, 'b ts_d l d -> (repeat b) ts_d l d', repeat = batch_size)
        predict_y = self.decoder(dec_in, enc_out)


        return base + predict_y[:, :self.out_len, :]

In [None]:
# 파라미터 설정

data_dim = 7
in_len = CFG['TRAIN_WINDOW_SIZE']
out_len = CFG['PREDICT_SIZE']
seg_len = 6
win_size = 2
factor = 10
d_model = 256    # hidden_size
d_ff = 512       # dimension of MLP in transformer

In [None]:
model = Crossformer(data_dim, in_len, out_len, seg_len, win_size, factor, d_model, d_ff)
model = nn.DataParallel(model, device_ids = [0, 1, 2, 3])
optimizer = torch.optim.Adam(params = model.parameters(), lr = CFG["LEARNING_RATE"])

# 3. 모델 학습

In [None]:
def train(model, optimizer, train_loader, device):
    model.to(device)
    criterion = nn.MSELoss().to(device)
    best_score = 0
    best_model = None

    for epoch in range(1, CFG['EPOCHS']+1):
        model.train()
        train_loss = []
        train_mae = []
        for X, Y in tqdm(iter(train_loader)):
            X = X.to(device)
            Y = Y.to(device)

            optimizer.zero_grad()

            output = model(X)
            loss = criterion(output[:, :, -3], Y)

            loss.backward()
            optimizer.step()

            train_loss.append(loss.item())

        val_score = []
        for i in tqdm(range(len(val_inputs))):
            val_dataset = CustomDataset(val_inputs[i], val_targets[i])
            val_loader = DataLoader(val_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=False, num_workers=0)
            val_score.append(validation(model, val_loader, device))

        print(f'Epoch : [{epoch}] Train Loss : [{np.mean(train_loss):.5f}] Val Loss : [{np.mean(val_score):.5f}]')

        if best_score < val_score:
            best_score = val_score
            best_model = model
            print('Model Saved')
    return best_model

In [None]:
indexs_bigcat={}
for bigcat in train_data['대분류'].unique():
    indexs_bigcat[bigcat] = list(train_data.loc[train_data['대분류']==bigcat].index)

indexs_bigcat.keys()

In [None]:
def PSFA(pred, target):
    PSFA = 1
    mapping_ids = {j:i for i, j in enumerate(train_data.index)}
    for cat in range(5):
        ids = indexs_bigcat[cat]
        ids = list(pd.Series(ids).map(mapping_ids))
        for day in range(21):
            total_sell = np.sum(target[ids, day]) # day별 총 판매량
            pred_values = pred[ids, day] # day별 예측 판매량
            target_values = target[ids, day] # day별 실제 판매량

            # 실제 판매와 예측 판매가 같은 경우 오차가 없는 것으로 간주
            denominator = np.maximum(target_values, pred_values)
            diffs = np.where(denominator!=0, np.abs(target_values - pred_values) / denominator, 0)

            if total_sell != 0:
                sell_weights = target_values / total_sell  # Item별 day 총 판매량 내 비중
            else:
                sell_weights = np.ones_like(target_values) / len(ids)  # 1 / len(ids)로 대체

            if not np.isnan(diffs).any():  # diffs에 NaN이 없는 경우에만 PSFA 값 업데이트
                PSFA -= np.sum(diffs * sell_weights) / (21 * 5)


    return PSFA

In [None]:
def validation(model, val_loader, device):
    model.eval()
    pred = []
    target = []

    with torch.no_grad():
        for X, Y in iter(val_loader):
            X = X.to(device)
            Y = Y.to(device)

            Y = Y.cpu().numpy()
            target.extend(Y)

            output = model(X)
            output = output[:, :, -3].cpu().numpy()
            pred.extend(output)

    pred = np.array(pred)
    target = np.array(target)

    # 추론 결과를 inverse scaling
    for i, idx in enumerate(train_data.index):
        x = pred[i, :] * (train_scale_max_dict[idx] - train_scale_min_dict[idx]) + train_scale_min_dict[idx]
        pred[i, :] = np.maximum(0, x)
        target[i, :] = target[i, :] * (train_scale_max_dict[idx] - train_scale_min_dict[idx]) + train_scale_min_dict[idx]

    # 결과 후처리
    pred = np.round(pred, 0).astype(int)
    target = np.round(target, 0).astype(int)

    return PSFA(pred, target)

In [None]:
infer_model = train(model, optimizer, train_loader, device)

In [None]:
torch.save(infer_model, 'transformer_model_epoch10.pth')

In [None]:
infer_model = torch.load('transformer_model_epoch10.pth', map_location=device)

# 4. 제출

In [None]:
test_dataset = CustomDataset(test_input, None)
test_loader = DataLoader(test_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=False, num_workers=0)

In [None]:
def inference(model, test_loader, device):
    predictions = []

    with torch.no_grad():
        for X in tqdm(iter(test_loader)):
            X = X.to(device)

            output = model(X)

            # 모델 출력인 output을 CPU로 이동하고 numpy 배열로 변환
            output = output.cpu().numpy()

            predictions.extend(output[:, :, -3])

    return np.array(predictions)

In [None]:
pred = inference(infer_model, test_loader, device)

In [None]:
# 추론 결과를 inverse scaling
for i, idx in enumerate(train_data.index):
    x = pred[i, :] * (train_scale_max_dict[idx] - train_scale_min_dict[idx]) + train_scale_min_dict[idx]
    pred[i, :] = np.maximum(0, x)

# 결과 후처리
pred = np.round(pred, 0).astype(int)

In [None]:
submit = pd.read_csv('./data/sample_submission.csv')
submit.iloc[train_data.index, 1:] = pred
submit

In [None]:
baseline_submit = pd.read_csv('baseline_submit.csv')
submit.iloc[none_brand_idx,1:] = baseline_submit.iloc[none_brand_idx,1:]

In [None]:
submit.to_csv('transformer_model_epoch10_submit.csv', index=False)