In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import random
import os
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler,StandardScaler

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader


In [3]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)

cuda


In [4]:
CFG = {
    'TRAIN_WINDOW_SIZE':28, # 28일 input
    'SUB_PREDICT_SIZE':1, # 7 output
    'PREDICT_SIZE':21, # 21 real output
    'EPOCHS':10,
    'LEARNING_RATE':1e-3,
    'BATCH_SIZE':2048,
    'HIDDEN_SIZE':256, # 16~256
    'ATTENTION_HEAD_SIZE':4, # 1~4
    'LSTM_LAYERS':1,
    'DROPOUT':0.3, # 0.1 ~ 0.3
    'SEED':1212,
    'EMBEDDING_LINEAR_DIM_DICT':{'static_cat':[(5,5),(11,11),(10,10)],
                               'X_cat':[(7,7)],
                               'X_future_cat':[(7,7)],
                               'static_cont':[],
                               'X_cont':[[32]], # (8~64)
                               'X_future_cont':[]}
}

In [5]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CFG['SEED']) # Seed 고정

In [6]:
train_data = pd.read_csv('./drive/MyDrive/lgaimers/open/train.csv').drop(columns=['ID', '제품','소분류','브랜드'])
train_data

Unnamed: 0,대분류,중분류,쇼핑몰,2022-01-01,2022-01-02,2022-01-03,2022-01-04,2022-01-05,2022-01-06,2022-01-07,...,2023-04-15,2023-04-16,2023-04-17,2023-04-18,2023-04-19,2023-04-20,2023-04-21,2023-04-22,2023-04-23,2023-04-24
0,B002-C001-0002,B002-C002-0007,S001-00001,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,B002-C001-0003,B002-C002-0008,S001-00001,0,0,0,0,0,0,0,...,2,0,2,0,2,2,1,0,0,0
2,B002-C001-0003,B002-C002-0008,S001-00001,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,B002-C001-0003,B002-C002-0008,S001-00001,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,B002-C001-0003,B002-C002-0008,S001-00010,0,0,0,0,0,0,0,...,10,6,4,4,4,0,0,0,0,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28889,B002-C001-0003,B002-C002-0008,S001-00001,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
28890,B002-C001-0003,B002-C002-0008,S001-00001,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
28891,B002-C001-0003,B002-C002-0008,S001-00001,0,0,0,0,0,0,0,...,2,4,1,0,0,0,4,0,2,0
28892,B002-C001-0003,B002-C002-0008,S001-00001,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
n_row = len(train_data)
print(n_row)

28894


In [8]:
t_train_data = train_data.iloc[:,3:].T
t_train_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,28884,28885,28886,28887,28888,28889,28890,28891,28892,28893
2022-01-01,0,0,0,0,0,0,0,0,18,0,...,0,0,0,0,0,0,0,0,0,0
2022-01-02,0,0,0,0,0,0,0,0,12,0,...,0,0,0,0,0,0,0,0,0,0
2022-01-03,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2022-01-04,0,0,0,0,0,0,0,14,6,0,...,0,0,0,0,0,0,0,0,0,0
2022-01-05,0,0,0,0,0,0,0,42,40,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
scaler = MinMaxScaler()

scale_df = scaler.fit_transform(t_train_data)
scale_df = pd.DataFrame(data= scale_df, index = t_train_data.index, columns = t_train_data.columns)
scale_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,28884,28885,28886,28887,28888,28889,28890,28891,28892,28893
2022-01-01,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.00,0.029900,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.00000,0.0,0.000000
2022-01-02,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.00,0.019934,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.00000,0.0,0.000000
2022-01-03,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.00,0.000000,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.00000,0.0,0.000000
2022-01-04,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.28,0.009967,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.00000,0.0,0.000000
2022-01-05,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.84,0.066445,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.00000,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-04-20,0.0,0.111111,0.0,0.0,0.000000,0.0,0.0,0.00,0.000000,0.0,...,0.0,0.0,0.0,0.121212,0.0,0.0,0.0,0.00000,0.0,0.035714
2023-04-21,0.0,0.055556,0.0,0.0,0.000000,0.0,0.0,0.00,0.000000,0.0,...,0.0,0.0,0.0,0.015152,0.0,0.0,0.0,0.01626,0.0,0.000000
2023-04-22,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.00,0.000000,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.00000,0.0,0.035714
2023-04-23,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.00,0.000000,0.0,...,0.0,0.0,0.0,0.015152,0.0,0.0,0.0,0.00813,0.0,0.071429


In [10]:
# Label Encoding
label_encoder = LabelEncoder()
categorical_columns = ['대분류', '중분류', '쇼핑몰']

for col in categorical_columns:
    label_encoder.fit(train_data[col])
    train_data[col] = label_encoder.transform(train_data[col])

In [11]:
train_data = pd.concat([train_data[categorical_columns],scale_df.T],axis=1)
train_data.head()

Unnamed: 0,대분류,중분류,쇼핑몰,2022-01-01,2022-01-02,2022-01-03,2022-01-04,2022-01-05,2022-01-06,2022-01-07,...,2023-04-15,2023-04-16,2023-04-17,2023-04-18,2023-04-19,2023-04-20,2023-04-21,2023-04-22,2023-04-23,2023-04-24
0,1,6,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,7,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.111111,0.0,0.111111,0.0,0.111111,0.111111,0.055556,0.0,0.0,0.0
2,2,7,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2,7,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2,7,9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.017606,0.010563,0.007042,0.007042,0.007042,0.0,0.0,0.0,0.0,0.014085


In [12]:
day_array = np.array([i%7 for i in range(479+21)])

In [13]:
def make_train_val_data(data, train_size=CFG['TRAIN_WINDOW_SIZE'], sub_predict_size = CFG['SUB_PREDICT_SIZE'], predict_size=CFG['PREDICT_SIZE'], val_ratio=0.2):
    num_rows = len(data)
    window_size = train_size + predict_size

    train_static_cat_data = []
    train_input_cont_data = []
    train_input_cat_data = []
    train_future_cat_data =[]
    train_target_data = []

    valid_static_cat_data = []
    val_input_cont_data = []
    val_input_cat_data = []
    val_future_cat_data = []
    val_target_data = []

    num_sales_data = 479
    val_start_idx = int((num_sales_data - window_size + 1) * (1 - val_ratio))
    encode_data = np.array(data.iloc[:,:3])
    sales = np.array(data.iloc[:,3:])
    for j in tqdm(range(num_sales_data - window_size + 1)):

        for i in range(num_rows):
            encode_info = encode_data[i]
            sales_data = sales[i]

            if j < val_start_idx:
                # train -> (input : 28) (output : 7)
                window = sales_data[j : j + window_size]
                input_cont_data = window[:train_size]
                input_cat_data = day_array[j:j+train_size]
                future_cat_data = day_array[j+train_size:j+train_size+sub_predict_size]

                target = window[train_size:train_size+sub_predict_size]

                train_static_cat_data.append(encode_info)
                train_input_cont_data.append(input_cont_data)
                train_input_cat_data.append(input_cat_data)
                train_future_cat_data.append(future_cat_data)
                train_target_data.append(target)
            else:
                # valid -> (input : 28) (output :7 ) [3번 반복]
                window = sales_data[j : j + window_size]
                input_cont_data = window[:train_size]
                input_cat_data = day_array[j:j+train_size]
                future_cat_data = day_array[j+train_size:j+train_size+predict_size]

                target = window[train_size:]

                valid_static_cat_data.append(encode_info)
                val_input_cont_data.append(input_cont_data)
                val_input_cat_data.append(input_cat_data)
                val_future_cat_data.append(future_cat_data)
                val_target_data.append(target)

    return np.array(train_static_cat_data),np.array(train_input_cont_data),np.array(train_input_cat_data),np.array(train_future_cat_data),np.array(train_target_data),np.array(valid_static_cat_data),np.array(val_input_cont_data),np.array(val_input_cat_data),np.array(val_future_cat_data),np.array(val_target_data)

In [14]:
def make_predict_data(data, train_size=CFG['TRAIN_WINDOW_SIZE'], predict_size=CFG['PREDICT_SIZE']):
    '''
    평가 데이터(Test Dataset)를 추론하기 위한 Input 데이터를 생성
    data : 일별 판매량
    train_size : 추론을 위해 필요한 일별 판매량 기간 (= 학습에 활용할 기간)
    '''
    num_rows = len(data)

    static_cat_data = []
    input_cont_data = []
    input_cat_data = []
    future_cat_data = []

    for i in tqdm(range(num_rows)):
        encode_info = np.array(data.iloc[i, :3])
        sales_data = np.array(data.iloc[i, -train_size:])

        static_cat_data.append(encode_info)
        input_cont_data.append(sales_data[-train_size : ])
        input_cat_data.append(day_array[-train_size-predict_size : -predict_size])
        future_cat_data.append(day_array[-predict_size : ])

    return np.array(static_cat_data),np.array(input_cont_data),np.array(input_cat_data),np.array(future_cat_data)

In [15]:
train_static_cat_data,train_input_cont_data,train_input_cat_data,train_future_cat_data,train_target_data,valid_static_cat_data,val_input_cont_data,val_input_cat_data,val_future_cat_data,val_target_data = make_train_val_data(train_data)
static_cat_data,input_cont_data,input_cat_data,future_cat_data = make_predict_data(train_data)

  0%|          | 0/431 [00:00<?, ?it/s]

  0%|          | 0/28894 [00:00<?, ?it/s]

In [16]:
train_static_cat_data.shape, train_input_cont_data.shape, train_input_cat_data.shape, train_future_cat_data.shape, train_target_data.shape, valid_static_cat_data.shape, val_input_cont_data.shape, val_input_cat_data.shape, val_future_cat_data.shape, val_target_data.shape, static_cat_data.shape, input_cont_data.shape, input_cat_data.shape, future_cat_data.shape

((9939536, 3),
 (9939536, 28),
 (9939536, 28),
 (9939536, 1),
 (9939536, 1),
 (2513778, 3),
 (2513778, 28),
 (2513778, 28),
 (2513778, 21),
 (2513778, 21),
 (28894, 3),
 (28894, 28),
 (28894, 28),
 (28894, 21))

In [17]:
train_input_cont_data = np.expand_dims(train_input_cont_data, axis=-1)
train_input_cat_data = np.expand_dims(train_input_cat_data, axis=-1)
train_future_cat_data = np.expand_dims(train_future_cat_data, axis=-1)

val_input_cont_data = np.expand_dims(val_input_cont_data, axis=-1)
val_input_cat_data = np.expand_dims(val_input_cat_data, axis=-1)
val_future_cat_data = np.expand_dims(val_future_cat_data, axis=-1)

input_cont_data = np.expand_dims(input_cont_data, axis=-1)
input_cat_data = np.expand_dims(input_cat_data, axis=-1)
future_cat_data = np.expand_dims(future_cat_data, axis=-1)

In [18]:
train_static_cat_data.shape, train_input_cont_data.shape, train_input_cat_data.shape, train_future_cat_data.shape, train_target_data.shape, valid_static_cat_data.shape, val_input_cont_data.shape, val_input_cat_data.shape, val_future_cat_data.shape, val_target_data.shape, static_cat_data.shape, input_cont_data.shape, input_cat_data.shape, future_cat_data.shape

((9939536, 3),
 (9939536, 28, 1),
 (9939536, 28, 1),
 (9939536, 1, 1),
 (9939536, 1),
 (2513778, 3),
 (2513778, 28, 1),
 (2513778, 28, 1),
 (2513778, 21, 1),
 (2513778, 21),
 (28894, 3),
 (28894, 28, 1),
 (28894, 28, 1),
 (28894, 21, 1))

In [19]:
class CustomDataset(Dataset):
    def __init__(self, static_cat=None, static_cont=None, X_cat=None, X_cont=None, X_future_cat=None, X_future_cont=None, Y=None):
        self.static_cat = static_cat
        self.static_cont = static_cont
        self.X_cat = X_cat
        self.X_cont = X_cont
        self.X_future_cat = X_future_cat
        self.X_future_cont = X_future_cont
        self.Y = Y

    def __len__(self):
        return len(self.X_cont) or len(self.X_cat)

    def __getitem__(self, idx):
        sample = {}
        if self.static_cat is not None:
            sample['static_cat'] = torch.Tensor(self.static_cat[idx])
        if self.static_cont is not None:
            sample['static_cont'] = torch.Tensor(self.static_cont[idx])
        if self.X_cat is not None:
            sample['X_cat'] = torch.Tensor(self.X_cat[idx])
        if self.X_cont is not None:
            sample['X_cont'] = torch.Tensor(self.X_cont[idx])
        if self.X_future_cat is not None:
            sample['X_future_cat'] = torch.Tensor(self.X_future_cat[idx])
        if self.X_future_cont is not None:
            sample['X_future_cont'] = torch.Tensor(self.X_future_cont[idx])

        if self.Y is not None:
            Y = torch.Tensor(self.Y[idx])
            return sample, Y

        return sample

In [20]:
train_dataset = CustomDataset(static_cat = train_static_cat_data, X_cont= train_input_cont_data, X_cat=train_input_cat_data, X_future_cat=train_future_cat_data, Y = train_target_data)
train_loader = DataLoader(train_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=True, num_workers=4)

val_dataset = CustomDataset(static_cat = valid_static_cat_data, X_cont= val_input_cont_data, X_cat=val_input_cat_data, X_future_cat=val_future_cat_data, Y = val_target_data)
val_loader = DataLoader(val_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=False, num_workers=4)

In [21]:
class GatedLinearUnit(nn.Module):
    def __init__(self, input_size, hidden_size, dropout = None):
        super().__init__()

        if dropout is not None:
            self.dropout = nn.Dropout(dropout)
        else:
            self.dropout = dropout

        self.fc = nn.Linear(input_size,hidden_size*2)

        self.init_weights()

    def init_weights(self):
        for n, p in self.named_parameters():
            if "bias" in n:
                torch.nn.init.zeros_(p)
            elif "fc" in n:
                torch.nn.init.xavier_uniform_(p)

    def forward(self, x):
        if self.dropout is not None:
            x = self.dropout(x)
        x = self.fc(x)
        x = F.glu(x, dim=-1)
        return x

In [22]:
class AddNorm(nn.Module):
    def __init__(self, input_size):
        super().__init__()
        self.norm = nn.LayerNorm(input_size)

    def interpolate(self,x,out_size):
        return F.interpolate(x.unsqueeze(1), out_size, mode="linear", align_corners=True).squeeze(1)


    def forward(self, x, skip):
        if x.size(-1)!=skip.size(-1):
            if skip.dim() == 2:  # (B, hidden_size)인 경우
                skip = self.interpolate(skip,x.size(-1))
            elif skip.dim() == 3:  # (B, time_length, hidden_size)인 경우
                skip_reshape = skip.contiguous().view(-1, skip.size(-1))  # (B * time_length, hidden_size)
                y = self.interpolate(skip_reshape,x.size(-1))
                y = y.view(skip.size(0), -1, y.size(-1))  # (B, output_size, hidden_size)
                skip = y

        return self.norm(x+skip)

In [23]:
class GateAddNorm(nn.Module):
    def __init__(self, input_size, hidden_size, dropout):
        super().__init__()

        self.glu = GatedLinearUnit(input_size=input_size, hidden_size=hidden_size, dropout=dropout)
        self.add_norm = AddNorm(hidden_size)

    def forward(self, x, skip): # skip 은 앞단에서 온 내용
        output = self.glu(x)
        output = self.add_norm(output,skip)
        return output

In [24]:
class GatedResidualNetwork(nn.Module):
    def __init__(self,input_size, hidden_size, output_size, dropout = 0.1, context_size=None):
        super().__init__()

        self.input_size = input_size
        self.output_size = output_size
        self.context_size = context_size
        self.hidden_size = hidden_size
        self.dropout = dropout

        self.fc1 = nn.Linear(self.input_size,self.hidden_size)
        self.elu = nn.ELU()

        if self.context_size is not None:
            self.context = nn.Linear(self.context_size, self.hidden_size, bias=False)

        self.fc2 = nn.Linear(self.hidden_size, self.hidden_size)
        self.init_weights()

        self.gate_norm = GateAddNorm(
            input_size = self.hidden_size,
            hidden_size = self.output_size,
            dropout = self.dropout,
        )

    def init_weights(self):
        for name, p in self.named_parameters():
            if "bias" in name:
                torch.nn.init.zeros_(p)
            elif "fc1" in name or "fc2" in name:
                torch.nn.init.kaiming_normal_(p, a=0, mode="fan_in", nonlinearity="leaky_relu")
            elif "context" in name:
                torch.nn.init.xavier_uniform_(p)

    def forward(self, x, context=None):
        residual = x
        x = self.fc1(x)

        if context is not None:
            context = self.context(context)
            x = x + context
        x = self.elu(x)
        x = self.fc2(x)
        x = self.gate_norm(x, residual)
        return x

In [25]:
class VariableSelectionNetwork(nn.Module):
    def __init__(self, input_embbedding_linear_list, hidden_size=16, dropout=0.1, context_size=None):
        super().__init__()

        self.input_embbedding_linear_list = input_embbedding_linear_list # [5,11,25,100,8,8]  리스트 안에 feature 별로 embedding, linear 가 적용된 shape들이 들어 있음
        self.hidden_size = hidden_size
        self.dropout = dropout
        self.context_size = context_size

        self.num_inputs = len(self.input_embbedding_linear_list)

        self.input_size_total = sum(self.input_embbedding_linear_list)

        if self.num_inputs > 1: # feature 개수가 1개 이상이라면
            if self.context_size is not None:
                self.flattened_grn = GatedResidualNetwork(
                    input_size = self.input_size_total,
                    hidden_size = min(self.hidden_size, self.num_inputs),
                    output_size = self.num_inputs, # 각 feature의 중요도를 나타냄
                    dropout = self.dropout,
                    context_size = self.context_size,
                )
            else:
                self.flattened_grn = GatedResidualNetwork(
                    self.input_size_total,
                    min(self.hidden_size, self.num_inputs),
                    self.num_inputs,
                    self.dropout,
                )

        self.single_variable_grns = nn.ModuleList()

        for input_shape in self.input_embbedding_linear_list:
            self.single_variable_grns.append(GatedResidualNetwork(
                    input_shape,
                    min(input_shape, self.hidden_size),
                    output_size=self.hidden_size,
                    dropout=self.dropout,
                ))

        self.softmax = nn.Softmax(dim=-1)

    def forward(self, x, context = None): # x 예시(shape 예시) : [(B,5),(B,11),(B,25),(B,100),(B,8),(B,8)] (static 일경우), 아닐경우는 가운데 L차원 추가
        if self.num_inputs > 1: # feature 여러개인경우
            single_outputs = []

            for i in range(self.num_inputs):
                single_outputs.append(self.single_variable_grns[i](x[i]))
            single_outputs = torch.stack(single_outputs,dim=-1) # (B,hidden,feature 개수) # [torch.rand(3,4) for _ in range(5)] -> (3,4,5) 됨
            # single_outputs ->(B,hidden,feature 개수)

            flat = torch.cat(x,dim=-1)
            weights = self.flattened_grn(flat,context) # context None인거는 GRN에서 알아서 처리
            # weights -> (B,feature 개수) (GRN에서 output_size를 feature 개수로 설정해둠)
            weights = self.softmax(weights).unsqueeze(-2)
            # weights -> (B,1,feature 개수)

            outputs = single_outputs * weights
            outputs = outputs.sum(dim=-1) # (B,hidden_size)

        else: # feature가 한개인경우 variable selection 할필요 없음.
            outputs = self.single_variable_grns[0](x[0]) # (B,hidden_size)

        return outputs

In [26]:
class ScaledDotProductAttention(nn.Module):
    def __init__(self, dropout = None, scale = True):
        super(ScaledDotProductAttention, self).__init__()
        if dropout is not None:
            self.dropout = nn.Dropout(p=dropout)
        else:
            self.dropout = dropout
        self.softmax = nn.Softmax(dim=2)
        self.scale = scale

    def forward(self, q, k, v, mask=None):
        attn = torch.bmm(q, k.permute(0, 2, 1))  # query-key overlap

        if self.scale:
            dimension = torch.as_tensor(k.size(-1), dtype=attn.dtype, device=attn.device).sqrt()
            attn = attn / dimension

        if mask is not None:
            attn = attn.masked_fill(mask, -1e9)
        attn = self.softmax(attn)

        if self.dropout is not None:
            attn = self.dropout(attn)
        output = torch.bmm(attn, v)
        return output, attn

In [27]:
class InterpretableMultiHeadAttention(nn.Module):
    def __init__(self, n_head, d_model, dropout = 0.0):
        super(InterpretableMultiHeadAttention, self).__init__()

        self.n_head = n_head
        self.d_model = d_model
        self.d_k = self.d_q = self.d_v = d_model // n_head
        self.dropout = nn.Dropout(p=dropout)

        self.v_layer = nn.Linear(self.d_model, self.d_v)
        self.q_layers = nn.ModuleList([nn.Linear(self.d_model, self.d_q) for _ in range(self.n_head)])
        self.k_layers = nn.ModuleList([nn.Linear(self.d_model, self.d_k) for _ in range(self.n_head)])
        self.attention = ScaledDotProductAttention()
        self.w_h = nn.Linear(self.d_v, self.d_model, bias=False)

        self.init_weights()

    def init_weights(self):
        for name, p in self.named_parameters():
            if "bias" not in name:
                torch.nn.init.xavier_uniform_(p)
            else:
                torch.nn.init.zeros_(p)

    def forward(self, q, k, v, mask=None):
        heads = []
        attns = []
        vs = self.v_layer(v)
        for i in range(self.n_head):
            qs = self.q_layers[i](q)
            ks = self.k_layers[i](k)
            head, attn = self.attention(qs, ks, vs, mask)
            head_dropout = self.dropout(head)
            heads.append(head_dropout)
            attns.append(attn)

        head = torch.stack(heads, dim=2) if self.n_head > 1 else heads[0]
        attn = torch.stack(attns, dim=2)

        outputs = torch.mean(head, dim=2) if self.n_head > 1 else head
        outputs = self.w_h(outputs)
        outputs = self.dropout(outputs)

        return outputs, attn

In [28]:
class TemporalFusionTransformer(nn.Module):
    def __init__(self, embedding_linear_dim_dict, batch_size,input_size=91,output_size=21, hidden_size=16, attention_head_size=4, lstm_layers = 1, dropout = 0.1):
        super().__init__()

        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # 디바이스 정의

        self.batch_size = batch_size
        self.hidden_size = hidden_size
        self.input_size = input_size
        self.output_size = output_size
        self.attention_head_size = attention_head_size
        self.lstm_layers = lstm_layers
        self.dropout = dropout

        self.embeddings = nn.ModuleDict()
        self.linears = nn.ModuleDict()

        for key, values in embedding_linear_dim_dict.items():
            if '_cat' in key:
                self.embeddings[key] = nn.ModuleList([nn.Embedding(dim[0], dim[1]) for dim in values])
            elif '_cont' in key:
                self.linears[key] = nn.ModuleList([nn.Linear(1, dim[-1]) for dim in values])


        static_list = embedding_linear_dim_dict['static_cat']+embedding_linear_dim_dict['static_cont']
        static_list = [item[-1] for item in static_list]

        self.static_variable_selection = VariableSelectionNetwork(static_list, hidden_size=self.hidden_size, dropout=self.dropout)

        encoder_list = embedding_linear_dim_dict['X_cat']+embedding_linear_dim_dict['X_cont']
        encoder_list = [item[-1] for item in encoder_list]

        self.encoder_variable_selection = VariableSelectionNetwork(encoder_list, hidden_size=self.hidden_size, dropout=self.dropout, context_size = self.hidden_size)

        decoder_list = embedding_linear_dim_dict['X_future_cat']+embedding_linear_dim_dict['X_future_cont']
        decoder_list = [item[-1] for item in decoder_list]

        self.decoder_variable_selection = VariableSelectionNetwork(decoder_list, hidden_size=self.hidden_size, dropout=self.dropout, context_size = self.hidden_size)

        # static encoders
        # for variable selection
        self.static_context_variable_selection = GatedResidualNetwork(
            input_size=self.hidden_size,
            hidden_size=self.hidden_size,
            output_size=self.hidden_size,
            dropout=self.dropout,
        )

        # for hidden state of the lstm
        self.static_context_initial_hidden_lstm = GatedResidualNetwork(
            input_size=self.hidden_size,
            hidden_size=self.hidden_size,
            output_size=self.hidden_size,
            dropout=self.dropout,
        )

        # for cell state of the lstm
        self.static_context_initial_cell_lstm = GatedResidualNetwork(
            input_size=self.hidden_size,
            hidden_size=self.hidden_size,
            output_size=self.hidden_size,
            dropout=self.dropout,
        )

        # for post lstm static enrichment
        self.static_context_enrichment = GatedResidualNetwork(
            self.hidden_size,
            self.hidden_size,
            self.hidden_size,
            self.dropout
        )

        # lstm encoder (history) and decoder (future) for local processing
        self.lstm_encoder = nn.LSTM(
            input_size=self.hidden_size,
            hidden_size=self.hidden_size,
            num_layers=self.lstm_layers,
            dropout=self.dropout if self.lstm_layers > 1 else 0,
            batch_first=True,
        )

        self.lstm_decoder = nn.LSTM(
            input_size=self.hidden_size,
            hidden_size=self.hidden_size,
            num_layers=self.lstm_layers,
            dropout=self.dropout if self.lstm_layers > 1 else 0,
            batch_first=True,
        )

        # skip connection for lstm
        self.post_lstm_gate_encoder = GatedLinearUnit(self.hidden_size, self.hidden_size, dropout=self.dropout)
        self.post_lstm_gate_decoder = GatedLinearUnit(self.hidden_size, self.hidden_size, dropout=self.dropout)
        self.post_lstm_add_norm_encoder = AddNorm(self.hidden_size)
        self.post_lstm_add_norm_decoder = AddNorm(self.hidden_size)

        # static enrichment and processing past LSTM
        self.static_enrichment = GatedResidualNetwork(
            input_size=self.hidden_size,
            hidden_size=self.hidden_size,
            output_size=self.hidden_size,
            dropout=self.dropout,
            context_size=self.hidden_size,
        )

        # attention for long-range processing
        self.multihead_attn = InterpretableMultiHeadAttention(
            d_model=self.hidden_size, n_head=self.attention_head_size, dropout=self.dropout
        )

        self.post_attn_gate_norm = GateAddNorm(
            self.hidden_size, self.hidden_size, dropout=self.dropout,
        )

        self.pos_wise_ff = GatedResidualNetwork(
            self.hidden_size, self.hidden_size, self.hidden_size, dropout=self.dropout
        )

        # output processing -> no dropout at this late stage
        self.pre_output_gate_norm = GateAddNorm(self.hidden_size, self.hidden_size, dropout=None)

        self.output_layer = nn.Linear(self.hidden_size, 1)

    def apply_embedding(self, input_cat, key_prefix):
        result = []
        embeddings_list = self.embeddings[key_prefix]
        for i, embed in enumerate(embeddings_list):
            embedded_input = embed(input_cat[:, i].long()) if len(input_cat.shape) == 2 else embed(input_cat[:, :, i].long())
            result.append(embedded_input)
        return result

    def apply_linear(self, input_cont, key_prefix):
        result = []
        linears_list = self.linears[key_prefix]
        for i, linear in enumerate(linears_list):
            linear_output = linear(input_cont[:, i:i+1] if len(input_cont.shape) == 2 else input_cont[:, :, i:i+1])
            result.append(linear_output)
        return result

    def forward(self,x):
        # x['static_cat'] (B, static_cat feature 개수)
        # x['static_cont'] (B, static_cont feature 개수)
        # x['X_cat']  (B, input_length, X_cat feature 개수)
        # x['X_cont'] (B, input_length, X_cont feature 개수)
        # x['X_future_cat'] (B, output_length, X_future_cat feature 개수)
        # x['X_future_cont'] (B, output_length, X_future_cont feature 개수)

        static_cat = self.apply_embedding(x['static_cat'], 'static_cat') if 'static_cat' in x else []
        static_cont = self.apply_linear(x['static_cont'], 'static_cont') if 'static_cont' in x else []
        X_cat = self.apply_embedding(x['X_cat'], 'X_cat') if 'X_cat' in x else []
        X_cont = self.apply_linear(x['X_cont'], 'X_cont') if 'X_cont' in x else []
        X_future_cat = self.apply_embedding(x['X_future_cat'], 'X_future_cat') if 'X_future_cat' in x else []
        X_future_cont = self.apply_linear(x['X_future_cont'], 'X_future_cont') if 'X_future_cont' in x else []

        for item in (static_cat,static_cont,X_cat,X_cont,X_future_cat,X_future_cont):
            if item:
                self.batch_size=item[0].size(0)
                break
        if static_cat+static_cont == []:
            static_embedding = torch.zeros((self.batch_size, self.hidden_size), dtype=torch.float32, device=self.device)
        else:
            static_input = static_cat + static_cont
            static_embedding = self.static_variable_selection(static_input)
        # static_embeding : (B,hidden)

        static_context_input = self.static_context_variable_selection(static_embedding).unsqueeze(1).expand(-1,self.input_size+self.output_size,-1)
        input_hidden = self.static_context_initial_hidden_lstm(static_embedding).expand(self.lstm_layers,-1,-1)
        input_cell = self.static_context_initial_cell_lstm(static_embedding).expand(self.lstm_layers,-1,-1)
        static_context_enrichment_input = self.static_context_enrichment(static_embedding).unsqueeze(1).expand(-1,self.input_size+self.output_size,-1)

        if X_cat+X_cont == []:
            X_embedding = torch.zeros((self.batch_size, self.input_size, self.hidden_size), dtype=torch.float32, device=self.device)
        else:
            X_input = X_cat + X_cont
            X_embedding = self.encoder_variable_selection(X_input,static_context_input[:,:self.input_size])

        if X_future_cat+X_future_cont == []:
            X_future_embedding = torch.zeros((self.batch_size, self.output_size, self.hidden_size), dtype=torch.float32, device=self.device)
        else:
            X_future_input = X_future_cat + X_future_cont
            X_future_embedding = self.decoder_variable_selection(X_future_input,static_context_input[:,self.input_size:])

        encoder_output, (hidden,cell) = self.lstm_encoder(
            X_embedding,
            (input_hidden.contiguous(),input_cell.contiguous()),
        )

        decoder_output, _ = self.lstm_decoder(
            X_future_embedding,
            (hidden,cell),
        )

        # skip connection over lstm
        lstm_output_encoder = self.post_lstm_gate_encoder(encoder_output)
        lstm_output_encoder = self.post_lstm_add_norm_encoder(lstm_output_encoder, X_embedding)

        lstm_output_decoder = self.post_lstm_gate_decoder(decoder_output)
        lstm_output_decoder = self.post_lstm_add_norm_decoder(lstm_output_decoder, X_future_embedding)

        lstm_output = torch.cat([lstm_output_encoder, lstm_output_decoder], dim=1)


        attn_input = self.static_enrichment(
            lstm_output, static_context_enrichment_input
        )

        # Attention
        attn_output, attn_output_weights = self.multihead_attn(
            q=attn_input[:, self.input_size:],  # query only for predictions
            k=attn_input,
            v=attn_input,
        )

        # skip connection over attention
        attn_output = self.post_attn_gate_norm(attn_output, attn_input[:, self.input_size:])

        output = self.pos_wise_ff(attn_output)

        # skip connection over temporal fusion decoder (not LSTM decoder despite the LSTM output contains
        # a skip from the variable selection network)
        output = self.pre_output_gate_norm(output, lstm_output[:, self.input_size:])

        output = self.output_layer(output)
        return output

In [29]:
def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']

In [30]:
def initialize_weights(module):
    if isinstance(module, nn.Conv2d):
        # Conv2d에서 mode는 channel로 적용. 보통 Conv2d는 output channel 수가 많으므로 fan_out으로 설정
        nn.init.kaiming_normal_(module.weight, mode='fan_out', nonlinearity='relu')
        if module.bias is not None:
            nn.init.constant_(module.bias, 0)
    elif isinstance(module, nn.Linear):
        nn.init.kaiming_normal_(module.weight, mode='fan_in', nonlinearity='leaky_relu')
        if module.bias is not None:
            nn.init.constant_(module.bias, 0)
    elif isinstance(module, nn.BatchNorm2d):
        nn.init.constant_(module.weight, 1)
        nn.init.constant_(module.bias, 0)
    elif isinstance(module, nn.Embedding):
        nn.init.xavier_normal_(module.weight)

In [31]:
def PSFA(pred,true,day=21):
    score_list = [1 for _ in range(5)]

    pred_T = scaler.inverse_transform(pred.T)
    pred=pred_T.T

    true_T = scaler.inverse_transform(true.T)
    true=true_T.T

    pred = np.round(pred, 0).astype(int)
    true = np.round(true, 0).astype(int)

    for i in range(5):
        idx = list(train_data['대분류']==i)
        i_pred = pred[idx]
        i_true = true[idx]



        epsilon = 1e-10
        max_values = np.maximum(i_pred,i_true)
        left_values = np.where(max_values == 0 ,0, np.abs(i_pred-i_true)/max_values)

        row_sum = np.sum(i_true, axis=0)
        right_values = np.wehre(row_sum == 0, 0 ,i_true/row_sum)

        final_values = left_value*right_value
        total_value = np.sum(np.sum(final_values,axis=0),axis=1)/day

        score_list[i]-= total_value

    return np.mean(score_list)

In [32]:
model = TemporalFusionTransformer(
    embedding_linear_dim_dict=CFG['EMBEDDING_LINEAR_DIM_DICT'],
    batch_size=CFG['BATCH_SIZE'],
    input_size=CFG['TRAIN_WINDOW_SIZE'],
    output_size=CFG['SUB_PREDICT_SIZE'],
    hidden_size=CFG['HIDDEN_SIZE'],
    attention_head_size=CFG['ATTENTION_HEAD_SIZE'],
    lstm_layers = CFG['LSTM_LAYERS'],
    dropout = CFG['DROPOUT']
)
model = model.to(device)
model.apply(initialize_weights)

TemporalFusionTransformer(
  (embeddings): ModuleDict(
    (static_cat): ModuleList(
      (0): Embedding(5, 5)
      (1): Embedding(11, 11)
      (2): Embedding(10, 10)
    )
    (X_cat): ModuleList(
      (0): Embedding(7, 7)
    )
    (X_future_cat): ModuleList(
      (0): Embedding(7, 7)
    )
  )
  (linears): ModuleDict(
    (static_cont): ModuleList()
    (X_cont): ModuleList(
      (0): Linear(in_features=1, out_features=32, bias=True)
    )
    (X_future_cont): ModuleList()
  )
  (static_variable_selection): VariableSelectionNetwork(
    (flattened_grn): GatedResidualNetwork(
      (fc1): Linear(in_features=26, out_features=3, bias=True)
      (elu): ELU(alpha=1.0)
      (fc2): Linear(in_features=3, out_features=3, bias=True)
      (gate_norm): GateAddNorm(
        (glu): GatedLinearUnit(
          (dropout): Dropout(p=0.3, inplace=False)
          (fc): Linear(in_features=3, out_features=6, bias=True)
        )
        (add_norm): AddNorm(
          (norm): LayerNorm((3,), eps

In [33]:
def to_device(data, device):
    if isinstance(data, dict):
        return {key: to_device(value, device) for key, value in data.items()}
    return data.to(device)

In [34]:
criterion = nn.MSELoss().to(device)

optimizer = torch.optim.Adam(params = model.parameters(), lr = CFG["LEARNING_RATE"])

In [35]:
best_loss = 9999999
best_model = None
best_psfa = 0.0
for epoch in range(1, CFG['EPOCHS']+1):
    model.train()
    train_loss = []
    for idx, (X, Y )in enumerate(tqdm(iter(train_loader))):
        X = to_device(X,device)
        Y = to_device(Y,device)

        optimizer.zero_grad()

        output = model(X)
        output = output.squeeze(-1)
        loss = criterion(output, Y)

        loss.backward()
        optimizer.step()

        train_loss.append(loss.item())

        if idx%100==0:
            print(f'now_loss:{loss.item():.5f} mean_loss:{np.mean(train_loss):.5f}')

    model.eval()
    val_loss = 0.0

    psfa = 0.0
    len_val = len(val_loader)

    pred_list = []
    true_list = []

    with torch.no_grad():
        for X, Y in tqdm(iter(val_loader)):
            X = to_device(X,device)
            Y = to_device(Y,device)

            temp_X = {key: val.clone() for key, val in X.items()}
            total_preds = []

            n_range = 21//CFG['SUB_PREDICT_SIZE']
            for r in range(n_range):
                temp_X['X_future_cat']=X['X_future_cat'][:,r*CFG['SUB_PREDICT_SIZE']:(r+1)*CFG['SUB_PREDICT_SIZE'],:]

                output = model(temp_X)

                total_preds.append(output.squeeze(-1))

                temp_X['X_cat'] = torch.roll(temp_X['X_cat'],shifts=-CFG['SUB_PREDICT_SIZE'],dims=1)
                temp_X['X_cat'][:,-CFG['SUB_PREDICT_SIZE']:,:] = X['X_future_cat'][:,r*CFG['SUB_PREDICT_SIZE']:(r+1)*CFG['SUB_PREDICT_SIZE'],:]

                temp_X['X_cont'] = torch.roll(temp_X['X_cont'],shifts=-CFG['SUB_PREDICT_SIZE'],dims=1)
                temp_X['X_cont'][:,-CFG['SUB_PREDICT_SIZE']:,:] = output.to(device)

            preds_total_days = torch.cat(total_preds,dim=1)

            loss = criterion(preds_total_days, Y)

            pred_list.extend(preds_total_days.cpu().numpy())
            true_list.extend(Y.cpu().numpy())

            val_loss+=loss.item()

    for i in range(0,len(pred_list),n_row):
        psfa+=PSFA(np.array(pred_list[i:i+n_row]),np.array(true_list[i:i+n_row]))

    val_loss = val_loss/len_val
    val_psfa = psfa/(len(pred_list)/n_row)

    print(f'Epoch : [{epoch}] Train Loss : [{np.mean(train_loss):.5f}] Val Loss : [{val_loss:.5f}] Val PSFA : [{psfa:.6f}]')

    if val_psfa>best_psfa:
        best_psfa=val_psfa
        torch.save(model.state_dict(),'./drive/MyDrive/lgaimers/pth_file/best_model.pth')
        print('Model Saved')

torch.save(model.state_dict(),'./drive/MyDrive/lgaimers/pth_file/last_model.pth')

  0%|          | 0/4854 [00:00<?, ?it/s]

now_loss:2.47922 mean_loss:2.47922
now_loss:0.01742 mean_loss:0.25804
now_loss:0.01496 mean_loss:0.13755
now_loss:0.01273 mean_loss:0.09621
now_loss:0.01133 mean_loss:0.07521
now_loss:0.01022 mean_loss:0.06251
now_loss:0.00983 mean_loss:0.05393
now_loss:0.00901 mean_loss:0.04782
now_loss:0.01041 mean_loss:0.04321
now_loss:0.01022 mean_loss:0.03961
now_loss:0.00873 mean_loss:0.03671
now_loss:0.01083 mean_loss:0.03433
now_loss:0.01074 mean_loss:0.03233
now_loss:0.00950 mean_loss:0.03063


KeyboardInterrupt: ignored

In [None]:
torch.save(model.state_dict(),'./drive/MyDrive/lgaimers/pth_file/last_model.pth')

In [None]:
test_dataset = CustomDataset(static_cat = static_cat_data, X_cont= input_cont_data, X_cat=input_cat_data, X_future_cat = future_cat_data)
test_loader = DataLoader(test_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=False, num_workers=0)
model.load_state_dict(torch.load('./drive/MyDrive/lgaimers/pth_file/best_model.pth'))

In [None]:
def inference(model, test_loader, device):
    model.eval()
    predictions = []

    with torch.no_grad():
        for X in tqdm(iter(test_loader)):
            X = to_device(X,device)

            temp_X = {key: val.clone() for key, val in X.items()}
            total_preds = []

            for r in range(3):
                temp_X['X_future_cat']=X['X_future_cat'][:,r*7:(r+1)*7,:]

                output = model(temp_X)

                total_preds.append(output.squeeze(-1))

                temp_X['X_cat'] = torch.roll(temp_X['X_cat'],shifts=-7,dims=1)
                temp_X['X_cat'][:,-7:,:] = X['X_future_cat'][:,r*7:(r+1)*7,:]

                temp_X['X_cont'] = torch.roll(temp_X['X_cont'],shifts=-7,dims=1)
                temp_X['X_cont'][:,-7:,:] = output.to(device)

            preds_total_days = torch.cat(total_preds,dim=1)

            # 모델 출력인 output을 CPU로 이동하고 numpy 배열로 변환
            output = preds_total_days.cpu().numpy()

            predictions.extend(output)

    return np.array(predictions)

In [None]:
pred = inference(model, test_loader, device)

In [None]:
pred.shape

In [None]:
pred_T = scaler.inverse_transform(pred.T)
pred_T

In [None]:
pred=pred_T.T
pred

In [None]:
pred = np.round(pred, 0).astype(int)
pred

In [None]:
submit = pd.read_csv('./drive/MyDrive/lgaimers/open/sample_submission.csv')
submit.head()

In [None]:
submit.iloc[:,1:] = pred
submit.head()

In [None]:
submit.to_csv('./drive/MyDrive/lgaimers/tft_best_colab_sub1_submit.csv', index=False)