## Import

In [None]:
import random
import os
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from sklearn.preprocessing import LabelEncoder
from statsmodels.tsa.seasonal import seasonal_decompose

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [None]:
device

device(type='cuda')

## Hyperparameter Setting

In [None]:
CFG = {
    'TRAIN_WINDOW_SIZE':35, # 90일치로 학습
    'PREDICT_SIZE':21, # 21일치 예측
    'EPOCHS':10,
    'LEARNING_RATE':0.001,
    'BATCH_SIZE':1024,
    'SEED':41
}

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CFG['SEED']) # Seed 고정

### 데이터 불러오기

In [None]:
cd /content/drive/MyDrive/Aimers3

/content/drive/MyDrive/Aimers3


In [None]:
train_data = pd.read_csv('./train.csv').drop(columns=['ID', '제품'])

### 데이터 전처리

In [None]:
train_data = pd.read_csv('./train.csv').drop(columns=['제품'])
zero_ratios = (train_data.iloc[:,5:] == 0).mean(axis=1) *100
ids_with_low_zeros = zero_ratios[zero_ratios <=70].index
train_df_copy = train_data.copy()
for id in tqdm(ids_with_low_zeros):
    sales_series = train_df_copy.loc[id, train_df_copy.columns[5:]]

    if sales_series.iloc[0] == 0:
        sales_series = sales_series.replace({0:np.nan}).bfill()

    if sales_series.iloc[-1] == 0:
        sales_series = sales_series.replace({0:np.nan}).ffill()

    interpolated_sales = sales_series.interpolate()

    train_df_copy.loc[id, train_df_copy.columns[5:]] = interpolated_sales.values

train_data = train_df_copy.copy()

  0%|          | 0/7642 [00:00<?, ?it/s]

In [None]:
# 숫자형 변수들의 min-max scaling을 수행하는 코드입니다.
numeric_cols = train_data.columns[5:]
# 칵 column의 min 및 max 계산
min_values = train_data[numeric_cols].min(axis=1)
max_values = train_data[numeric_cols].max(axis=1)
# 각 행의 범위(max-min)를 계산하고, 범위가 0인 경우 1로 대체
ranges = max_values - min_values
ranges[ranges == 0] = 1
# min-max scaling 수행
train_data[numeric_cols] = (train_data[numeric_cols].subtract(min_values, axis=0)).div(ranges, axis=0)
# max와 min 값을 dictionary 형태로 저장
scale_min_dict = min_values.to_dict()
scale_max_dict = max_values.to_dict()

In [None]:
# Label Encoding
label_encoder = LabelEncoder()
categorical_columns = ['ID','대분류', '중분류', '소분류', '브랜드']

for col in categorical_columns:
    label_encoder.fit(train_data[col])
    train_data[col] = label_encoder.transform(train_data[col])

In [None]:
train_data.drop(['대분류', '중분류', '소분류', '브랜드'],axis=1,inplace=True)
train_data

Unnamed: 0,ID,2022-01-01,2022-01-02,2022-01-03,2022-01-04,2022-01-05,2022-01-06,2022-01-07,2022-01-08,2022-01-09,...,2023-03-26,2023-03-27,2023-03-28,2023-03-29,2023-03-30,2023-03-31,2023-04-01,2023-04-02,2023-04-03,2023-04-04
0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.250000,0.125000,0.125000,0.125000,0.125000,0.125000
2,2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,3,0.025641,0.025641,0.025641,0.025641,0.025641,0.025641,0.025641,0.025641,0.025641,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,4,0.022222,0.022222,0.022222,0.022222,0.022222,0.022222,0.022222,0.022222,0.022222,...,0.022222,0.022222,0.022222,0.022222,0.022222,0.022222,0.022222,0.022222,0.022222,0.022222
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15885,15885,0.006452,0.006452,0.006452,0.006452,0.006452,0.006452,0.006452,0.006452,0.006452,...,0.006452,0.006452,0.006452,0.006452,0.006452,0.006452,0.006452,0.006452,0.006452,0.006452
15886,15886,0.016393,0.016393,0.016393,0.016393,0.016393,0.016393,0.016393,0.016393,0.016393,...,0.016393,0.016393,0.016393,0.016393,0.008197,0.008197,0.024590,0.000000,0.000000,0.016393
15887,15887,0.080000,0.080000,0.080000,0.080000,0.080000,0.080000,0.080000,0.080000,0.080000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
15888,15888,0.076923,0.076923,0.076923,0.076923,0.076923,0.076923,0.076923,0.076923,0.076923,...,0.076923,0.076923,0.076923,0.076923,0.076923,0.076923,0.076923,0.076923,0.076923,0.076923


In [None]:

def make_train_data(data,train_size=CFG['TRAIN_WINDOW_SIZE'], predict_size=CFG['PREDICT_SIZE']):

    #학습 기간 블럭, 예측 기간 블럭의 세트로 데이터를 생성
    #data : 일별 판매량
    #train_size : 학습에 활용할 기간
    #predict_size : 추론할 기간

    num_rows = len(data)
    window_size = train_size + predict_size

    input_data = np.empty((num_rows * (len(data.columns) - window_size + 1), train_size, len(data.iloc[0, :1]) + 1))
    target_data = np.empty((num_rows * (len(data.columns) - window_size + 1), predict_size))

    for i in tqdm(range(num_rows)):
        encode_info = np.array(data.iloc[i, :1])
        sales_data = np.array(data.iloc[i, 1:])

        for j in range(len(sales_data) - window_size + 1):
            window = sales_data[j : j + window_size]
            temp_data = np.column_stack((np.tile(encode_info, (train_size, 1)), window[:train_size]))
            input_data[i * (len(data.columns) - window_size + 1) + j] = temp_data
            target_data[i * (len(data.columns) - window_size + 1) + j] = window[train_size:]

    return input_data, target_data

In [None]:

def make_predict_data(data,train_size=CFG['TRAIN_WINDOW_SIZE']):

    #평가 데이터(Test Dataset)를 추론하기 위한 Input 데이터를 생성
    #data : 일별 판매량
    #train_size : 추론을 위해 필요한 일별 판매량 기간 (= 학습에 활용할 기간)

    num_rows = len(data)

    input_data = np.empty((num_rows, train_size, len(data.iloc[0, :1]) + 1 ))

    for i in tqdm(range(num_rows)):
        encode_info = np.array(data.iloc[i, :1])
        sales_data = np.array(data.iloc[i, -train_size:])
        window = sales_data[-train_size : ]
        temp_data = np.column_stack((np.tile(encode_info, (train_size, 1)), window[:train_size]))
        input_data[i] = temp_data

    return input_data

In [None]:
train_input, train_target = make_train_data(train_data)
test_input = make_predict_data(train_data)

  0%|          | 0/15890 [00:00<?, ?it/s]

  0%|          | 0/15890 [00:00<?, ?it/s]

In [None]:
# Train / Validation Split
data_len = len(train_input)
# val_input = train_input[-int(data_len*0.2):]
# val_target = train_target[-int(data_len*0.2):]
# train_input = train_input[:-int(data_len*0.2)]
# train_target = train_target[:-int(data_len*0.2)]

In [None]:
train_input.shape, train_target.shape, #val_input.shape, val_target.shape, test_input.shape

((6435450, 35, 2), (6435450, 21))

In [None]:
train_input.shape, train_target.shape, #val_input.shape, val_target.shape, test_input.shape

((6483120, 35, 1), (6483120, 21))

In [None]:
train_input.shape, train_target.shape, #val_input.shape, val_target.shape, test_input.shape

((6483120, 35, 1), (6483120, 21))

In [None]:
train_input.shape, train_target.shape, #val_input.shape, val_target.shape, test_input.shape

((6483120, 35, 5), (6483120, 21))

In [None]:
train_input.shape, train_target.shape, #val_input.shape, val_target.shape, test_input.shape

((5609170, 90, 5), (5609170, 21))

### 모델 선언

In [None]:
class BaseModel(nn.Module):
    def __init__(self, input_size=1, hidden_size=512, output_size=CFG['PREDICT_SIZE'], embedding_dim=50, n_ids=15890):
        super(BaseModel, self).__init__()

        # Embedding layer for ID
        self.embedding = nn.Embedding(n_ids, embedding_dim)

        self.hidden_size = hidden_size
        self.gru = nn.GRU(input_size + embedding_dim, hidden_size, batch_first=True, bidirectional=True)

        # Attention Layer
        self.attn_weights = nn.Linear(hidden_size * 2, 1)

        # Fully connected layer
        self.fc = nn.Sequential(
            nn.Linear(hidden_size*2, hidden_size//2),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(hidden_size//2, output_size)
        )

        self.actv = nn.ReLU()

    def forward(self, x, ids):
        # Embedding the IDs
        embedded_ids = self.embedding(ids)

        # Repeating the embedded_ids to match sequence length and concatenating it with input
        x = torch.cat([x, embedded_ids.repeat(1, x.shape[1], 1)], dim=-1)

        batch_size = x.size(0)
        hidden = self.init_hidden(batch_size, x.device)
        gru_out, hidden = self.gru(x, hidden)

        # Attention mechanism
        attn_weights = F.softmax(self.attn_weights(gru_out), dim=1)
        context = torch.sum(attn_weights * gru_out, dim=1)

        # Fully connected layer
        output = self.actv(self.fc(context))
        return output.squeeze(1)

    def init_hidden(self, batch_size, device):
        return torch.zeros(2, batch_size, self.hidden_size, device=device)


### 모델 학습

## Run !!

In [None]:
# Modifying the CustomDataset to return ID and sales data separately
class CustomDatasetWithID(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y

    def __getitem__(self, index):
        x_data = self.X[index, :, 1:]  # Sales data
        id_data = self.X[index, 0, 0]  # ID data (since it's same across the sequence, just pick the first one)
        if self.Y is not None:
            return torch.Tensor(x_data), torch.LongTensor([id_data]), torch.Tensor(self.Y[index])
        return torch.Tensor(x_data), torch.LongTensor([id_data])

    def __len__(self):
        return len(self.X)

# Update the DataLoader with the new dataset
train_dataset_with_id = CustomDatasetWithID(train_input, train_target)
train_loader_with_id = DataLoader(train_dataset_with_id, batch_size=CFG['BATCH_SIZE'], shuffle=True, num_workers=0)


# Modify the training function to handle the separated data
def train_with_ids(model, optimizer, train_loader, device):
    model.to(device)
    criterion = nn.MSELoss().to(device)  # Using Huber Loss
    best_loss = 9999999

    for epoch in range(1, CFG['EPOCHS'] + 1):
        model.train()
        train_loss = []
        for x_data, id_data, y_data in tqdm(train_loader):
            x_data, id_data, y_data = x_data.to(device), id_data.to(device), y_data.to(device)

            optimizer.zero_grad()

            output = model(x_data, id_data)
            loss = criterion(output, y_data)

            loss.backward()
            optimizer.step()

            train_loss.append(loss.item())

        print(f'Epoch : [{epoch}] Train Loss : [{np.mean(train_loss):.5f}]')
    return model

# Test the new training function
model_test = BaseModel()
optimizer = torch.optim.AdamW(params=model_test.parameters(), lr=CFG["LEARNING_RATE"])
infer_model_with_id = train_with_ids(model_test, optimizer, train_loader_with_id, device)


  0%|          | 0/6285 [00:00<?, ?it/s]

Epoch : [1] Train Loss : [0.01800]


  0%|          | 0/6285 [00:00<?, ?it/s]

KeyboardInterrupt: ignored

## 모델 추론

In [None]:
test_dataset = CustomDatasetWithID(test_input, None)
test_loader = DataLoader(test_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=False, num_workers=0)

In [None]:
def inference(model, test_loader, device):
    predictions = []

    model.eval()
    with torch.no_grad():
        for x_data, id_data in tqdm(test_loader):
            x_data, id_data = x_data.to(device), id_data.to(device)

            output = model(x_data, id_data)

            output = output.cpu().numpy()

            predictions.extend(output)

    return np.array(predictions)


In [None]:
pred = inference(model, test_loader, device)

  0%|          | 0/16 [00:00<?, ?it/s]

In [None]:
# 추론 결과를 inverse scaling
for idx in range(len(pred)):
    pred[idx, :] = pred[idx, :] * (scale_max_dict[idx] - scale_min_dict[idx]) + scale_min_dict[idx]

# 결과 후처리
pred = np.round(pred, 0).astype(int)

In [None]:
pred.shape

(15890, 21)

## Submission

In [None]:
submit = pd.read_csv('./sample_submission.csv')
submit.head()

Unnamed: 0,ID,2023-04-05,2023-04-06,2023-04-07,2023-04-08,2023-04-09,2023-04-10,2023-04-11,2023-04-12,2023-04-13,...,2023-04-16,2023-04-17,2023-04-18,2023-04-19,2023-04-20,2023-04-21,2023-04-22,2023-04-23,2023-04-24,2023-04-25
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
submit.iloc[:,1:] = pred
submit.head()

Unnamed: 0,ID,2023-04-05,2023-04-06,2023-04-07,2023-04-08,2023-04-09,2023-04-10,2023-04-11,2023-04-12,2023-04-13,...,2023-04-16,2023-04-17,2023-04-18,2023-04-19,2023-04-20,2023-04-21,2023-04-22,2023-04-23,2023-04-24,2023-04-25
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,2,2,2,2,2,2,2,2,3,...,2,2,2,2,2,2,2,2,2,2
2,2,1,1,1,1,1,2,1,2,2,...,2,3,2,2,2,2,2,3,3,3
3,3,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,2,2,2
4,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
submit.to_csv('gru_nofeature_35_att_emb.csv',index=False)

In [None]:
submit[submit["ID"] == 224]

Unnamed: 0,ID,2023-04-05,2023-04-06,2023-04-07,2023-04-08,2023-04-09,2023-04-10,2023-04-11,2023-04-12,2023-04-13,...,2023-04-16,2023-04-17,2023-04-18,2023-04-19,2023-04-20,2023-04-21,2023-04-22,2023-04-23,2023-04-24,2023-04-25
224,224,43,31,29,27,25,27,28,29,29,...,26,27,28,28,28,28,25,25,26,27
