In [1]:
import random
import os
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from sklearn.preprocessing import LabelEncoder

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split

In [2]:
CFG = {
    'TRAIN_WINDOW_SIZE':90, # 90일치로 학습
    'PREDICT_SIZE':21, # 21일치 예측
    'EPOCHS': 1,
    'LEARNING_RATE':1e-4,
    'BATCH_SIZE':4096,
    'SEED':41
}

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CFG['SEED']) # Seed 고정

In [4]:
train_data = pd.read_csv('train.csv').drop(columns=['ID', '제품'])

In [5]:
# 브랜드 키워드 언급량 가중치
keyword = pd.read_csv('brand_keyword_cnt.csv').drop(columns=['브랜드'])
keyword = keyword.fillna(0)

In [6]:
# 브랜드 키워드 언급량이 0과 1사이 값을 갖도록 조정
for idx in tqdm(range(len(keyword))):
    maxi = np.max(keyword.iloc[idx,:])
    mini = np.min(keyword.iloc[idx,:])
    if maxi == mini :
        keyword.iloc[idx,:] = 0
    else:
        keyword.iloc[idx,:] = (keyword.iloc[idx,:] - mini) / (maxi - mini)

  0%|          | 0/3822 [00:00<?, ?it/s]

In [7]:
# 시그모이드 함수 정의
def sigmoid(x):
    return round(1 / (1 + np.exp(-x)) - 0.2, 2)

In [8]:
sigmoid_df = keyword.transform(sigmoid)

In [9]:
# 판매금액 정보
matching = pd.read_csv('./train.csv')['브랜드']

In [10]:
label_encoder = LabelEncoder()
label_encoder.fit(matching)
matching = label_encoder.transform(matching)
print(matching)
print(len(matching))

[   0    1    1 ... 2894 2894 2894]
28894


In [11]:
qty = pd.read_csv('./train.csv')

In [12]:
sales_w = pd.read_csv('./sales.csv').drop(columns=['ID', '제품'])
sales_w = sales_w.fillna(0)

In [13]:
qty_data = qty.drop(['대분류', '소분류', '중분류', '브랜드', '쇼핑몰'], axis=1).sum(axis=1)
sales_data = sales_w.drop(['대분류', '소분류', '중분류', '브랜드', '쇼핑몰'], axis=1).sum(axis=1)

In [14]:
sales_weights = sales_data / qty_data

In [15]:
# PMC를 100으로 잡고 100 이상일 때 역수로 가중치를 취함 (제품이 비쌀수록 수요가 적을 것이라 생각함)
sales_weights = sales_weights.apply(lambda avg_price: 1 / avg_price if avg_price >= 100 else 0)

In [None]:
# train data에 가중치를 줌
for i in tqdm(range(len(train_data))):
    for j in range(5, len(train_data.columns)):
        mul = sales_weights[i] + sigmoid_df.loc[matching[i], sigmoid_df.columns[j-5]]
        w = train_data.loc[i, train_data.columns[j]] * mul
        train_data.loc[i, train_data.columns[j]] = round(train_data.loc[i, train_data.columns[j]] + w)

  0%|          | 0/28894 [00:00<?, ?it/s]

In [None]:
train_data

In [None]:
# 먼저 5번째 열부터의 데이터만 선택합니다.
data_subset = train_data.iloc[:, 5:]

# 각 행의 최댓값과 최솟값을 계산합니다.
max_values = data_subset.max(axis=1)
min_values = data_subset.min(axis=1)

# 분모가 0이 되는 경우를 처리하기 위해 조건을 적용합니다.
diff = max_values - min_values
mask = diff != 0

# 정규화 작업을 수행합니다.
train_data.loc[mask, data_subset.columns] = (data_subset[mask] - min_values[mask].values.reshape(-1, 1)) / diff[mask].values.reshape(-1, 1)
train_data.loc[~mask, data_subset.columns] = 0

# 결과를 사전에 저장합니다.
scale_max_dict = max_values.to_dict()
scale_min_dict = min_values.to_dict()

In [None]:
# Label Encoding
label_encoder = LabelEncoder()
categorical_columns = ['대분류', '중분류', '소분류', '브랜드', '쇼핑몰']

for col in categorical_columns:
    label_encoder.fit(train_data[col])
    train_data[col] = label_encoder.transform(train_data[col])

In [None]:
class CustomDataset(Dataset):
    def __init__(self, data, train_size=CFG['TRAIN_WINDOW_SIZE'], predict_size=CFG['PREDICT_SIZE'], is_inference=False):
        self.data = data.values # convert DataFrame to numpy array
        self.train_size = train_size
        self.predict_size = predict_size
        self.window_size = self.train_size + self.predict_size
        self.is_inference = is_inference

    def __len__(self):
        if self.is_inference:
            return len(self.data)
        else:
            return self.data.shape[0] * (self.data.shape[1] - self.window_size - 4)

    def __getitem__(self, idx):
        if self.is_inference:
            # 추론 시
            encode_info = self.data[idx, :5]
            window = self.data[idx, -self.train_size:]
            input_data = np.column_stack((np.tile(encode_info, (self.train_size, 1)), window))
            return input_data
        else:
            # 학습 시
            row = idx // (self.data.shape[1] - self.window_size - 4)
            col = idx % (self.data.shape[1] - self.window_size - 4)
            encode_info = self.data[row, :5]
            sales_data = self.data[row, 5:]
            window = sales_data[col : col + self.window_size]
            input_data = np.column_stack((np.tile(encode_info, (self.train_size, 1)), window[:self.train_size]))
            target_data = window[self.train_size:]
            return input_data, target_data

In [None]:
# CustomDataset 인스턴스 생성
dataset = CustomDataset(train_data)

# 전체 데이터셋의 크기
total_size = len(dataset)

# 분리할 데이터셋의 크기 계산
train_size = int(total_size * 0.8)
val_size = total_size - train_size

# random_split 함수를 사용해 데이터셋 분리
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# DataLoader 인스턴스 생성
train_loader = DataLoader(train_dataset, batch_size=CFG['BATCH_SIZE'], shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=CFG['BATCH_SIZE'], shuffle=False)

In [None]:
# LSTM Model과 GRU Model을 Stacking Ensemble한 모델 선언
class LSTMModel(nn.Module): # LSTM Model
    def __init__(self, input_size=5, hidden_size=512, output_size=CFG['PREDICT_SIZE']):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.layer_norm = nn.LayerNorm(hidden_size)
        self.fc = nn.Sequential(
            nn.Linear(hidden_size, hidden_size//2),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(hidden_size//2, output_size)
        )

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        output = self.fc(lstm_out[:, -1, :])
        return output

class GRUModel(nn.Module): # GRU Model
    def __init__(self, input_size=5, hidden_size=512, output_size=CFG['PREDICT_SIZE']):
        super(GRUModel, self).__init__()
        self.hidden_size = hidden_size
        self.gru = nn.GRU(input_size, hidden_size, batch_first=True)
        self.fc = nn.Sequential(
            nn.Linear(hidden_size, hidden_size//2),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(hidden_size//2, output_size)
        )

    def forward(self, x):
        gru_out, _ = self.gru(x)
        output = self.fc(gru_out[:, -1, :])
        return output

lstm_model = LSTMModel()
gru_model = GRUModel()

class StackingEnsembleModel(nn.Module): # Stacking Ensemble Model
    def __init__(self, input_size, hidden_size, output_size):
        super(StackingEnsembleModel, self).__init__()
        self.lstm_model = LSTMModel(input_size, hidden_size, output_size)
        self.gru_model = GRUModel(input_size, hidden_size, output_size)
        self.ensemble_fc = nn.Sequential(
            nn.Linear(output_size * 2, hidden_size),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(hidden_size, output_size)
        )

    def forward(self, x):
        lstm_output = self.lstm_model(x)
        gru_output = self.gru_model(x)

        combined_output = torch.cat((lstm_output, gru_output), dim=1)
        ensemble_output = self.ensemble_fc(combined_output)

        return ensemble_output

output_size = CFG['PREDICT_SIZE']
ensemble_model = StackingEnsembleModel(input_size=5, hidden_size=256, output_size=output_size)
print(ensemble_model)

In [None]:
def train(model, optimizer, train_loader, val_loader, device):
    model.to(device)
    criterion = nn.MSELoss().to(device)
    best_loss = 9999999
    best_model = None

    for epoch in range(1, CFG['EPOCHS']+1):
        model.train()
        train_loss = []
        train_mae = []
        for X, Y in tqdm(iter(train_loader)):
            X = X.float().to(device)
            Y = Y.float().to(device)

            optimizer.zero_grad()

            output = model(X)
            loss = criterion(output, Y)

            loss.backward()
            optimizer.step()

            train_loss.append(loss.item())

        val_loss = validation(model, val_loader, criterion, device)
        print(f'Epoch : [{epoch}] Train Loss : [{np.mean(train_loss):.5f}] Val Loss : [{val_loss:.5f}]')

        if best_loss > val_loss:
            best_loss = val_loss
            best_model = model
            print('Model Saved')
    return best_model

In [None]:
def validation(model, val_loader, criterion, device):
    model.eval()
    val_loss = []

    with torch.no_grad():
        for X, Y in tqdm(iter(val_loader)):
            X = X.float().to(device)
            Y = Y.float().to(device)

            output = model(X)
            loss = criterion(output, Y)

            val_loss.append(loss.item())
    return np.mean(val_loss)

In [None]:
model = ensemble_model
optimizer = torch.optim.Adam(params = model.parameters(), lr = CFG["LEARNING_RATE"])
infer_model = train(model, optimizer, train_loader, val_loader, device)

In [None]:
test_dataset = CustomDataset(data=train_data, is_inference=True)
test_loader = DataLoader(test_dataset, batch_size=CFG['BATCH_SIZE'], shuffle=False)

In [None]:
def inference(model, test_loader, device):
    predictions = []

    with torch.no_grad():
        for X in tqdm(iter(test_loader)):
            X = X.float().to(device)

            output = model(X)

            # 모델 출력인 output을 CPU로 이동하고 numpy 배열로 변환
            output = output.cpu().numpy()

            predictions.extend(output)

    return np.array(predictions)

In [None]:
pred = inference(infer_model, test_loader, device)

In [None]:
# 추론 결과를 inverse scaling
for idx in range(len(pred)):
    pred[idx, :] = pred[idx, :] * (scale_max_dict[idx] - scale_min_dict[idx]) + scale_min_dict[idx]

# 결과 후처리
pred = np.round(pred, 0).astype(int)

In [None]:
pred.shape

In [None]:
submit = pd.read_csv('./sample_submission.csv')
submit.head()

In [None]:
submit.iloc[:,1:] = pred
submit.head()

In [None]:
submit.to_csv('final.csv', index=False)