### 구글 드라이브 연동

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### 라이브러리 불러오기

In [2]:
import random
import os
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from sklearn.preprocessing import LabelEncoder

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

### gpu 이용

In [3]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

###하이퍼파라미터 설정

In [4]:
CFG = {
    'TRAIN_WINDOW_SIZE':90, # 90일치로 학습
    'PREDICT_SIZE':21, # 21일치 예측
    'EPOCHS':10,
    'LEARNING_RATE':1e-4,
    'BATCH_SIZE':2048,
    'SEED':41
}

###시드 고정

In [5]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CFG['SEED']) # Seed 고정

###Feature Engineering

###데이터 불러오기

In [6]:
train_data = pd.read_csv('/content/drive/MyDrive/lg_aimers/open/train.csv')
brand_keyword_data = pd.read_csv('/content/drive/MyDrive/lg_aimers/open/brand_keyword_cnt.csv')
sales_data = pd.read_csv('/content/drive/MyDrive/lg_aimers/open/sales.csv')

### 브랜드 키워드 feature 추가

In [7]:
for date_col in brand_keyword_data.columns[1:]:
    # 새로운 이름의 columns 생성
    new_col_name = f"{date_col}_brand_cnt"

    # 데이터 병합
    train_data = pd.merge(train_data, brand_keyword_data[['브랜드', date_col]],
                          on='브랜드', how='left')

    # column 이름 재설정
    train_data.rename(columns={date_col: new_col_name}, inplace=True)


In [8]:
# 새로운 변수에 추가
train_data_brand = train_data

In [9]:
# train csv파일 다시 불러오기
train_data = pd.read_csv('/content/drive/MyDrive/lg_aimers/open/train.csv')

### 제품별 평균금액 feature 추가

In [10]:
# 숫자형 데이터가 아닌 coulums들을 drop
train_quantities = train_data.drop(columns=[ '제품', '대분류', '중분류', '소분류', '브랜드'])

# 판매금액을 판매량으로 나누어 평균금액 구하기
daily_prices = sales_data.set_index('ID').divide(train_quantities.set_index('ID'))

In [11]:
average_prices = daily_prices.mean(axis=1).reset_index(name='평균금액')

In [12]:
train_data_brand_price = pd.merge(train_data_brand, average_prices, on='ID', how='left')

In [13]:
brand_index = train_data_brand.columns.get_loc('브랜드')

train_data_brand_price.insert(brand_index + 1, '평균금액임시', train_data_brand_price['평균금액'])

train_data_brand_price.drop(columns=['평균금액'], inplace=True)

train_data_brand_price.rename(columns={'평균금액임시': '평균금액'}, inplace=True)

  train_data_brand_price.insert(brand_index + 1, '평균금액임시', train_data_brand_price['평균금액'])


In [14]:
# 다음 코드 파일에서 다른 방향으로 전처리하기 위해 병합한 파일을 불러오기 위한'train_data_brand_price.csv'저장해두기!
train_data_brand_price.to_csv('/content/drive/MyDrive/lg_aimers/open/train_data_brand_price.csv', index=False)

In [15]:
# train_data 라는 변수명으로 저장
train_data = train_data_brand_price.drop(columns=['ID', '제품'])

### 결측치 확인 및 처리

---
전처리의 방향: 결측치를 0으로 처리

In [None]:
# NaN value를 0으로 처리
train_data.fillna(0, inplace=True)

In [None]:
# 결측치 확인
nan_per_column = train_data.isna().sum()

total_nan = nan_per_column.sum()

print(nan_per_column)
print("Total number of NaN values:", total_nan)

대분류             0
중분류             0
소분류             0
브랜드             0
평균금액            0
               ..
2023-03-31_y    0
2023-04-01_y    0
2023-04-02_y    0
2023-04-03_y    0
2023-04-04_y    0
Length: 923, dtype: int64
Total number of NaN values: 0


###데이터 전처리

**average price-> min-max**

In [None]:
# 평균금액 column min-max scaling
numeric_cols = train_data.columns[4:5]
print(numeric_cols)
# 각 column의 min 및 max 계산
min_values = train_data[numeric_cols].min(axis=0)
max_values = train_data[numeric_cols].max(axis=0)
# 각 행의 범위(max-min)를 계산하고, 범위가 0인 경우 1로 대체
ranges = max_values - min_values
ranges[ranges == 0] = 1
# min-max scaling 수행
train_data[numeric_cols] = (train_data[numeric_cols].subtract(min_values, axis=1)).div(ranges, axis=1)
# max와 min 값을 dictionary 형태로 저장
scale_min_dict_avg_price = min_values.to_dict()
scale_max_dict_avg_price = max_values.to_dict()

Index(['평균금액'], dtype='object')


**sales count-> min-max**

In [None]:
# 판매량 column min-max scaling
numeric_cols = train_data.columns[5:464]
print(numeric_cols)
# 칵 column의 min 및 max 계산
min_values = train_data[numeric_cols].min(axis=1)
max_values = train_data[numeric_cols].max(axis=1)
# 각 행의 범위(max-min)를 계산하고, 범위가 0인 경우 1로 대체
ranges = max_values - min_values
ranges[ranges == 0] = 1
# min-max scaling 수행
train_data[numeric_cols] = (train_data[numeric_cols].subtract(min_values, axis=0)).div(ranges, axis=0)
# max와 min 값을 dictionary 형태로 저장
scale_min_dict_sale = min_values.to_dict()
scale_max_dict_sale = max_values.to_dict()

Index(['2022-01-01_x', '2022-01-02_x', '2022-01-03_x', '2022-01-04_x',
       '2022-01-05_x', '2022-01-06_x', '2022-01-07_x', '2022-01-08_x',
       '2022-01-09_x', '2022-01-10_x',
       ...
       '2023-03-26_x', '2023-03-27_x', '2023-03-28_x', '2023-03-29_x',
       '2023-03-30_x', '2023-03-31_x', '2023-04-01_x', '2023-04-02_x',
       '2023-04-03_x', '2023-04-04_x'],
      dtype='object', length=459)


**brand_cnt->min-max**

In [None]:
# 브랜드 언급량 column min-max scaling
numeric_cols = train_data.columns[464:]
print(numeric_cols)
# 칵 column의 min 및 max 계산
min_values = train_data[numeric_cols].min(axis=1)
max_values = train_data[numeric_cols].max(axis=1)
# 각 행의 범위(max-min)를 계산하고, 범위가 0인 경우 1로 대체
ranges = max_values - min_values
ranges[ranges == 0] = 1
# min-max scaling 수행
train_data[numeric_cols] = (train_data[numeric_cols].subtract(min_values, axis=0)).div(ranges, axis=0)
# max와 min 값을 dictionary 형태로 저장
scale_min_dict_brand = min_values.to_dict()
scale_max_dict_brand = max_values.to_dict()

Index(['2022-01-01_y', '2022-01-02_y', '2022-01-03_y', '2022-01-04_y',
       '2022-01-05_y', '2022-01-06_y', '2022-01-07_y', '2022-01-08_y',
       '2022-01-09_y', '2022-01-10_y',
       ...
       '2023-03-26_y', '2023-03-27_y', '2023-03-28_y', '2023-03-29_y',
       '2023-03-30_y', '2023-03-31_y', '2023-04-01_y', '2023-04-02_y',
       '2023-04-03_y', '2023-04-04_y'],
      dtype='object', length=459)


###Label Encoding


In [None]:
# Label Encoding
label_encoder = LabelEncoder()
categorical_columns = ['대분류','중분류', '소분류', '브랜드']

for col in categorical_columns:
    label_encoder.fit(train_data[col])
    train_data[col] = label_encoder.transform(train_data[col])


###make_train_data/make_predict_data 설정

In [None]:
def make_train_data(data, train_size=CFG['TRAIN_WINDOW_SIZE'], predict_size=CFG['PREDICT_SIZE']):
    '''
    학습 기간 블럭, 예측 기간 블럭의 세트로 데이터를 생성
    data : 일별 판매량
    train_size : 학습에 활용할 기간
    predict_size : 추론할 기간
    '''
    # 메모리 부족방지를 위한 STEP_SIZE
    STEP_SIZE = 2
    num_rows = len(data)
    window_size = train_size + predict_size
    adjusted_size = (len(data.columns) - window_size + 1) // STEP_SIZE
                                                                       # -1 하나?마나?
    input_data = np.empty((num_rows * adjusted_size, train_size, len(data.iloc[0, :5]) +  2))
    target_data = np.empty((num_rows * adjusted_size, predict_size))

    for i in tqdm(range(num_rows)):
        encode_info = np.array(data.iloc[i, :5])
        sales_data = np.array(data.iloc[i, 5:464])
        brand_data = np.array(data.iloc[i, 464:])
        #print(sales_data)
        #print(brand_data)
        for j in range(0,len(sales_data) - window_size + 1,STEP_SIZE):
            sales_window = sales_data[j : j + window_size]
            brand_window = brand_data[j : j + train_size]
            temp_data = np.column_stack((np.tile(encode_info, (train_size, 1)),brand_window, sales_window[:train_size]))
            input_data[i * adjusted_size  + j //STEP_SIZE] = temp_data
            target_data[i * adjusted_size + j //STEP_SIZE] = sales_window[train_size:]

    return input_data, target_data

In [None]:
def make_predict_data(data, train_size=CFG['TRAIN_WINDOW_SIZE']):
    '''
    평가 데이터(Test Dataset)를 추론하기 위한 Input 데이터를 생성
    data : 일별 판매량
    train_size : 추론을 위해 필요한 일별 판매량 기간 (= 학습에 활용할 기간)
    '''
    num_rows = len(data)

    input_data = np.empty((num_rows, train_size, len(data.iloc[0, :5]) + 2))

    for i in tqdm(range(num_rows)):
        encode_info = np.array(data.iloc[i, :5])
        sales_data = np.array(data.iloc[i, 464-train_size:464])
        brand_data = np.array(data.iloc[i, -train_size:])

        #window = sales_data[-train_size : ]
        temp_data = np.column_stack((np.tile(encode_info, (train_size, 1)), brand_data,sales_data))
        input_data[i] = temp_data

    return input_data

In [None]:
train_input, train_target = make_train_data(train_data)
test_input = make_predict_data(train_data)

  0%|          | 0/15890 [00:00<?, ?it/s]

  0%|          | 0/15890 [00:00<?, ?it/s]

In [None]:
print(train_input.shape)
print(train_target.shape)
print(test_input.shape)
print(len(train_input))

(6451340, 90, 7)
(6451340, 21)
(15890, 90, 7)
6451340


###Train / Validation Split

In [None]:
#Train / Validation Split
data_len = len(train_input)
val_input = train_input[-int(data_len*0.2):]
val_target = train_target[-int(data_len*0.2):]
train_input = train_input[:-int(data_len*0.2)]
train_target = train_target[:-int(data_len*0.2)]

In [None]:
train_input.shape, train_target.shape, val_input.shape, val_target.shape, test_input.shape

((5161072, 90, 7),
 (5161072, 21),
 (1290268, 90, 7),
 (1290268, 21),
 (15890, 90, 7))

###사용자 정의 데이터셋

In [None]:
class CustomDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y

    def __getitem__(self, index):
        if self.Y is not None:
            return torch.Tensor(self.X[index]), torch.Tensor(self.Y[index])
        return torch.Tensor(self.X[index])

    def __len__(self):
        return len(self.X)

In [None]:
train_dataset = CustomDataset(train_input, train_target)
train_loader = DataLoader(train_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=True, num_workers=0)

val_dataset = CustomDataset(val_input, val_target)
val_loader = DataLoader(val_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=False, num_workers=0)

###DeepLSTMModel 정의

In [None]:
# num_layer 기능으로 층 입력 기능 추가

class DeepLSTMModel(nn.Module):
    def __init__(self, input_size=7, hidden_size=512, output_size=CFG['PREDICT_SIZE'], num_layers=2):
        super(DeepLSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        # 멀티 레이어 LSTM
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers=num_layers, batch_first=True, dropout=0.2)

        # 풀리 커넥티드 layers
        self.fc = nn.Sequential(
            nn.Linear(hidden_size, hidden_size//2),
            nn.ReLU(),
            nn.Dropout(0.5),  # dropout 기능 추가
            nn.Linear(hidden_size//2, output_size)
        )

        self.actv = nn.ReLU()

    def forward(self, x):
        batch_size = x.size(0)
        hidden = self.init_hidden(batch_size, x.device)

        lstm_out, hidden = self.lstm(x, hidden)
        last_output = lstm_out[:, -1, :]
        output = self.actv(self.fc(last_output))
        return output.squeeze(1)

    def init_hidden(self, batch_size, device):
        return (torch.zeros(self.num_layers, batch_size, self.hidden_size, device=device),
                torch.zeros(self.num_layers, batch_size, self.hidden_size, device=device))


In [None]:
def train(model, optimizer, train_loader, val_loader, device):
    model.to(device)
    criterion = nn.MSELoss().to(device)
    best_loss = 9999999
    best_model = None

    for epoch in range(1, CFG['EPOCHS']+1):
        model.train()
        train_loss = []
        train_mae = []
        for X, Y in tqdm(iter(train_loader)):
            X = X.to(device)
            Y = Y.to(device)

            optimizer.zero_grad()

            output = model(X)
            loss = criterion(output, Y)

            loss.backward()
            optimizer.step()

            train_loss.append(loss.item())

        val_loss = validation(model, val_loader, criterion, device)
        print(f'Epoch : [{epoch}] Train Loss : [{np.mean(train_loss):.5f}] Val Loss : [{val_loss:.5f}]')

        if best_loss > val_loss:
            best_loss = val_loss
            best_model = model
            print('Model Saved')
    return best_model


In [None]:
def validation(model, val_loader, criterion, device):
    model.eval()
    val_loss = []

    with torch.no_grad():
        for X, Y in tqdm(iter(val_loader)):
            X = X.to(device)
            Y = Y.to(device)

            output = model(X)
            loss = criterion(output, Y)

            val_loss.append(loss.item())
    return np.mean(val_loss)

### 모델 학습
---
optimizer: Adam

learning rate: 1e-4

In [None]:
model = DeepLSTMModel()
optimizer = torch.optim.Adam(params = model.parameters(), lr = CFG["LEARNING_RATE"])
infer_model = train(model, optimizer, train_loader, val_loader, device)


  0%|          | 0/2521 [00:00<?, ?it/s]

  0%|          | 0/631 [00:00<?, ?it/s]

Epoch : [1] Train Loss : [0.01307] Val Loss : [0.01309]
Model Saved


  0%|          | 0/2521 [00:00<?, ?it/s]

  0%|          | 0/631 [00:00<?, ?it/s]

Epoch : [2] Train Loss : [0.01180] Val Loss : [0.00835]
Model Saved


  0%|          | 0/2521 [00:00<?, ?it/s]

  0%|          | 0/631 [00:00<?, ?it/s]

Epoch : [3] Train Loss : [0.00809] Val Loss : [0.00850]


  0%|          | 0/2521 [00:00<?, ?it/s]

  0%|          | 0/631 [00:00<?, ?it/s]

Epoch : [4] Train Loss : [0.00777] Val Loss : [0.00977]


  0%|          | 0/2521 [00:00<?, ?it/s]

  0%|          | 0/631 [00:00<?, ?it/s]

Epoch : [5] Train Loss : [0.00767] Val Loss : [0.00830]
Model Saved


  0%|          | 0/2521 [00:00<?, ?it/s]

  0%|          | 0/631 [00:00<?, ?it/s]

Epoch : [6] Train Loss : [0.00759] Val Loss : [0.00766]
Model Saved


  0%|          | 0/2521 [00:00<?, ?it/s]

  0%|          | 0/631 [00:00<?, ?it/s]

Epoch : [7] Train Loss : [0.00747] Val Loss : [0.00742]
Model Saved


  0%|          | 0/2521 [00:00<?, ?it/s]

  0%|          | 0/631 [00:00<?, ?it/s]

Epoch : [8] Train Loss : [0.00744] Val Loss : [0.00795]


  0%|          | 0/2521 [00:00<?, ?it/s]

  0%|          | 0/631 [00:00<?, ?it/s]

Epoch : [9] Train Loss : [0.00739] Val Loss : [0.00729]
Model Saved


  0%|          | 0/2521 [00:00<?, ?it/s]

  0%|          | 0/631 [00:00<?, ?it/s]

Epoch : [10] Train Loss : [0.00738] Val Loss : [0.00730]


### 모델 저장

In [None]:
#모델을 파일명 "model1_v100.pth"으로 저장
save_path = "/content/drive/MyDrive/lg_aimers/open/model1_v100.pth"
torch.save(infer_model.state_dict(), save_path)

### Validation 데이터로 예측

In [None]:
test_dataset = CustomDataset(test_input, None)
test_loader = DataLoader(test_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=False, num_workers=0)

In [None]:
def inference(model, test_loader, device):
    predictions = []

    with torch.no_grad():
        for X in tqdm(iter(test_loader)):
            X = X.to(device)

            output = model(X)

            # 모델 출력인 output을 CPU로 이동하고 numpy 배열로 변환
            output = output.cpu().numpy()

            predictions.extend(output)

    return np.array(predictions)

In [None]:
pred = inference(infer_model, test_loader, device)

  0%|          | 0/8 [00:00<?, ?it/s]

In [None]:
# 추론 결과를 inverse scaling
for idx in range(len(pred)):
    pred[idx, :] = pred[idx, :] * (scale_max_dict_sale[idx] - scale_min_dict_sale[idx]) + scale_min_dict_sale[idx]

# 결과 후처리
pred = np.round(pred, 0).astype(int)

In [None]:
pred.shape

(15890, 21)

In [None]:
submit = pd.read_csv('/content/drive/MyDrive/lg_aimers/open/sample_submission.csv')
submit.head()

Unnamed: 0,ID,2023-04-05,2023-04-06,2023-04-07,2023-04-08,2023-04-09,2023-04-10,2023-04-11,2023-04-12,2023-04-13,...,2023-04-16,2023-04-17,2023-04-18,2023-04-19,2023-04-20,2023-04-21,2023-04-22,2023-04-23,2023-04-24,2023-04-25
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
submit.iloc[:,1:] = pred
submit.head()

Unnamed: 0,ID,2023-04-05,2023-04-06,2023-04-07,2023-04-08,2023-04-09,2023-04-10,2023-04-11,2023-04-12,2023-04-13,...,2023-04-16,2023-04-17,2023-04-18,2023-04-19,2023-04-20,2023-04-21,2023-04-22,2023-04-23,2023-04-24,2023-04-25
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
2,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


###예측 결과 파일로 'model1_v100.csv'저장

In [None]:
submit.to_csv('/content/drive/MyDrive/lg_aimers/open/model1_v100.csv', index=False)