###구글 드라이브 연동

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### 라이브러리 불러오기

In [None]:
import random
import os
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from sklearn.preprocessing import LabelEncoder

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

### gpu 이용

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

###하이퍼파라미터 설정

In [None]:
CFG = {
    'TRAIN_WINDOW_SIZE':90, # 90일치로 학습
    'PREDICT_SIZE':21, # 21일치 예측
    'EPOCHS':10,
    'LEARNING_RATE':1e-4,
    'BATCH_SIZE':2048,
    'SEED':41
}

###시드 고정

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CFG['SEED']) # Seed 고정

###Feature Engineering

###데이터 불러오기

---
model1_v100.ipynb 코드에서 메타 데이터(brand_keyword_cnt,sales,train)를 병합하여 'train_data_brand_price.csv' 저장한 파일을 불러와서 이전과 다른 방향으로 전처리 진행


In [None]:
# train_data_brand_price.csv 불러와서 train_data에 저장
train_data = pd.read_csv('/content/drive/MyDrive/lg_aimers/open/train_data_brand_price.csv')

### 결측치 확인

In [None]:
# Count NaN values for each column
nan_per_column = train_data.isna().sum()

# Get the total number of NaN values across the entire DataFrame
total_nan = nan_per_column.sum()

print(nan_per_column)
print("Total number of NaN values:", total_nan)

ID                0
제품                0
대분류               0
중분류               0
소분류               0
               ... 
2023-03-31_y    208
2023-04-01_y    208
2023-04-02_y    208
2023-04-03_y    208
2023-04-04_y    208
Length: 925, dtype: int64
Total number of NaN values: 95557


###평균금액 결측치 처리

---
전처리의 방향: 제품의 가장 세분화된 분류인 '소분류'기준으로 제품끼리 특성이 가장 비슷하다 생각하여 평균금액도 비슷할 것이라는 가정에서 시작! 평균금액의 결측치를 가지고 있는 제품을 그 제품이 속한 소분류의 평균금액의 평균값으로 대체.


**'평균금액'열 결측치 갯수 확인**

In [None]:
missing_values = train_data['평균금액'].isnull().sum()
print("평균금액 열의 결측치 갯수:", missing_values)

평균금액 열의 결측치 갯수: 85


**평균금액이 결측치 인 행 가져와서 filtered_data에 저장**

In [None]:
filtered_data = train_data[train_data['평균금액'].isnull()]
filtered_data.head()

Unnamed: 0,ID,제품,대분류,중분류,소분류,브랜드,평균금액,2022-01-01_x,2022-01-02_x,2022-01-03_x,...,2023-03-26_y,2023-03-27_y,2023-03-28_y,2023-03-29_y,2023-03-30_y,2023-03-31_y,2023-04-01_y,2023-04-02_y,2023-04-03_y,2023-04-04_y
143,143,B002-00066-00006,B002-C001-0001,B002-C002-0001,B002-C003-0003,B002-00066,,0,0,0,...,0.14505,0.42065,0.62373,0.2901,0.31911,0.26109,0.15955,0.31911,0.69625,0.21758
240,240,B002-00072-00007,B002-C001-0002,B002-C002-0004,B002-C003-0023,B002-00072,,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
407,407,B002-00107-00049,B002-C001-0002,B002-C002-0005,B002-C003-0025,B002-00107,,0,0,0,...,45.040846,31.840453,32.449674,29.998205,29.432418,31.767854,41.225737,36.786929,32.174071,40.239358
513,513,B002-00126-00009,B002-C001-0002,B002-C002-0004,B002-C003-0023,B002-00126,,0,0,0,...,0.20307,0.21758,0.13054,0.14505,0.07252,0.0,0.08703,0.0,0.10153,0.0
696,696,B002-00172-00001,B002-C001-0002,B002-C002-0002,B002-C003-0011,B002-00172,,0,0,0,...,0.580207,0.652737,0.522195,0.536701,0.638231,0.580207,0.377134,0.522195,0.725268,0.841304


**소분류 기준으로 평균금액 열의 데이터 값의 평균을 계산하고 average_by_category에 저장**

In [None]:
average_by_category = train_data.groupby('소분류')['평균금액'].mean()

print(average_by_category)

소분류
B002-C003-0001    22015.302309
B002-C003-0002    25125.923540
B002-C003-0003    27133.871611
B002-C003-0004    26227.318933
B002-C003-0005    65677.958390
B002-C003-0006    12516.392570
B002-C003-0007    10126.190099
B002-C003-0008    10762.005191
B002-C003-0009     4550.438399
B002-C003-0010     4686.747769
B002-C003-0011     5882.114887
B002-C003-0012     7734.065678
B002-C003-0013     7960.218143
B002-C003-0014     8934.237575
B002-C003-0015    10166.658242
B002-C003-0016     7951.328711
B002-C003-0017     8020.199145
B002-C003-0018    10915.517470
B002-C003-0019     5762.093889
B002-C003-0020    54437.746795
B002-C003-0021     5020.408774
B002-C003-0022     5502.270218
B002-C003-0023     2869.667357
B002-C003-0024    13664.882044
B002-C003-0025     2507.935022
B002-C003-0026     7326.233731
B002-C003-0027     5181.195953
B002-C003-0028     9053.751067
B002-C003-0029     9265.205271
B002-C003-0030    17238.788191
B002-C003-0031     9476.771460
B002-C003-0032     6416.445626
B002

**average_by_category를 딕션너리로 저장**

In [None]:
average_by_category_dict = average_by_category.to_dict()
print(average_by_category_dict)

{'B002-C003-0001': 22015.302308914386, 'B002-C003-0002': 25125.92353990331, 'B002-C003-0003': 27133.87161129201, 'B002-C003-0004': 26227.318932779137, 'B002-C003-0005': 65677.95839028708, 'B002-C003-0006': 12516.392569570853, 'B002-C003-0007': 10126.190098787514, 'B002-C003-0008': 10762.005190686537, 'B002-C003-0009': 4550.438398536554, 'B002-C003-0010': 4686.747768501604, 'B002-C003-0011': 5882.1148869440885, 'B002-C003-0012': 7734.065678413316, 'B002-C003-0013': 7960.218142968754, 'B002-C003-0014': 8934.237575114688, 'B002-C003-0015': 10166.658241508561, 'B002-C003-0016': 7951.32871113439, 'B002-C003-0017': 8020.199145363169, 'B002-C003-0018': 10915.517469752138, 'B002-C003-0019': 5762.0938885574315, 'B002-C003-0020': 54437.74679497229, 'B002-C003-0021': 5020.408774179904, 'B002-C003-0022': 5502.270218218317, 'B002-C003-0023': 2869.667356889615, 'B002-C003-0024': 13664.882043825584, 'B002-C003-0025': 2507.9350222055914, 'B002-C003-0026': 7326.2337305805695, 'B002-C003-0027': 5181.195

**결측치 행의 '평균금액' 데이터를 소분류 기준 '평균금액'의 데이터의 평균값으로 대체하기**

---
결측치 행을 따로 출력하여 저장한 'filtered_data'에 대체 값을 넣으면서 결측치를 처리하였다.


In [None]:
# 소분류 기준으로 일치하는 행의 평균금액을 평균값으로 채워넣기.
for index, row in filtered_data.iterrows():
    category = row['소분류']
    if category in average_by_category_dict:
        filtered_data.at[index, '평균금액'] = average_by_category_dict[category]

**따로 출력하고 결측치를 처리한 'filtered_data'를 train_data와 비교하여 train_data의 '평균금액' 결측치 값에 덮어쓰기**

In [None]:
#train_data의 결측값을 평균값을 대체하는 것을 적용
for index, row in train_data.iterrows():
    product = row['제품']
    if product in filtered_data['제품'].values:
        new_average = filtered_data.loc[filtered_data['제품'] == product, '평균금액'].values[0]
        train_data.at[index, '평균금액'] = new_average

**결측치 재확인**

In [None]:
missing_values = train_data['평균금액'].isnull().sum()
print("평균금액 열의 결측치 갯수:", missing_values)

평균금액 열의 결측치 갯수: 0


### 브랜드 키워드 데이터 전처리

---
전처리의 방향: 결측치를 0으로 처리


**결측치 0값으로 처리**

In [None]:
# NaN value를 0으로 처리
train_data.fillna(0, inplace=True)

**최종 결측치 확인**

In [None]:
# 결측치 확인
nan_per_column = train_data.isna().sum()

total_nan = nan_per_column.sum()

print(nan_per_column)
print("Total number of NaN values:", total_nan)

ID              0
제품              0
대분류             0
중분류             0
소분류             0
               ..
2023-03-31_y    0
2023-04-01_y    0
2023-04-02_y    0
2023-04-03_y    0
2023-04-04_y    0
Length: 925, dtype: int64
Total number of NaN values: 0


**train_data에서 '제품'열 제거 후 저장**

In [None]:
train_data = train_data.drop(columns=['ID','제품'])
train_data.head(3)

Unnamed: 0,대분류,중분류,소분류,브랜드,평균금액,2022-01-01_x,2022-01-02_x,2022-01-03_x,2022-01-04_x,2022-01-05_x,...,2023-03-26_y,2023-03-27_y,2023-03-28_y,2023-03-29_y,2023-03-30_y,2023-03-31_y,2023-04-01_y,2023-04-02_y,2023-04-03_y,2023-04-04_y
0,B002-C001-0002,B002-C002-0007,B002-C003-0038,B002-00001,5856.896552,0,0,0,0,0,...,0.31911,0.39164,0.37713,0.49318,0.07252,0.2901,0.31911,0.23208,0.33362,0.44966
1,B002-C001-0003,B002-C002-0008,B002-C003-0044,B002-00002,23808.744939,0,0,0,0,0,...,10.26979,11.96692,10.64693,10.41485,10.48738,9.48651,9.28343,10.42935,11.15462,11.38671
2,B002-C001-0003,B002-C002-0008,B002-C003-0044,B002-00002,12058.598993,0,0,0,0,0,...,10.26979,11.96692,10.64693,10.41485,10.48738,9.48651,9.28343,10.42935,11.15462,11.38671


###데이터 전처리

**average price-> min-max**

In [None]:
# 평균금액 column min-max scaling
numeric_cols = train_data.columns[4:5]
print(numeric_cols)
# 각 column의 min 및 max 계산
min_values = train_data[numeric_cols].min(axis=0)
max_values = train_data[numeric_cols].max(axis=0)
# 각 행의 범위(max-min)를 계산하고, 범위가 0인 경우 1로 대체
ranges = max_values - min_values
ranges[ranges == 0] = 1
# min-max scaling 수행
train_data[numeric_cols] = (train_data[numeric_cols].subtract(min_values, axis=1)).div(ranges, axis=1)
# max와 min 값을 dictionary 형태로 저장
scale_min_dict_avg_price = min_values.to_dict()
scale_max_dict_avg_price = max_values.to_dict()

Index(['평균금액'], dtype='object')


**sales count-> min-max**

In [None]:
# 판매량 column min-max scaling
numeric_cols = train_data.columns[5:464]
print(numeric_cols)
# 칵 column의 min 및 max 계산
min_values = train_data[numeric_cols].min(axis=1)
max_values = train_data[numeric_cols].max(axis=1)
# 각 행의 범위(max-min)를 계산하고, 범위가 0인 경우 1로 대체
ranges = max_values - min_values
ranges[ranges == 0] = 1
# min-max scaling 수행
train_data[numeric_cols] = (train_data[numeric_cols].subtract(min_values, axis=0)).div(ranges, axis=0)
# max와 min 값을 dictionary 형태로 저장
scale_min_dict_sale = min_values.to_dict()
scale_max_dict_sale = max_values.to_dict()

Index(['2022-01-01_x', '2022-01-02_x', '2022-01-03_x', '2022-01-04_x',
       '2022-01-05_x', '2022-01-06_x', '2022-01-07_x', '2022-01-08_x',
       '2022-01-09_x', '2022-01-10_x',
       ...
       '2023-03-26_x', '2023-03-27_x', '2023-03-28_x', '2023-03-29_x',
       '2023-03-30_x', '2023-03-31_x', '2023-04-01_x', '2023-04-02_x',
       '2023-04-03_x', '2023-04-04_x'],
      dtype='object', length=459)


**brand_cnt->min-max**

In [None]:
# brand_cnt
numeric_cols = train_data.columns[464:]
print(numeric_cols)
# 칵 column의 min 및 max 계산
min_values = train_data[numeric_cols].min(axis=1)
max_values = train_data[numeric_cols].max(axis=1)
# 각 행의 범위(max-min)를 계산하고, 범위가 0인 경우 1로 대체
ranges = max_values - min_values
ranges[ranges == 0] = 1
# min-max scaling 수행
train_data[numeric_cols] = (train_data[numeric_cols].subtract(min_values, axis=0)).div(ranges, axis=0)
# max와 min 값을 dictionary 형태로 저장
scale_min_dict_brand = min_values.to_dict()
scale_max_dict_brand = max_values.to_dict()

Index(['2022-01-01_y', '2022-01-02_y', '2022-01-03_y', '2022-01-04_y',
       '2022-01-05_y', '2022-01-06_y', '2022-01-07_y', '2022-01-08_y',
       '2022-01-09_y', '2022-01-10_y',
       ...
       '2023-03-26_y', '2023-03-27_y', '2023-03-28_y', '2023-03-29_y',
       '2023-03-30_y', '2023-03-31_y', '2023-04-01_y', '2023-04-02_y',
       '2023-04-03_y', '2023-04-04_y'],
      dtype='object', length=459)


###Label Encoding



In [None]:
# Label Encoding
label_encoder = LabelEncoder()
categorical_columns = ['대분류','중분류', '소분류', '브랜드']

for col in categorical_columns:
    label_encoder.fit(train_data[col])
    train_data[col] = label_encoder.transform(train_data[col])


###make_train_data/make_predict_data 설정

In [None]:
def make_train_data(data, train_size=CFG['TRAIN_WINDOW_SIZE'], predict_size=CFG['PREDICT_SIZE']):
    '''
    학습 기간 블럭, 예측 기간 블럭의 세트로 데이터를 생성
    data : 일별 판매량
    train_size : 학습에 활용할 기간
    predict_size : 추론할 기간
    '''
    # 메모리 부족방지를 위한 STEP_SIZE
    STEP_SIZE = 2
    num_rows = len(data)
    window_size = train_size + predict_size
    adjusted_size = (len(data.columns) - window_size + 1) // STEP_SIZE
                                                                       # -1 하나?마나?
    input_data = np.empty((num_rows * adjusted_size, train_size, len(data.iloc[0, :5]) +  2))
    target_data = np.empty((num_rows * adjusted_size, predict_size))

    for i in tqdm(range(num_rows)):
        encode_info = np.array(data.iloc[i, :5])
        sales_data = np.array(data.iloc[i, 5:464])
        brand_data = np.array(data.iloc[i, 464:])
        #print(sales_data)
        #print(brand_data)
        for j in range(0,len(sales_data) - window_size + 1,STEP_SIZE):
            sales_window = sales_data[j : j + window_size]
            brand_window = brand_data[j : j + train_size]
            temp_data = np.column_stack((np.tile(encode_info, (train_size, 1)),brand_window, sales_window[:train_size]))
            input_data[i * adjusted_size  + j //STEP_SIZE] = temp_data
            target_data[i * adjusted_size + j //STEP_SIZE] = sales_window[train_size:]

    return input_data, target_data

In [None]:
def make_predict_data(data, train_size=CFG['TRAIN_WINDOW_SIZE']):
    '''
    평가 데이터(Test Dataset)를 추론하기 위한 Input 데이터를 생성
    data : 일별 판매량
    train_size : 추론을 위해 필요한 일별 판매량 기간 (= 학습에 활용할 기간)
    '''
    num_rows = len(data)

    input_data = np.empty((num_rows, train_size, len(data.iloc[0, :5]) + 2))

    for i in tqdm(range(num_rows)):
        encode_info = np.array(data.iloc[i, :5])
        sales_data = np.array(data.iloc[i, 464-train_size:464])
        brand_data = np.array(data.iloc[i, -train_size:])

        #window = sales_data[-train_size : ]
        temp_data = np.column_stack((np.tile(encode_info, (train_size, 1)), brand_data,sales_data))
        input_data[i] = temp_data

    return input_data

In [None]:
train_input, train_target = make_train_data(train_data)
test_input = make_predict_data(train_data)

  0%|          | 0/15890 [00:00<?, ?it/s]

  0%|          | 0/15890 [00:00<?, ?it/s]

In [None]:
print(train_input.shape)
print(train_target.shape)
print(test_input.shape)
print(len(train_input))

(6451340, 90, 7)
(6451340, 21)
(15890, 90, 7)
6451340


###Train / Validation Split

In [None]:
#Train / Validation Split
data_len = len(train_input)
val_input = train_input[-int(data_len*0.2):]
val_target = train_target[-int(data_len*0.2):]
train_input = train_input[:-int(data_len*0.2)]
train_target = train_target[:-int(data_len*0.2)]

In [None]:
train_input.shape, train_target.shape, val_input.shape, val_target.shape, test_input.shape

((5161072, 90, 7),
 (5161072, 21),
 (1290268, 90, 7),
 (1290268, 21),
 (15890, 90, 7))

###사용자 정의 데이터셋

In [None]:
class CustomDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y

    def __getitem__(self, index):
        if self.Y is not None:
            return torch.Tensor(self.X[index]), torch.Tensor(self.Y[index])
        return torch.Tensor(self.X[index])

    def __len__(self):
        return len(self.X)

In [None]:
train_dataset = CustomDataset(train_input, train_target)
train_loader = DataLoader(train_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=True, num_workers=0)

val_dataset = CustomDataset(val_input, val_target)
val_loader = DataLoader(val_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=False, num_workers=0)

###DeepLSTMModel 정의

In [None]:
# num_layer 기능으로 층 입력 기능 추가

class DeepLSTMModel(nn.Module):
    def __init__(self, input_size=7, hidden_size=512, output_size=CFG['PREDICT_SIZE'], num_layers=2):
        super(DeepLSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        # 멀티 레이어 LSTM
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers=num_layers, batch_first=True, dropout=0.2)

        # 풀리 커넥티드 layers
        self.fc = nn.Sequential(
            nn.Linear(hidden_size, hidden_size//2),
            nn.ReLU(),
            nn.Dropout(0.5),  # dropout 기능 추가
            nn.Linear(hidden_size//2, output_size)
        )

        self.actv = nn.ReLU()

    def forward(self, x):
        batch_size = x.size(0)
        hidden = self.init_hidden(batch_size, x.device)

        lstm_out, hidden = self.lstm(x, hidden)
        last_output = lstm_out[:, -1, :]
        output = self.actv(self.fc(last_output))
        return output.squeeze(1)

    def init_hidden(self, batch_size, device):
        return (torch.zeros(self.num_layers, batch_size, self.hidden_size, device=device),
                torch.zeros(self.num_layers, batch_size, self.hidden_size, device=device))




In [None]:
def train(model, optimizer, train_loader, val_loader, device):
    model.to(device)
    criterion = nn.MSELoss().to(device)
    best_loss = 9999999
    best_model = None

    for epoch in range(1, CFG['EPOCHS']+1):
        model.train()
        train_loss = []
        train_mae = []
        for X, Y in tqdm(iter(train_loader)):
            X = X.to(device)
            Y = Y.to(device)

            optimizer.zero_grad()

            output = model(X)
            loss = criterion(output, Y)

            loss.backward()
            optimizer.step()

            train_loss.append(loss.item())

        val_loss = validation(model, val_loader, criterion, device)
        print(f'Epoch : [{epoch}] Train Loss : [{np.mean(train_loss):.5f}] Val Loss : [{val_loss:.5f}]')

        if best_loss > val_loss:
            best_loss = val_loss
            best_model = model
            print('Model Saved')



    return best_model

In [None]:
def validation(model, val_loader, criterion, device):
    model.eval()
    val_loss = []

    with torch.no_grad():
        for X, Y in tqdm(iter(val_loader)):
            X = X.to(device)
            Y = Y.to(device)

            output = model(X)
            loss = criterion(output, Y)

            val_loss.append(loss.item())
    return np.mean(val_loss)

###모델 실행

---
optimizer: Adam

learning rate: 1e-4


In [None]:
model = DeepLSTMModel()
optimizer = torch.optim.Adam(params = model.parameters(), lr = CFG["LEARNING_RATE"])
infer_model = train(model, optimizer, train_loader, val_loader, device)



  0%|          | 0/2521 [00:00<?, ?it/s]

  0%|          | 0/631 [00:00<?, ?it/s]

Epoch : [1] Train Loss : [0.01294] Val Loss : [0.01310]
Model Saved


  0%|          | 0/2521 [00:00<?, ?it/s]

  0%|          | 0/631 [00:00<?, ?it/s]

Epoch : [2] Train Loss : [0.01083] Val Loss : [0.00793]
Model Saved


  0%|          | 0/2521 [00:00<?, ?it/s]

  0%|          | 0/631 [00:00<?, ?it/s]

Epoch : [3] Train Loss : [0.00794] Val Loss : [0.00802]


  0%|          | 0/2521 [00:00<?, ?it/s]

  0%|          | 0/631 [00:00<?, ?it/s]

Epoch : [4] Train Loss : [0.00773] Val Loss : [0.00751]
Model Saved


  0%|          | 0/2521 [00:00<?, ?it/s]

  0%|          | 0/631 [00:00<?, ?it/s]

Epoch : [5] Train Loss : [0.00769] Val Loss : [0.00752]


  0%|          | 0/2521 [00:00<?, ?it/s]

  0%|          | 0/631 [00:00<?, ?it/s]

Epoch : [6] Train Loss : [0.00754] Val Loss : [0.00746]
Model Saved


  0%|          | 0/2521 [00:00<?, ?it/s]

  0%|          | 0/631 [00:00<?, ?it/s]

Epoch : [7] Train Loss : [0.00746] Val Loss : [0.00734]
Model Saved


  0%|          | 0/2521 [00:00<?, ?it/s]

  0%|          | 0/631 [00:00<?, ?it/s]

Epoch : [8] Train Loss : [0.00742] Val Loss : [0.00830]


  0%|          | 0/2521 [00:00<?, ?it/s]

  0%|          | 0/631 [00:00<?, ?it/s]

Epoch : [9] Train Loss : [0.00738] Val Loss : [0.00735]


  0%|          | 0/2521 [00:00<?, ?it/s]

  0%|          | 0/631 [00:00<?, ?it/s]

Epoch : [10] Train Loss : [0.00735] Val Loss : [0.00734]
Model Saved


###모델 저장

In [None]:
#모델을 파일명 "model2_a100.pth"으로 저장
save_path = "/content/drive/MyDrive/lg_aimers/open/model2_a100.pth"
torch.save(infer_model.state_dict(), save_path)

###Validation 데이터로 예측

In [None]:
test_dataset = CustomDataset(test_input, None)
test_loader = DataLoader(test_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=False, num_workers=0)

In [None]:
def inference(model, test_loader, device):
    predictions = []

    with torch.no_grad():
        for X in tqdm(iter(test_loader)):
            X = X.to(device)

            output = model(X)

            # 모델 출력인 output을 CPU로 이동하고 numpy 배열로 변환
            output = output.cpu().numpy()

            predictions.extend(output)

    return np.array(predictions)

In [None]:
pred = inference(infer_model, test_loader, device)

  0%|          | 0/8 [00:00<?, ?it/s]

In [None]:
# 추론 결과를 inverse scaling
for idx in range(len(pred)):
    pred[idx, :] = pred[idx, :] * (scale_max_dict_sale[idx] - scale_min_dict_sale[idx]) + scale_min_dict_sale[idx]

# 결과 후처리
pred = np.round(pred, 0).astype(int)

In [None]:
pred.shape

(15890, 21)

In [None]:
submit = pd.read_csv('/content/drive/MyDrive/lg_aimers/open/sample_submission.csv')
submit.head()

Unnamed: 0,ID,2023-04-05,2023-04-06,2023-04-07,2023-04-08,2023-04-09,2023-04-10,2023-04-11,2023-04-12,2023-04-13,...,2023-04-16,2023-04-17,2023-04-18,2023-04-19,2023-04-20,2023-04-21,2023-04-22,2023-04-23,2023-04-24,2023-04-25
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
submit.iloc[:,1:] = pred
submit.head()

Unnamed: 0,ID,2023-04-05,2023-04-06,2023-04-07,2023-04-08,2023-04-09,2023-04-10,2023-04-11,2023-04-12,2023-04-13,...,2023-04-16,2023-04-17,2023-04-18,2023-04-19,2023-04-20,2023-04-21,2023-04-22,2023-04-23,2023-04-24,2023-04-25
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
2,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


###예측 결과 파일로 'model2_a100.csv'저장

In [None]:
submit.to_csv('/content/drive/MyDrive/lg_aimers/open/model2_a100.csv', index=False)

###최댓값 앙상블 적용

---
예측 결과 값인 'model2_a100.csv'의 public score: 0.5488993372/private score: 0.537894118이 나왔고 전략적으로 더 높은 점수를 얻기 위해 두번째로 점수가 잘 나온 예측값과 **최댓값 앙상블**을 이용하여 새로운 예측값을 생성하려고 하였다.


**두번째로 잘 나온 예측값 'model1_v100.csv' 데이터 가져오기**

In [None]:
#df_a 변수에 저장
df_a = pd.read_csv('/content/drive/MyDrive/lg_aimers/open/model1_v100.csv')

**'model2_a100.csv'를 불러와서 df_b에 저장 또는 df_b=submit 코드 실행**

In [None]:
#df_b 변수에 저장
#df_b=submit
df_b = pd.read_csv('/content/drive/MyDrive/lg_aimers/open/model2_a100.csv')

**최댓값 앙상블**

---
최종 예측값 제출 파일은 'final_submit.csv'이고 제출 당시 public score: 0.5509346291 / private score: 0.54025 이었습니다.


In [None]:
# 결과를 저장할 동일한 열을 가진 빈 DataFrame 생성
df_c = pd.DataFrame(columns=df_a.columns)
df_c['ID'] = df_a['ID']

# 각 열을 순회하면서 ( 'ID' 열은 무시) 각 항목에 대해 더 큰 값을 찾습니다.
for col in df_a.columns[1:]:
    df_c[col] = df_a[col].combine(df_b[col], max)

# 새 DataFrame(최댓값 앙상블을 적용한)을 CSV 파일로 저장
output_csv_path = '/content/drive/MyDrive/lg_aimers/open/final_submit.csv'
df_c.to_csv(output_csv_path, index=False)

# 새 DataFrame(최댓값 앙상블을 적용한)의 처음 몇 개 행을 표시합니다.
output_csv_path, df_c.head()
