## Import

In [28]:
import random
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns

import tqdm
from tqdm import tqdm
import sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler

import warnings
warnings.filterwarnings(action='ignore')



## Fixed Random-Seed

In [29]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

## Load Data

In [30]:
train_df = pd.read_csv('/content/train.csv')
test_df = pd.read_csv('/content/test.csv')

In [31]:
train_df.head()

Unnamed: 0,ID,timestamp,item,corporation,location,supply(kg),price(원/kg)
0,TG_A_J_20190101,2019-01-01,TG,A,J,0.0,0.0
1,TG_A_J_20190102,2019-01-02,TG,A,J,0.0,0.0
2,TG_A_J_20190103,2019-01-03,TG,A,J,60601.0,1728.0
3,TG_A_J_20190104,2019-01-04,TG,A,J,25000.0,1408.0
4,TG_A_J_20190105,2019-01-05,TG,A,J,32352.0,1250.0


## Data Pre-Processing

In [32]:
#datetime 형식으로 변환
#.dt 접근자를 통해 년, 월, 일 등의 시계열 특성을 추출할 수 있음
train_df['timestamp'] = pd.to_datetime(train_df['timestamp'], format='%Y-%m-%d')
test_df['timestamp'] = pd.to_datetime(test_df['timestamp'], format='%Y-%m-%d')

In [33]:
train_df=train_df.drop("ID",axis=1)
test_df=test_df.drop("ID",axis=1)

In [34]:
train_df=train_df.replace('TG','감귤')
train_df=train_df.replace('BC','브로콜리')
train_df=train_df.replace('RD','무')
train_df=train_df.replace('CR','당근')
train_df=train_df.replace('CB','양배추')

test_df=test_df.replace('TG','감귤')
test_df=test_df.replace('BC','브로콜리')
test_df=test_df.replace('RD','무')
test_df=test_df.replace('CR','당근')
test_df=test_df.replace('CB','양배추')

In [35]:
#시계열 특성을 학습에 반영하기 위해 timestamp를 월, 일, 시간으로 나눕니다
train_df['year'] = train_df['timestamp'].dt.year
train_df['month'] = train_df['timestamp'].dt.month
train_df['day'] = train_df['timestamp'].dt.day

test_df['year'] = test_df['timestamp'].dt.year
test_df['month'] = test_df['timestamp'].dt.month
test_df['day'] = test_df['timestamp'].dt.day

In [36]:
# 연 기준 며칠 째인지
train_df['day_of_year'] = train_df['timestamp'].dt.dayofyear
test_df['day_of_year'] = test_df['timestamp'].dt.dayofyear

# 요일 0 = monday, 6 = sunday
train_df['day_of_week'] = train_df['timestamp'].dt.dayofweek
test_df['day_of_week'] = test_df['timestamp'].dt.dayofweek

# 주말 여부
train_df['holiday'] = train_df.apply(lambda x : 0 if x['day_of_week'] < 5 else 1, axis = 1)
test_df['holiday'] = test_df.apply(lambda x : 0 if x['day_of_week'] < 5 else 1, axis = 1)

 ## price outlier에 대한 normalize

In [37]:
# 한글 컬럼이름은 작업의 편의를 위해 바꿔줌
train_df.rename(columns = {'price(원/kg)' : 'price'}, inplace = True)

train_df.groupby('item')['price'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
감귤,15230.0,3177.442022,2805.240092,0.0,992.25,2273.5,5532.75,20909.0
당근,10661.0,273.118938,632.929933,0.0,0.0,0.0,0.0,3882.0
무,12184.0,187.073539,358.150266,0.0,0.0,0.0,383.0,5715.0
브로콜리,13707.0,910.196615,1422.148301,0.0,0.0,0.0,2032.5,8750.0
양배추,7615.0,152.187131,345.053036,0.0,0.0,0.0,0.0,2500.0


In [38]:
QT = 0.99995   # quantile 변수 99.995% - 2만개중 1개 수준의 아웃라이어

# Outlier는 삭제.
dict_it = {'브로콜리':0.0, '양배추':0.0, '당근':0.0, '무':0.0, '감귤':0.0}
for it in dict_it.keys():
    dict_it[it] = train_df[train_df['item'] == it]['price'].quantile(QT)

print(dict_it)

{'브로콜리': 7893.375000001015, '양배추': 2500.0, '당근': 3845.7560000000376, '무': 4746.451499999639, '감귤': 16503.250300000185}


In [39]:
for i in range(len(train_df)):
    item = train_df.loc[i, 'item']
    if train_df.loc[i, 'price'] > dict_it[item]:
        train_df.loc[i, 'price'] = dict_it[item]

In [40]:
train_df.groupby('item')['price'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
감귤,15230.0,3177.152741,2803.638171,0.0,992.25,2273.5,5532.75,16503.2503
당근,10661.0,273.115539,632.910644,0.0,0.0,0.0,0.0,3845.756
무,12184.0,186.994046,357.028943,0.0,0.0,0.0,383.0,4746.4515
브로콜리,13707.0,910.134119,1421.822545,0.0,0.0,0.0,2032.5,7893.375
양배추,7615.0,152.187131,345.053036,0.0,0.0,0.0,0.0,2500.0


예외 상황 제거

In [41]:
train_df[train_df['price'] == 0]['supply(kg)'].value_counts()

supply(kg)
0.0      35452
64.0         1
129.0        1
80.0         1
Name: count, dtype: int64

In [42]:
train_df[(train_df['price'] == 0) & (train_df['supply(kg)']!=0)]

Unnamed: 0,timestamp,item,corporation,location,supply(kg),price,year,month,day,day_of_year,day_of_week,holiday
825,2021-04-05,감귤,A,J,64.0,0.0,2021,4,5,95,0,0
9266,2019-05-09,감귤,D,J,129.0,0.0,2019,5,9,129,3,0
31199,2021-01-09,양배추,E,J,80.0,0.0,2021,1,9,9,5,1


In [43]:
train_df = train_df.drop([825, 9266, 31199], axis=0).reset_index(drop=True)

In [44]:
#학습에 사용하지 않을 변수들을 제거합니다
train_x = train_df.drop(columns=['timestamp', 'supply(kg)', 'price'])
train_y = train_df['price']

test_x = test_df.drop(columns=['timestamp'])

In [45]:
#질적 변수들을 수치화합니다
qual_col = ['item', 'corporation', 'location']

for i in qual_col:
    le = LabelEncoder()
    train_x[i]=le.fit_transform(train_x[i])
    test_x[i]=le.transform(test_x[i])
    #test 데이터에 대해서 fit하는 것은 data leakage에 해당(따라서 훈련 데이터셋에서 학습한 변환을 그대로 적용)

    if(i == 'item'):
      print(le.inverse_transform([0, 1, 2, 3, 4]))
    if(i == 'location'):
      print(le.inverse_transform([0,1]))
    if(i == 'corporation'):
      print(le.inverse_transform([0,1,2,3,4]))

print('Done.')

['감귤' '당근' '무' '브로콜리' '양배추']
['A' 'B' 'C' 'D' 'E']
['J' 'S']
Done.


In [46]:
train_x

Unnamed: 0,item,corporation,location,year,month,day,day_of_year,day_of_week,holiday
0,0,0,0,2019,1,1,1,1,0
1,0,0,0,2019,1,2,2,2,0
2,0,0,0,2019,1,3,3,3,0
3,0,0,0,2019,1,4,4,4,0
4,0,0,0,2019,1,5,5,5,1
...,...,...,...,...,...,...,...,...,...
59389,2,5,0,2023,2,27,58,0,0
59390,2,5,0,2023,2,28,59,1,0
59391,2,5,0,2023,3,1,60,2,0
59392,2,5,0,2023,3,2,61,3,0


In [47]:
train_y

0           0.0
1           0.0
2        1728.0
3        1408.0
4        1250.0
          ...  
59389     468.0
59390     531.0
59391     574.0
59392     523.0
59393     529.0
Name: price, Length: 59394, dtype: float64

In [48]:
test_x

Unnamed: 0,item,corporation,location,year,month,day,day_of_year,day_of_week,holiday
0,0,0,0,2023,3,4,63,5,1
1,0,0,0,2023,3,5,64,6,1
2,0,0,0,2023,3,6,65,0,0
3,0,0,0,2023,3,7,66,1,0
4,0,0,0,2023,3,8,67,2,0
...,...,...,...,...,...,...,...,...,...
1087,2,5,0,2023,3,27,86,0,0
1088,2,5,0,2023,3,28,87,1,0
1089,2,5,0,2023,3,29,88,2,0
1090,2,5,0,2023,3,30,89,3,0


## PyTorch를 사용한 LSTM 모델

In [49]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

1. 데이터 준비

In [50]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import torch
from torch.utils.data import DataLoader, TensorDataset

# DataFrame을 NumPy 배열로 변환
train_x_values = train_x.values
train_y_values = train_y.values.reshape(-1, 1)
test_x_values = test_x.values

# 데이터 정규화
scaler_x = MinMaxScaler()
scaler_y = MinMaxScaler()

train_x_scaled = scaler_x.fit_transform(train_x_values)
train_y_scaled = scaler_y.fit_transform(train_y_values)
test_x_scaled = scaler_x.transform(test_x_values)

# 시퀀스 생성 함수
def create_sequences(X, y, time_steps=10):
    Xs, ys = [], []
    for i in range(len(X) - time_steps):
        Xs.append(X[i:(i + time_steps)])
        ys.append(y[i + time_steps])
    return np.array(Xs), np.array(ys)

time_steps = 10
X_train, y_train = create_sequences(train_x_scaled, train_y_scaled, time_steps)
X_test, _ = create_sequences(test_x_scaled, test_x_scaled, time_steps)

# PyTorch Tensor로 변환
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)

# DataLoader 생성
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)


In [51]:
pd.DataFrame(test_x_scaled)

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0.0,0.0,0.0,1.0,0.181818,0.100000,0.169863,0.833333,1.0
1,0.0,0.0,0.0,1.0,0.181818,0.133333,0.172603,1.000000,1.0
2,0.0,0.0,0.0,1.0,0.181818,0.166667,0.175342,0.000000,0.0
3,0.0,0.0,0.0,1.0,0.181818,0.200000,0.178082,0.166667,0.0
4,0.0,0.0,0.0,1.0,0.181818,0.233333,0.180822,0.333333,0.0
...,...,...,...,...,...,...,...,...,...
1087,0.5,1.0,0.0,1.0,0.181818,0.866667,0.232877,0.000000,0.0
1088,0.5,1.0,0.0,1.0,0.181818,0.900000,0.235616,0.166667,0.0
1089,0.5,1.0,0.0,1.0,0.181818,0.933333,0.238356,0.333333,0.0
1090,0.5,1.0,0.0,1.0,0.181818,0.966667,0.241096,0.500000,0.0


2. 모델 정의

In [52]:
import torch.nn as nn

# 모델 정의
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size, dropout=0.5):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout if num_layers > 1 else 0)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

# 하이퍼파라미터 설정
input_size = len(train_x.columns)
output_size = 1
hidden_size = 64
num_layers = 2
num_epochs = 10
batch_size = 256
learning_rate = 0.001
dropout = 0.2



In [53]:
# 시퀀스 생성 함수 수정
def create_sequences_for_prediction(X, time_steps=7):
    Xs = []
    for i in range(len(X) - time_steps + 1):
        Xs.append(X[i:(i + time_steps)])
    return np.array(Xs)

In [54]:
import torch.optim as optim

# 장치 설정 (GPU가 사용 가능한 경우 GPU를 사용, 그렇지 않으면 CPU 사용)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# 모델 초기화
model = LSTMModel(input_size, hidden_size, num_layers, output_size, dropout=dropout).to(device)

model.to(device)

# 손실 함수 및 옵티마이저 설정
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# 모델 학습
for epoch in range(num_epochs):
    model.train()
    for i, (inputs, targets) in enumerate(train_loader):
        inputs = inputs.to(device)
        targets = targets.to(device)

        outputs = model(inputs)
        loss = criterion(outputs, targets)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item()}')

Epoch [1/10], Loss: 0.002653013914823532
Epoch [2/10], Loss: 0.0031616874039173126
Epoch [3/10], Loss: 0.014942098408937454
Epoch [4/10], Loss: 0.003561234101653099
Epoch [5/10], Loss: 0.004409718327224255
Epoch [6/10], Loss: 0.0020570557098835707
Epoch [7/10], Loss: 0.0034885460045188665
Epoch [8/10], Loss: 0.006056013982743025
Epoch [9/10], Loss: 0.0023418827913701534
Epoch [10/10], Loss: 0.0006747228908352554


In [55]:
# 모델 평가 (테스트 데이터에 대한 예측)
model.eval()
with torch.no_grad():
    train_predict = model(X_train_tensor.to(device)).cpu().numpy()

    # 예측값 역정규화
    train_predict_inverse = scaler_y.inverse_transform(train_predict)

    # RMSE 계산
    from sklearn.metrics import mean_squared_error
    import math
    train_rmse = math.sqrt(mean_squared_error(train_y[time_steps:], train_predict_inverse))
    print('Train RMSE:', train_rmse)

     # test_x 데이터의 마지막 9개 시점을 가져와서 예측에 사용
    last_10_timesteps = test_x_scaled[-1:]
    test_x_scaled_extended = np.vstack([test_x_scaled, np.repeat(last_10_timesteps, 9, axis=0)])

    X_test_pred = create_sequences_for_prediction(test_x_scaled_extended, time_steps=10)
    X_test_pred_tensor = torch.tensor(X_test_pred, dtype=torch.float32)

    test_predict = model(X_test_pred_tensor.to(device)).cpu().numpy()
    test_predict_inverse = scaler_y.inverse_transform(test_predict)


# 예측 결과를 데이터프레임으로 변환
test_predict_df = pd.DataFrame(test_predict_inverse, columns=test_x.columns[:1])

Train RMSE: 967.9109422096004


In [56]:
test_predict_df

Unnamed: 0,item
0,3593.530273
1,3719.513428
2,3792.937988
3,3587.553467
4,3066.939697
...,...
1087,384.216248
1088,382.936493
1089,378.440002
1090,378.223602


In [57]:
# 예측값을 submission 데이터프레임에 담기
# submission 파일 불러오기
submission = pd.read_csv('/content/sample_submission.csv')

# 예측값을 데이터프레임에 담기
submission['answer'] = test_predict_df.values.flatten()

# 파일 저장
submission.to_csv('./LSTM_submission.csv', index=False)
print(submission)


                   ID       answer
0     TG_A_J_20230304  3593.530273
1     TG_A_J_20230305  3719.513428
2     TG_A_J_20230306  3792.937988
3     TG_A_J_20230307  3587.553467
4     TG_A_J_20230308  3066.939697
...               ...          ...
1087  RD_F_J_20230327   384.216248
1088  RD_F_J_20230328   382.936493
1089  RD_F_J_20230329   378.440002
1090  RD_F_J_20230330   378.223602
1091  RD_F_J_20230331   376.412903

[1092 rows x 2 columns]
