## Import

In [30]:
import random
import os
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from sklearn.preprocessing import LabelEncoder

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import TensorDataset


In [3]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

## Hyperparameter Setting

In [4]:
CFG = {
    'TRAIN_WINDOW_SIZE':7*24*60, # 7일치로 학습 : 7*24*60
    'PREDICT_SIZE':1*60, # 1시간 예측 : 1*60
    'EPOCHS':10,
    'LEARNING_RATE':1e-6,
    'BATCH_SIZE':1024,
    'SEED':41
}

In [5]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CFG['SEED']) # Seed 고정

### 데이터 불러오기(0에 알아서 숫자 대입)

In [11]:
# train_df1 = pd.read_csv('./data/airdeep_df1_831.csv').drop(columns=['Unnamed: 0'], axis=1)
# train_df2 = pd.read_csv('./data/airdeep_df2_831.csv').drop(columns=['Unnamed: 0'], axis=1)
# train_df3 = pd.read_csv('./data/airdeep_df2_831.csv').drop(columns=['Unnamed: 0'], axis=1)

train_df1 = pd.read_csv('./data/5_27to8_19data_1.csv', encoding='euc-kr').drop(columns=['지점', '풍향(deg)', '풍속(m/s)'], axis=1)
# holiday_raw = pd.read_csv('./data/holiday.csv')

In [14]:
train_df1.head()

Unnamed: 0,일시,pm10_Average,pm2.5_Average,co2_Average,tvoc_Average,기온(°C),누적강수량(mm),습도(%)
0,2023-05-27 00:00:00,7.0,6.5,438.0,150.5,18.6,0.0,75.5
1,2023-05-27 00:01:00,8.0,8.0,439.0,151.0,18.6,0.0,75.5
2,2023-05-27 00:02:00,10.5,9.5,438.5,154.0,18.6,0.0,75.5
3,2023-05-27 00:03:00,7.0,6.0,436.0,151.0,18.6,0.0,75.8
4,2023-05-27 00:04:00,9.0,7.0,438.0,152.0,18.6,0.0,75.7


In [15]:
train_df1.info()
# train_df2.info()
# train_df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 133921 entries, 0 to 133920
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   일시             133921 non-null  object 
 1   pm10_Average   133921 non-null  float64
 2   pm2.5_Average  133921 non-null  float64
 3   co2_Average    133921 non-null  float64
 4   tvoc_Average   133921 non-null  float64
 5   기온(°C)         133921 non-null  float64
 6   누적강수량(mm)      133921 non-null  float64
 7   습도(%)          133921 non-null  float64
dtypes: float64(7), object(1)
memory usage: 8.2+ MB


### 데이터 전처리

scaling - train, price

In [16]:
'''train scaling'''

# 숫자형 변수들의 min-max scaling을 수행하는 코드입니다.
numeric_cols = train_df1.columns[1:]

# 칵 column의 min 및 max 계산
min_values = train_df1[numeric_cols].min(axis=1)
max_values = train_df1[numeric_cols].max(axis=1)

# 각 행의 범위(max-min)를 계산하고, 범위가 0인 경우 1로 대체
ranges = max_values - min_values
ranges[ranges == 0] = 1

# min-max scaling 수행
train_df1[numeric_cols] = (train_df1[numeric_cols].subtract(min_values, axis=0)).div(ranges, axis=0)

# max와 min 값을 dictionary 형태로 저장
scale_min_dict = min_values.to_dict()
scale_max_dict = max_values.to_dict()

In [17]:
train_df1.head()

Unnamed: 0,일시,pm10_Average,pm2.5_Average,co2_Average,tvoc_Average,기온(°C),누적강수량(mm),습도(%)
0,2023-05-27 00:00:00,0.015982,0.01484,1.0,0.343607,0.042466,0.0,0.172374
1,2023-05-27 00:01:00,0.018223,0.018223,1.0,0.343964,0.042369,0.0,0.171982
2,2023-05-27 00:02:00,0.023945,0.021665,1.0,0.351197,0.042417,0.0,0.172178
3,2023-05-27 00:03:00,0.016055,0.013761,1.0,0.34633,0.042661,0.0,0.173853
4,2023-05-27 00:04:00,0.020548,0.015982,1.0,0.347032,0.042466,0.0,0.172831


# train/predict 데이터셋 생성

### 코드 분석(주석 참고)

### 데이터셋 생성

In [28]:
# 데이터셋 생성 함수
def build_dataset(data, seq_length=CFG['PREDICT_SIZE'] + CFG['TRAIN_WINDOW_SIZE']):
    dataX = []
    dataY = []
    for i in range(0, len(data)-seq_length):
        _x = data[i:i+seq_length, :]
        _y = data[i+seq_length, [-1]]
        # print(_x, "-->",_y)
        dataX.append(_x)
        dataY.append(_y)

    return np.array(dataX), np.array(dataY)

In [29]:
train_input, train_target = build_dataset(np.array(train_df1))
# val_input, val_target = build_dataset(np.array(train_df1))

MemoryError: Unable to allocate 74.8 GiB for an array with shape (123781, 10140, 8) and data type object

In [None]:
# 텐서로 변환
train_input_tensor = torch.FloatTensor(train_input)
train_target_tensor = torch.FloatTensor(train_target)

In [None]:
# 텐서 형태로 데이터 정의
dataset = TensorDataset(train_input_tensor, train_target_tensor)

# 데이터로더는 기본적으로 2개의 인자를 입력받으며 배치크기는 통상적으로 2의 배수를 사용
dataloader = DataLoader(dataset,
                        batch_size=CFG['BATCH_SIZE'],
                        shuffle=True,  
                        drop_last=True)

In [None]:
# Train / Validation Split
data_len = len(train_input)
val_input = train_input[-int(data_len*0.2):]
val_target = train_target[-int(data_len*0.2):]
train_input = train_input[:-int(data_len*0.2)]
train_target = train_target[:-int(data_len*0.2)]

In [None]:
train_input.shape, train_target.shape, val_input.shape, val_target.shape

((1070112, 90, 9), (1070112, 21), (267528, 90, 9), (267528, 21), (3768, 90, 9))

### Custom Dataset

In [None]:
class CustomDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y
        
    def __getitem__(self, index):
        if self.Y is not None:
            return torch.Tensor(self.X[index]), torch.Tensor(self.Y[index])
        return torch.Tensor(self.X[index])
    
    def __len__(self):
        return len(self.X)

In [None]:
train_dataset = CustomDataset(train_input, train_target)
train_loader = DataLoader(train_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=True, num_workers=0)

val_dataset = CustomDataset(val_input, val_target)
val_loader = DataLoader(val_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=False, num_workers=0)

### 모델 선언

In [None]:
class BaseModel(nn.Module):
    def __init__(self, input_size=7, hidden_size=512, output_size=CFG['PREDICT_SIZE']):
        super(BaseModel, self).__init__()
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Sequential(
            nn.Linear(hidden_size, hidden_size//2),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(hidden_size//2, output_size)
        )
            
        self.actv = nn.ReLU()
    
    def forward(self, x):
        # x shape: (B(batch_size=4096), TRAIN_WINDOW_SIZE=90, 5)
        batch_size = x.size(0)
        hidden = self.init_hidden(batch_size, x.device)
        
        # LSTM layer
        lstm_out, hidden = self.lstm(x, hidden)
        
        # Only use the last output sequence
        last_output = lstm_out[:, -1, :]
        
        # Fully connected layer
        output = self.actv(self.fc(last_output))
        
        return output.squeeze(1)
    
    def init_hidden(self, batch_size, device):
        # Initialize hidden state and cell state
        return (torch.zeros(1, batch_size, self.hidden_size, device=device),
                torch.zeros(1, batch_size, self.hidden_size, device=device))

### 모델 학습

In [None]:
def train(model, optimizer, train_loader, val_loader, device):
    model.to(device)
    criterion = nn.MSELoss().to(device)
    best_loss = 9999999
    best_model = None
    
    for epoch in range(1, CFG['EPOCHS']+1):
        model.train()
        train_loss = []
        train_mae = []
        
        for X, Y in tqdm(iter(train_loader)):
            X = X.to(device)
            Y = Y.to(device)
            
            optimizer.zero_grad()
            
            output = model(X)
            loss = criterion(output, Y)
            
            loss.backward()
            optimizer.step()
            
            train_loss.append(loss.item())
        
        val_loss = validation(model, val_loader, criterion, device)
        print(f'Epoch : [{epoch}] Train Loss : [{np.mean(train_loss):.5f}] Val Loss : [{val_loss:.5f}]')
        
        if best_loss > val_loss:
            best_loss = val_loss
            best_model = model
            print('Model Saved')
            
    return best_model

In [None]:
def validation(model, val_loader, criterion, device):
    model.eval()
    val_loss = []
    
    with torch.no_grad():
        for X, Y in tqdm(iter(val_loader)):
            X = X.to(device)
            Y = Y.to(device)
            
            output = model(X)
            loss = criterion(output, Y)
            
            val_loss.append(loss.item())
    return np.mean(val_loss)

## Run !!

In [None]:
model = BaseModel()
optimizer = torch.optim.Adam(params = model.parameters(), lr = CFG["LEARNING_RATE"])
infer_model = train(model, optimizer, train_loader, val_loader, device)

  0%|          | 0/1046 [00:00<?, ?it/s]