<a href="https://colab.research.google.com/github/jjooki/TIL/blob/main/bike_DL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/dacon/bike

Mounted at /content/drive
/content/drive/MyDrive/dacon/bike


In [None]:
import random
import pandas as pd
import numpy as np
import os
import re
from pathlib import Path

import matplotlib.pyplot as plt
import seaborn as sns

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from tqdm.auto import tqdm

import warnings
warnings.filterwarnings(action='ignore') 

base_path = Path('__file__').resolve().parent
train = pd.read_csv(base_path / 'data/train.csv')
submission = pd.read_csv(base_path / 'data/sample_submission.csv')

regions = ['광진구', '동대문구', '성동구', '중랑구']
metric = 'mae'

train

Unnamed: 0,일시,광진구,동대문구,성동구,중랑구
0,20180101,0.592,0.368,0.580,0.162
1,20180102,0.840,0.614,1.034,0.260
2,20180103,0.828,0.576,0.952,0.288
3,20180104,0.792,0.542,0.914,0.292
4,20180105,0.818,0.602,0.994,0.308
...,...,...,...,...,...
1456,20211227,3.830,3.416,2.908,2.350
1457,20211228,4.510,3.890,3.714,2.700
1458,20211229,4.490,3.524,3.660,2.524
1459,20211230,4.444,3.574,3.530,2.506


In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [None]:
device

device(type='cuda')

In [None]:
CFG = {
    'X_RANGE':334, # 예측하려는 시점을 X_RANGE 만큼의 이전 일자의 데이터로부터 예측 및 학습
    'EPOCHS':200,
    'LEARNING_RATE':1e-2,
    'BATCH_SIZE':128,
    'SEED':42
}

In [None]:
import copy

def make_ymd_feature(data):
    df = data.copy(deep=True)
    df['일시'] = df['일시'].astype(str)
    df['년'] = df['일시'].str[:4].astype(int)/1000.
    df['월'] = df['일시'].str[4:6].astype(int)/12.
    df['일'] = df['일시'].str[6:8].astype(int)/31.
    df = df[['년', '월', '일', '광진구', '동대문구', '성동구', '중랑구']]
    return df

In [None]:
def get_x_y_data(df, infer=False):
    # x_range : x일전까지의 데이터를 통해 예측
    x_data = []
    y_data = []
    for i in tqdm(range(CFG['X_RANGE'], len(df))):
        x_data.append(np.array(df.loc[i-CFG['X_RANGE']:i-1, ['년', '월', '일', '광진구', '동대문구', '성동구', '중랑구']]).astype(float))
        y_data.append(np.array(df.loc[i, ['광진구', '동대문구', '성동구', '중랑구']]).astype(float))
    if infer:
        return x_data
    else:
        return x_data, y_data

In [None]:
val = train.iloc[1096-CFG['X_RANGE']:].reset_index().drop(columns='index') # 20210101 ~ 20211231
# 위의 1096-CFG['X_RANGE']의 이유는 20210101을 예측하기 위해서는 이전 X_RANGE만큼의 일수가 필요하므로
train = train.iloc[:1096].reset_index().drop(columns='index') # 20180101 ~ 20201231

In [None]:
train

Unnamed: 0,일시,광진구,동대문구,성동구,중랑구
0,20180101,0.592,0.368,0.580,0.162
1,20180102,0.840,0.614,1.034,0.260
2,20180103,0.828,0.576,0.952,0.288
3,20180104,0.792,0.542,0.914,0.292
4,20180105,0.818,0.602,0.994,0.308
...,...,...,...,...,...
1091,20201227,3.528,2.604,3.258,2.038
1092,20201228,4.542,3.588,4.506,2.480
1093,20201229,3.694,3.054,3.222,2.118
1094,20201230,2.366,1.812,2.012,1.174


In [None]:
train = make_ymd_feature(train)
val = make_ymd_feature(val)

In [None]:
train_x, train_y = get_x_y_data(train)

  0%|          | 0/1091 [00:00<?, ?it/s]

In [None]:
val_x, val_y = get_x_y_data(val)

  0%|          | 0/365 [00:00<?, ?it/s]

In [None]:
class toTensor(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
    
    def __getitem__(self, index):
        if self.y is not None:
            return torch.Tensor(self.X[index]), torch.Tensor(self.y[index])
        else:
            return torch.Tensor(self.X[index])

    def __len__(self):
        return len(self.X)

In [None]:
train_tensor = toTensor(train_x, train_y)
train_loader = DataLoader(train_tensor,
                          batch_size=CFG['BATCH_SIZE'],
                          shuffle=True, num_workers=0)

val_tensor = toTensor(val_x, val_y)
val_loader = DataLoader(val_tensor,
                        batch_size=CFG['BATCH_SIZE'],
                        shuffle=False, num_workers=0)

In [None]:
class BaseModel(nn.Module):
    def __init__(self):
        super(BaseModel, self).__init__()
        self.lstm = nn.LSTM(input_size=7,
                            hidden_size=256,
                            num_layers=4,
                            batch_first=True)
        
        self.multioutput_reg = nn.Sequential(
            nn.Linear(in_features=256, out_features=128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Linear(in_features=128, out_features=4)
        )

    def forward(self, x):
        hidden, _ = self.lstm(x)
        output = self.multioutput_reg(hidden[:,-1,:])
        return output

In [None]:
def train(model, optimizer, train_loader, val_loader, scheduler, device):
    model.to(device)
    criterion = nn.L1Loss().to(device)
    
    best_loss = 9999999
    best_model = None
    
    for epoch in range(1, CFG['EPOCHS']+1):
        model.train()
        train_loss = []
        for X, Y in iter(train_loader):
            X = X.to(device)
            Y = Y.to(device)
            
            optimizer.zero_grad()
            
            output = model(X)
            loss = criterion(output, Y)
            
            loss.backward()
            optimizer.step()
            
            train_loss.append(loss.item())
        
        _train_loss = np.mean(train_loss)
        
        val_mae = validation(model, val_loader, criterion, device)
        print(f'Epoch : [{epoch}] Train Loss : [{_train_loss:.5f}] Val MAE : [{val_mae:.5f}]')
        
        if scheduler is not None:
            scheduler.step(val_mae)
            
        if best_loss > val_mae:
            best_loss = val_mae
            best_model = model 
    return best_model

In [None]:
def validation(model, val_loader, criterion, device):
    model.eval()
    val_loss = []
    with torch.no_grad():
        for X, Y in iter(val_loader):
            X = X.to(device)
            Y = Y.to(device)
            
            output = model(X)
            
            loss = criterion(output, Y)
            
            val_loss.append(loss.item())
    
    _val_loss = np.mean(val_loss)
    return _val_loss

In [None]:
model = BaseModel()

optimizer = torch.optim.Adam(params = model.parameters(), lr = CFG["LEARNING_RATE"])
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=4,threshold_mode='abs',min_lr=1e-8, verbose=True)
best_model = train(model, optimizer, train_loader, val_loader, scheduler, device) # 2018.01.01 ~ 2020.12.31 까지 데이터로 학습

Epoch : [1] Train Loss : [2.93666] Val MAE : [4.33689]
Epoch : [2] Train Loss : [1.40875] Val MAE : [3.77486]
Epoch : [3] Train Loss : [1.30871] Val MAE : [3.54277]
Epoch : [4] Train Loss : [1.22130] Val MAE : [2.40812]
Epoch : [5] Train Loss : [1.03123] Val MAE : [2.56708]
Epoch : [6] Train Loss : [1.06956] Val MAE : [2.16181]
Epoch 00006: reducing learning rate of group 0 to 5.0000e-03.
Epoch : [7] Train Loss : [0.99835] Val MAE : [2.31106]
Epoch : [8] Train Loss : [0.97845] Val MAE : [2.76836]
Epoch : [9] Train Loss : [1.00749] Val MAE : [1.56494]
Epoch : [10] Train Loss : [0.95972] Val MAE : [1.54772]
Epoch : [11] Train Loss : [0.94794] Val MAE : [1.59305]
Epoch 00011: reducing learning rate of group 0 to 2.5000e-03.
Epoch : [12] Train Loss : [0.93937] Val MAE : [1.69249]
Epoch : [13] Train Loss : [0.92452] Val MAE : [1.85762]
Epoch : [14] Train Loss : [0.90702] Val MAE : [1.56977]
Epoch : [15] Train Loss : [0.89983] Val MAE : [1.61063]
Epoch : [16] Train Loss : [0.88097] Val MAE :

In [None]:
test = make_ymd_feature(submission)
test = pd.concat([val[CFG['X_RANGE']*(-1):], test]).reset_index().drop(columns='index')

In [None]:
test

Unnamed: 0,년,월,일,광진구,동대문구,성동구,중랑구
0,2.021,1.000000,0.870968,3.830,3.416,2.908,2.350
1,2.021,1.000000,0.903226,4.510,3.890,3.714,2.700
2,2.021,1.000000,0.935484,4.490,3.524,3.660,2.524
3,2.021,1.000000,0.967742,4.444,3.574,3.530,2.506
4,2.021,1.000000,1.000000,3.616,3.210,2.620,2.146
...,...,...,...,...,...,...,...
334,2.022,0.916667,0.838710,0.000,0.000,0.000,0.000
335,2.022,0.916667,0.870968,0.000,0.000,0.000,0.000
336,2.022,0.916667,0.903226,0.000,0.000,0.000,0.000
337,2.022,0.916667,0.935484,0.000,0.000,0.000,0.000


In [None]:
def inference(model, df, device):
    model.to(device)
    model.eval()
    for i in tqdm(range(CFG['X_RANGE'], len(df))):
        X = torch.Tensor(np.array(df.loc[i-CFG['X_RANGE']:i-1, ['년', '월', '일', '광진구', '동대문구', '성동구', '중랑구']]).astype(float)).unsqueeze(0)
        X = X.to(device)
        with torch.no_grad():
            model_pred = model(X)[0]
        
        model_pred = model_pred.cpu().numpy()
        df.loc[i, ['광진구', '동대문구', '성동구', '중랑구']] = model_pred
    return df.loc[CFG['X_RANGE']:, ['광진구', '동대문구', '성동구', '중랑구']].reset_index().drop(columns=['index'])

In [None]:
preds = inference(best_model, test, device)

  0%|          | 0/334 [00:00<?, ?it/s]

In [None]:
submission['광진구'] = preds['광진구'].round(4)
submission['동대문구'] = preds['동대문구'].round(4)
submission['성동구'] = preds['성동구'].round(4)
submission['중랑구'] = preds['중랑구'].round(4)

In [None]:
submission

Unnamed: 0,일시,광진구,동대문구,성동구,중랑구
0,20220101,4.3270,2.5140,3.7023,1.7828
1,20220102,4.5405,2.6253,3.8845,1.8733
2,20220103,4.5242,2.6175,3.8717,1.8671
3,20220104,4.4706,2.5899,3.8265,1.8421
4,20220105,4.4987,2.6053,3.8528,1.8554
...,...,...,...,...,...
329,20221126,3.6171,2.1508,3.0407,1.4860
330,20221127,3.6174,2.1510,3.0410,1.4861
331,20221128,3.6179,2.1513,3.0416,1.4863
332,20221129,3.6187,2.1518,3.0424,1.4866


## Submission

In [None]:
save_path = base_path / 'submission'

# Check submission file name and define file name
if 'submission_bike.csv' in os.listdir(save_path):
    count = 0
    for name in os.listdir(save_path):
        if 'submission_bike' in name:
            count += 1
    filename = f"submission_bike{count + 1}.csv"
else:
    filename = 'submission_bike.csv'

# Export submission file
submission.to_csv(save_path / filename, index=False)