## Import

In [1]:
import random
import pandas as pd
import numpy as np
import os
import re

import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from tqdm.auto import tqdm

import warnings
warnings.filterwarnings(action='ignore') 

In [2]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

## Hyperparameter Setting

In [3]:
CFG = {
    'X_RANGE':24, # 예측하려는 시점을 X_RANGE 만큼의 이전 일자의 데이터로부터 예측 및 학습
    'EPOCHS':10,
    'LEARNING_RATE':1e-3,
    'BATCH_SIZE':128,
    'SEED':41
}

## Fixed RandomSeed

In [4]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CFG['SEED']) # Seed 고정

## Data Load

In [5]:
train_df = pd.read_csv('./train.csv')

In [6]:
train_df

Unnamed: 0,일시,광진구,동대문구,성동구,중랑구
0,20180101,0.592,0.368,0.580,0.162
1,20180102,0.840,0.614,1.034,0.260
2,20180103,0.828,0.576,0.952,0.288
3,20180104,0.792,0.542,0.914,0.292
4,20180105,0.818,0.602,0.994,0.308
...,...,...,...,...,...
1456,20211227,3.830,3.416,2.908,2.350
1457,20211228,4.510,3.890,3.714,2.700
1458,20211229,4.490,3.524,3.660,2.524
1459,20211230,4.444,3.574,3.530,2.506


In [7]:
temporary = pd.DataFrame()

In [8]:
import numpy as np
from scipy import interpolate
for loc in train_df.columns[1:]:
    ans = []
    for i in range(len(train_df)-1):
        x = np.array(range(24))
        y = np.linspace(train_df.loc[i,loc], train_df.loc[i+1,loc], 24)
        sp = interpolate.interp1d(x,y,kind='cubic')

        xs = np.linspace(x[0], x[-1], 24)
        ys = sp(xs)
        ys = list(ys)
        ys.pop()
        ans.extend(ys)
    temporary[loc] = ans

In [9]:
ans = []
for i in range(len(train_df)-1):
    x = np.array(range(24))
    y = np.linspace(train_df.loc[i,'일시'], train_df.loc[i+1,'일시'], 24)
    sp = interpolate.interp1d(x,y,kind='cubic')

    xs = np.linspace(x[0], x[-1], 24)
    ys = sp(xs)
    ys = list(ys)
    ys.pop()
    ans.extend(ys)
temporary['일시'] = ans
temporary

Unnamed: 0,광진구,동대문구,성동구,중랑구,일시
0,0.592000,0.368000,0.580000,0.162000,2.018010e+07
1,0.602783,0.378696,0.599739,0.166261,2.018010e+07
2,0.613565,0.389391,0.619478,0.170522,2.018010e+07
3,0.624348,0.400087,0.639217,0.174783,2.018010e+07
4,0.635130,0.410783,0.658957,0.179043,2.018010e+07
...,...,...,...,...,...
33575,3.796000,3.289130,2.817826,2.224261,2.021123e+07
33576,3.760000,3.273304,2.778261,2.208609,2.021123e+07
33577,3.724000,3.257478,2.738696,2.192957,2.021123e+07
33578,3.688000,3.241652,2.699130,2.177304,2.021123e+07


In [10]:
len(train_df)*23-23

33580

In [11]:
train_df = temporary

## Data Visualization

In [12]:
def get_view_df(df, target):
    viewd = df[['일시', target]]
    viewd = viewd.reset_index(drop=True)
    viewd = viewd.reset_index()
    print(target)
    sns.set(rc = {'figure.figsize':(30,8)})
    sns.lineplot(x='index', y=target, data=viewd)
    plt.show()

## Train / Validation Split

In [13]:
val_df = train_df.iloc[31000-CFG['X_RANGE']:].reset_index().drop(columns='index') # 20210101 ~ 20211231
# 위의 1096-CFG['X_RANGE']의 이유는 20210101을 예측하기 위해서는 이전 X_RANGE만큼의 일수가 필요하므로
train_df = train_df.iloc[:31000].reset_index().drop(columns='index') # 20180101 ~ 20201231

## Data Pre-processing

In [14]:
def get_x_y_data(df, infer=False):
    # x_range : x일전까지의 데이터를 통해 예측
    x_data = []
    y_data = []
    for i in tqdm(range(CFG['X_RANGE'], len(df))):
        x_data.append(np.array(df.loc[i-CFG['X_RANGE']:i-1, ['광진구', '동대문구', '성동구', '중랑구']]).astype(float))
        y_data.append(np.array(df.loc[i, ['광진구', '동대문구', '성동구', '중랑구']]).astype(float))
    if infer:
        return x_data
    else:
        return x_data, y_data

In [15]:
def make_ymd_feature(df):
    df['일시'] = df['일시'].astype(str)
#     df['년'] = df['일시'].str[:4].astype(int)/1000.
#     df['월'] = df['일시'].str[4:6].astype(int)/12.
#     df['일'] = df['일시'].str[6:8].astype(int)/31.
#     df['시'] = df['일시'].str[8:13].astype(int)/10000.
    df = df[['광진구', '동대문구', '성동구', '중랑구']]
    return df

In [16]:
train_df = make_ymd_feature(train_df)
val_df = make_ymd_feature(val_df)

In [17]:
train_x, train_y = get_x_y_data(train_df)

  0%|          | 0/30976 [00:00<?, ?it/s]

In [18]:
val_x, val_y = get_x_y_data(val_df)

  0%|          | 0/2580 [00:00<?, ?it/s]

## CustomDataset

In [19]:
class CustomDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y

    def __getitem__(self, index):
        if self.Y is not None:
            return torch.Tensor(self.X[index]), torch.Tensor(self.Y[index])
        else:
            return torch.Tensor(self.X[index])

    def __len__(self):
        return len(self.X)

In [20]:
train_dataset = CustomDataset(train_x, train_y)
train_loader = DataLoader(train_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=True, num_workers=0)

val_dataset = CustomDataset(val_x, val_y)
val_loader = DataLoader(val_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=False, num_workers=0)

## Define Model

In [21]:
class BaseModel(nn.Module):
    def __init__(self):
        super(BaseModel, self).__init__()
        self.lstm = nn.LSTM(input_size=4, hidden_size=256, num_layers=10, batch_first=True, dropout=0.1)
        
        self.multioutput_reg = nn.Sequential(
            nn.Linear(in_features=256, out_features=128), 
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Linear(in_features=128, out_features=128), 
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Linear(in_features=128, out_features=64), 
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Linear(in_features=64, out_features=4),
        )
        
    def forward(self, x):
        hidden, _ = self.lstm(x)
        output = self.multioutput_reg(hidden[:,-1,:])
        return output

## Train

In [22]:
def train(model, optimizer, train_loader, val_loader, scheduler, device):
    model.to(device)
    criterion = nn.L1Loss().to(device)
    
    best_loss = 9999999
    best_model = None
    
    for epoch in range(1, CFG['EPOCHS']+1):
        model.train()
        train_loss = []
        for X, Y in iter(train_loader):
            X = X.to(device)
            Y = Y.to(device)
            
            optimizer.zero_grad()
            
            output = model(X)
            loss = criterion(output, Y)
            
            loss.backward()
            optimizer.step()
            
            train_loss.append(loss.item())
        
        _train_loss = np.mean(train_loss)
        
        val_mae = validation(model, val_loader, criterion, device)
        print(f'Epoch : [{epoch}] Train Loss : [{_train_loss:.5f}] Val MAE : [{val_mae:.5f}]')
        
        if scheduler is not None:
            scheduler.step(val_mae)
            
        if best_loss > val_mae:
            best_loss = val_mae
            best_model = model 
    return best_model

In [23]:
def validation(model, val_loader, criterion, device):
    model.eval()
    val_loss = []
    with torch.no_grad():
        for X, Y in iter(val_loader):
            X = X.to(device)
            Y = Y.to(device)
            
            output = model(X)
            
            loss = criterion(output, Y)
            
            val_loss.append(loss.item())
    
    _val_loss = np.mean(val_loss)
    return _val_loss    

## Run!!

In [24]:
model = BaseModel()

In [25]:
optimizer = torch.optim.Adam(params = model.parameters(), lr = CFG["LEARNING_RATE"])
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=4,threshold_mode='abs',min_lr=1e-8, verbose=True)
best_model = train(model, optimizer, train_loader, val_loader, scheduler, device) # 2018.01.01 ~ 2020.12.31 까지 데이터로 학습

Epoch : [1] Train Loss : [1.73352] Val MAE : [2.14432]
Epoch : [2] Train Loss : [0.71281] Val MAE : [1.09840]
Epoch : [3] Train Loss : [0.57710] Val MAE : [0.78618]
Epoch : [4] Train Loss : [0.42748] Val MAE : [0.62822]
Epoch : [5] Train Loss : [0.38665] Val MAE : [0.77889]
Epoch : [6] Train Loss : [0.38707] Val MAE : [0.52647]
Epoch 00006: reducing learning rate of group 0 to 5.0000e-04.
Epoch : [7] Train Loss : [0.31334] Val MAE : [0.34716]
Epoch : [8] Train Loss : [0.28477] Val MAE : [0.52240]
Epoch : [9] Train Loss : [0.26193] Val MAE : [0.28537]
Epoch : [10] Train Loss : [0.24956] Val MAE : [0.31691]
Epoch : [11] Train Loss : [0.24306] Val MAE : [0.23454]
Epoch 00011: reducing learning rate of group 0 to 2.5000e-04.
Epoch : [12] Train Loss : [0.22643] Val MAE : [0.23989]
Epoch : [13] Train Loss : [0.21762] Val MAE : [0.22805]
Epoch : [14] Train Loss : [0.20750] Val MAE : [0.38112]
Epoch : [15] Train Loss : [0.19787] Val MAE : [0.29921]
Epoch : [16] Train Loss : [0.19998] Val MAE :

## Inference

In [57]:
test_df = pd.read_csv('./sample_submission.csv')

Unnamed: 0,일시,광진구,동대문구,성동구,중랑구
0,20220101,0,0,0,0
1,20220102,0,0,0,0
2,20220103,0,0,0,0
3,20220104,0,0,0,0
4,20220105,0,0,0,0
...,...,...,...,...,...
329,20221126,0,0,0,0
330,20221127,0,0,0,0
331,20221128,0,0,0,0
332,20221129,0,0,0,0


In [58]:
test_df = make_ymd_feature(test_df)
test_df = pd.concat([val_df[CFG['X_RANGE']*(-1):], test_df]).reset_index().drop(columns='index')

Unnamed: 0,광진구,동대문구,성동구,중랑구
0,4.446,3.571826,3.535652,2.506783
1,4.444,3.574000,3.530000,2.506000
2,4.408,3.558174,3.490435,2.490348
3,4.372,3.542348,3.450870,2.474696
4,4.336,3.526522,3.411304,2.459043
...,...,...,...,...
353,0.000,0.000000,0.000000,0.000000
354,0.000,0.000000,0.000000,0.000000
355,0.000,0.000000,0.000000,0.000000
356,0.000,0.000000,0.000000,0.000000


In [60]:
342*23-23

7843

In [59]:
test_df = pd.DataFrame(columns=['광진구', '동대문구', '성동구', '중랑구'],index=range(342*23-23))
test_df = pd.concat([val_df[CFG['X_RANGE']*(-1):], test_df]).reset_index().drop(columns='index')
test_df = test_df.fillna(0)
test_df

Unnamed: 0,광진구,동대문구,성동구,중랑구
0,4.446,3.571826,3.535652,2.506783
1,4.444,3.574000,3.530000,2.506000
2,4.408,3.558174,3.490435,2.490348
3,4.372,3.542348,3.450870,2.474696
4,4.336,3.526522,3.411304,2.459043
...,...,...,...,...
7862,0.000,0.000000,0.000000,0.000000
7863,0.000,0.000000,0.000000,0.000000
7864,0.000,0.000000,0.000000,0.000000
7865,0.000,0.000000,0.000000,0.000000


In [61]:
def inference(model, df, device):
    model.to(device)
    model.eval()
    for i in tqdm(range(CFG['X_RANGE'], len(df))):
        X = torch.Tensor(np.array(df.loc[i-CFG['X_RANGE']:i-1, ['광진구', '동대문구', '성동구', '중랑구']]).astype(float)).unsqueeze(0)
        X = X.to(device)
        with torch.no_grad():
            model_pred = model(X)[0]
        
        model_pred = model_pred.cpu().numpy()
        df.loc[i, ['광진구', '동대문구', '성동구', '중랑구']] = model_pred
    return df.loc[CFG['X_RANGE']:, ['광진구', '동대문구', '성동구', '중랑구']].reset_index().drop(columns=['index'])

In [62]:
preds = inference(best_model, test_df, device)

  0%|          | 0/7843 [00:00<?, ?it/s]

## Submission

In [32]:
submit = pd.read_csv('./sample_submission.csv')
submit.head()
# submission 을 *23 으로 늘린 후  해당위치값만 다시 submission

Unnamed: 0,일시,광진구,동대문구,성동구,중랑구
0,20220101,0,0,0,0
1,20220102,0,0,0,0
2,20220103,0,0,0,0
3,20220104,0,0,0,0
4,20220105,0,0,0,0


In [33]:
submit['광진구'] = preds['광진구']
submit['동대문구'] = preds['동대문구']
submit['성동구'] = preds['성동구']
submit['중랑구'] = preds['중랑구']

In [34]:
submit.to_csv('./submit2.csv', index=False)

In [68]:
preds.head(500)

Unnamed: 0,광진구,동대문구,성동구,중랑구
0,3.592053,3.098008,2.811573,2.230042
1,3.612767,3.139088,2.828345,2.259726
2,3.660545,3.208189,2.870644,2.309330
3,3.723818,3.292754,2.928246,2.369124
4,3.812488,3.398592,3.015283,2.443658
...,...,...,...,...
495,11.513114,10.369221,9.486783,7.061199
496,11.513114,10.369221,9.486783,7.061199
497,11.513114,10.369221,9.486782,7.061199
498,11.513115,10.369221,9.486782,7.061198


In [69]:
pred

NameError: name 'pred' is not defined