In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import torch
import torch.nn as nn
from torch.autograd import Variable
from sklearn.preprocessing import MinMaxScaler

In [2]:
# 라이브러리 임포트
import os
import time
import json
from datetime import datetime, timedelta
import numpy as np
import pandas as pd
#from fbprophet import Prophet
import pickle
import copy
from tqdm.auto import tqdm

In [3]:
%%time
submit = json.load(open('./data/sample_submission/sample_submission.json', 'r', encoding='utf8')) 
df_dic = pickle.load(open('./mydata/df_dic.plk','rb'))
plc_lst = pickle.load(open('./mydata/plc_lst.plk','rb'))
fct_lst = ['pH', 'COD', 'SS', 'N', 'P', 'T']

CPU times: total: 750 ms
Wall time: 789 ms


In [4]:
''' fill na '''
for plc in plc_lst:
    df = df_dic[plc]
    df_dic[plc] = df_dic[plc].fillna(method='ffill')
    df_dic[plc] = df_dic[plc].fillna(method='bfill')
    df_dic[plc] = df_dic[plc].dropna()

In [5]:
%%time
''' create datetime '''
for plc in plc_lst:
    df_dic[plc]['date'] = df_dic[plc]['ds'].apply(lambda x : datetime.strptime(x , '%Y%m%d'))
    df_dic[plc]['year'] = pd.DatetimeIndex(df_dic[plc]['date']).year - 2010
    df_dic[plc]['month'] = pd.DatetimeIndex(df_dic[plc]['date']).month / 12
    df_dic[plc]['day'] = pd.DatetimeIndex(df_dic[plc]['date']).day / 31

CPU times: total: 6.03 s
Wall time: 6.04 s


---
# 데이터 만들기

In [35]:
def create_lag_feature(df_trg , lag_count , interval):
    df_n = df_trg.copy()
    for n in range(1, lag_count + 1):
        df_shift = []
        for col in fct_lst:
            df_shift.append( df_n[col].shift(n*interval).to_frame(name= f"{col}_lag{n*interval}") )
        df_n  = pd.concat( [df_n] + df_shift , axis = 1)
    dftemp = df_n.iloc[lag_count*interval:].dropna()
    #dftemp = df_n.iloc[n_lags*interval:-1]
    
    x = dftemp.iloc[:-1].drop(['ds','date'],axis = 1)
    y = dftemp[fct_lst].shift(-1).dropna()
    y.columns = [ 'y_' + x for x in y.columns ]
    return x, y

In [36]:
%%time
lag_count = 120
interval = 5

xs = dict()
ys = dict()
for plc in tqdm(plc_lst):
    x ,y = create_lag_feature(df_dic[plc] , lag_count , interval)
    xs[plc] = x
    ys[plc] = y


CPU times: total: 2min 12s
Wall time: 2min 13s


In [37]:
''' 저장 '''
pickle.dump([xs,ys] , open('./mydata/lstm_xy.plk','wb'))

---
## simple LSTM

In [68]:
import random
import numpy as np
import torch

# multivariate data preparation
from numpy import array
from numpy import hstack
from torch.optim.lr_scheduler import StepLR
#from pytorchtools import EarlyStopping

In [41]:
''' Multivariate LSTM Network '''
class MV_LSTM(torch.nn.Module):
    def __init__(self,n_features,seq_length,output_dim):
        super(MV_LSTM, self).__init__()
        self.n_features = n_features
        self.seq_len = seq_length
        self.output_dim = output_dim
        self.n_hidden = 10 # number of hidden states
        self.n_layers = 1 # number of LSTM layers (stacked)
    
        self.l_lstm = torch.nn.LSTM(input_size = n_features, 
                                 hidden_size = self.n_hidden,
                                 num_layers = self.n_layers, 
                                 batch_first = True)
        # according to pytorch docs LSTM output is 
        # (batch_size,seq_len, num_directions * hidden_size)
        # when considering batch_first = True
        self.l_linear = torch.nn.Linear(self.n_hidden*self.seq_len, output_dim)
        
    
    def init_hidden(self, batch_size):
        # even with batch_first = True this remains same as docs
        hidden_state = torch.zeros(self.n_layers,batch_size,self.n_hidden).cuda()
        cell_state = torch.zeros(self.n_layers,batch_size,self.n_hidden).cuda()
        self.hidden = (hidden_state, cell_state)
    
    
    def forward(self, x):        
        batch_size, seq_len, _ = x.size()
        
        lstm_out, self.hidden = self.l_lstm(x,self.hidden)
        # lstm_out(with batch_first = True) is 
        # (batch_size,seq_len,num_directions * hidden_size)
        # for following linear layer we want to keep batch_size dimension and merge rest       
        # .contiguous() -> solves tensor compatibility error
        x = lstm_out.contiguous().view(batch_size,-1)
        return self.l_linear(x*self.output_dim)

In [58]:
def split_sequences(xs,ys, n_steps, x_dim , y_dim):
    X, y = list(), list()
    for i in range(len(xs)):
        # find the end of this pattern
        end_ix = i + n_steps
        # check if we are beyond the dataset
        if end_ix > len(xs):
            break
        # gather input and output parts of the pattern
        seq_x, seq_y = xs.iloc[i:end_ix,:].values, ys.iloc[end_ix-1, :].values
        X.append(seq_x)
        y.append(seq_y)
    return array(X), array(y)

In [54]:
# 1개 지점으로 테스트

In [None]:
%%time
model_list = {}

for comname in plc_lst:
    x_single = xs[comname]
    y_single = ys[comname]
    
    '''Initialization '''
    n_features = x_single.shape[1] # this is number of parallel inputs
    n_timesteps = 100 # this is number of timesteps
    output_dim = y_single.shape[1]
    
    # convert dataset into input/output
    X, y = split_sequences(x_single,y_single, n_timesteps ,n_features, output_dim )
    print(X.shape, y.shape)
    
    # create NN
    mv_net = MV_LSTM(n_features,n_timesteps,output_dim)
    mv_net = mv_net.to('cuda')
    criterion = torch.nn.MSELoss() # reduction='sum' created huge loss value
    optimizer = torch.optim.Adam(mv_net.parameters(), lr=1e-1)
    scheduler = StepLR(optimizer, step_size = 10 , gamma = 0.9)
    
    train_episodes = 150
    batch_size = 64
    
    ''' Training '''
    mv_net.train()
    for t in range(train_episodes):
        for b in range(0,len(X),batch_size):
            inpt = X[b:b+batch_size,:,:]
            target = y[b:b+batch_size]    
            
            x_batch = torch.tensor(inpt,dtype=torch.float32).cuda()  
            y_batch = torch.tensor(target,dtype=torch.float32).cuda()
        
            mv_net.init_hidden(x_batch.size(0))
        #    lstm_out, _ = mv_net.l_lstm(x_batch,nnet.hidden)    
        #    lstm_out.contiguous().view(x_batch.size(0),-1)
            output = mv_net(x_batch) 
            loss = criterion(output, y_batch)  
            
            loss.backward()
            optimizer.step()        
            optimizer.zero_grad() 
        print('step : ' , t , 'loss : ' , loss.item())
    torch.save(mv_net, './model/' + '{}_model.pt'.format(col))

(2120, 100, 729) (2120, 6)
step :  0 loss :  86313.96875
step :  1 loss :  16272.15625
step :  2 loss :  14816.205078125
step :  3 loss :  2978.610107421875
step :  4 loss :  369.679443359375
step :  5 loss :  338.43511962890625
step :  6 loss :  135.877685546875
step :  7 loss :  148.70603942871094
step :  8 loss :  45.91547775268555
step :  9 loss :  41.12989807128906
step :  10 loss :  12.055002212524414
step :  11 loss :  35.92486572265625
step :  12 loss :  11.49665355682373
step :  13 loss :  56.28765106201172
step :  14 loss :  42.128658294677734
step :  15 loss :  83.24525451660156
step :  16 loss :  77.10880279541016
step :  17 loss :  60.84429168701172
step :  18 loss :  40.00377655029297
step :  19 loss :  12.76434326171875
step :  20 loss :  7.845407009124756
step :  21 loss :  6.6047210693359375
step :  22 loss :  6.997963905334473
step :  23 loss :  6.274617671966553
step :  24 loss :  5.6839599609375
step :  25 loss :  5.262387275695801
step :  26 loss :  4.8788948059082

---

In [64]:
x_single = xs[plc_lst[0]]
y_single = ys[plc_lst[0]]

'''Initialization '''
n_features = x_single.shape[1] # this is number of parallel inputs
n_timesteps = 30 # this is number of timesteps
output_dim = y_single.shape[1]

# convert dataset into input/output
X, y = split_sequences(x_single,y_single, n_timesteps ,n_features, output_dim )
print(X.shape, y.shape)

# create NN
mv_net = MV_LSTM(n_features,n_timesteps,output_dim)
mv_net = mv_net.to('cuda')
criterion = torch.nn.MSELoss() # reduction='sum' created huge loss value
optimizer = torch.optim.Adam(mv_net.parameters(), lr=1e-1)
scheduler = StepLR(optimizer, step_size = 10 , gamma = 0.9)

train_episodes = 100
batch_size = 16

(2190, 30, 729) (2190, 6)


In [None]:
''' Training '''
mv_net.train()
for t in range(train_episodes):
    for b in range(0,len(X),batch_size):
        inpt = X[b:b+batch_size,:,:]
        target = y[b:b+batch_size]    
        
        x_batch = torch.tensor(inpt,dtype=torch.float32).cuda()  
        y_batch = torch.tensor(target,dtype=torch.float32).cuda()
    
        mv_net.init_hidden(x_batch.size(0))
    #    lstm_out, _ = mv_net.l_lstm(x_batch,nnet.hidden)    
    #    lstm_out.contiguous().view(x_batch.size(0),-1)
        output = mv_net(x_batch) 
        loss = criterion(output, y_batch)  
        
        loss.backward()
        optimizer.step()        
        optimizer.zero_grad() 
    print('step : ' , t , 'loss : ' , loss.item())