In [1]:
# 라이브러리 임포트
import os
import time
import json
from datetime import datetime, timedelta
import numpy as np
import pandas as pd
#from fbprophet import Prophet
import pickle
import copy
from tqdm.auto import tqdm

In [2]:
from darts.models import *
from darts.models.forecasting import gradient_boosted_model
import darts.utils.timeseries_generation as tg
from darts.utils.likelihood_models import GaussianLikelihood
from darts.datasets import EnergyDataset
from darts.utils.missing_values import fill_missing_values
from darts import TimeSeries
from darts.dataprocessing.transformers import Scaler
from darts.utils.timeseries_generation import datetime_attribute_timeseries
from darts.timeseries import TimeSeries
import warnings

warnings.filterwarnings("ignore")

  warn(f"Failed to load image Python extension: {e}")


In [3]:
%%time
submit = json.load(open('./data/sample_submission/sample_submission.json', 'r', encoding='utf8')) 
df_dic = pickle.load(open('./mydata/df_dic.plk','rb'))
plc_lst = pickle.load(open('./mydata/plc_lst.plk','rb'))
fct_lst = ['pH', 'COD', 'SS', 'N', 'P', 'T']

CPU times: user 769 ms, sys: 140 ms, total: 910 ms
Wall time: 910 ms


In [4]:
%%time
''' fillna : 단순 missing value 채움'''
for plc in plc_lst:
    df = df_dic[plc]
    df = df.fillna(method='ffill')
    df = df.fillna(method='bfill')    
    df_dic[plc] = df.dropna()
    
    if df_dic[plc].isna().any().any():
        print(plc)


CPU times: user 2.32 s, sys: 7.66 ms, total: 2.33 s
Wall time: 2.33 s


In [4]:
# Prophet 예측을 위한 dataframe 포맷 제작
future = pd.DataFrame([str(x)[:10] for x in list(pd.date_range(start='2018-02-01', end='2019-12-31', inclusive="both"))], columns=['ds'])
future['y'] = np.nan
#future

---
# 예측 시작

In [16]:
dfall_comm= pickle.load( open('./mydata/dfall_comm.plk','rb'))
df_grp = dfall_comm.groupby('loc')
df_temp = df_grp.get_group(i)

---
## tcn

In [5]:
from darts.models.forecasting import tcn_model

In [None]:
# 추론 : kalman
start_time = time.time()

for i in tqdm(plc_lst):
    plc_df = df_dic[i]
    
    for j in fct_lst: # 피쳐 리스트
        fct_df = plc_df[['ds', j]]   # 날짜 - 피쳐1 조합임.
        
        fct_df = fct_df.rename(columns={j:'y'}) # 피쳐1을 y로 바꾼다.
        fct_df['ds'] = fct_df.ds.apply(lambda x : datetime.strptime(x , '%Y%m%d'))
        fct_df = fct_df.dropna()
        fct_df['y'] = pd.to_numeric(fct_df.y)
        
        df_align = pd.DataFrame([ x for x in list(pd.date_range( end = '2018-01-31', periods = len(fct_df), inclusive="both"))], columns=['ds'])
        df_align['y'] = fct_df.y.values
        
        # TimeSeris 로 변환 하기 
        dfts = TimeSeries.from_dataframe(df_align,  time_col = 'ds' , value_cols  = ['y'], fill_missing_dates=True  , freq='D')
        
        diff = datetime(2018,2,1) - df_align.loc[0].ds
        if diff.days > 720:
            lag = 700
        else:
            lag = diff.days - 10
            
        my_model = tcn_model.TCNModel(
            input_chunk_length = 500,
            output_chunk_length = 1,
            n_epochs=50,
            kernel_size =3,
            num_filters = 12,
            dropout = 0.3,
            batch_size = 64,
            pl_trainer_kwargs  = {"accelerator": "gpu", "gpus": -1, "auto_select_gpus": True} ,
            dilation_base=2,
            weight_norm=True,
            random_state=2022,
            
            
        )
        
        my_model.fit(dfts)
        pred_series = my_model.predict( n = 699 )
        
        forecast = pred_series.pd_dataframe()
        forecast['ds'] = forecast.index
        forecast['ds'] = forecast['ds'].apply(lambda x : str(x).split(' ')[0].split('-')[0]+str(x).split(' ')[0].split('-')[1]+str(x).split(' ')[0].split('-')[2])
        forecast.columns = ['yhat', 'ds']
        forecast = forecast.reset_index(drop=True)
        
        for idx,k in enumerate(submit[i].keys()):
            submit[i][k][j] = np.round(forecast.yhat[idx],6)
      
        
end_time = time.time()
print('소요시간 :', (end_time - start_time)/60, 'minutes')

---

In [5]:
from darts.models.forecasting import nhits

In [6]:
# 추론 : rf
start_time = time.time()

for i in tqdm(plc_lst):
    plc_df = df_dic[i]
    
    for j in fct_lst: # 피쳐 리스트
        fct_df = plc_df[['ds', j]]   # 날짜 - 피쳐1 조합임.
        
        fct_df = fct_df.rename(columns={j:'y'}) # 피쳐1을 y로 바꾼다.
        fct_df['ds'] = fct_df.ds.apply(lambda x : datetime.strptime(x , '%Y%m%d'))
        fct_df = fct_df.dropna()
        fct_df['y'] = pd.to_numeric(fct_df.y)
        
        df_align = pd.DataFrame([ x for x in list(pd.date_range( end = '2018-01-31', periods = len(fct_df), inclusive="both"))], columns=['ds'])
        df_align['y'] = fct_df.y.values
        
        # TimeSeris 로 변환 하기 
        dfts = TimeSeries.from_dataframe(df_align,  time_col = 'ds' , value_cols  = ['y'], fill_missing_dates=True  , freq='D')
        
        diff = datetime(2018,2,1) - df_align.loc[0].ds
        if diff.days > 720:
            lag = 700
        else:
            lag = diff.days - 10
            
        my_model = nhits.NHiTS(
            input_chunk_length = lag,
            output_chunk_length = 1,

            n_epochs=20,
            nr_epochs_val_period=1,
            batch_size=256,
            torch_device_str="cuda",
            random_state=2022
        )
        
        my_model.fit(dfts)
        pred_series = my_model.predict( n = 699 )
        
        forecast = pred_series.pd_dataframe()
        forecast['ds'] = forecast.index
        forecast['ds'] = forecast['ds'].apply(lambda x : str(x).split(' ')[0].split('-')[0]+str(x).split(' ')[0].split('-')[1]+str(x).split(' ')[0].split('-')[2])
        forecast.columns = ['yhat', 'ds']
        forecast = forecast.reset_index(drop=True)
        
        for idx,k in enumerate(submit[i].keys()):
            submit[i][k][j] = np.round(forecast.yhat[idx],6)
      
        
end_time = time.time()
print('소요시간 :', (end_time - start_time)/60, 'minutes')

  0%|          | 0/545 [00:00<?, ?it/s]

[2022-06-14 21:56:13,940] INFO | darts.models.forecasting.nhits | (N-HiTS): Using automatic kernel pooling size: ((350,), (18,), (1,)).
[2022-06-14 21:56:13,940] INFO | darts.models.forecasting.nhits | (N-HiTS): Using automatic kernel pooling size: ((350,), (18,), (1,)).
2022-06-14 21:56:13 darts.models.forecasting.nhits INFO: (N-HiTS): Using automatic kernel pooling size: ((350,), (18,), (1,)).
[2022-06-14 21:56:13,940] INFO | darts.models.forecasting.nhits | (N-HiTS):  Using automatic downsampling coefficients: ((1,), (1,), (1,)).
[2022-06-14 21:56:13,940] INFO | darts.models.forecasting.nhits | (N-HiTS):  Using automatic downsampling coefficients: ((1,), (1,), (1,)).
2022-06-14 21:56:13 darts.models.forecasting.nhits INFO: (N-HiTS):  Using automatic downsampling coefficients: ((1,), (1,), (1,)).
[2022-06-14 21:56:13,941] INFO | darts.models.forecasting.torch_forecasting_model | Train dataset contains 2120 samples.
[2022-06-14 21:56:13,941] INFO | darts.models.forecasting.torch_forec

MisconfigurationException: GPUAccelerator can not run on your system since the accelerator is not available. The following accelerator(s) is available and can be passed into `accelerator` argument of `Trainer`: ['cpu'].

In [None]:
import torch 
torch.cuda.device_count()

In [None]:
torch.cuda.is_available()

In [None]:
%%time
# 사업장별 dataframe 제작
df_dic = {}
plc_lst = list(train.keys())
fct_lst = ['pH', 'COD', 'SS', 'N', 'P', 'T']

for i in plc_lst:
    plc_df = pd.DataFrame(columns={'ds', 'pH', 'COD', 'SS', 'N', 'P', 'T'})
    plc_df = plc_df[['ds', 'pH', 'COD', 'SS', 'N', 'P', 'T']]
    
    date_lst = list(train[i].keys())
    plc_df.loc[:, 'ds'] = date_lst

    for index, j in enumerate(date_lst):
        for k in fct_lst:
            try:
                plc_df.loc[index, k] = train[i][j][k]
            except:
                pass
                    
    df_dic[i] = plc_df