In [194]:
import pandas as pd # 데이터 전처리
import numpy as np # 데이터 전처리
import matplotlib.pyplot as plt # 데이터 시각화

import warnings
from tqdm import tqdm
from datetime import datetime, timedelta # 시간 데이터 처리"

from statsmodels.tsa.arima_model import ARIMA # ARIMA 모델
from fbprophet import Prophet

warnings.filterwarnings('ignore')
%matplotlib inline

In [195]:
def fill_nan(df):
    for col in df.columns[1:]:
        start_dt = df.loc[df.loc[:, col].isnull() == False].index[0]
        from_dt = start_dt
        from_val = df.loc[start_dt, col]

        for dt, val in df.loc[start_dt:, col].items():
            if not pd.isna(val):
                # processing
                length = len(df.loc[from_dt: dt, col]) - 1
                if length != 0:
                    for i in df.loc[from_dt: dt, col].index:
                        df.loc[i, col] = from_val / length

                from_dt = dt
                from_val = val
    return df


def to_panel_format(df):
    place_id=[]; time=[] ; target=[] # 빈 리스트를 생성합니다.
    
    df['Time'] = pd.to_datetime(df['Time']) 
    df = df.set_index('Time')
    
    for i in df.columns:
        for j in range(len(df)):
            place_id.append(i) # place_id에 미터 ID를 정리합니다.
            time.append(df.index[j]) # time에 시간대를 정리합니다.
            target.append(df[i].iloc[j]) # target에 전력량을 정리합니다.

    ret = pd.DataFrame({'place_id':place_id,'time':time,'target':target})
    ret = ret.dropna() # 결측치를 제거합니다.
    ret = ret.set_index('time') # time을 인덱스로 저장합니다.
    return ret


def get_optimal_params(y):
    param_dict = {}
    
    for param in pdq:
        try:
            model = ARIMA(y, order=param)
            results_ARIMA = model.fit(disp=-1)
            param_dict[results_ARIMA.aic] = param
        except:
            continue

    min_aic = min(param_dict.keys())
    optimal_params = param_dict[min_aic]
    return optimal_params

def prediction(df):
    p = d = q = range(0, 2)
    pdq = list(itertools.product(p, d, q))

    ret = {}
    sub = pd.DataFrame() # a라는 데이터프레임에 예측값을 정리합니다.

    for key in df['place_id'].unique(): # 미터ID 200개의 리스트를 unique()함수를 통해 추출합니다.
        temp = df.loc[df['place_id']==key] # 미터ID 하나를 할당합니다.
        temp_1h = temp.resample('1h').sum() # 1시간 단위로 정리합니다.
        temp_1day = temp.resample('D').sum() # 1일 단위로 정리합니다.

        # 시간별 예측
        model = ARIMA(temp_1h['target'], order=get_optimal_params(temp_1h['target'])) # AIC를 최소화하는 최적의 파라미터로 모델링합니다.
        results_ARIMA = model.fit(disp=-1)
        fcst = results_ARIMA.forecast(24) # 24시간을 예측합니다.
        for i in range(24):
            sub['X2018_7_1_'+str(i+1)+'h'] = [fcst[0][i]] # column명을 submission 형태에 맞게 지정합니다.

        # 일별 예측
        model = ARIMA(temp_1day['target'], order=get_optimal_params(temp_1day['target'])) # AIC를 최소화하는 최적의 파라미터로 모델링합니다.
        results_ARIMA = model.fit(disp=-1)
        fcst = results_ARIMA.forecast(10) # 10일을 예측합니다.
        for i in range(10):
            sub['X2018_7_'+str(i+1)+'_d'] = [fcst[0][i]] # column명을 submission 형태에 맞게 지정합니다.

        # 월별 예측
        # 일별로 예측하여 7월 ~ 11월의 일 수에 맞게 나누어 합산합니다.
        fcst = results_ARIMA.forecast(153)
        sub['X2018_7_m'] = [np.sum(fcst[0][:31])] # 7월 
        sub['X2018_8_m'] = [np.sum(fcst[0][31:62])] # 8월
        sub['X2018_9_m'] = [np.sum(fcst[0][62:92])] # 9월
        sub['X2018_10_m'] = [np.sum(fcst[0][92:123])] # 10월
        sub['X2018_11_m'] = [np.sum(fcst[0][123:153])] # 11월
        sub['meter_id'] = key 
        ret[key] = sub[submission.columns.tolist()]
        print(key)
        
    print('---- Modeling Done ----')
    return ret

In [196]:
# train = pd.read_csv("train.csv")
test = pd.read_csv("enc_test.csv")
submission = pd.read_csv("submission_1002.csv")
submission = submission.set_index('meter_id')

In [197]:
test = test.set_index('Time')
test.index = pd.to_datetime(test.index)

In [198]:
test = test['2017-08-25 00:00:00':]

# 1.Agregate by Time

In [199]:
temp = test.copy()
temp['hour'] = list(temp.reset_index()['Time'].apply(lambda x: x.hour))

In [None]:
for col in tqdm(test.columns):
    for t in range(24):
        data = temp.loc[temp['hour'] == t, col].copy()
        df = pd.DataFrame({
            'ds': data.index,
            'y': data.data,
        })

        ## Set model setting 
        m = Prophet(
            growth='linear', 
            changepoint_range=0.95,
            changepoint_prior_scale=0.1,
        )
        m.fit(df)

        ## Make dataframe for get a predicted value
        future = m.make_future_dataframe(periods=1, freq='T')

        ## Predict
        forecast = m.predict(future)
        
        ## Save the data
        submission.loc[col, 'X2018_7_1_' + str(t + 1) + '_h'] = forecast.loc[len(data), 'yhat']
        
#     ## Show a graph
#     fig1 = m.plot(forecast)
#     plt.show()








  0%|          | 0/200 [00:00<?, ?it/s]INFO:fbprophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
INFO:fbprophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
INFO:fbprophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
INFO:fbprophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
INFO:fbprophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:fbprophet:Disabling d

# 2. Aggregate by Day

In [None]:
test = test['2017-08-25 00:00:00':]
test_D = test.resample('D').sum()

In [None]:
for col in tqdm(test_D.columns):
    df = pd.DataFrame({
        'ds': test_D.index,
        'y': test_D[col],
    })

    ## Set model setting 
    m = Prophet(
        growth='linear', 
        changepoint_range=0.95,
        changepoint_prior_scale=0.1,
        yearly_seasonality=True
    )
    m.fit(df)
    
    ## Make dataframe for get a predicted value
    future = m.make_future_dataframe(periods=10)
    
    ## Predict
    forecast = m.predict(future)
    for i in range(10):
        submission.loc[col, 'X2018_7_' + str(i + 1) + '_d'] = forecast.loc[len(df) + i, 'yhat']
        
#     ## Show a graph
#     fig1 = m.plot(forecast)
#     plt.show()

# Aggregate by M

In [None]:
test = test['2017-08-25 00:00:00':]
test_M = test.resample('M').sum()

In [None]:
for col in tqdm(test.columns):
    df = pd.DataFrame({
        'ds': test_M.index,
        'y': test_M[col],
    })

    ## Set model setting 
    m = Prophet(
        growth='linear', 
        changepoint_range=0.8,
        changepoint_prior_scale=0.5,
    )
    m.fit(df)
    
    ## Make dataframe for get a predicted value
    future = m.make_future_dataframe(periods=5, freq='M')
    
    ## Predict
    forecast = m.predict(future)
    for i in range(5):
        submission.loc[col, 'X2018_' + str(i + 7) + '_m'] = forecast.loc[len(df) + i, 'yhat']
    
#     ## Show a graph
#     fig1 = m.plot(forecast)
#     plt.show()

In [None]:
# test = to_panel_format(test) # 데이터 형식 변겅
# agg = prediction(test) # 예측

# output1 = pd.concat(agg, ignore_index=False)
# output2 = output1.reset_index().drop(['level_0','level_1'], axis=1)
# output2['id'] = output2['meter_id'].str.replace('X','').astype(int)
# output2 =  output2.sort_values(by='id', ascending=True).drop(['id'], axis=1).reset_index(drop=True)
# output2.to_csv('sub_baseline.csv', index=False)

In [None]:
submission.reset_index().to_csv('sub_fbprophet.csv', index=False)

In [None]:
submission.head()