In [9]:
import pandas as pd
from pandas import datetime
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go

import os
import warnings
warnings.filterwarnings('ignore')

plt.style.use('ggplot')

from statsmodels.tsa.arima_model import ARIMA
import statsmodels.api as sm

In [2]:
DATA_PATH = "../data" # 학습에 사용할 csv 파일이 저장된 폴더입니다.

df_brand = pd.read_csv(os.path.join(DATA_PATH, 'brand_keyword_cnt.csv'))
df_info = pd.read_csv(os.path.join(DATA_PATH, 'product_info.csv'))
df_sales = pd.read_csv(os.path.join(DATA_PATH, 'sales.csv'))
df_smp = pd.read_csv(os.path.join(DATA_PATH, 'sample_submission.csv'))
df_data = pd.read_csv(os.path.join(DATA_PATH, 'train.csv'))

In [3]:
m_data = df_data.melt(id_vars=['ID', '제품', '대분류', '중분류', '소분류', '브랜드', '쇼핑몰'],
                var_name='ds', value_name='y', ignore_index=True)
m_data

Unnamed: 0,ID,제품,대분류,중분류,소분류,브랜드,쇼핑몰,ds,y
0,SAMPLE_00000,B002-00001-00001,B002-C001-0002,B002-C002-0007,B002-C003-0038,B002-00001,S001-00001,2022-01-01,0
1,SAMPLE_00001,B002-00002-00001,B002-C001-0003,B002-C002-0008,B002-C003-0044,B002-00002,S001-00001,2022-01-01,0
2,SAMPLE_00002,B002-00002-00002,B002-C001-0003,B002-C002-0008,B002-C003-0044,B002-00002,S001-00001,2022-01-01,0
3,SAMPLE_00003,B002-00002-00003,B002-C001-0003,B002-C002-0008,B002-C003-0044,B002-00002,S001-00001,2022-01-01,0
4,SAMPLE_00004,B002-00002-00004,B002-C001-0003,B002-C002-0008,B002-C003-0044,B002-00002,S001-00010,2022-01-01,0
...,...,...,...,...,...,...,...,...,...
13840221,SAMPLE_28889,B002-03798-00046,B002-C001-0003,B002-C002-0008,B002-C003-0044,B002-03798,S001-00001,2023-04-24,0
13840222,SAMPLE_28890,B002-03799-00002,B002-C001-0003,B002-C002-0008,B002-C003-0042,B002-03799,S001-00001,2023-04-24,0
13840223,SAMPLE_28891,B002-03799-00003,B002-C001-0003,B002-C002-0008,B002-C003-0044,B002-03799,S001-00001,2023-04-24,0
13840224,SAMPLE_28892,B002-03799-00004,B002-C001-0003,B002-C002-0008,B002-C003-0044,B002-03799,S001-00001,2023-04-24,0


In [4]:
m_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13840226 entries, 0 to 13840225
Data columns (total 9 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   ID      object
 1   제품      object
 2   대분류     object
 3   중분류     object
 4   소분류     object
 5   브랜드     object
 6   쇼핑몰     object
 7   ds      object
 8   y       int64 
dtypes: int64(1), object(8)
memory usage: 950.3+ MB


In [None]:
train = pd.DataFrame(m_data.loc[m_data['ID'] == 'SAMPLE_00000'].loc[:, ['ds', 'y']])
train['ds'] = pd.to_datetime(train['ds'])
train

In [None]:
# def make_train(df):
#         data_list = [] 
#         for code in tqdm(df['ID'].unique()):
#                 d = df[df['ID'] == code].reset_index().drop(['index','ID'], axis=1).sort_values('ds')
#                 data_list.append(d)

#         make = pd.DataFrame(data_list)
#         ds_data.index = make.ds
#         ts_data = ds_data.drop('ds', axis=1)
#         return ts_data
# ts_data = make_train(m_data)
# ts_data

In [None]:
# 1차 차분
ts_diff = ts - ts.shift()
plt.figure(figsize=(22,8))
plt.plot(ts_diff)
plt.title("differencing method")
plt.xlabel("ds")
plt.ylabel("differencing y")
plt.show

In [None]:
# 2차 차분 진행
ts_diff = ts - ts.shift()
plt.figure(figsize=(22,8))
plt.plot(ts_diff)
plt.title("differencing method")
plt.xlabel("ds")
plt.ylabel("differencing y")
plt.show

In [None]:
# 모델 학습과 예측 결과, 그래프 오류 보임
# fit model
model = ARIMA(ts, order=(1,1,1))
model_fit = model.fit() # 최신버전에서는 disp 지원하지 않음

# predict
start_index = datetime(2023, 1, 1)
end_index = datetime(2023, 4, 4)
forecast = model_fit.predict(start=start_index, end=end_index, typ='levels')

# visualization
plt.figure(figsize=(22,8))
plt.plot(train.ds, train.y, label = 'original')
plt.plt(forecast, label='predicted')
plt.xlabel("ds")
plt.ylabel("y")
plt.legend()
plt.show()

In [None]:
# 정상성을 만족하는 차분된 데이터로 ACF, PACF 그래프 그려서 p,q 결정

import statsmodels.api as sm

fig = plt.figure(figsize=(20,8))
ax1 = fig.add_subplot(211)
fig = sm.graphics.tsa.plot_acf(ts_diff[1:], lags=20, ax=ax1)
ax2 = fig.add_subplot(212)
fig = sm.graphics.tsa.plot_pacf(ts_diff[1:], lags=20, ax=ax2)

- 그래프 해석 : 둘 다 점점 작아지면 ARIMA에 적합
- p : ACF그래프 모양이 점점 작아지고, PACF그래프에서 p개 바 뒤에 그래프가 끊긴다면 p를 파라미터로 사용, 마이너스 값으로 갑자기 떨어질 경우에는 AR모델이 적합한 것이고 p값을 AR모델 파라미터로 사용하면 됨
- 절단점이란 어느 지점부터 색칠된 구간 안으로 데이터가 포함되는지를 말하는 지점
- 해당 그래프에서는 lag=2 일 것 같다. 그러면 lag-1인 ARMA(1) 모델을 이용하면 좋지 않을까 추측가능
- 또는 ARIMA(1,1,1) 모델을 활용할 수 있을 듯
- 실제 예측해야 하는 기간 : 23.04.25 ~ 23.05.15

In [None]:
# ACF그패프 정상성 판단 -> 0으로 빠르게 수렴

fig = plt.figure(figsize=(20,10))
ax1 = fig.add_subplot(211)
fig = sm.graphics.tsa.plot_acf(resi, lags=20, ax=ax1)

In [None]:
forecast_df = pd.DataFrame(forecast)
forecast_df

In [5]:
def train_test_split(train, date :str):

    val = train[train.ds >= date]
    train = train[train.ds < date]

    val.index = val.ds
    val_ts = val.drop('ds', axis=1)

    train.index = train.ds
    train_ts = train.drop('ds', axis=1)

    return train_ts, val_ts

In [None]:
train = pd.DataFrame(m_data.loc[m_data['ID'] == 'SAMPLE_00000'].loc[:, ['ds', 'y']])
train['ds'] = pd.to_datetime(train['ds'])
train

In [None]:
model = ARIMA(samsung_train_df.price.values, order = (2,1,2))
model_fit = model.fit(trend = 'c', full_output = True, disp = True)
print(model_fit.summary())

In [14]:
data_list = [] 

for code in tqdm(m_data['ID'].unique()):
    
    # ID에 해당하는 데이터프레임 가져오기
    train = pd.DataFrame(m_data.loc[m_data['ID'] == code].loc[:, ['ds', 'y']])
    train['ds'] = pd.to_datetime(train['ds'])
    
    # train, validation 데이터 분리해서 timeseries로 가져오기
    train_ts, val_ts = train_test_split(train, '2023-01-01')

    # 1차 차분 진행
    train_ts_diff = train_ts - train_ts.shift()

    # fit model
    model = ARIMA(train_ts['y'], order=(1,1,1))
    model_fit = model.fit() # 최신버전에서는 disp 지원하지 않음
    
    # try:
    #     model_fit = statsmodels.api.tsa.ARIMA(train, order=(1,1,1)).fit()
    # except:
    #     pass
    
    # predict
    start_index = datetime(2023, 1, 1)
    end_index = datetime(2023, 5, 15)
    forecast = model_fit.predict(start=start_index, end=end_index, typ='levels') 
    

    # 예측값이 생성된 forecast는 dataframe 변경해서 0이하인 값은 0으로 대체하기
    forecast_df = pd.DataFrame(forecast)
    # forecast_df.predicted_mean = np.where(forecast_df.predicted_mean < 0, 0, forecast_df.predicted_mean)

    # 데이터프레임 형식으로 concat하기
    df = forecast_df.T
    df = df.round(0).astype(int)
    data_list.append(df)

predict_df = pd.concat(data_list, ignore_index=True)
predict_df


  0%|          | 0/28894 [00:00<?, ?it/s]


NotImplementedError: 
statsmodels.tsa.arima_model.ARMA and statsmodels.tsa.arima_model.ARIMA have
been removed in favor of statsmodels.tsa.arima.model.ARIMA (note the .
between arima and model) and statsmodels.tsa.SARIMAX.

statsmodels.tsa.arima.model.ARIMA makes use of the statespace framework and
is both well tested and maintained. It also offers alternative specialized
parameter estimators.


In [8]:
data_list = [] 

for code in m_data['ID'].unique():
    
    # ID에 해당하는 데이터프레임 가져오기
    train = m_data.loc[m_data['ID'] == code].loc[:, ['ds', 'y']]
    
    # train, validation 데이터 분리해서 timeseries로 가져오기
    train_ts, val_ts = train_test_split(train, '2023-01-01')

    # 1차 차분 진행
    train_ts_diff = train_ts - train_ts.shift()

    # fit model
    model = ARIMA(train_ts, order=(1,1,1))
    model_fit = model.fit() # 최신버전에서는 disp 지원하지 않음

    # predict
    start_index = datetime(2023, 4, 25)
    end_index = datetime(2023, 5, 15)
    forecast = model_fit.predict(start=start_index, end=end_index, typ='levels') 

    # 예측값이 생성된 forecast는 dataframe 변경해서 0이하인 값은 0으로 대체하기
    forecast_df = pd.DataFrame(forecast)
    forecast_df.predicted_mean = np.where(forecast_df.predicted_mean < 0, 0, forecast_df.predicted_mean)

    # 데이터프레임 형식으로 concat하기
    df = forecast_df.T
    df = df.round(0).astype(int)
    data_list.append(df)

predict_df = pd.concat(data_list, ignore_index=True)
predict_df


NotImplementedError: 
statsmodels.tsa.arima_model.ARMA and statsmodels.tsa.arima_model.ARIMA have
been removed in favor of statsmodels.tsa.arima.model.ARIMA (note the .
between arima and model) and statsmodels.tsa.SARIMAX.

statsmodels.tsa.arima.model.ARIMA makes use of the statespace framework and
is both well tested and maintained. It also offers alternative specialized
parameter estimators.
