# 정리노트 - 시계열 셀프 모의고사(국제 철광석 시세)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_json("data/ironOrePriceJSON.txt")
df["year"] = df["date"].dt.year.astype("str")
df["month"] = df["date"].dt.month.astype("str")

for i in range(len(df["month"])):
    if len(str(df.iloc[i, 3])) == 1:
        df.iloc[i, 3] = "0" + str(df.iloc[i, 3])

df["ym"] = df["year"].map(str) + "-" + df["month"].map(str)
display(df)

In [None]:
df_gr = df.groupby(by="ym")
mean_by_ym = df_gr.mean()
mean_by_ym.index = pd.to_datetime(mean_by_ym.index)
display(mean_by_ym)

plt.figure(dpi=150)
plt.plot(mean_by_ym.index, mean_by_ym["price"])
plt.show()

In [None]:
import statsmodels.api as sm

fig, ax = plt.subplots(1, 2, figsize=(10, 5))
fig.suptitle("RawData")
sm.graphics.tsa.plot_acf(mean_by_ym.values.squeeze(), lags=30, ax=ax[0])
sm.graphics.tsa.plot_pacf(mean_by_ym.values.squeeze(), lags=30, ax=ax[1])

In [None]:
#차분한다.
diff_data = mean_by_ym.copy()
diff_data = diff_data["price"].diff()
diff_data.dropna(inplace=True)
print(diff_data)
plt.plot(diff_data, "orange", label="diff_data(Stationary)")
plt.grid()
plt.legend()

fig, ax = plt.subplots(1, 2, figsize=(10, 5))
fig.suptitle("RawData")
sm.graphics.tsa.plot_acf(diff_data.values.squeeze(), lags=30, ax=ax[0])
sm.graphics.tsa.plot_pacf(diff_data.values.squeeze(), lags=30, ax=ax[1])

#정상성을 보이는 것으로 보임. 3~4개 튀는 것들이 있으나 무시해도 좋음

In [None]:
from sklearn.model_selection import train_test_split
import itertools
import warnings
warnings.filterwarnings("ignore")

train_data, test_data = train_test_split(mean_by_ym, test_size=0.05, 
                                         shuffle=False)

#원래는 위 표에 따라 p,d,q를 결정해야 하지만 iterative 하게 할거임
p = range(0, 3)
d = range(1, 2)
q = range(0, 3)
#세가지 배열의 값을 각각 조합하여 새로운 배열로 만들어 줌
pdq = list(itertools.product(p, d, q)) 
aic = []
for i in pdq:
    model = sm.tsa.ARIMA(train_data.values, order=(i))
    model_fit = model.fit()
    print("ARIMA Order : {} -> AIC : {}".format(i, model_fit.aic))
    aic_dict = { "order": i, "aic" : model_fit.aic }
    aic.append(aic_dict)
result_by_aic = pd.DataFrame(aic)
result_by_aic.sort_values(by="aic", ascending=True, inplace=True)
result_by_aic.reset_index(inplace=True)
result_by_aic

In [None]:
#aic가 가장 낮은 모델로 select 하여 summary를 본다
model_opt = sm.tsa.ARIMA(train_data.values, order=result_by_aic.iloc[0,1])
model_opt_fit = model_opt.fit()
model_opt_fit.summary()

In [None]:
import datetime
import numpy as np
from sklearn.metrics import r2_score

prediction = model_opt_fit.forecast(len(test_data))
predicted_value = prediction[0]
predicted_ub = prediction[2][:,0]
predicted_lb = prediction[2][:,1]
predict_index = list(test_data.index)
r2 = r2_score(test_data, predicted_value)

# forecast_start_date = datetime.datetime(1958, 8, 1)
# forecast_start_date = np.datetime64(forecast_start_date)
plt.figure(dpi=150)
plt.plot(mean_by_ym.index, mean_by_ym["price"], label="price")
# plt.vlines(forecast_start_date, 0, 1000, linestyle="--", 
#                             color="r", label="Forecast Start")
plt.plot(predict_index, predicted_value, label = "Prediction")
plt.fill_between(predict_index, predicted_lb, predicted_ub, 
                 color="k", alpha=0.1, label="0.95 Prediction Interval")
plt.legend(loc="upper left")
plt.suptitle("ARIMA {} Prediction Results (r2_score : {:4.2f})".format(
    result_by_aic.iloc[0,1], r2))

In [None]:
#SARIMA - auto_arima로 한방에 모델 셀렉션이 가능
from pmdarima import auto_arima
aa_model = auto_arima(train_data,
                      start_p=0,
                      d=1,
                      start_q=0,
                      max_p=3,
                      max_d=3,
                      max_q=3,
                      start_P=0,
                      D=1,
                      start_Q=0,
                      max_P=3,
                      max_D=3,
                      max_Q=3,
                      m=4,
                      seasonal=True,
                      trace=True,
                      error_action="ignore",
                      suppress_warnings=True,
                      stepwise=False)
aa_model.summary()

In [None]:
#aic가 가장 낮은 모델로 select 하여 summary를 본다
model_opt = sm.tsa.SARIMAX(train_data.values, 
                       order=aa_model.get_params()["order"], 
                       seasonal_order=aa_model.get_params()["seasonal_order"])
model_opt_fit = model_opt.fit()
model_opt_fit.summary()

prediction = model_opt_fit.get_forecast(len(test_data))
predicted_value = prediction.predicted_mean
predicted_ub = prediction.conf_int()[:,0]
predicted_lb = prediction.conf_int()[:,1]
predict_index = list(test_data.index)
r2 = r2_score(test_data, predicted_value)

plt.figure(dpi=150)
plt.plot(mean_by_ym.index, mean_by_ym["price"], label="price")
# plt.vlines(forecast_start_date, 0, 1000, linestyle="--", color="r", label="Forecast Start")
plt.plot(predict_index, predicted_value, label = "Prediction")
plt.fill_between(predict_index, predicted_lb, predicted_ub, color="k", 
                 alpha=0.1, label="0.95 Prediction Interval")
plt.legend(loc="upper left")
plt.suptitle("SARIMA {} {}, Prediction Results (r2_score : {:4.2f})".format(
    aa_model.get_params()["order"], aa_model.get_params()["seasonal_order"], r2))