# 정리노트 - 시계열 셀프 모의고사(국제 금시세)

In [None]:
import json
import pandas as pd

f = open("data/goldPrice.json", "r", encoding="UTF-8")
str_data = f.read()
f.close()
json_data = json.loads(str_data)

data_list = []
for item in json_data["vector"]["data"]:
    if item["LIST"]["CUR_NM"]["_value"] == "1USD":
        this_data = {}
    for k, v in item["LIST"].items():
        if k == "NTC_DT":
            this_data[k] = v["_value"]
        elif k == "TRD_BAS_EXRT":
            if "TRD_BAS_EXRT" in this_data:
                this_data["WON"] = float(v["_value"])
            else:
                this_data[k] = float(v["_value"])
    if item["LIST"]["CUR_NM"]["_value"] != "1USD":
        data_list.append(this_data)
    
df = pd.DataFrame(data_list)
df["GOLD_PRICE"] = df["WON"]/df["TRD_BAS_EXRT"] * (31.1034768/0.9999)
df.rename(columns={"NTC_DT" : "DATE"}, inplace=True)
df.drop(columns=["TRD_BAS_EXRT", "WON"], axis=1, inplace=True)
df

In [None]:
df["DATE"] = pd.to_datetime(df["DATE"])
df["YM_PRD"] = df["DATE"].dt.to_period(freq="M")
df["YM_DT"] = ""
for i in range(len(df["YM_PRD"])):
    df.iloc[i, 3] = str(df.iloc[i, 2])
df["YM_DT"] = pd.to_datetime(df["YM_DT"])
df

In [None]:
df_ym = df.groupby(by="YM_DT")
df_ym = df_ym.mean()
# df_ym.reset_index(inplace=True)
display(df_ym)
# df_ym["new_ym"] = pd.to_datetime(df_ym["YM"])
# df_ym["YM"] = df_ym["YM"].asfreq('D', how='start').to_timestamp()
# df_ym["YM"]
# df_ym["YM"] = df_ym[]

In [None]:
import matplotlib.pyplot as plt

# fig = plt.figure(dpi=150)
fig = plt.figure()
ax = fig.subplots()
df_ym.plot(ax = ax)
# plt.plot(df_ym["YM"], df_ym["GOLD_PRICE"])

In [None]:
import statsmodels.api as sm
#Decomposition of Time Series
decomposition = sm.tsa.seasonal_decompose(df_ym["GOLD_PRICE"], model="additive", period=1)
fig = decomposition.plot()
# fig.dpi = 150
fig.set_size_inches(10, 15)
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(df_ym, test_size=0.2, shuffle=False)

fig, ax = plt.subplots(1, 2, figsize=(10, 5))
fig.suptitle("RawData")
sm.graphics.tsa.plot_acf(train_data.values.squeeze(), lags=30, ax=ax[0])
sm.graphics.tsa.plot_pacf(train_data.values.squeeze(), lags=30, ax=ax[1])

#비정상성 시계열의 전형적인 특징을 보여준다. 완만히 하강하는 모양새. 정상성이라면 뚝 끊기게 나올 것

In [None]:
#차분한다.
diff_train_data = train_data.copy()
diff_train_data = diff_train_data["GOLD_PRICE"].diff()
diff_train_data.dropna(inplace=True)
print(diff_train_data)
plt.plot(diff_train_data, "orange", label="diff_train_data(Stationary)")
plt.grid()
plt.legend()

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(10, 5))
fig.suptitle("RawData")
sm.graphics.tsa.plot_acf(diff_train_data.values.squeeze(), lags=30, ax=ax[0])
sm.graphics.tsa.plot_pacf(diff_train_data.values.squeeze(), lags=30, ax=ax[1])

#정상성을 보이는 것으로 보임. 3~4개 튀는 것들이 있으나 무시해도 좋음

In [None]:
import itertools
import warnings
warnings.filterwarnings("ignore")

p = range(0, 3)
d = range(1, 2)
q = range(0, 3)
pdq = list(itertools.product(p, d, q)) #세가지 배열의 값을 각각 조합하여 새로운 배열로 만들어 줌
seasonal_pdq = [(x[0], x[1], x[2], 4) for x in pdq]

aic = []
for i in pdq:
    for j in seasonal_pdq:
        try: #pdq 파라미터 값 때문에 오류가 날 수 있음. 코드가 중단되므로 try문으로...
            model = sm.tsa.SARIMAX(df_ym.values, order=(i), seasonal_order=(j))
            model_fit = model.fit()
            print("SARIMA Order : {}{} -> AIC : {}".format(i, j, model_fit.aic))
            aic_dict = { "pdq": i, "s-pdq" : j, "aic" : model_fit.aic }
            aic.append(aic_dict)
        except:
            continue
result_by_aic = pd.DataFrame(aic)
result_by_aic.sort_values(by="aic", ascending=True, inplace=True)
result_by_aic.reset_index(inplace=True)
result_by_aic

In [None]:
#aic가 가장 낮은 모델로 select 하여 summary를 본다
model_opt = sm.tsa.SARIMAX(df_ym.values, order=result_by_aic.iloc[0,1], seasonal_order=result_by_aic.iloc[0,2])
model_opt_fit = model_opt.fit()
model_opt_fit.summary()

In [None]:
df_ym2 = df_ym.copy()

dt_index = pd.date_range(start="2021-10-1", end="2023-9-1", freq="MS")
df_dt = pd.DataFrame(dt_index)
df_dt["GOLD_PRICE"] = 0.0
df_dt["YM_DT"] = df_dt[0]
df_dt.drop(columns=[0], axis=1, inplace=True)
df_dt.set_index("YM_DT", inplace=True)
df_ym2 = pd.concat([df_ym2, df_dt])
df_ym2[-24:].index

In [None]:
import datetime
import numpy as np
from sklearn.metrics import r2_score

prediction = model_opt_fit.get_forecast(24)
predicted_value = prediction.predicted_mean
predicted_ub = prediction.conf_int()[:,0]
predicted_lb = prediction.conf_int()[:,1]
predict_index = list(df_ym2[-24:].index)
# r2 = r2_score(test_data, predicted_value)

forecast_start_date = datetime.datetime(2021, 9, 1)
forecast_start_date = np.datetime64(forecast_start_date)
plt.figure(dpi=150)
plt.plot(df_ym2.index, df_ym2["GOLD_PRICE"], label="GOLD_PRICE")
# plt.vlines(forecast_start_date, 0, 2000, linestyle="--", color="r", label="Forecast Start")
plt.plot(predict_index, predicted_value, label = "Prediction")
plt.fill_between(predict_index, predicted_lb, predicted_ub, color="k", alpha=0.1, label="0.95 Prediction Interval")
plt.legend(loc="upper left")
plt.suptitle("SARIMA {} {}, Prediction Results".format(result_by_aic.iloc[0,1], result_by_aic.iloc[0,2]))