## 47. 時系列分析（Time Series Analysis）

### <font color=blue>**1.** </font> 古典的方法

In [None]:
'''自己回帰モデル
（AR : AutoRegressive integrated moving model）

移動平均モデル
（MA : Moving Average）

自己回帰移動平均モデル
（ARMA : Auto Regressive Moving Average）

自己回帰和分移動平均モデル
（ARIMA : Auto Regressive Integrated Moving Average）

季節的自己回帰和分移動平均モデル
（SARIMA : Seasonal AutoRegressive Integrated Moving Average）

外部変数付き季節的自己回帰和分移動平均モデル
（SARIMAX : Seasonal AutoRegressive Integrated Moving Average with eXogenous regressors）
'''

In [None]:
# 元データ : https://stat.ethz.ch/R-manual/R-devel/library/datasets/html/AirPassengers.html

#### <font color=green>**1.1.** </font> サンプルコード　その１

In [None]:
## 出典 : https://logics-of-blue.com/python-time-series-analysis/

In [None]:
# 基本のライブラリを読み込む
import numpy as np
import pandas as pd
from scipy import stats

# グラフ描画
from matplotlib import pylab as plt
import seaborn as sns
sns.set()

# グラフを横長にする
#from matplotlib.pylab import rcParams
#rcParams['figure.figsize'] = 15, 6

# 統計モデル
import statsmodels.api as sm

# ワーニングを表示させない
import warnings
warnings.filterwarnings('ignore')

In [None]:
## 時系列データの読み込み

data = pd.read_csv('https://raw.githubusercontent.com/jiai-tus/FirstTerm/main/20210518/datasets/AirPassengers.csv', 
                   index_col='Month', 
                   parse_dates=True, 
                   dtype='float')
data.head()

In [None]:
# 日付形式にする
ts = data['#Passengers'] 
ts.head()

In [None]:
## 時系列データの取り扱い
# プロット
plt.figure(figsize=(12,6))
plt.plot(ts)
plt.show()

In [None]:
# データの取得方法その1
ts['1949-01-01']

In [None]:
# データの取得方法その2
from datetime import datetime
ts[datetime(1949,1, 1)]

In [None]:
# 1949年のデータをすべて取ってくる
ts['1949']

In [None]:
# 差分系列
ts.diff().head()

In [None]:
# シフト
ts.shift().head()

In [None]:
# 対数差分系列
logDiff = np.log(ts) - np.log(ts.shift())

# NaNを取り除てから表示
logDiff.dropna().head()

In [None]:
## 自己相関係数の推定
# 自己相関を求める
# 過去の値とどれくらい似ているか
ts_acf = sm.tsa.stattools.acf(ts, nlags=40)
ts_acf

In [None]:
# 偏自己相関
# 注目している時以外の要因を無視して計算された自己相関係数
ts_pacf = sm.tsa.stattools.pacf(ts, nlags=40, method='ols')
ts_pacf

In [None]:
# 自己相関のグラフ
fig = plt.figure(figsize=(12,8))
ax1 = fig.add_subplot(211)
fig = sm.graphics.tsa.plot_acf(ts, lags=40, ax=ax1)
ax2 = fig.add_subplot(212)
fig = sm.graphics.tsa.plot_pacf(ts, lags=40, ax=ax2)

In [None]:
## ARIMAモデルの推定
# 和分過程なので、差分をとる
diff = ts.diff()
diff = diff.dropna()
diff.head()

In [None]:
# 差分系列のグラフ
plt.figure(figsize=(12,6))
plt.plot(diff)
plt.show()

In [None]:
# 差分系列への自動ARMA推定関数の実行
resDiff = sm.tsa.arma_order_select_ic(diff, ic='aic', trend='nc')
resDiff

In [None]:
# P=3, q=2 が最善となったので、それをモデル化
from statsmodels.tsa.arima_model import ARIMA
ARIMA_3_1_2 = ARIMA(ts, order=(3, 1, 2)).fit(dist=False)
ARIMA_3_1_2.params

In [None]:
# 残差のチェック
# SARIMAじゃないので、周期性が残ってしまっている。。。
resid = ARIMA_3_1_2.resid
fig = plt.figure(figsize=(12,8))
ax1 = fig.add_subplot(211)
fig = sm.graphics.tsa.plot_acf(resid.values.squeeze(), lags=40, ax=ax1)
ax2 = fig.add_subplot(212)
fig = sm.graphics.tsa.plot_pacf(resid, lags=40, ax=ax2)

In [None]:
## SARIMAモデルの推定
# SARIMAモデルを「決め打ち」で推定する

SARIMA_3_1_2_111 = sm.tsa.SARIMAX(ts, order=(3,1,2), seasonal_order=(1,1,1,12)).fit(method='bfgs', maxiter=300)
print(SARIMA_3_1_2_111.summary())

In [None]:
# 残差のチェック
residSARIMA = SARIMA_3_1_2_111.resid
fig = plt.figure(figsize=(12,8))
ax1 = fig.add_subplot(211)
fig = sm.graphics.tsa.plot_acf(residSARIMA.values.squeeze(), lags=40, ax=ax1)
ax2 = fig.add_subplot(212)
fig = sm.graphics.tsa.plot_pacf(residSARIMA, lags=40, ax=ax2)

In [None]:
# 予測
pred = SARIMA_3_1_2_111.predict('1960-01-01', '1961-12-01')
print(pred)

In [None]:
# 実データと予測結果の図示
plt.figure(figsize=(12,6))
plt.plot(ts)
plt.plot(pred, "r")
plt.show()

In [None]:
## 総当たり法によるSARIMAモデル次数の決定
'''総当たりで、AICが最小となるSARIMAの次数を探す
ARIMA(p, d, q)、季節(sp, sd, sq)
各々、pは自己回帰モデルの次数：AR(p)、
     qは移動平均モデルの次数：MA(q)、
     dは差分をとる回数：I(d)
'''
max_p = 3
max_q = 3
max_d = 1
max_sp = 1
max_sq = 1
max_sd = 1

## 上記設定で3分くらい。増やすとアホみたいに時間かかる

pattern = max_p*(max_q + 1)*(max_d + 1)*(max_sp + 1)*(max_sq + 1)*(max_sd + 1)

modelSelection = pd.DataFrame(index=range(pattern), columns=["model", "aic"])
pattern

In [None]:
# 自動SARIMA選択

num = 0

for p in range(1, max_p + 1):
    for d in range(0, max_d + 1):
        for q in range(0, max_q + 1):
            for sp in range(0, max_sp + 1):
                for sd in range(0, max_sd + 1):
                    for sq in range(0, max_sq + 1):
                        sarima = sm.tsa.SARIMAX(
                            ts, order=(p,d,q), 
                            seasonal_order=(sp,sd,sq,12), 
                            enforce_stationarity = False, 
                            enforce_invertibility = False
                        ).fit(method='bfgs', maxiter=300, disp=False)
                        modelSelection.iloc[num]["model"] = "order=(" + str(p) + ","+ str(d) + ","+ str(q) + "), season=("+ str(sp) + ","+ str(sd) + "," + str(sq) + ")"
                        modelSelection.iloc[num]["aic"] = sarima.aic
                        num = num + 1
# .ix -> .iloc

In [None]:
modelSelection

In [None]:
# AIC最小モデル
modelSelection[modelSelection.aic == min(modelSelection.aic)]

In [None]:
# 参考：次数がすべて０だとエラーになる 
# sarima = sm.tsa.SARIMAX(ts, order=(0,0,0), seasonal_order=(0,0,0,12), enforce_stationarity = False).fit()

In [None]:
bestSARIMA = sm.tsa.SARIMAX(ts, order=(3,1,3), seasonal_order=(0,1,1,12), enforce_stationarity = False, enforce_invertibility = False).fit()

In [None]:
print(bestSARIMA.summary())

In [None]:
# 残差のチェック
residSARIMA = bestSARIMA.resid
fig = plt.figure(figsize=(12,8))
ax1 = fig.add_subplot(211)
fig = sm.graphics.tsa.plot_acf(residSARIMA, lags=40, ax=ax1)
ax2 = fig.add_subplot(212)
fig = sm.graphics.tsa.plot_pacf(residSARIMA, lags=40, ax=ax2)

In [None]:
# 予測
bestPred = bestSARIMA.predict('1960-01-01', '1961-12-01')

# 実データと予測結果の図示
plt.figure(figsize=(12,6))
plt.plot(ts)
plt.plot(bestPred, "r")
plt.show()

#### <font color=green>**1.2.** </font> サンプルコード　その２

In [None]:
## 出典 : https://qiita.com/DS27/items/1e998a58488e76bfcbdc

In [None]:
# 必要なライブラリーのインポート
import pandas as pd
import numpy as np

# 統計モデル
import statsmodels.api as sm

from matplotlib import pylab as plt

In [None]:
# データの読み込み
df = pd.read_csv('https://raw.githubusercontent.com/jiai-tus/FirstTerm/main/20210518/datasets/AirPassengers.csv')

# float型に変換
df['#Passengers'] = df['#Passengers'].astype('float64')
df = df.rename(columns={'#Passengers': 'Passengers'})

# datetime型にしてインデックスにする
df.Month = pd.to_datetime(df.Month)
df = df.set_index("Month")

# データの中身を確認
df.head()

In [None]:
# データの可視化
plt.figure(figsize=(12,6))
plt.plot(df)
plt.show()

In [None]:
## 移動平均モデル（MA：Moving Average model）
# 移動平均
df["3ma"]=df["Passengers"].rolling(3).mean().round(1)
df["5ma"]=df["Passengers"].rolling(5).mean().round(1)
df["7ma"]=df["Passengers"].rolling(7).mean().round(1)

# 可視化
plt.figure(figsize=(12,6))
plt.plot(df["Passengers"], label="Passengers")
plt.plot(df["3ma"], "k--", label="SMA(3)")
plt.plot(df["5ma"], "r--", label="SMA(5)")
plt.plot(df["7ma"], "g--", label="SMA(7)")
plt.xlabel("date")
plt.ylabel("quantity")
plt.legend()

plt.show()

In [None]:
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 15, 6

In [None]:
## 自己回帰モデル（AR：Autoregressive Integrated Moving model）

# 自己相関を求める
df_acf = sm.tsa.stattools.acf(df["Passengers"], nlags=30, fft=False)

# 自己相関のグラフ
fig = sm.graphics.tsa.plot_acf(df["Passengers"], lags=30)

In [None]:
# 薄青の空間は、真に自己相関がない場合の信頼区間95%の範囲を示します
# つまり、この範囲外の値を持つlag地点に（統計的に）有意な自己相関があると分かります

In [None]:
# 偏自己相関を求める
df_pacf = sm.tsa.stattools.pacf(df["Passengers"], nlags=20, method='ols')

# 偏自己相関を可視化する
fig = sm.graphics.tsa.plot_pacf(df["Passengers"], lags=20)

In [None]:
res = sm.tsa.seasonal_decompose(df["Passengers"])
fig = res.plot()

In [None]:
def invert(data, diff, prepro):
  """前処理に応じた逆変換をする
  Parameters
  ----------
  data (np.array or np.float) : 原系列のデータ
  diff (np.array or np.float) : 変換したデータ
  prepro (str) : 前処理の手法（'diff', 'pct', 'logdiff')

  """
  if prepro == 'diff':
    return data + diff
  elif prepro == 'pct':
    return data * diff + data
  elif prepro == 'logdiff':
    return np.exp(diff) * data
  else:
    print('{}は対応していない前処理です'.format(prepro))
    return None

In [None]:
def invert_predict(data, pred, start, prepro, period, span=None):
  """前処理した系列の予測値を原系列に逆変換する
  Parameters
  ----------
  data (np.array) : 実測値
  pred (np.array) : 予測値
  start (int) : 逆変換して得る最初の期
  prepro (str) : 前処理手法（'diff', 'pct', 'logdiff')
  period (int) : 何期先まで予測したか
  xlim (turple) : 原系列グラフのx軸の描画範囲
  ylim (turple) : 原系列グラフのy軸の描画範囲
  """
  pred_inverted = np.empty_like(pred)
  if span == None:
    pred_inverted[0] = invert(data[start - 1], pred[0], prepro)
    for i in range(1, period):
      pred_inverted[i] = invert(pred_inverted[i - 1], pred[i], prepro)
  else:
    for i in range(span):
      pred_inverted[i, 0] = invert(data[start - period + i], pred[i, 0], prepro)
      for j in range(1, period):
        pred_inverted[i, j] = invert(pred_inverted[i, j-1], pred[i, j], prepro)
  return pred_inverted

In [None]:
def plot_processed_series(data, pred, start, data_num, interval, prepro, axis):
  """前処理した系列の実測値及び予測値をグラフに書き出す
  Parameters
  ----------
  data (np.array) : 実測値
  pred (np.array) : 予測値
  start (int) : プロットの最初の期
  data_num (int) : 予測に使用したデータの数
  interval (int) : プロットする区間の長さ
  prepro (str) : 前処理手法（'diff', 'pct', 'logdiff')
  axis (matplotlib.axes.Axes) : 書き出したいグラフのAxesオブジェクト
  """
  if prepro == 'diff':
    target = data.diff().values
  elif prepro == 'pct':
    target = data.pct_change().values
  elif prepro == 'logdiff':
    target = (np.log(data) - np.log(data.shift(1))).values
  axis.plot(np.arange(start-data_num, start), target[start-data_num:start], marker='.')
  axis.plot(np.arange(start, start+interval), target[start:start+interval], c='green', label='Actual', marker='.')
  axis.plot(np.arange(start, start+interval), pred, c='r', label='Predict', marker='.')
  axis.set_ylabel('diff', fontsize=17)
  axis.legend()
  return None

In [None]:
def plot_original_series(data, pred, start, interval, axis, xlim, ylim):
  '''逆変換した系列の実測値および予測値をグラフに書き出す.
  Parameters
  ----------
  data (np.array) : 実測値
  pred (np.array) : 予測値
  start (int) : プロットの最初の期
  interval (int) : プロットする区間の長さ
  axis (matplotlib.axes.Axes) : 書き出したいグラフのAxesオブジェクト
  xlim (turple) : 原系列グラフのx軸の描画範囲
  ylim (turple) : 原系列グラフのy軸の描画範囲
  '''
  axis.plot(np.arange(1, start+interval+1), data[:start+interval], c='green', label='Actual', marker='.')
  axis.plot(np.arange(start+1, start+interval+1), pred, c='r', label='Predict', marker='.')
  axis.set_xlim(xlim)
  axis.set_ylim(ylim)
  axis.set_ylabel('Passengers', fontsize=17)
  axis.legend()
  return None

In [None]:
def plot_predict(data, pred, start, data_num, prepro, xlim=None, ylim=None):
  """AR(MA)モデルで予測したデータからグラフを描画する
  Parameters
  ----------
  data (np.array) : 実測値
  pred (np.array) : 予測値
  start (int) : プロットを始める期（ > data_num)
  data_num (int) : データをいくつ利用したか
  prepro (str) : データの前処理方法（'diff', 'pct', 'logdiff')
  xlim (turple) : 原系列グラフのx軸の描画範囲
  ylim (turple) : 原系列グラフのy軸の描画範囲
  """
  period = len(pred) #何期先までの予測をしたか

  # 逆変換して原系列の予測を用意
  pred_inverted = invert_predict(data, pred, start, prepro, period)

  # 処理した系列、原系列について実測値と予測値の比較
  fig, axes = plt.subplots(1, 2, figsize=(12, 5))
  plt.rcParams["font.size"] = 12
  plt.subplots_adjust(wspace = 0.3)
  plot_processed_series(data, pred, start, data_num, period, prepro, axes[0])
  plot_original_series(data, pred_inverted, start, period, axes[1], xlim, ylim)

  plt.show()
  return None

In [None]:
target = df["Passengers"].diff().values

start = 101
period = 1
data_num = 100

pred = sm.tsa.AR(target[start-data_num:start]).fit(maxlag=12).predict(start=data_num, end=data_num+period-1)
plot_predict(df["Passengers"], pred, start, data_num, 'diff')

In [None]:
start = 101
period = 1
data_num = 100
pred_seq = np.arange(start+period-1, len(df))
pred_AR = np.empty((len(pred_seq), period), dtype=float)

for i, j in enumerate(pred_seq):
  pred_AR[i] = sm.tsa.AR(target[j-period+1-data_num:j-period+1]).fit(maxlag=12).predict(start=data_num, end=data_num+period-1)
plot_predict(df["Passengers"], pred_AR, start, data_num, 'diff')

In [None]:
## 自己回帰移動平均モデル（ARMA）

#target = df["Passengers"].diff().values

start = 101
period = 1
data_num = 100

pred2 = sm.tsa.ARMA(target[start-data_num:start], order=(9, 3)).fit(method='mle').predict(start=data_num, end=data_num+period-1)
plot_predict(df["Passengers"], pred2, start, data_num, 'diff')

In [None]:
#target = df["Passengers"].diff().values

start = 101
period = 1
data_num = 100
pred_seq = np.arange(start+period-1, len(df))
pred_ARMA = np.empty((len(pred_seq), period), dtype=float)

for i, j in enumerate(pred_seq):
  pred_ARMA[i] = sm.tsa.ARMA(target[j-period+1-data_num:j-period+1], order=(9, 3)).fit(method='mle').predict(start=data_num, end=data_num+period-1)
plot_predict(df["Passengers"], pred_ARMA, start, data_num, 'diff')

########
# 12分くらいかかる
########

#### <font color=green>**1.3.** </font> ライブラリの公式サンプルコードたち

#### <font color=green>**1.3.1.** </font> MLE : Maximum Likelihood Estimation

In [None]:
## Time Series Analysis by State Space Methods statespace
# https://thequackdaddy.github.io/statsmodels.github.io/0.9.0/statespace.html
# https://thequackdaddy.github.io/statsmodels.github.io/0.9.0/examples/index.html

In [None]:
import numpy as np
from scipy.signal import lfilter
import statsmodels.api as sm

In [None]:
# True model parameters
nobs = int(1e3)
true_phi = np.r_[0.5, -0.2]
true_sigma = 1**0.5

In [None]:
# Simulate a time series
np.random.seed(1234)
disturbances = np.random.normal(0, true_sigma, size=(nobs,))
endog = lfilter([1], np.r_[1, -true_phi], disturbances)

In [None]:
# Construct the model
class AR2(sm.tsa.statespace.MLEModel):
  def __init__(self, endog):
    # Initialize the state space model
    super(AR2, self).__init__(endog, k_states=2, k_posdef=1,
                              initialization='stationary')

    # Setup the fixed components of the state space representation
    self['design'] = [1, 0]
    self['transition'] = [[0, 0],
                                  [1, 0]]
    self['selection', 0, 0] = 1

  # Describe how parameters enter the model
  def update(self, params, transformed=True, **kwargs):
    params = super(AR2, self).update(params, transformed, **kwargs)

    self['transition', 0, :] = params[:2]
    self['state_cov', 0, 0] = params[2]

  # Specify start parameters and parameter names
  @property
  def start_params(self):
    return [0,0,1]  # these are very simple

In [None]:
# Create and fit the model
mod = AR2(endog)
res = mod.fit()
print(res.summary())

#### <font color=green>**1.3.2.** </font> SARIMAX : Introduction

In [None]:
## https://thequackdaddy.github.io/statsmodels.github.io/0.9.0/examples/notebooks/generated/statespace_sarimax_stata.html

In [None]:
import numpy as np
import pandas as pd
from scipy.stats import norm
import statsmodels.api as sm
import matplotlib.pyplot as plt
from datetime import datetime
import requests
from io import BytesIO

In [None]:
# Dataset
wpi1 = requests.get('http://www.stata-press.com/data/r12/wpi1.dta').content
data = pd.read_stata(BytesIO(wpi1))
data.index = data.t

data['ln_wpi'] = np.log(data['wpi'])
data['D.ln_wpi'] = data['ln_wpi'].diff()

In [None]:
# Fit the model
mod = sm.tsa.statespace.SARIMAX(data['wpi'], trend='c', order=(1,1,1))
res = mod.fit(disp=False)
print(res.summary())

In [None]:
# Graph data
fig, axes = plt.subplots(1, 2, figsize=(15,4))

# Levels
axes[0].plot(data.index._mpl_repr(), data['wpi'], '-')
axes[0].set(title='US Wholesale Price Index')

# Log difference
axes[1].plot(data.index._mpl_repr(), data['D.ln_wpi'], '-')
axes[1].hlines(0, data.index[0], data.index[-1], 'r')
axes[1].set(title='US Wholesale Price Index - difference of logs');

In [None]:
# Graph data
fig, axes = plt.subplots(1, 2, figsize=(15,4))

fig = sm.graphics.tsa.plot_acf(data.iloc[1:]['D.ln_wpi'], lags=40, ax=axes[0])
fig = sm.graphics.tsa.plot_pacf(data.iloc[1:]['D.ln_wpi'], lags=40, ax=axes[1])

In [None]:
# Fit the model
mod = sm.tsa.statespace.SARIMAX(data['ln_wpi'], trend='c', order=(1,1,1))
res = mod.fit(disp=False)
print(res.summary())

In [None]:
# Dataset
air2 = requests.get('http://www.stata-press.com/data/r12/air2.dta').content
data = pd.read_stata(BytesIO(air2))
data.index = pd.date_range(start=datetime(data.time[0], 1, 1), periods=len(data), freq='MS')
data['lnair'] = np.log(data['air'])

# Fit the model
mod = sm.tsa.statespace.SARIMAX(data['lnair'], order=(2,1,0), seasonal_order=(1,1,0,12), simple_differencing=True)
res = mod.fit(disp=False)
print(res.summary())

In [None]:
# Dataset
friedman2 = requests.get('http://www.stata-press.com/data/r12/friedman2.dta').content
data = pd.read_stata(BytesIO(friedman2))
data.index = data.time

# Variables
endog = data.loc['1959':'1981', 'consump']
exog = sm.add_constant(data.loc['1959':'1981', 'm2'])

# Fit the model
mod = sm.tsa.statespace.SARIMAX(endog, exog, order=(1,0,1))
res = mod.fit(disp=False)
print(res.summary())

In [None]:
# Dataset
raw = pd.read_stata(BytesIO(friedman2))
raw.index = raw.time
data = raw.loc[:'1981']

# Variables
endog = data.loc['1959':, 'consump']
exog = sm.add_constant(data.loc['1959':, 'm2'])
nobs = endog.shape[0]

# Fit the model
mod = sm.tsa.statespace.SARIMAX(endog.loc[:'1978-01-01'], exog=exog.loc[:'1978-01-01'], order=(1,0,1))
fit_res = mod.fit(disp=False)
print(fit_res.summary())

In [None]:
mod = sm.tsa.statespace.SARIMAX(endog, exog=exog, order=(1,0,1))
res = mod.filter(fit_res.params)

In [None]:
# In-sample one-step-ahead predictions
predict = res.get_prediction()
predict_ci = predict.conf_int()

In [None]:
# Dynamic predictions
predict_dy = res.get_prediction(dynamic='1978-01-01')
predict_dy_ci = predict_dy.conf_int()

In [None]:
# Graph
fig, ax = plt.subplots(figsize=(9,4))
npre = 4
ax.set(title='Personal consumption', xlabel='Date', ylabel='Billions of dollars')

# Plot data points
data.loc['1977-07-01':, 'consump'].plot(ax=ax, style='o', label='Observed')

# Plot predictions
predict.predicted_mean.loc['1977-07-01':].plot(ax=ax, style='r--', label='One-step-ahead forecast')
ci = predict_ci.loc['1977-07-01':]
ax.fill_between(ci.index, ci.iloc[:,0], ci.iloc[:,1], color='r', alpha=0.1)
predict_dy.predicted_mean.loc['1977-07-01':].plot(ax=ax, style='g', label='Dynamic forecast (1978)')
ci = predict_dy_ci.loc['1977-07-01':]
ax.fill_between(ci.index, ci.iloc[:,0], ci.iloc[:,1], color='g', alpha=0.1)

legend = ax.legend(loc='lower right')

In [None]:
# Prediction error

# Graph
fig, ax = plt.subplots(figsize=(9,4))
npre = 4
ax.set(title='Forecast error', xlabel='Date', ylabel='Forecast - Actual')

# In-sample one-step-ahead predictions and 95% confidence intervals
predict_error = predict.predicted_mean - endog
predict_error.loc['1977-10-01':].plot(ax=ax, label='One-step-ahead forecast')
ci = predict_ci.loc['1977-10-01':].copy()
ci.iloc[:,0] -= endog.loc['1977-10-01':]
ci.iloc[:,1] -= endog.loc['1977-10-01':]
ax.fill_between(ci.index, ci.iloc[:,0], ci.iloc[:,1], alpha=0.1)

# Dynamic predictions and 95% confidence intervals
predict_dy_error = predict_dy.predicted_mean - endog
predict_dy_error.loc['1977-10-01':].plot(ax=ax, style='r', label='Dynamic forecast (1978)')
ci = predict_dy_ci.loc['1977-10-01':].copy()
ci.iloc[:,0] -= endog.loc['1977-10-01':]
ci.iloc[:,1] -= endog.loc['1977-10-01':]
ax.fill_between(ci.index, ci.iloc[:,0], ci.iloc[:,1], color='r', alpha=0.1)

legend = ax.legend(loc='lower left');
legend.get_frame().set_facecolor('w')

#### <font color=green>**1.3.3.** </font> SARIMAX : Model selection, missing data

In [None]:
## https://thequackdaddy.github.io/statsmodels.github.io/0.9.0/examples/notebooks/generated/statespace_sarimax_internet.html

In [None]:
import numpy as np
import pandas as pd
from scipy.stats import norm
import statsmodels.api as sm
import matplotlib.pyplot as plt

In [None]:
import requests
from io import BytesIO
from zipfile import ZipFile

# Download the dataset
dk = requests.get('http://www.ssfpack.com/files/DK-data.zip').content
f = BytesIO(dk)
zipped = ZipFile(f)
df = pd.read_table(
    BytesIO(zipped.read('internet.dat')),
    skiprows=1, header=None, sep='\s+', engine='python',
    names=['internet','dinternet']
)

In [None]:
# Get the basic series
dta_full = df.dinternet[1:].values
dta_miss = dta_full.copy()

# Remove datapoints
missing = np.r_[6,16,26,36,46,56,66,72,73,74,75,76,86,96]-1
dta_miss[missing] = np.nan

In [None]:
import warnings

aic_full = pd.DataFrame(np.zeros((6,6), dtype=float))
aic_miss = pd.DataFrame(np.zeros((6,6), dtype=float))

warnings.simplefilter('ignore')

# Iterate over all ARMA(p,q) models with p,q in [0,6]
for p in range(6):
  for q in range(6):
    if p == 0 and q == 0:
      continue
            
    # Estimate the model with no missing datapoints
    mod = sm.tsa.statespace.SARIMAX(dta_full, order=(p,0,q), enforce_invertibility=False)
    try:
      res = mod.fit(disp=False)
      aic_full.iloc[p,q] = res.aic
    except:
      aic_full.iloc[p,q] = np.nan
        
    # Estimate the model with missing datapoints
    mod = sm.tsa.statespace.SARIMAX(dta_miss, order=(p,0,q), enforce_invertibility=False)
    try:
      res = mod.fit(disp=False)
      aic_miss.iloc[p,q] = res.aic
    except:
      aic_miss.iloc[p,q] = np.nan

In [None]:
# Statespace
mod = sm.tsa.statespace.SARIMAX(dta_miss, order=(1,0,1))
res = mod.fit(disp=False)
print(res.summary())

In [None]:
# In-sample one-step-ahead predictions, and out-of-sample forecasts
nforecast = 20
predict = res.get_prediction(end=mod.nobs + nforecast)
idx = np.arange(len(predict.predicted_mean))
predict_ci = predict.conf_int(alpha=0.5)

# Graph
fig, ax = plt.subplots(figsize=(12,6))
ax.xaxis.grid()
ax.plot(dta_miss, 'k.')

# Plot
ax.plot(idx[:-nforecast], predict.predicted_mean[:-nforecast], 'gray')
ax.plot(idx[-nforecast:], predict.predicted_mean[-nforecast:], 'k--', linestyle='--', linewidth=2)
ax.fill_between(idx, predict_ci[:, 0], predict_ci[:, 1], alpha=0.15)

ax.set(title='Figure 8.9 - Internet series');

#### <font color=green>**1.3.4.** </font> VARMAX models

In [None]:
## https://thequackdaddy.github.io/statsmodels.github.io/0.9.0/examples/notebooks/generated/statespace_varmax.html

In [None]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt

In [None]:
dta = sm.datasets.webuse('lutkepohl2', 'http://www.stata-press.com/data/r12/')
dta.index = dta.qtr
endog = dta.loc['1960-04-01':'1978-10-01', ['dln_inv', 'dln_inc', 'dln_consump']]

In [None]:
exog = endog['dln_consump']
mod = sm.tsa.VARMAX(endog[['dln_inv', 'dln_inc']], order=(2,0), trend='nc', exog=exog)
res = mod.fit(maxiter=1000, disp=False)
print(res.summary())

In [None]:
ax = res.impulse_responses(10, orthogonalized=True).plot(figsize=(13,3))
ax.set(xlabel='t', title='Responses to a shock to `dln_inv`');

In [None]:
mod = sm.tsa.VARMAX(endog[['dln_inv', 'dln_inc']], order=(0,2), error_cov_type='diagonal')
res = mod.fit(maxiter=1000, disp=False)
print(res.summary())

In [None]:
mod = sm.tsa.VARMAX(endog[['dln_inv', 'dln_inc']], order=(1,1))
res = mod.fit(maxiter=1000, disp=False)
print(res.summary())

#### <font color=green>**1.3.5.** </font> Dynamic factors and coincident indices

In [None]:
## https://thequackdaddy.github.io/statsmodels.github.io/0.9.0/examples/notebooks/generated/statespace_dfm_coincident.html

In [None]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt

np.set_printoptions(precision=4, suppress=True, linewidth=120)

In [None]:
from pandas_datareader.data import DataReader

# Get the datasets from FRED
start = '1979-01-01'
end = '2014-12-01'
indprod = DataReader('IPMAN', 'fred', start=start, end=end)
income = DataReader('W875RX1', 'fred', start=start, end=end)
sales = DataReader('CMRMTSPL', 'fred', start=start, end=end)
emp = DataReader('PAYEMS', 'fred', start=start, end=end)
# dta = pd.concat((indprod, income, sales, emp), axis=1)
# dta.columns = ['indprod', 'income', 'sales', 'emp']

In [None]:
# HMRMT = DataReader('HMRMT', 'fred', start='1967-01-01', end=end)
# CMRMT = DataReader('CMRMT', 'fred', start='1997-01-01', end=end)

In [None]:
# HMRMT_growth = HMRMT.diff() / HMRMT.shift()
# sales = pd.Series(np.zeros(emp.shape[0]), index=emp.index)

# # Fill in the recent entries (1997 onwards)
# sales[CMRMT.index] = CMRMT

# # Backfill the previous entries (pre 1997)
# idx = sales.loc[:'1997-01-01'].index
# for t in range(len(idx)-1, 0, -1):
#     month = idx[t]
#     prev_month = idx[t-1]
#     sales.loc[prev_month] = sales.loc[month] / (1 + HMRMT_growth.loc[prev_month].values)

In [None]:
dta = pd.concat((indprod, income, sales, emp), axis=1)
dta.columns = ['indprod', 'income', 'sales', 'emp']

In [None]:
dta.loc[:, 'indprod':'emp'].plot(subplots=True, layout=(2, 2), figsize=(15, 6));

In [None]:
# Create log-differenced series
dta['dln_indprod'] = (np.log(dta.indprod)).diff() * 100
dta['dln_income'] = (np.log(dta.income)).diff() * 100
dta['dln_sales'] = (np.log(dta.sales)).diff() * 100
dta['dln_emp'] = (np.log(dta.emp)).diff() * 100

# De-mean and standardize
dta['std_indprod'] = (dta['dln_indprod'] - dta['dln_indprod'].mean()) / dta['dln_indprod'].std()
dta['std_income'] = (dta['dln_income'] - dta['dln_income'].mean()) / dta['dln_income'].std()
dta['std_sales'] = (dta['dln_sales'] - dta['dln_sales'].mean()) / dta['dln_sales'].std()
dta['std_emp'] = (dta['dln_emp'] - dta['dln_emp'].mean()) / dta['dln_emp'].std()

In [None]:
# Get the endogenous data
endog = dta.loc['1979-02-01':, 'std_indprod':'std_emp']

# Create the model
mod = sm.tsa.DynamicFactor(endog, k_factors=1, factor_order=2, error_order=2)
initial_res = mod.fit(method='powell', disp=False)
res = mod.fit(initial_res.params, disp=False)

In [None]:
print(res.summary(separate_params=False))

In [None]:
fig, ax = plt.subplots(figsize=(13,3))

# Plot the factor
dates = endog.index._mpl_repr()
ax.plot(dates, res.factors.filtered[0], label='Factor')
ax.legend()

# Retrieve and also plot the NBER recession indicators
rec = DataReader('USREC', 'fred', start=start, end=end)
ylim = ax.get_ylim()
ax.fill_between(dates[:-3], ylim[0], ylim[1], rec.values[:-4,0], facecolor='k', alpha=0.1);

In [None]:
res.plot_coefficients_of_determination(figsize=(8,2));

In [None]:
usphci = DataReader('USPHCI', 'fred', start='1979-01-01', end='2014-12-01')['USPHCI']
usphci.plot(figsize=(13,3));

In [None]:
dusphci = usphci.diff()[1:].values
def compute_coincident_index(mod, res):
  # Estimate W(1)
  spec = res.specification
  design = mod.ssm['design']
  transition = mod.ssm['transition']
  ss_kalman_gain = res.filter_results.kalman_gain[:,:,-1]
  k_states = ss_kalman_gain.shape[0]

  W1 = np.linalg.inv(np.eye(k_states) - np.dot(
      np.eye(k_states) - np.dot(ss_kalman_gain, design),
      transition
      )).dot(ss_kalman_gain)[0]

  # Compute the factor mean vector
  factor_mean = np.dot(W1, dta.loc['1972-02-01':, 'dln_indprod':'dln_emp'].mean())
    
  # Normalize the factors
  factor = res.factors.filtered[0]
  factor *= np.std(usphci.diff()[1:]) / np.std(factor)

  # Compute the coincident index
  coincident_index = np.zeros(mod.nobs+1)
  # The initial value is arbitrary; here it is set to
  # facilitate comparison
  coincident_index[0] = usphci.iloc[0] * factor_mean / dusphci.mean()
  for t in range(0, mod.nobs):
    coincident_index[t+1] = coincident_index[t] + factor[t] + factor_mean
    
  # Attach dates
  coincident_index = pd.Series(coincident_index, index=dta.index).iloc[1:]
    
  # Normalize to use the same base year as USPHCI
  coincident_index *= (usphci.loc['1992-07-01'] / coincident_index.loc['1992-07-01'])
    
  return coincident_index

In [None]:
fig, ax = plt.subplots(figsize=(13,3))

# Compute the index
coincident_index = compute_coincident_index(mod, res)

# Plot the factor
dates = endog.index._mpl_repr()
ax.plot(dates, coincident_index, label='Coincident index')
ax.plot(usphci.index._mpl_repr(), usphci, label='USPHCI')
ax.legend(loc='lower right')

# Retrieve and also plot the NBER recession indicators
ylim = ax.get_ylim()
ax.fill_between(dates[:-3], ylim[0], ylim[1], rec.values[:-4,0], facecolor='k', alpha=0.1);

In [None]:
from statsmodels.tsa.statespace import tools
class ExtendedDFM(sm.tsa.DynamicFactor):
  def __init__(self, endog, **kwargs):
    # Setup the model as if we had a factor order of 4
    super(ExtendedDFM, self).__init__(
        endog, k_factors=1, factor_order=4, error_order=2,
        **kwargs)

    # Note: `self.parameters` is an ordered dict with the
    # keys corresponding to parameter types, and the values
    # the number of parameters of that type.
    # Add the new parameters
    self.parameters['new_loadings'] = 3

    # Cache a slice for the location of the 4 factor AR
    # parameters (a_1, ..., a_4) in the full parameter vector
    offset = (self.parameters['factor_loadings'] +
                self.parameters['exog'] +
                self.parameters['error_cov'])
    self._params_factor_ar = np.s_[offset:offset+2]
    self._params_factor_zero = np.s_[offset+2:offset+4]

  @property
  def start_params(self):
    # Add three new loading parameters to the end of the parameter
    # vector, initialized to zeros (for simplicity; they could
    # be initialized any way you like)
    return np.r_[super(ExtendedDFM, self).start_params, 0, 0, 0]
    
  @property
  def param_names(self):
    # Add the corresponding names for the new loading parameters
    #  (the name can be anything you like)
    return super(ExtendedDFM, self).param_names + [
            'loading.L%d.f1.%s' % (i, self.endog_names[3]) for i in range(1,4)]

  def transform_params(self, unconstrained):
    # Perform the typical DFM transformation (w/o the new parameters)
    constrained = super(ExtendedDFM, self).transform_params(
        unconstrained[:-3])

    # Redo the factor AR constraint, since we only want an AR(2),
    # and the previous constraint was for an AR(4)
    ar_params = unconstrained[self._params_factor_ar]
    constrained[self._params_factor_ar] = (
        tools.constrain_stationary_univariate(ar_params))

    # Return all the parameters
    return np.r_[constrained, unconstrained[-3:]]

  def untransform_params(self, constrained):
    # Perform the typical DFM untransformation (w/o the new parameters)
    unconstrained = super(ExtendedDFM, self).untransform_params(
        constrained[:-3])

    # Redo the factor AR unconstraint, since we only want an AR(2),
    # and the previous unconstraint was for an AR(4)
    ar_params = constrained[self._params_factor_ar]
    unconstrained[self._params_factor_ar] = (
        tools.unconstrain_stationary_univariate(ar_params))

    # Return all the parameters
    return np.r_[unconstrained, constrained[-3:]]

  def update(self, params, transformed=True, complex_step=False):
    # Peform the transformation, if required
    if not transformed:
      params = self.transform_params(params)
    params[self._params_factor_zero] = 0
        
    # Now perform the usual DFM update, but exclude our new parameters
    super(ExtendedDFM, self).update(params[:-3], transformed=True, complex_step=complex_step)

    # Finally, set our new parameters in the design matrix
    self.ssm['design', 3, 1:4] = params[-3:]

In [None]:
# Create the model
extended_mod = ExtendedDFM(endog)
initial_extended_res = extended_mod.fit(maxiter=1000, disp=False)
extended_res = extended_mod.fit(initial_extended_res.params, method='nm', maxiter=1000)
print(extended_res.summary(separate_params=False))

In [None]:
extended_res.plot_coefficients_of_determination(figsize=(8,2));

In [None]:
fig, ax = plt.subplots(figsize=(13,3))

# Compute the index
extended_coincident_index = compute_coincident_index(extended_mod, extended_res)

# Plot the factor
dates = endog.index._mpl_repr()
ax.plot(dates, coincident_index, '-', linewidth=1, label='Basic model')
ax.plot(dates, extended_coincident_index, '--', linewidth=3, label='Extended model')
ax.plot(usphci.index._mpl_repr(), usphci, label='USPHCI')
ax.legend(loc='lower right')
ax.set(title='Coincident indices, comparison')

# Retrieve and also plot the NBER recession indicators
ylim = ax.get_ylim()
ax.fill_between(dates[:-3], ylim[0], ylim[1], rec.values[:-4,0], facecolor='k', alpha=0.1);

#### <font color=green>**1.3.6.** </font> Detrending, Stylized Facts and the Business Cycle

In [None]:
## https://thequackdaddy.github.io/statsmodels.github.io/0.9.0/examples/notebooks/generated/statespace_structural_harvey_jaeger.html

In [None]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt

In [None]:
# Datasets
from pandas_datareader.data import DataReader

# Get the raw data
start = '1948-01'
end = '2008-01'
us_gnp = DataReader('GNPC96', 'fred', start=start, end=end)
us_gnp_deflator = DataReader('GNPDEF', 'fred', start=start, end=end)
us_monetary_base = DataReader('AMBSL', 'fred', start=start, end=end).resample('QS').mean()
recessions = DataReader('USRECQ', 'fred', start=start, end=end).resample('QS').last().values[:,0]

# Construct the dataframe
dta = pd.concat(map(np.log, (us_gnp, us_gnp_deflator, us_monetary_base)), axis=1)
dta.columns = ['US GNP','US Prices','US monetary base']
dates = dta.index._mpl_repr()

In [None]:
# Plot the data
ax = dta.plot(figsize=(13,3))
ylim = ax.get_ylim()
ax.xaxis.grid()
ax.fill_between(dates, ylim[0]+1e-5, ylim[1]-1e-5, recessions, facecolor='k', alpha=0.1);

In [None]:
# Model specifications

# Unrestricted model, using string specification
unrestricted_model = {
    'level': 'local linear trend', 'cycle': True, 'damped_cycle': True, 'stochastic_cycle': True
    }

# Unrestricted model, setting components directly
# This is an equivalent, but less convenient, way to specify a
# local linear trend model with a stochastic damped cycle:
# unrestricted_model = {
#     'irregular': True, 'level': True, 'stochastic_level': True, 'trend': True, 'stochastic_trend': True,
#     'cycle': True, 'damped_cycle': True, 'stochastic_cycle': True
# }

# The restricted model forces a smooth trend
restricted_model = {
    'level': 'smooth trend', 'cycle': True, 'damped_cycle': True, 'stochastic_cycle': True
    }

# Restricted model, setting components directly
# This is an equivalent, but less convenient, way to specify a
# smooth trend model with a stochastic damped cycle. Notice
# that the difference from the local linear trend model is that
# `stochastic_level=False` here.
# unrestricted_model = {
#     'irregular': True, 'level': True, 'stochastic_level': False, 'trend': True, 'stochastic_trend': True,
#     'cycle': True, 'damped_cycle': True, 'stochastic_cycle': True
# }

In [None]:
# Output
output_mod = sm.tsa.UnobservedComponents(dta['US GNP'], **unrestricted_model)
output_res = output_mod.fit(method='powell', disp=False)

# Prices
prices_mod = sm.tsa.UnobservedComponents(dta['US Prices'], **unrestricted_model)
prices_res = prices_mod.fit(method='powell', disp=False)

prices_restricted_mod = sm.tsa.UnobservedComponents(dta['US Prices'], **restricted_model)
prices_restricted_res = prices_restricted_mod.fit(method='powell', disp=False)

# Money
money_mod = sm.tsa.UnobservedComponents(dta['US monetary base'], **unrestricted_model)
money_res = money_mod.fit(method='powell', disp=False)

money_restricted_mod = sm.tsa.UnobservedComponents(dta['US monetary base'], **restricted_model)
money_restricted_res = money_restricted_mod.fit(method='powell', disp=False)

In [None]:
print(output_res.summary())

In [None]:
fig = output_res.plot_components(legend_loc='lower right', figsize=(15, 9));

In [None]:
# Create Table I
table_i = np.zeros((5,6))

start = dta.index[0]
end = dta.index[-1]
time_range = '%d:%d-%d:%d' % (start.year, start.quarter, end.year, end.quarter)
models = [
          ('US GNP', time_range, 'None'),
          ('US Prices', time_range, 'None'),
          ('US Prices', time_range, r'$\sigma_\eta^2 = 0$'),
          ('US monetary base', time_range, 'None'),
          ('US monetary base', time_range, r'$\sigma_\eta^2 = 0$'),
          ]
index = pd.MultiIndex.from_tuples(models, names=['Series', 'Time range', 'Restrictions'])
parameter_symbols = [
    r'$\sigma_\zeta^2$', r'$\sigma_\eta^2$', r'$\sigma_\kappa^2$', r'$\rho$',
    r'$2 \pi / \lambda_c$', r'$\sigma_\varepsilon^2$',
]

i = 0
for res in (output_res, prices_res, prices_restricted_res, money_res, money_restricted_res):
  if res.model.stochastic_level:
    (sigma_irregular, sigma_level, sigma_trend,
     sigma_cycle, frequency_cycle, damping_cycle) = res.params
  else:
    (sigma_irregular, sigma_level,
     sigma_cycle, frequency_cycle, damping_cycle) = res.params
    sigma_trend = np.nan
  period_cycle = 2 * np.pi / frequency_cycle
    
  table_i[i, :] = [
        sigma_level*1e7, sigma_trend*1e7,
        sigma_cycle*1e7, damping_cycle, period_cycle,
        sigma_irregular*1e7
        ]
  i += 1
    
pd.set_option('float_format', lambda x: '%.4g' % np.round(x, 2) if not np.isnan(x) else '-')
table_i = pd.DataFrame(table_i, index=index, columns=parameter_symbols)
table_i

#### <font color=green>**1.3.7.** </font> Trends and cycles in unemployment

In [None]:
## https://thequackdaddy.github.io/statsmodels.github.io/0.9.0/examples/notebooks/generated/statespace_cycles.html

In [None]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt

In [None]:
from pandas_datareader.data import DataReader
endog = DataReader('UNRATE', 'fred', start='1954-01-01')

In [None]:
hp_cycle, hp_trend = sm.tsa.filters.hpfilter(endog, lamb=129600)

In [None]:
mod_ucarima = sm.tsa.UnobservedComponents(endog, 'rwalk', autoregressive=4)
# Here the powell method is used, since it achieves a
# higher loglikelihood than the default L-BFGS method
res_ucarima = mod_ucarima.fit(method='powell', disp=False)
print(res_ucarima.summary())

In [None]:
mod_uc = sm.tsa.UnobservedComponents(
    endog, 'rwalk',
    cycle=True, stochastic_cycle=True, damped_cycle=True,
    )
# Here the powell method gets close to the optimum
res_uc = mod_uc.fit(method='powell', disp=False)
# but to get to the highest loglikelihood we do a
# second round using the L-BFGS method.
res_uc = mod_uc.fit(res_uc.params, disp=False)
print(res_uc.summary())

In [None]:
fig, axes = plt.subplots(2, figsize=(13,5));
axes[0].set(title='Level/trend component')
axes[0].plot(endog.index, res_uc.level.smoothed, label='UC')
axes[0].plot(endog.index, res_ucarima.level.smoothed, label='UC-ARIMA(2,0)')
axes[0].plot(hp_trend, label='HP Filter')
axes[0].legend(loc='upper left')
axes[0].grid()

axes[1].set(title='Cycle component')
axes[1].plot(endog.index, res_uc.cycle.smoothed, label='UC')
axes[1].plot(endog.index, res_ucarima.autoregressive.smoothed, label='UC-ARIMA(2,0)')
axes[1].plot(hp_cycle, label='HP Filter')
axes[1].legend(loc='upper left')
axes[1].grid()

fig.tight_layout();

#### <font color=green>**1.3.8.** </font> State space modeling : Local Linear Trends

In [None]:
## https://thequackdaddy.github.io/statsmodels.github.io/0.9.0/examples/notebooks/generated/statespace_local_linear_trend.html

In [None]:
import numpy as np
import pandas as pd
from scipy.stats import norm
import statsmodels.api as sm
import matplotlib.pyplot as plt

In [None]:
"""
Univariate Local Linear Trend Model
"""
class LocalLinearTrend(sm.tsa.statespace.MLEModel):
  def __init__(self, endog):
    # Model order
    k_states = k_posdef = 2

    # Initialize the statespace
    super(LocalLinearTrend, self).__init__(
        endog, k_states=k_states, k_posdef=k_posdef,
        initialization='approximate_diffuse',
        loglikelihood_burn=k_states
        )

    # Initialize the matrices
    self.ssm['design'] = np.array([1, 0])
    self.ssm['transition'] = np.array([[1, 1],
                                       [0, 1]])
    self.ssm['selection'] = np.eye(k_states)

    # Cache some indices
    self._state_cov_idx = ('state_cov',) + np.diag_indices(k_posdef)

  @property
  def param_names(self):
    return ['sigma2.measurement', 'sigma2.level', 'sigma2.trend']

  @property
  def start_params(self):
    return [np.std(self.endog)]*3

  def transform_params(self, unconstrained):
    return unconstrained**2

  def untransform_params(self, constrained):
    return constrained**0.5

  def update(self, params, *args, **kwargs):
    params = super(LocalLinearTrend, self).update(params, *args, **kwargs)
        
    # Observation covariance
    self.ssm['obs_cov',0,0] = params[0]

    # State covariance
    self.ssm[self._state_cov_idx] = params[1:]

In [None]:
import requests
from io import BytesIO
from zipfile import ZipFile
    
# Download the dataset
ck = requests.get('http://staff.feweb.vu.nl/koopman/projects/ckbook/OxCodeAll.zip').content
zipped = ZipFile(BytesIO(ck))
df = pd.read_table(
    BytesIO(zipped.read('OxCodeIntroStateSpaceBook/Chapter_2/NorwayFinland.txt')),
    skiprows=1, header=None, sep='\s+', engine='python',
    names=['date','nf', 'ff']
)

In [None]:
# Load Dataset
df.index = pd.date_range(start='%d-01-01' % df.date[0], end='%d-01-01' % df.iloc[-1, 0], freq='AS')

# Log transform
df['lff'] = np.log(df['ff'])

# Setup the model
mod = LocalLinearTrend(df['lff'])

# Fit it using MLE (recall that we are fitting the three variance parameters)
res = mod.fit(disp=False)
print(res.summary())

In [None]:
# Perform prediction and forecasting
predict = res.get_prediction()
forecast = res.get_forecast('2014')

In [None]:
fig, ax = plt.subplots(figsize=(10,4))

# Plot the results
df['lff'].plot(ax=ax, style='k.', label='Observations')
predict.predicted_mean.plot(ax=ax, label='One-step-ahead Prediction')
predict_ci = predict.conf_int(alpha=0.05)
predict_index = np.arange(len(predict_ci))
ax.fill_between(predict_index[2:], predict_ci.iloc[2:, 0], predict_ci.iloc[2:, 1], alpha=0.1)

forecast.predicted_mean.plot(ax=ax, style='r', label='Forecast')
forecast_ci = forecast.conf_int()
forecast_index = np.arange(len(predict_ci), len(predict_ci) + len(forecast_ci))
ax.fill_between(forecast_index, forecast_ci.iloc[:, 0], forecast_ci.iloc[:, 1], alpha=0.1)

# Cleanup the image
ax.set_ylim((4, 8));
legend = ax.legend(loc='lower left');

#### <font color=green>**1.3.9.** </font> Autoregressive Moving Average (ARMA) : Sunspots data

In [None]:
## https://thequackdaddy.github.io/statsmodels.github.io/0.9.0/examples/notebooks/generated/statespace_arma_0.html

In [None]:
from __future__ import print_function
import numpy as np
from scipy import stats
import pandas as pd
import matplotlib.pyplot as plt

import statsmodels.api as sm

In [None]:
from statsmodels.graphics.api import qqplot

In [None]:
print(sm.datasets.sunspots.NOTE)

In [None]:
dta = sm.datasets.sunspots.load_pandas().data

In [None]:
dta.index = pd.Index(sm.tsa.datetools.dates_from_range('1700', '2008'))
del dta["YEAR"]

In [None]:
dta.plot(figsize=(12,4));

In [None]:
fig = plt.figure(figsize=(12,8))
ax1 = fig.add_subplot(211)
fig = sm.graphics.tsa.plot_acf(dta.values.squeeze(), lags=40, ax=ax1)
ax2 = fig.add_subplot(212)
fig = sm.graphics.tsa.plot_pacf(dta, lags=40, ax=ax2)

In [None]:
arma_mod20 = sm.tsa.statespace.SARIMAX(dta, order=(2,0,0), trend='c').fit(disp=False)
print(arma_mod20.params)

In [None]:
arma_mod30 = sm.tsa.statespace.SARIMAX(dta, order=(3,0,0), trend='c').fit(disp=False)

In [None]:
print(arma_mod20.aic, arma_mod20.bic, arma_mod20.hqic)

In [None]:
print(arma_mod30.params)

In [None]:
print(arma_mod30.aic, arma_mod30.bic, arma_mod30.hqic)

In [None]:
sm.stats.durbin_watson(arma_mod30.resid)

In [None]:
fig = plt.figure(figsize=(12,4))
ax = fig.add_subplot(111)
ax = plt.plot(arma_mod30.resid)

In [None]:
resid = arma_mod30.resid

In [None]:
stats.normaltest(resid)

In [None]:
fig = plt.figure(figsize=(12,4))
ax = fig.add_subplot(111)
fig = qqplot(resid, line='q', ax=ax, fit=True)

In [None]:
fig = plt.figure(figsize=(12,8))
ax1 = fig.add_subplot(211)
fig = sm.graphics.tsa.plot_acf(resid, lags=40, ax=ax1)
ax2 = fig.add_subplot(212)
fig = sm.graphics.tsa.plot_pacf(resid, lags=40, ax=ax2)

In [None]:
r,q,p = sm.tsa.acf(resid, qstat=True)
data = np.c_[range(1,41), r[1:], q, p]
table = pd.DataFrame(data, columns=['lag', "AC", "Q", "Prob(>Q)"])
print(table.set_index('lag'))

In [None]:
predict_sunspots = arma_mod30.predict(start='1990', end='2012', dynamic=True)

In [None]:
fig, ax = plt.subplots(figsize=(12, 8))
dta.loc['1950':].plot(ax=ax)
predict_sunspots.plot(ax=ax, style='r');

In [None]:
def mean_forecast_err(y, yhat):
  return y.sub(yhat).mean()

In [None]:
mean_forecast_err(dta.SUNACTIVITY, predict_sunspots)

### <font color=blue>**2.** </font> Linear Dynamical System（LDS）

#### <font color=green>**2.1.** </font> 状態空間モデル

In [None]:
## 出典 : https://logics-of-blue.com/python-state-space-models/

In [None]:
# 基本のライブラリを読み込む
import numpy as np
import pandas as pd
from scipy import stats

# グラフ描画
from matplotlib import pylab as plt
import seaborn as sns

# グラフを横長にする
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 15, 6

# 統計モデル
import statsmodels.api as sm

In [None]:
# 日付形式で読み込む
dateparse = lambda dates: pd.datetime.strptime(dates, '%Y-%m')
data = pd.read_csv('https://raw.githubusercontent.com/jiai-tus/FirstTerm/main/20210518/datasets/AirPassengers.csv',
                   index_col='Month', date_parser=dateparse, dtype='float')

# 日付形式にする
ts = data['#Passengers'] 
ts.head()

# プロット
plt.plot(ts)
plt.show()

In [None]:
# ローカルレベルモデルの推定
mod_local_level = sm.tsa.UnobservedComponents(ts, 'local level')

# 最尤法によるパラメタの推定
res_local_level = mod_local_level.fit()

# 推定されたパラメタ一覧
print(res_local_level.summary())

# 推定された状態・トレンドの描画
rcParams['figure.figsize'] = 15, 15
fig = res_local_level.plot_components()

In [None]:
# ローカル線形トレンドモデル

mod_trend = sm.tsa.UnobservedComponents(
    ts,
    'local linear trend'
)

# 最尤法によるパラメタの推定
# ワーニングが出たのでBFGS法で最適化する
res_trend = mod_trend.fit(method='bfgs')

# 推定されたパラメタ一覧
print(res_trend.summary())

# 推定された状態・トレンドの描画
rcParams['figure.figsize'] = 15, 20
fig = res_trend.plot_components()

In [None]:
# 季節変動ありのローカルレベルモデル

mod_season_local_level = sm.tsa.UnobservedComponents(
    ts,
    'local level',
    seasonal=12
)

# まずはNelder-Meadでパラメタを推定し、その結果を初期値としてまた最適化する。2回目はBFGSを使用。
res_season_local_level = mod_season_local_level.fit(
    method='bfgs', 
    maxiter=500, 
    start_params=mod_season_local_level.fit(method='nm', maxiter=500).params,
)

# 推定されたパラメタ一覧
print(res_season_local_level.summary())

# 推定された状態・トレンド・季節の影響の描画
rcParams['figure.figsize'] = 15, 20
fig = res_season_local_level.plot_components()

In [None]:
# 季節変動ありのローカル線形トレンドモデル

mod_season_trend = sm.tsa.UnobservedComponents(
    ts,
    'local linear trend',
    seasonal=12
)

# まずはNelder-Meadでパラメタを推定し、その結果を初期値としてまた最適化する。2回目はBFGSを使用。
res_season_trend = mod_season_trend.fit(
    method='bfgs', 
    maxiter=500, 
    start_params=mod_season_trend.fit(method='nm', maxiter=500).params,
)

# 推定されたパラメタ一覧
print(res_season_trend.summary())

# 推定された状態・トレンド・季節の影響の描画
rcParams['figure.figsize'] = 15, 20
fig = res_season_trend.plot_components()

In [None]:
# 詳細は以下の資料を参照してください
# http://www.statsmodels.org/stable/generated/statsmodels.tsa.statespace.structural.UnobservedComponents.html

# 季節変動ありのローカル線形トレンドモデル
# ただし、トレンドの分散は無し

mod_season_trend_d = sm.tsa.UnobservedComponents(
    ts,
    'local linear deterministic trend',
    seasonal=12
)

# まずはNelder-Meadでパラメタを推定し、その結果を初期値としてまた最適化する。2回目はBFGSを使用。
res_season_trend_d = mod_season_trend_d.fit(
    method='bfgs', 
    maxiter=500, 
    start_params=mod_season_trend_d.fit(method='nm', maxiter=500).params,
)

# 推定されたパラメタ一覧
print(res_season_trend_d.summary())

# 推定された状態・トレンド・季節の影響の描画
rcParams['figure.figsize'] = 15, 20
fig = res_season_trend_d.plot_components()

In [None]:
# 詳細は以下の資料を参照してください
# http://www.statsmodels.org/stable/generated/statsmodels.tsa.statespace.structural.UnobservedComponents.html

# 季節変動ありのローカル線形トレンドモデル
# ただし、トレンドと観測誤差の分散は無し

mod_season_rw = sm.tsa.UnobservedComponents(
    ts,
    'random walk with drift',
    seasonal=12
)

# まずはNelder-Meadでパラメタを推定し、その結果を初期値としてまた最適化する。2回目はBFGSを使用。
res_season_rw = mod_season_rw.fit(
    method='bfgs', 
    maxiter=500, 
    start_params=mod_season_rw.fit(method='nm', maxiter=500).params,
)

# 推定されたパラメタ一覧
print(res_season_rw.summary())

# 推定された状態・トレンド・季節の影響の描画
rcParams['figure.figsize'] = 15, 20
fig = res_season_rw.plot_components()

In [None]:
# 今まで計算してきたモデルのAICを格納する
aic_list = pd.DataFrame(index=range(6), columns=["model", "aic"])

### .ix -> .loc

aic_list.loc[0]["model"] = "res_local_level"
aic_list.loc[0]["aic"] = res_local_level.aic

aic_list.loc[1]["model"] = "res_trend"
aic_list.loc[1]["aic"] = res_trend.aic

aic_list.loc[2]["model"] = "res_season_local_level"
aic_list.loc[2]["aic"] = res_season_local_level.aic

aic_list.loc[3]["model"] = "res_season_trend"
aic_list.loc[3]["aic"] = res_season_trend.aic

aic_list.loc[4]["model"] = "res_season_trend_d"
aic_list.loc[4]["aic"] = res_season_trend_d.aic

aic_list.loc[5]["model"] = "res_season_rw"
aic_list.loc[5]["aic"] = res_season_rw.aic

# 結果の表示
aic_list

In [None]:
# 予測
pred = res_season_rw.predict('1960-01-01', '1961-12-01')

# 実データと予測結果の図示
rcParams['figure.figsize'] = 15, 6
plt.plot(ts)
plt.plot(pred, "r")

#### <font color=green>**2.2.** </font> Kalman filter

In [None]:
## 出典 : https://qiita.com/matsui_685/items/16b81bf0ad9a24c54e52

In [None]:
import numpy as np

In [None]:
# 初期位置
initial_xy = [6., 17.]

In [None]:
# 計測間隔
dt = 0.1

In [None]:
# 位置[x,y]の計測結果
measurements = [[7., 15.], 
                  [8., 14.], 
                  [9., 13.], 
                  [10., 12.], 
                  [11., 11.], 
                  [12., 10.]] 

## 計測は位置だけを取得でき、速度は計測不能とします。
## ここでは0.1秒間隔で6回計測したとします。

In [None]:
# 初期位置と初期速度を代入した「4次元状態」
x = np.array([[initial_xy[0]], 
              [initial_xy[1]], 
              [0.], 
              [0.]])

In [None]:
# 外部要素
u = np.array([[0.], 
              [0.], 
              [0.], 
              [0.]]) 

In [None]:
# 共分散行列
## この値が大きいほど予測が広く分布し、値の精度が悪いことになります
P = np.array([[0., 0., 0., 0.], 
              [0., 0., 0., 0.], 
              [0., 0., 100., 0.], 
              [0., 0., 0., 100.]])

In [None]:
# 状態遷移行列
## 次の時刻での状態を求めるのに使用します
F = np.array([[1., 0., dt, 0.], 
              [0., 1., 0., dt], 
              [0., 0., 1., 0.], 
              [0., 0., 0., 1.]])

In [None]:
# 観測行列
## 観測値は位置だけなので、4次元状態から位置だけを抽出する役割を持ちます
H = np.array([[1., 0., 0, 0], 
              [0., 1., 0., 0.]])

In [None]:
# ノイズ
## 計測結果はノイズにより不確かなものとなります
R = np.array([[0.1, 0], 
              [0, 0.1]])

In [None]:
# 4次元単位行列
I = np.identity((len(x)))    

In [None]:
def kalman_filter(x, P):
  for n in range(len(measurements)):
    # 予測
    x = np.dot(F, x) + u
    P = np.dot(np.dot(F, P), F.T)

    # 計測更新
    Z = np.array([measurements[n]])
    y = Z.T - np.dot(H, x)
    S = np.dot(np.dot(H, P), H.T) + R
    K = np.dot(np.dot(P, H.T), np.linalg.inv(S))
    x = x + np.dot(K, y)        
    P = np.dot((I - np.dot(K, H)), P)

  x = x.tolist()
  P = P.tolist()
  return x,P

In [None]:
print("6回の計測後の位置と速度の予測値：\n{}".format(kalman_filter(x, P)[0]))

In [None]:
# 得られた予測値は、6回目の観測値[12,10]よりわずかに下方へ修正されています。
# 速度は10分の1にして0.1秒単位で計算すると、x方向に約1で、y方向に約-1となっており、観測値の変化から推測できる値に近い結果が出ています

In [None]:
kalman_filter(x, P)

#### <font color=green>**2.3.** </font> Kalman smoother

In [None]:
## 出典 : https://qiita.com/Kosuke-Szk/items/9f7b7d71dc2d435fe2cf

In [None]:
## 衛星の回転運動モデル

import numpy as np
import matplotlib.pyplot as plt

In [None]:
# step数
N = 60

# 人工衛星の回転運動を線形近似した4次元システムを考える
# 人工衛星の姿勢角度、角速度、角加速度の平均値成分、角加速度のランダム成分
nx = 4

# 推定したいのは衛星の姿勢角
ny = 1

In [None]:
F = np.matrix([[1, 1, 0.5, 0.5],
              [0, 1,   1,   1],
              [0, 0,   1,   0],
              [0, 0,   0, 0.606]]) # 遷移行列

H = np.matrix([1,0,0,0])  # 観測行列

R = np.matrix([1]) # 観測ノイズ共分散行列

G = np.matrix([0,0,0,1])

# ガウスノイズの分散
q = 0.0064

x = np.zeros([N,nx])  # 状態ベクトル(真値)
y = np.zeros([N,ny])  # 観測ベクトル

In [None]:
x[0,:] = np.array([1.25, 0.06, 0.01, -0.003])

for i in range(1,N):
  x[i,:] = F.dot(x[i-1]) + G*np.random.normal(0, q)
  y[i,:] = H.dot(x[i,:]) + np.random.normal(0, R)

In [None]:
# カルマンフィルター
xp = np.zeros([N, nx])     # 一期先予測
Pp = np.zeros([N, nx, nx]) # 一期先共分散

xp[0,:] = np.array([0,0,0,0])  # 初期推定値
Pp[0,:,:] = np.diag([10,10,10,10]) # 初期共分散

xu = np.copy(xp)   # フィルタ予測
Pu = np.copy(Pp)   # フィルタ共分散

for i in range(1,N):
  # 時間更新ステップ
  xp[i,:] = F.dot(xu[i-1,:])
  Pp[i,:,:] = F.dot(Pu[i-1,:,:]).dot(F.T) + G.dot(q).dot(G.T)
  # 観測更新ステップ
  K = Pp[i,:,:].dot(H.T).dot(np.linalg.inv(H.dot(Pp[i,:,:]).dot(H.T)+R))
  xu[i,:] = xp[i,:] + K.dot(y[i,:] - H.dot(xp[i,:])).squeeze()
  Pu[i,:,:] = Pp[i,:,:] - K.dot(H).dot(Pp[i])

In [None]:
plt.figure(figsize=(10,6))
plt.plot(x[:,0], label="True $x_t$")
plt.plot(y[:,0], marker="x", alpha=0.8, label="Observation $y_t$")
plt.plot(xu[:,0], marker=".", alpha=0.8, label="Filtered estimate $\hat{x_{t/t}}$")
plt.legend()
plt.ylabel("$x_t,y_t,\hat{x_{t/t}}$")
plt.xlabel("Number of steps t")
plt.show()

In [None]:
# カルマンスムーザー
xs = np.zeros([N, nx])     
Ps = np.zeros([N, nx, nx]) 

xs[-1,:] = xu[-1,:]
Ps[-1,:,:] = Pu[-1,:,:]

for i in reversed(range(N-1)):
    # 平滑化ステップ
    C = Pu[i,:,:].dot(F.T).dot(np.linalg.inv(Pp[i+1,:,:]))
    Ps[i,:,:] = Pu[i,:,:] + C.dot(Ps[i+1,:,:]-Pp[i+1,:,:]).dot(C.T)
    xs[i,:] = xu[i,:] + C.dot(xs[i+1]-xp[i+1])

In [None]:
plt.figure(figsize=(10,6))
plt.plot(x[:,0], label="True $x_t$")
plt.plot(y[:,0], marker="x", alpha=0.8, label="Observation $y_t$")
plt.plot(xu[:,0], marker="^", alpha=0.8, markerfacecolor="None", label="Filtered estimate $\hat{x_{t/t}}$")
plt.plot(xs[:,0], marker="o", alpha=0.8, markerfacecolor="None", label="Smoothed estimate $\hat{x}_{t/N}^{(1)}$")
plt.legend()
plt.ylabel("$x_t,y_t,\hat{x_{t/t}}$")
plt.xlabel("Number of steps t")
plt.tight_layout()
plt.savefig("ex2_image.png")
plt.show()

### <font color=blue>**3.** </font> Hidden Marcov Model

#### <font color=green>**3.1.** </font> Viterbi algorithm

In [None]:
## Viterbi algorithm
## https://ja.m.wikipedia.org/wiki/ビタビアルゴリズム

In [None]:
def forward_viterbi(y, X, sp, tp, ep):
  T = {}
  for state in X:
    ##          prob.      V. path  V. prob.
    T[state] = (sp[state], [state], sp[state])
  for output in y:
    U = {}
    for next_state in X:
      total = 0
      argmax = None
      valmax = 0
      for source_state in X:
        (prob, v_path, v_prob) = T[source_state]
        p = ep[source_state][output] * tp[source_state][next_state]
        prob *= p
        v_prob *= p
        total += prob
        if v_prob > valmax:
          argmax = v_path + [next_state]
          valmax = v_prob
      U[next_state] = (total, argmax, valmax)
    T = U
  ## apply sum/max to the final states:
  total = 0
  argmax = None
  valmax = 0
  for state in X:
    (prob, v_path, v_prob) = T[state]
    total += prob
    if v_prob > valmax:
      argmax = v_path
      valmax = v_prob
  return (total, argmax, valmax)

In [None]:
states = ('Rainy', 'Sunny')

In [None]:
start_probability = {
    'Rainy': 0.6,
    'Sunny': 0.4
  }

In [None]:
transition_probability = {
    'Rainy' : {
        'Rainy': 0.7,
        'Sunny': 0.3
         },
    'Sunny' : {
        'Rainy': 0.4,
        'Sunny': 0.6
        }
  }

In [None]:
emission_probability = {
    'Rainy' : {
        'walk': 0.1, 
        'shop': 0.4, 
        'clean': 0.5
        },
    
    'Sunny' : {
        'walk': 0.6,
        'shop': 0.3, 
        'clean': 0.1
        },
  }

In [None]:
observations = ('walk', 'clean', 'shop')

total, argmax, valmax = forward_viterbi(observations,
                                        states,
                                        start_probability,
                                        transition_probability,
                                        emission_probability)

print("observations \t: {}".format(observations))
print("\t total \t: {}".format(total))
print("\t argmax \t: {}".format(argmax))
print("\t valmax \t: {}".format(valmax))

In [None]:
observations = ('walk', 'clean', 'shop', 'walk', 'clean', 'shop')

total, argmax, valmax = forward_viterbi(observations,
                                        states,
                                        start_probability,
                                        transition_probability,
                                        emission_probability)

print("observations \t: {}".format(observations))
print("\t total \t: {}".format(total))
print("\t argmax \t: {}".format(argmax))
print("\t valmax \t: {}".format(valmax))

#### <font color=green>**3.2.** </font> イカサマを見抜く

In [None]:
## 出典 : https://sites.google.com/site/ryunosukehm/study/ml-with-python/impact-ss2017/hmm_toy_example

In [None]:
!pip install hmmlearn

In [None]:
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('ggplot')

from hmmlearn import hmm

In [None]:
# 1枚のコインを100回投げて出た目の系列を生成し，プロット

X = np.empty((0,1), int)
for i in range(0,100):
  X = np.append(X, np.array([[np.random.binomial(1, p=0.5)]]),axis=0)

plt.plot(X)
plt.xlabel('Trial')
plt.yticks((0,1))
ax = plt.gca()
ax.set_yticklabels(['Tail','Head'])
plt.show()

In [None]:
'''最初，
  表が出る確率は 0.7  裏が出る確率は 0.3

しかし途中で，
  表が出る確率は 0.0  裏が出る確率は 1.0
にすり替える
'''

In [None]:
X = np.empty((0,1), int)
for i in range(0,50):
  X = np.append(X, np.array([[np.random.binomial(1, p=0.7)]]),axis=0)
for i in range(0,50):
  X = np.append(X, np.array([[np.random.binomial(1, p=0.0)]]),axis=0)

plt.plot(X)
plt.xlabel('Trial')
plt.yticks((0,1))
ax = plt.gca()
ax.set_yticklabels(['Tail','Head'])
plt.show()

In [None]:
model = hmm.MultinomialHMM(n_components=2)

model.fit(X)
L,Z = model.decode(X)

plt.plot(Z)
plt.xlabel('Trial')
plt.yticks((0,1))
plt.show()

In [None]:
print(model.emissionprob_)

# 1行目： 状態0で裏が出る確率， 状態0で表が出る確率
# 2行目： 状態1で裏が出る確率， 状態1で表が出る確率

In [None]:
'''最初
  裏が出る確率 0.3  表が出る確率 0.7
すり替え後
  裏が出る確率 1.0    表が出る確率 0.0 
'''

In [None]:
plt.plot(Z)
plt.xlabel('Trial')
plt.yticks((0,1))
ax = plt.gca()
ax.set_yticklabels(['Fair','Cheating'])
plt.show()

#### <font color=green>**3.3.** </font> Sampling from HMM

In [None]:
## 出典 : https://hmmlearn.readthedocs.io/en/latest/auto_examples/plot_hmm_sampling.html

In [None]:
import numpy as np
import matplotlib.pyplot as plt

from hmmlearn import hmm

In [None]:
# Prepare parameters for a 4-components HMM Initial population probability

startprob = np.array([0.6, 0.3, 0.1, 0.0])

# The transition matrix, note that there are no transitions possible
# between component 1 and 3
transmat = np.array([[0.7, 0.2, 0.0, 0.1],
                     [0.3, 0.5, 0.2, 0.0],
                     [0.0, 0.3, 0.5, 0.2],
                     [0.2, 0.0, 0.2, 0.6]])

# The means of each component
means = np.array([[0.0,  0.0],
                  [0.0, 11.0],
                  [9.0, 10.0],
                  [11.0, -1.0]])

# The covariance of each component
covars = .5 * np.tile(np.identity(2), (4, 1, 1))

# Build an HMM instance and set parameters
model = hmm.GaussianHMM(n_components=4, covariance_type="full")

# Instead of fitting it from the data, we directly set the estimated
# parameters, the means and covariance of the components
model.startprob_ = startprob
model.transmat_ = transmat
model.means_ = means
model.covars_ = covars

In [None]:
# Generate samples
X, Z = model.sample(500)

# Plot the sampled data
plt.figure(figsize=(12, 8))
plt.plot(X[:, 0], X[:, 1], ".-", label="observations", ms=6,
         mfc="orange", alpha=0.7)

# Indicate the component numbers
for i, m in enumerate(means):
    plt.text(m[0], m[1], 'Component %i' % (i + 1),
             size=17, horizontalalignment='center',
             bbox=dict(alpha=.7, facecolor='w'))
plt.legend(loc='best')
plt.show()

#### <font color=green>**3.4.** </font> HMMlearn 公式チュートリアル

In [None]:
# hmmlearn Tutorial
# https://hmmlearn.readthedocs.io/en/latest/tutorial.html

# 日本語解説記事 :
# http://keik-117.hatenablog.com/entry/2016/07/05/213903
# https://qiita.com/ryo-ma/items/ac26c78cf8ff99bc329c

In [None]:
## Building HMM and generating samples

import numpy as np
from hmmlearn import hmm

np.random.seed(42)

In [None]:
# You can build a HMM instance by passing the parameters described above to the constructor.
# Then, you can generate samples from the HMM by calling sample().

model = hmm.GaussianHMM(n_components=3,
                        covariance_type="full")
model.startprob_ = np.array([0.6, 0.3, 0.1])
model.transmat_ = np.array([[0.7, 0.2, 0.1],
                            [0.3, 0.5, 0.2],
                            [0.3, 0.3, 0.4]])
model.means_ = np.array([[0.0, 0.0],
                          [3.0, -3.0],
                          [5.0, 10.0]])
model.covars_ = np.tile(np.identity(2),
                        (3, 1, 1))
X, Z = model.sample(100)

In [None]:
'''GaussianHMM
Covariance parameters shape depends on covariance_type:

(n_components, )                        if "spherical",
(n_components, n_features)              if "diag",
(n_components, n_features, n_features)  if "full"
(n_features, n_features)                if "tied",
'''

In [None]:
X.shape

In [None]:
Z

In [None]:
# The transition probability matrix need not to be ergodic.
# For instance, a left-right HMM can be defined as follows:

lr = hmm.GaussianHMM(n_components=3,
                     covariance_type="diag",
                     init_params="cm",
                     params="cmt")
lr.startprob_ = np.array([1.0, 0.0, 0.0])
lr.transmat_ = np.array([[0.5, 0.5, 0.0],
                         [0.0, 0.5, 0.5],
                         [0.0, 0.0, 1.0]])

In [None]:
# If any of the required parameters are missing,
# sample() will raise an exception:

model = hmm.GaussianHMM(n_components=3)
X, Z = model.sample(100)

In [None]:
'''Each HMM parameter has a character code 
 which can be used to customize its initialization and estimation.
The EM algorithm needs a starting point to proceed,
 thus prior to training each parameter is assigned a value either
  random or computed from the data. 
It is possible to hook into this process and provide a starting point explicitly. 
To do so
1. ensure that the character code for the parameter is missing from init_params
 and then
2. set the parameter to the desired value.

For example, consider a HMM with an explicitly initialized transition probability matrix:
'''

model = hmm.GaussianHMM(n_components=3,
                        n_iter=100,
                        init_params="mcs")
model.transmat_ = np.array([[0.7, 0.2, 0.1],
                            [0.3, 0.5, 0.2],
                            [0.3, 0.3, 0.4]])

In [None]:
# A similar trick applies to parameter estimation.
# If you want to fix some parameter at a specific value,
# remove the corresponding character from params and set the parameter value before training.

In [None]:
## Training HMM parameters and inferring the hidden states

# This time, the input is a single sequence of observed values.
# Note, the states in remodel will have a different order than those in the generating model.

remodel = hmm.GaussianHMM(n_components=3,
                          covariance_type="full",
                          n_iter=100)
remodel.fit(X)
Z2 = remodel.predict(X)

In [None]:
Z2

In [None]:
## Monitoring convergence
# You can use the monitor_ attribute to diagnose convergence:

remodel.monitor_

In [None]:
remodel.monitor_.history

In [None]:
remodel.monitor_.converged

In [None]:
## Working with multiple sequences

# Consider two 1D sequences:
X1 = [[0.5], [1.0], [-1.0], [0.42], [0.24]]
X2 = [[2.4], [4.2], [0.5], [-0.24]]

# To pass both sequences to fit() or predict(), first concatenate them
# into a single array and then compute an array of sequence lengths:
X = np.concatenate([X1, X2])
lengths = [len(X1), len(X2)]

# Finally, just call the desired method with X and lengths:
hmm.GaussianHMM(n_components=3).fit(X, lengths)

In [None]:
## Saving and loading HMM
# After training, a HMM can be easily persisted for future use with the standard pickle module:

import pickle
with open("filename.pkl", "wb") as file: pickle.dump(remodel, file)
with open("filename.pkl", "rb") as file: pickle.load(file)

In [None]:
# Use custom convergence criteria by subclassing ConvergenceMonitor
# and redefining the converged method.
# The resulting subclass can be used by creating an instance
# and pointing a model’s monitor_ attribute to it prior to fitting.

from hmmlearn.base import ConvergenceMonitor
from hmmlearn import hmm

class ThresholdMonitor(ConvergenceMonitor):
  @property
  def converged(self):
    return (self.iter == self.n_iter or 
            self.history[-1] >= self.tol)

model = hmm.GaussianHMM(n_components=2, tol=5, verbose=True)
model.monitor_ = ThresholdMonitor(model.monitor_.tol,
                                  model.monitor_.n_iter,
                                  model.monitor_.verbose)

In [None]:
from hmmlearn.hmm import GaussianHMM
GaussianHMM(n_components=2)

In [None]:
'''GMMHMM
Covariance parameters shape depends on covariance_type:

(n_components, n_mix)                          if "spherical",
(n_components, n_mix, n_features)              if "diag",
(n_components, n_mix, n_features, n_features)  if "full"
(n_components, n_features, n_features)         if "tied",
'''

In [None]:
from hmmlearn.hmm import MultinomialHMM
MultinomialHMM(n_components=2)

#### <font color=green>**3.5.** </font> 自然言語処理への適用例 : 品詞推定

In [None]:
## 出典 : 
# https://www.kabuku.co.jp/developers/hmm
# https://github.com/takafumihoriuchi/natural_language_processing/blob/master/viterbi_pos_estimate.py

In [None]:
"""Created on May 17, 2018
@author: Takafumi Horiuchi
"""

In [None]:
# for POS estimation
import numpy as np
import nltk
import math

# for printing progress-bar
import time
import sys

In [None]:
# remove sentence boundaries from raw bigrams made by nltk
def make_tagged_word_bigrams(sents):
  return filter(lambda x: x != (('_end', '</s>'), ('start_', '<s>')),
                nltk.bigrams(make_sent_words(sents)))

In [None]:
# remake word list
def make_sent_words(sents):
  words = []
  for i in range(len(sents)):
    words += mod_sent(sents[i])
  return words

In [None]:
# add dummy tokens (for beginning and ending) to each sentence
def mod_sent(tokens):
  tokens.insert(0, ('start_', '<s>'))
  tokens.append(('_end', '</s>'))
  return tokens

In [None]:
# p(w|t)
# word emission probability with add-α smoothing
def p_t_w(t_w, tag, word, alpha=0.01):
  return (t_w[tag][word] + alpha ) / (t_w[tag].N() + alpha * t_w[tag].B())

In [None]:
# p(t_i|t_i-1)
# POS transition probability with add-α smoothing
def p_t_t(t_t, tag1, tag2, alpha=0.01):
  return (t_t[tag1][tag2] + alpha ) / (t_t[tag1].N() + alpha * t_t[tag1].B())

In [None]:
# helper method of viterbi()
def calc_table(S, T, V, i, j, pos_tags, tokens, t_w, t_t):
  max_prob = -np.inf
  max_k = 0
  for k in range(S):
    prob = V[k][i-1] + math.log(p_t_w(t_w, pos_tags[j], tokens[i])) + math.log(p_t_t(t_t, pos_tags[k], pos_tags[j]))
    if prob > max_prob:
      max_prob, max_k = prob, k
  return max_prob, max_k

In [None]:
def viterbi(sentence, pos_tags, t_w, t_t):
  tokens = nltk.word_tokenize(sentence)   # ['Time', 'flies', 'like', 'an', 'arrow', '.']
  tokens.insert(0, '<s>')                 # ['<s>', 'Time', 'flies', 'like', 'an', 'arrow', '.']
    
  S = len(pos_tags)                       # S: number of POS (47)
  T = len(tokens)                         # T: number of tokens
  V = np.zeros((S, T), dtype=np.float32)  # V: probability table
  B = np.zeros((S, T), dtype=int)         # B: back-pointer table

  ## induction
  for i in range(1, T):
    for j in range(S):
      V[j][i], B[j][i] = calc_table(S, T, V, i, j, pos_tags, tokens, t_w, t_t)

  ## termination and path-readout
  X = np.zeros((T), dtype=int)
  max_prob = -np.inf
  for j in range(S):
    if V[j][T-1] > max_prob:
      max_prob = V[j][T-1]
      X[T-1] = j
  for i in range(T-2, -1, -1):
    X[i] = B[X[i+1]][i+1]

  # convert POS-index to POS-tag
  pos_seq = []
  for pos_idx in X:
    pos_seq.append(pos_tags[pos_idx])

  return list(zip(tokens[1:], pos_seq[1:]))

In [None]:
def setup_progbar(width):
  sys.stdout.write("[%s]" % (" " * width))
  sys.stdout.flush()
  sys.stdout.write("\b" * (width+1))

In [None]:
def update_progbar():
  sys.stdout.write("=")
  sys.stdout.flush()

In [None]:
def calc_accuracy(tagged_sents, pos_tags, t_w, t_t):
  ## setup progress bar
  test_size = len(tagged_sents)
  max_width = 78
  progbar_width = test_size if test_size < max_width else max_width
  setup_progbar(progbar_width)
  prog_step = test_size / progbar_width
  prog_cnt = progbar_width

  ## create test sentences from 'tagged_sents_test'
  test_sent_list = []
  ans_tagged_sents = []
  for each_tagged_sent in tagged_sents:
    sentence = []
    for each_tagged_word in each_tagged_sent:
      sentence.append(each_tagged_word[0])
    test_sent_list.append(" ".join(str(x) for x in sentence))
    ans_tagged_sents.append(each_tagged_sent)

  ## setup dictionary for POS specific accuracy
  pos_accuracy = dict()
  for pos in pos_tags:
    pos_accuracy[pos] = {'correct': 0, 'total': 0, 'accuracy': 0.0}

  ## evaluate created HMM
  total_word_cnt = 0
  correct_word_cnt = 0
  correct_sent_cnt = 0
  for sentence, answer in zip(test_sent_list, ans_tagged_sents):
    token_pos = viterbi(sentence, pos_tags, t_w, t_t)
    all_pos_matched = True
    for pred, ans in zip(token_pos, answer):
      if (pred[1] == ans[1]):
        correct_word_cnt += 1
        pos_accuracy[pred[1]]['correct'] += 1
      else:
        all_pos_matched = False
      total_word_cnt += 1
      pos_accuracy[ans[1]]['total'] += 1
    if all_pos_matched is True:
      correct_sent_cnt += 1

    # update progress bar
    prog_cnt += 1
    if (prog_cnt >= prog_step):
      update_progbar()
      prog_cnt = 0

  # calculate POS specific accuracy
  for pos in pos_tags:
    if pos_accuracy[pos]['total'] == 0:
      pos_accuracy[pos]['accuracy'] = None
      continue
    pos_accuracy[pos]['accuracy'] = pos_accuracy[pos]['correct'] / pos_accuracy[pos]['total']
    
  accuracy_token = correct_word_cnt / total_word_cnt
  accuracy_sent = correct_sent_cnt / len(ans_tagged_sents)
  return accuracy_token, accuracy_sent, pos_accuracy

In [None]:
print("\n+------------------------------------------------------------------------------+\n",
      "This is a HMM based POS estimator created by Takafumi Horiuchi in May of 2018.\n",
      "Input of the sentence \"We choose to go to the Moon.\" could output the following:\n",
      "[('We', 'PRP'), ('choose', 'VB'), ('to', 'TO'), ('go', 'VB'), ('to', 'TO'), ('the', 'DT'), ('moon', 'NN'), ('.', '.')]\n",
      "\nloading POS tagsets (may consume few seconds) ...\n")

In [None]:
## load POS tagset from Penn Treebank
nltk.download('treebank')
nltk.download('punkt')
tagged_sents = nltk.corpus.treebank.tagged_sents()

In [None]:
## split tagset to train and test
train_ratio = 0.8
train_size = int(len(tagged_sents) * train_ratio)
tagged_sents_train = tagged_sents[:train_size]
tagged_sents_test = tagged_sents[train_size:]

tagged_word_bigrams = list(make_tagged_word_bigrams(tagged_sents_train))

In [None]:
## word emission count (t_w[tag][word])
t_w = nltk.ConditionalFreqDist([(d[0][1], d[0][0]) for d in tagged_word_bigrams])

In [None]:
## state transition count (t_t[t1][t2])
t_t = nltk.ConditionalFreqDist([(d[0][1], d[1][1]) for d in tagged_word_bigrams])

In [None]:
## a list of possible pos tags (</s> is not included)
pos_tags = list(t_t.keys())

In [None]:
## sentence to evaluate POS

#sentence = input("input a sentence: ")
sentence = "I have a pen."  ###

token_pos = viterbi(sentence, pos_tags, t_w, t_t)

In [None]:
#######
# すごく時間かかる
#######

## show results
print("POS estimation result:")
for each_token_pos in token_pos:
  print(each_token_pos)

  print("\nConditions: Penn-Treebank as POS tagset; train : test = %.2f : %.2f" % (train_ratio, 1 - train_ratio))
  print("--------------------------------------------------------------------------------")

  ## test model precision
  print("measuring precision of model (may consume several minutes) ...")
  prec_token, prec_sent, pos_acc = calc_accuracy(tagged_sents_test, pos_tags, t_w, t_t)
  print("\nmodel precision")
  print("token based accuracy    :", prec_token)
  print("sentence based accuracy :", prec_sent)   
  print("\nPOS specific accuracy   :")

  for each_pos in pos_tags:
    print(each_pos, "\t---\t", pos_acc[each_pos])
    
  print("+------------------------------------------------------------------------------+\n")

#### <font color=green>**3.6.** </font> HDP-HMM（階層ディリクレ過程隠れマルコフモデル）

In [None]:
# ライブラリ : https://github.com/bnpy/bnpy

# サンプルコード : https://bnpy.readthedocs.io/en/latest/examples/08_mocap6/plot-02-demo=merge_moves_for_hdphmm.html

In [None]:
!git clone https://github.com/bnpy/bnpy.git

In [None]:
!pip install -e /content/bnpy/.

In [None]:
## install成功したらruntimeを再起動させる

In [None]:
## Merge moves with HDP-HMM
# https://bnpy.readthedocs.io/en/latest/examples/08_mocap6/plot-02-demo=merge_moves_for_hdphmm.html

In [None]:
import bnpy
import numpy as np
import os

from matplotlib import pylab
import seaborn as sns

FIG_SIZE = (10, 5)
pylab.rcParams['figure.figsize'] = FIG_SIZE

In [None]:
# Setup: Load data

dataset_path = os.path.join(bnpy.DATASET_PATH, 'mocap6')
dataset = bnpy.data.GroupXData.read_npz(
    os.path.join(dataset_path, 'dataset.npz'))

In [None]:
'''データの説明 : https://github.com/bnpy/bnpy/tree/master/bnpy/datasets/mocap6

Six sequences were collected from files available at mocap.cs.cmu.edu:
  Subject 13: trials 29, 30, and 31 Subject 14: trials 6, 14, and 20

Each of the six sequences has been annotated to indicate which of a set of 12 possible exercises is being performed at each timestep.

The raw AMC mocap sensor data from these sequences was post-processed as follows:
  - 12 sensor channels were kept as representative of gross motor behavior. Remaining channels were discarded.
  - Each sensor channel was adjusted to have zero-mean.
  - Each channel was block-averaged to a final frame rate of 10 fps (down from 120 fps in the raw data).
'''

In [None]:
# Setup: Initialization hyperparameters

init_kwargs = dict(
    K=20,
    initname='randexamples',
    )

alg_kwargs = dict(
    nLap=29,
    nTask=1, nBatch=1, convergeThr=0.0001,
    )

In [None]:
# Setup: HDP-HMM hyperparameters

hdphmm_kwargs = dict(
    gamma = 5.0,       # top-level Dirichlet concentration parameter
    transAlpha = 0.5,  # trans-level Dirichlet concentration parameter
    )

In [None]:
# Setup: Gaussian observation model hyperparameters

gauss_kwargs = dict(
    sF = 1.0,          # Set prior so E[covariance] = identity
    ECovMat = 'eye',
    )

In [None]:
# All-Pairs : Try all possible pairs of merges every 10 laps

allpairs_merge_kwargs = dict(
    m_startLap = 10,
    # Set limits to number of merges attempted each lap.
    # This value specifies max number of tries for each cluster
    # Setting this very high (to 50) effectively means try all pairs
    m_maxNumPairsContainingComp = 50,
    # Set "reactivation" limits
    # So that each cluster is eligible again after 10 passes thru dataset
    # Or when it's size changes by 400%
    m_nLapToReactivate = 10,
    m_minPercChangeInNumAtomsToReactivate = 400 * 0.01,
    # Specify how to rank pairs (determines order in which merges are tried)
    # 'total_size' and 'descending' means try largest combined clusters first
    m_pair_ranking_procedure = 'total_size',
    m_pair_ranking_direction = 'descending',
    )

allpairs_trained_model, allpairs_info_dict = bnpy.run(
    dataset, 'HDPHMM', 'DiagGauss', 'memoVB',
    output_path='/tmp/mocap6/trymerge-K=20-model=HDPHMM+DiagGauss-ECovMat=1*eye-merge_strategy=all_pairs/',
    moves='merge,shuffle',
    **dict(
        sum(map(list,   [alg_kwargs.items(),
                        init_kwargs.items(),
                        hdphmm_kwargs.items(),
                        gauss_kwargs.items(),
                        allpairs_merge_kwargs.items()]),[]))
)

In [None]:
# Large-Pairs : Try 5-largest-size pairs of merges every 10 laps

largepairs_merge_kwargs = dict(
    m_startLap = 10,
    # Set limits to number of merges attempted each lap.
    # This value specifies max number of tries for each cluster
    m_maxNumPairsContainingComp = 5,
    # Set "reactivation" limits
    # So that each cluster is eligible again after 10 passes thru dataset
    # Or when it's size changes by 400%
    m_nLapToReactivate = 10,
    m_minPercChangeInNumAtomsToReactivate = 400 * 0.01,
    # Specify how to rank pairs (determines order in which merges are tried)
    # 'total_size' and 'descending' means try largest size clusters first
    m_pair_ranking_procedure = 'total_size',
    m_pair_ranking_direction = 'descending',
    )


largepairs_trained_model, largepairs_info_dict = bnpy.run(
    dataset, 'HDPHMM', 'DiagGauss', 'memoVB',
    output_path='/tmp/mocap6/trymerge-K=20-model=HDPHMM+DiagGauss-ECovMat=1*eye-merge_strategy=large_pairs/',
    moves='merge,shuffle',
    **dict(
        sum(map(list,   [alg_kwargs.items(),
                        init_kwargs.items(),
                        hdphmm_kwargs.items(),
                        gauss_kwargs.items(),
                        largepairs_merge_kwargs.items()]),[])))

In [None]:
# Good-ELBO-Pairs : Rank pairs of merges by improvement to observation model

goodelbopairs_merge_kwargs = dict(
    m_startLap = 10,
    # Set limits to number of merges attempted each lap.
    # This value specifies max number of tries for each cluster
    m_maxNumPairsContainingComp = 5,
    # Set "reactivation" limits
    # So that each cluster is eligible again after 10 passes thru dataset
    # Or when it's size changes by 400%
    m_nLapToReactivate = 10,
    m_minPercChangeInNumAtomsToReactivate = 400 * 0.01,
    # Specify how to rank pairs (determines order in which merges are tried)
    # 'obsmodel_elbo' means rank pairs by improvement to observation model ELBO
    m_pair_ranking_procedure = 'obsmodel_elbo',
    m_pair_ranking_direction = 'descending',
    )


goodelbopairs_trained_model, goodelbopairs_info_dict = bnpy.run(
    dataset, 'HDPHMM', 'DiagGauss', 'memoVB',
    output_path='/tmp/mocap6/trymerge-K=20-model=HDPHMM+DiagGauss-ECovMat=1*eye-merge_strategy=good_elbo_pairs/',
    moves='merge,shuffle',
    **dict(
        sum(map(list,   [alg_kwargs.items(),
                        init_kwargs.items(),
                        hdphmm_kwargs.items(),
                        gauss_kwargs.items(),
                        goodelbopairs_merge_kwargs.items()]),[])))

In [None]:
# Compare loss function vs wallclock time

pylab.figure()
for info_dict, color_str, label_str in [
        (allpairs_info_dict, 'k', 'all_pairs'),
        (largepairs_info_dict, 'g', 'large_pairs'),
        (goodelbopairs_info_dict, 'b', 'good_elbo_pairs')]:
    pylab.plot(
        info_dict['elapsed_time_sec_history'],
        info_dict['loss_history'],
        '.-',
        color=color_str,
        label=label_str)
pylab.legend(loc='upper right')
pylab.xlabel('elapsed time (sec)')
pylab.ylabel('loss')

In [None]:
# Compare number of active clusters vs wallclock time

pylab.figure()
for info_dict, color_str, label_str in [
        (allpairs_info_dict, 'k', 'all_pairs'),
        (largepairs_info_dict, 'g', 'large_pairs'),
        (goodelbopairs_info_dict, 'b', 'good_elbo_pairs')]:
    pylab.plot(
        info_dict['elapsed_time_sec_history'],
        info_dict['K_history'],
        '.-',
        color=color_str,
        label=label_str)
pylab.legend(loc='upper right')
pylab.xlabel('elapsed time (sec)')
pylab.ylabel('num. clusters (K)')

pylab.show(block=False)