# 1. Factor model to reduce dimentionality

$$\eta_t=\Phi \eta_{t-1} + \nu_t$$
$$x_t=\Lambda \eta_t + \epsilon_t$$

Where $\nu_t\sim \mathcal{N}_k(0,I)$ and $\epsilon_t\sim \mathcal{N}_p(0, \Sigma_0)$, with $k < p$. We can use the Kalman filter to estimate this model and fill missing values. The latent factors $\eta_t$ are going to be used to build the predictive model.

How can I add structure so that factors can represent elements of different frequency. In other words, elements that change at different paces.

In [None]:
import sys
sys.path.append('../')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from src.d01_data.dengue_data_api import DengueDataApi
from src.d00_utils.utils import variable_analysis

In [None]:
from scipy import signal
dda = DengueDataApi(interpolate=False)
x1, x2, y1, y2 = dda.split_data(random=False)

In [None]:
from src.d04_modeling.dynamic_factor_model import DynamicFactorModel
from collections import defaultdict
    
dfm_model = DynamicFactorModel(x1.copy(), y1.copy(), factors=3, factor_orders=2)
dfm_model.fit()

In [None]:
city = 'sj'
res_dfm = dfm_model.get_model_results(city)
mod_dfm = dfm_model.get_model(city)

In [None]:
eta_filtered = res_dfm.factors['filtered']
eta_filtered.columns = ['eta%i' % i for i in range(3)]
eta_filtered_cov = res_dfm.factors['filtered_cov']

fig, ax = plt.subplots(figsize=(8, 6))
for factor in eta_filtered.columns:
    ax.plot(eta_filtered.index, eta_filtered[factor], label=factor)
plt.show()

In [None]:
for i in range(3):
    col = eta_filtered.columns[i]
    print("[eta%i] AR coefficient: %.4f" % (i, res_dfm.params['L1.%i->%i' % (i,i)]))
    print(variable_analysis(eta_filtered[col], col, ylim = [1e-2, 1e3]).head())

In [None]:
dfm_model.plot_prediction(x1, y1)
dfm_model.analyze_residuals(x1, y1)
dfm_model.insample_model_evaluation()
dfm_model.plot_prediction(x2, y2)
print("MAE DFM: %.4f" % dfm_model.get_mae(x2, y2))

In [None]:
fig, ax = plt.subplots(figsize=(8, 6))
endog, exog = dfm_model.format_data_arimax(x2.loc[city], y2.loc[city], interpolate=False)
endog = pd.concat([endog.to_frame(), exog], axis=1)
res = res_dfm.extend(endog=endog)
y_log_hat = res.predict()['total_cases']
y_log = dfm_model.transform_endog(dfm_model.resample(y2.loc[city]))
ax.plot(y_log_hat.index, y_log_hat, label='prediction')
ax.plot(y_log.index, y_log, label='obs')
plt.legend()
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(8, 6))
y2_nan = pd.Series(np.nan, index=x2.index, name=y2.name)
endog, exog = dfm_model.format_data_arimax(x2.loc[city], y2_nan.loc[city], interpolate=False)
endog = pd.concat([endog.to_frame(), exog], axis=1)
res = res_dfm.extend(endog=endog)
y_log_hat = res.predict()['total_cases']
y_log = dfm_model.transform_endog(dfm_model.resample(y2.loc[city]))
ax.plot(y_log.index, y_log, label='obs')
ax.plot(y_log_hat.index, y_log_hat, label='prediction')
plt.legend()
plt.show()