## Programming Assignment 03

Student ID: 

Collaborators: 

In [53]:
import numpy as np
import pandas as pd
import scipy.stats as st

from statsmodels.tsa.api import VAR

#### Helper Functions from previous assignments

In [54]:
def Z_matrix(y: np.array, p: int, c: int):
    """Calculate the Z-matrix for a given input

    Args:
        y (np.array): input with all the data of shape (T + p) × K
        p (int): lags
        c (int): intercept yes=1, no=0

    Returns:
        (np.array): Z-matrix for given input
    """

    y = y.T

    #determine matrix dimensions:
    T = y.shape[1] - p
    K = y.shape[0]

    # build Z-matrix
    if c==1:
        Z = np.ones((1, T+p), dtype=float)

    # 1b stacked lagged data
    for i in range(p):
        #add i columns of leading zeros (EDIT: empty, comp cost lower) to ktpmat
        zeros = np.zeros((K, i), dtype=float)
        zerostack = np.hstack((zeros, y[:,:(T+p-i)]))
        # vertically stack this to Z
        Z = np.vstack((Z, zerostack))

    # cutting of leading p columns and retrieving Z
    Z = Z[:, p-1:-1]

    return Z

In [55]:
def B_matrix(y: np.array, p: int, c: int):
    """Calculates the B matrix with the estimated coefficients

    Args:
        y (np.array): input with all the data of shape (T + p) × K
        p (int): lags
        c (int): intercept yes=1, no=0

    Returns:
        _type_: B = matrix with estimated coefficients; Z=Z-matrix; sigma_u=covariance matrix
    """

    # get Z-matrix from function above
    Z = Z_matrix(y, p, c)

    y = y.T # transpose y
    y = y[:,p:] # first p observations are lost as we need prior lags for estimation
    K = y.shape[0] # number of variables
    T = y.shape[1] # number of observations

    # calculate B
    B = y @ Z.T @ np.linalg.inv((Z@Z.T))

    # calculate sigma_u (covariance matrix)
    sigma_u = (1/(T-K*p-1))*(y-(B@Z))@(y-(B@Z)).T

    return B, Z, sigma_u

#### Exercise 1

In [56]:
def resid_bootstrap(Tpkmat, p):
    '''
    :param Tpkmat: a T + p × K matrix of observations on yt,
    :param p: the lag length p,
    :param R: and the number of bootstrap replications R as input.
    :return: returns the bootstrap standard errors of the VAR coefficients in B
    '''

    y = Tpkmat.T # transpose input matrix to K x (T+p)
    T = y.shape[1] - p # get T (number of observations)
    K = y.shape[0]

    '''
    Description from Lütkepohl, appendix D, page 709
    (1) The parameters of the model under consideration are estimated. Let uthat, t = 1, . . . , T, be the estimation residuals.
    '''

    B, Z, sigma_u = B_matrix(Tpkmat, p, c=1)

    '''
    (2) Centered residuals are computed (usual average). Bootstrap residuals u∗1, . . . , u∗T are then obtained by randomly drawing with replacement from the centered residuals.
    '''

    uthat = y[:,p:] - (B@Z)
    uthatbar = np.sum(uthat, axis=1)/T
    uthatcenterded = uthat - uthatbar.T.reshape(K, 1)
    draws = np.random.randint(0, T, T)


    '''
    (3) Bootstrap time series are computed recursively [...]where the same initial values may be used for each generated series, (y∗ −p+1, . . . , y∗0) = (y−p+1, . . . , y0).
    '''
    # set bootstrap time series pre-sample values to the same presample series from original data for every repetition

    bs_y = y[:,:p]

    for i in range(T):
        y_t = B[:,0] + uthatcenterded[:,draws[i]]
        for l in range(p):
            y_t = y_t + (B[:, (l*K+1):(l*K+K+1)] @ bs_y[:,-l])
        bs_y = np.hstack((bs_y, y_t.reshape(K, 1)))

    B_bs, Z_bs, sigma_u_bs = B_matrix(bs_y.T, p, c=1)

    return B_bs, Z_bs, sigma_u_bs

In [57]:
def bootstrap_se(Tpkmat, p, R):
    K = Tpkmat.shape[1]

    B_bs_list = np.empty((K, K*p+1))
    for i in range(R):
        B_bs, _, _ = resid_bootstrap(Tpkmat, p)
        B_bs_list = np.dstack((B_bs_list, B_bs))

    Bbar_bs_list = np.mean(B_bs_list, axis = 2)
    deviation = B_bs_list - Bbar_bs_list[:, :, None]
    deviation_squared = deviation**2
    sd = np.sqrt(np.sum(deviation_squared, axis=2)/(R-1))
    se = sd/np.sqrt(R)

    return se

#### Exercise 2

In [58]:
# read in data
awm = pd.read_csv("awm19up18.csv")
awm.rename(columns={awm.columns[0]: "Q" }, inplace = True)

of_interest = ["Q", "YER", "ITR", "LTN", "STN"]
awm = awm[awm.columns.intersection(of_interest)]
awm.set_index('Q', inplace=True)

In [59]:
# calculate logs and first differences and assign names accordingly
awm["YER_log"] = np.log(awm['YER'])
awm["ITR_log"] = np.log(awm['ITR'])

awm["d_lgdp"] = awm["YER_log"].diff()
awm["d_invest"] = awm["ITR_log"].diff()

awm["d_lgdp"] = awm["d_lgdp"] * 400
awm["d_invest"] = awm["d_invest"] * 400

awm["d_R"] = awm["LTN"].diff()
awm["d_r"] = awm["STN"].diff()

awm.dropna(inplace=True)

In [60]:
# get the input for our function
y_t = np.array(awm[["d_lgdp", "d_invest", "d_R", "d_r"]])

In [61]:
# test our function
B, Z, sigma_u = B_matrix(y_t, p=2, c=1)
B_se = bootstrap_se(y_t, 2, R=499)

In [62]:
# show our B
B

array([[ 8.19991990e-01,  4.41893745e-01, -3.09516135e-02,
         3.53840313e-01,  2.80812932e-01,  1.40348551e-01,
         2.89658568e-02, -6.61782716e-01, -6.99814626e-01],
       [-1.00607134e+00,  1.20312057e+00, -2.28829643e-01,
        -1.26631209e+00,  6.99606104e-01, -5.17309344e-02,
         2.41250166e-01, -1.21968589e+00, -1.57970603e+00],
       [-6.57473325e-02,  1.20603459e-02,  1.78784492e-04,
         5.15582681e-01,  3.78561218e-02,  1.32232293e-02,
        -5.88137520e-03, -2.20154123e-01,  7.06111408e-02],
       [-2.43278598e-01,  7.28410851e-02, -7.47895590e-03,
         4.22382718e-01,  2.78528354e-01,  3.87124972e-02,
        -2.40842053e-03, -2.89226900e-01, -7.25841414e-02]])

In [63]:
# show our B_se
B_se

array([[0.77199959, 2.72830039, 3.01713176, 0.0250545 , 0.09392389,
        2.42179748, 2.91557482, 0.05281537, 0.04734476],
       [0.58567125, 3.36774495, 3.01003662, 0.07035399, 0.14134962,
        2.63616228, 5.41163659, 0.11859751, 0.11287958],
       [0.01557032, 0.04975942, 0.12830272, 0.02851678, 0.03290027,
        0.04137466, 0.07066602, 0.00897546, 0.02203668],
       [0.01727625, 0.22509772, 0.40636296, 0.03550641, 0.07136207,
        0.18387232, 0.34499978, 0.00989128, 0.02510667]])

In [88]:
# Compare to built-in function from VAR package
model = VAR(y_t)
var = model.fit(2) #number = lag order
var.bse # standard errors

array([[0.2697177 , 0.6727813 , 0.04033584, 0.0646388 ],
       [0.09634184, 0.24031419, 0.01440777, 0.02308866],
       [0.03812077, 0.09508808, 0.0057009 , 0.00913578],
       [0.53710206, 1.3397423 , 0.08032274, 0.12871841],
       [0.33948962, 0.8468197 , 0.05077012, 0.08135989],
       [0.10218602, 0.25489185, 0.01528175, 0.02448924],
       [0.03699542, 0.09228102, 0.0055326 , 0.00886608],
       [0.54350851, 1.3557225 , 0.08128081, 0.13025374],
       [0.32538508, 0.81163747, 0.04866081, 0.07797969]])

In [89]:
# show whole built-in output 
var.summary()

  Summary of Regression Results   
Model:                         VAR
Method:                        OLS
Date:           Sun, 11, Dec, 2022
Time:                     13:33:30
--------------------------------------------------------------------
No. of Equations:         4.00000    BIC:                    1.40924
Nobs:                     189.000    HQIC:                   1.04192
Log likelihood:          -1111.54    FPE:                    2.20793
AIC:                     0.791766    Det(Omega_mle):         1.83304
--------------------------------------------------------------------
Results for equation y1
           coefficient       std. error           t-stat            prob
------------------------------------------------------------------------
const         0.819992         0.269718            3.040           0.002
L1.y1         0.441894         0.096342            4.587           0.000
L1.y2        -0.030952         0.038121           -0.812           0.417
L1.y3         0.353840

#### Exercise 3

In [65]:
def var2sim(A1: np.array, A2: np.array, sigma_u: np.array, T: int):
    """A function that simulates time series data from a K-dimensional VAR(2) process yt = A1 y_t−1 + A2 y_t−2 + u_t, 
    where the innovations ut are drawn from a multivariate normal distribution with mean zero and covariance matrix Σ_u. 
    Uses y_−1 = y_0 = 0 as starting values, where 0 is a K ×1 vector of zeros.
    Generates time series of length T+50 and discards the first 50 observations, such that it returns a time series of total length equal to T.

    Args:
        A1 (np.array): coefficient matrix at lag 1
        A2 (np.array): coefficient matrix at lag 2
        sigma_u (np.array): covariance matrix Σ_u
        T (int): number of observations

    Returns:
        np.array: T x K matrix of observations on y_t
    """
    K = sigma_u.shape[0]

    # set starting values
    y_tminus1 = np.zeros((K, 1))
    y_tminus2 = np.zeros((K, 1))

    P = np.linalg.cholesky(sigma_u)

    y = np.empty((K, 0))
    for i in range(T+50):
        # draw disturbance u_t
        u_t = P @ np.random.standard_normal(K)
        u_t = u_t.reshape(K, 1)
        #recursively calculate y_t
        y_t = A1@y_tminus1 + A2@y_tminus2 - 2 + u_t
        y = np.hstack((y, y_t))
        y_tminus2 = y_tminus1
        y_tminus1 = y_t

    # discard first 50 observations
    y = y[:,50:]

    return y.T

In [66]:
# Test our var2sim function

K = 4
p = 2
T = 100

A1 = B[:,1:K+1]
A2 = B[:,K+1:2*K+1]

var2sim(A1, A2, sigma_u, T)

array([[ 2.44720354e+00,  2.14269914e+01, -3.06054021e+00,
        -2.41752988e+00],
       [ 2.09055764e+00,  1.32675429e+01, -3.10036387e+00,
        -3.09827884e+00],
       [ 9.47547682e-01,  1.01143746e+01, -3.66864437e+00,
        -3.12173869e+00],
       [ 3.11624412e+00,  1.25712859e+01, -3.15849271e+00,
        -3.36343711e+00],
       [ 6.79516546e-01,  1.13724498e+01, -3.87501410e+00,
        -3.59190541e+00],
       [ 2.51167774e+00,  1.54586110e+01, -3.62193240e+00,
        -4.56896113e+00],
       [ 2.44200803e+00,  7.89630716e+00, -3.05822001e+00,
        -2.89842730e+00],
       [-5.36722091e-01,  1.56144322e+01, -3.93759297e+00,
        -2.98150197e+00],
       [-4.05484749e+00,  1.78753013e+00, -3.90275247e+00,
        -4.53795197e+00],
       [-7.50716143e+00, -7.42399219e-01, -3.44468935e+00,
        -3.70980235e+00],
       [-1.73380681e+00,  1.19811899e+00, -4.21822881e+00,
        -4.63749881e+00],
       [-1.91596647e+00, -9.01803644e-01, -3.91545364e+00,
      

#### Exercise 4

In [67]:
def hstep_forecast(y: np.array, p: int, h: int):
    """A function that computes the h-step ahead point forecasts y_T (h) and the corresponding MSE matrix Σˆ_y(h) based on a VAR(p) with intercept

    Args:
        y (np.array): K × T matrix of observations
        p (int): lag order
        h (int): forecast horizon

    Returns:
        list: h-step ahead forecasts and the corresponding MSE matrix
    """
    
    K = y.shape[0]
    T = y.shape[1]

    # retrieving estimates
    B, Z, sigma_u = B_matrix(y.T, p, c=1)

    # constructing matrices
    J1 = np.hstack((np.zeros((K, 1)), np.identity(K), np.zeros((K, K*(p-1)))))

    row0 = np.hstack((np.ones((1,1)), np.zeros((1, K*p))))
    rowz = np.hstack((np.zeros((K*(p-1), 1)), np.identity(K*(p-1)), np.zeros((K*(p-1), K))))
    B = np.vstack((row0, B, rowz))

    Zt = y[:,-p:]                       # selecting y[:,-p:] from t-p up to t
    Zt = Zt[:,::-1].T.flatten()             # reverse order horizontally, transpose and flatten.
    Zt = np.hstack((np.array([(1)]), Zt)).reshape(K*p+1, 1)   # adding one leading 1, transposing, dimension is: 1+K*T x 1
    # predicting y_th

    y_th = J1@B**h@Zt


    # ab hier alles wild. wie berechnen wir small sample mse? oder bootstrappen wir hier (formula von task1)?

    # THIS IS ONLY THE FORMULA FOR one-step ahead forecast, others need to be calculated differently, see Lütkepohl page 98 ff
    sigma_hat_yh_hat = sigma_u + ((K*p+1)/T)*sigma_u

    ## was machen wir mit all dem stuff auf seite 97 ff? offensichtlich ist sigma_hat_yt_hat(2) != sigma_hat_yt(2) (see page 99)
    PHIi = J1@B**(i-1)@J1.T
    #sigma_hat_yh_hat = sigma_u + PHIi@sigma_u@PHIi.T + 1/T * omega_hat




    return y_th, sigma_hat_yh_hat

#### Exercise 5

In [68]:
# Define parameters

T = 100
p = 2

A1 = np.array([(0.4, 0.25), (0.0, 0.5)])
A2 = np.array([(0.2, 0.4), (0.0, 0.0)])
sigma_u = np.array([(1, 0.5), (0.5, 1)])

In [69]:
# Test functions

time_series_TK = var2sim(A1, A2, sigma_u, T)

# Forecast horizon h = 1
h = 1
y_th1, mse_mat1 = hstep_forecast(time_series_TK.T, p, h)

# Forecast horizon h = 2
h = 4
y_th4, mse_mat4 = hstep_forecast(time_series_TK.T, p, h)

NameError: name 'i' is not defined

In [None]:
# set up 95 % interval forecast (assuming data is generated from gaussian process)

CIone95 = [y_th1 - 1.96 * np.sqrt(np.diag(mse_mat1)), y_th1 + 1.96 * np.sqrt(np.diag(mse_mat1))]
CIfour95 = [y_th4 - 1.96 * np.sqrt(np.diag(mse_mat4)), y_th4 + 1.96 * np.sqrt(np.diag(mse_mat4))]
