In [1]:
import numpy as np
import matplotlib
import math
import pandas as pd 

jobs_df = pd.read_csv('./data/UK_job_vacancies.csv')

def rmse(predicted, actual):
    diffs = predicted - actual
    return math.sqrt(np.mean(diffs ** 2))

def linear_regression(design_matrix, target_vector):
    return np.linalg.inv(design_matrix.transpose() @ design_matrix) @ design_matrix.transpose() @ target_vector

def train_ar(values, order):
    target_vector = np.array(values[order:])
    lagged_values = []
    for i in range(len(values) - order):
        lagged_values.append(values[i:i+order])
    design_matrix = np.array(lagged_values)
    return linear_regression(design_matrix, target_vector)

In [2]:


# set AR deg to a larger magnitude - 20

series = jobs_df['Vacancies']

ar_deg = 10

ma_deg = 2

coef = np.append(train_ar(series, ar_deg), np.zeros(ma_deg))

print(coef)
# coef = [-0.0941, -0.1832, 1.2754, 0.5262, 0.7267] # coefs from statsmodels
errors = [0.0] * len(series)

for x in range(20):
    design_matrix_rows = []
    for i in range(max(ar_deg, ma_deg), len(series)):
        values = np.append(series[i-ar_deg:i], errors[i-ma_deg:i])
        pred = np.dot(values, coef)
        design_matrix_rows.append(values)
        errors[i] = series[i] - pred
        
    design_matrix = np.array(design_matrix_rows)
    coef = linear_regression(design_matrix, series[max(ar_deg, ma_deg):])

print("step", x, ":", math.sqrt(np.dot(errors, errors)/(len(errors) - max(ar_deg, ma_deg))))
print(coef) # 12.0543
# print(design_matrix)


[ 0.04289748 -0.12204264 -0.13419879  0.60114682 -0.54379875 -0.29137531
  1.05463138 -0.50054911 -1.13997554  2.03387541  0.          0.        ]
step 19 : 9.107590717940694e+60
[ 3.17344572e+02 -6.43771588e+02  1.73437306e+02  6.90726809e+02
 -3.25780348e+02 -1.67960576e+03  3.33672663e+03 -2.82343339e+03
  9.59873640e+02 -3.63829133e+00  2.06873163e-56  1.18113673e-56]
