In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import os.path as osp
import sys
from sklearn.linear_model import LinearRegression

sys.path.insert(0,'src')

In [2]:
from data_funcs import plot_data

In [3]:
# Read data at one location
dat = pd.read_pickle('data/raws_dat.pickle') # all RAWS observations

dat = dat['CPTC2'] # restrict to one station for visualization

## AR Model

For a model with $K$ time lags and $P$ other covariates

$$
y_t = \beta_0 + \beta_1 t + \sum_{k=1}^K \beta_k y_{t-k} + \sum_{i=1}^P\alpha_j x_{j, t} +\epsilon_t
$$

In [12]:
# Helper Functions
def build_lags(v, lags):
    "v: data vector to lag"
    "lags: list of integers"
    
    X = pd.DataFrame({'x': v})
    for l in lags:
        X[f"lag{l}"] = X['x'].shift(l)
    X = X.drop(['x'], axis=1)
    X = X.dropna().to_numpy()
    return X

def predict_ar(m, K, f, XX, ts):
    "m: model object"
    "K: time lag terms in m"
    "f: observed"
    "XX: covariate matrix"
    "ts: number of time steps to forecast"
    
    preds = np.zeros(ts) # initialize array of forecasts for return value
    
    Xtemp = np.column_stack((np.flip(f[-K:]).reshape(1, K), XX.loc[0:0])) # model matrix with last fitted value

    preds[0]=m.predict(Xtemp)

    # Loop through remaining time steps and predict using last value
    for i in range(1, ts):
        if i < K: # build lags using training data if necessary
            x = np.concatenate((f[-(K-i):], preds[0:i]))
        else: 
            x = preds[(i-K):i]
        x = np.flip(x)
        Xtemp = np.column_stack((x.reshape(1, K), XX.loc[i:i]))
        # Xtemp = preds[i-1].reshape(1, 1) # join with time index
        preds[i]=m.predict(Xtemp)
    
    return preds

def build_ar(dat, lags, hours = 720, h2 = 480):
    # Input:
    # dat: dictionary of fmda data
    # lags: (int) time lags to model
    
    # Return dictionary
    mod={'h2': h2, 'hours':hours}

    # Time params
    h = np.arange(0, hours)
    hour = np.resize(range(0, 23), hours) # repeat 0-23 starting at time 0, not necessarily lined up with actual time of day
    
    # Build training matrix
    X = build_lags(dat['fm'][0:h2], lags = np.arange(1, lags+1))
    X = pd.DataFrame(X)
    X['t'] = h[lags:h2].tolist()
    X['hour'] = hour[lags:h2].tolist()
    X['rain'] = dat['rain'][lags:h2].tolist()
    X['Ed'] = dat['Ed'][lags:h2].tolist()
    X['wind_speed'] = dat['wind_speed'][lags:h2].tolist()
    X = X.to_numpy()
    
    mod['train']=X
    
    # Fit model
    mod["m"] = LinearRegression().fit(X, dat['fm'][lags:h2])
    mod['fits'] = mod["m"].predict(X)

    # # Set up prediction matrix
    X2 = pd.DataFrame({'t': h[h2:hours].tolist(), 'hour': hour[h2:hours].tolist(), 'rain': dat['rain'][h2:hours].tolist(), 
                       'Ed': dat['Ed'][h2:hours].tolist(), 'wind_speed': dat['wind_speed'][h2:hours].tolist()})
    mod['test']=X2
    
    # mod['preds']=predict_ar(mod['m'], lags, dat['fm'], X2, len(dat['fm'])-mod1['h2'])
    
    return mod

In [9]:
mod1 = build_ar(dat, 1)