In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import os.path as osp
import sys
from sklearn.linear_model import LinearRegression

sys.path.insert(0,'src')

In [None]:
from data_funcs import plot_data

In [None]:
# Read data at one location
dat = pd.read_pickle('data/raws_dat.pickle') # all RAWS observations

dat = dat['CPTC2'] # restrict to one station for visualization

In [None]:
# Set up time params: 
# train: 0-480 hours (20 days)
# test: 480-720 hours (10 days)

h2 = 480
hours = 720
h = np.arange(0, 720)
hour = np.resize(range(0, 23), hours) # repeat 0-23 starting at time 0, not necessarily lined up with actual time of day

## AR Model

For a model with $K$ time lags and $P$ other covariates

$$
y_t = \beta_0 + \beta_1 t + \sum_{k=1}^K \beta_k y_{t-k} + \sum_{i=1}^P\alpha_j x_{j, t} +\epsilon_t
$$

In [None]:
# Helper Functions
def build_lags(v, lags):
    "v: data vector to lag"
    "lags: list of integers"
    
    X = pd.DataFrame({'x': v})
    for l in lags:
        X[f"lag{l}"] = X['x'].shift(l)
    X = X.drop(['x'], axis=1)
    X = X.dropna().to_numpy()
    return X



In [None]:
# build design matrix
# Train matrix
lags=1
X = build_lags(dat['fm'][0:h2], lags = np.arange(1, lags+1))
X = pd.DataFrame(X)
X['t'] = h[lags:h2].tolist()
X['hour'] = hour[lags:h2].tolist()
X['rain'] = dat['rain'][lags:h2].tolist()
X['Ed'] = dat['Ed'][lags:h2].tolist()
X['wind_speed'] = dat['wind_speed'][lags:h2].tolist()
X = X.to_numpy()

In [None]:
mod = LinearRegression().fit(X, dat['fm'][lags:h2])
fits = mod.predict(X)