In [1]:
suppressPackageStartupMessages({
  library(data.table)
  library(glmnet)
  library(ggplot2)
})

In [None]:
# ---------------------------
# 1. Load data and keep only Apple
# ---------------------------
load(here::here("data","X_data"))
X <- as.data.table(X_data)
setorder(X, date)

# keep only Apple returns + topic variables
apple <- "AAPL"   # or whatever ticker Apple has in your data
topic_cols <- grep("^Topic_", names(X), value = TRUE)

# --- parameters ---
K <- 3       # number of lags
L <- 30      # training window
nfolds <- 10 # CV folds

In [8]:
# ---------------------------
# 2. Build lags for Apple and topics
# ---------------------------
lag_block <- function(dt, cols, K, tag) {
  lags <- lapply(0:(K-1), function(k) {
    lagged <- dt[, shift(.SD, n = k), .SDcols = cols]
    setnames(lagged, paste0(cols, "_", tag, "Lag", k))
    lagged
  })
  do.call(cbind, lags)
}

Xlag <- cbind(
  X[, .(date)],
  lag_block(X, apple, K, "R"),     # Apple return lags
  lag_block(X, topic_cols, K, "T") # Topic lags
)

# drop first rows with missing lags
first_valid <- K
Xlag <- Xlag[(first_valid+1):.N]
pred_cols <- setdiff(names(Xlag), "date")
Xmat <- as.matrix(Xlag[, ..pred_cols])
dates <- Xlag$date
y <- X[[apple]][(first_valid+1):.N]



ERROR: Error in `[.data.table`(dt, , shift(.SD, n = k), .SDcols = cols): Some items of .SDcols are not column names: [AAPL]


In [7]:
# ---------------------------
# 3. Rolling LASSO forecast
# ---------------------------
forecasts <- data.table(date = dates[(L+1):length(dates)],
                        forecast = NA_real_,
                        actual   = NA_real_,
                        r2_insample = NA_real_)

for (t in seq(L+1, length(dates))) {
  # window indices
  train_idx <- (t-L):(t-1)
  
  X_train <- Xmat[train_idx, , drop = FALSE]
  y_train <- y[train_idx]
  
  # fit LASSO with CV
  fit <- cv.glmnet(x = X_train, y = y_train,
                   alpha = 1, family = "gaussian",
                   nfolds = nfolds, standardize = TRUE)
  
  # in-sample R^2
  yhat_train <- predict(fit, newx = X_train, s = "lambda.min")
  r2 <- 1 - sum((y_train - yhat_train)^2) / sum((y_train - mean(y_train))^2)
  
  # forecast next day return
  X_next <- Xmat[t, , drop = FALSE]
  f <- as.numeric(predict(fit, newx = X_next, s = "lambda.min"))
  
  forecasts[t - L, `:=`(forecast = f,
                        actual   = y[t],
                        r2_insample = r2)]
}


ERROR: Error: object 'dates' not found


In [None]:
# ---------------------------
# 4. Plot results
# ---------------------------
ggplot(forecasts, aes(x = date)) +
  geom_line(aes(y = actual), color = "black") +
  geom_line(aes(y = forecast), color = "red") +
  labs(title = "Rolling LASSO Forecast vs Actual (Apple)",
       y = "Return", x = "Date") +
  theme_minimal()

ggplot(forecasts, aes(x = date, y = r2_insample)) +
  geom_line(color = "blue") +
  labs(title = "In-sample R² of LASSO (30-day rolling window)", y = "R²", x = "Date") +
  theme_minimal()
