### 1 Loading the data

In [1]:
setwd("D:/PUCP/JP-TC")

# Load the data
data <- read.csv('wage2015_subsample_inference.csv')

# Separate the features and the target variable
X <- data[, !(names(data) %in% c('wage', 'lwage'))]  # Drop columns 'wage' and 'lwage'
y <- data['lwage']

In [2]:
# Load necessary package
library(stats)

# Create the design matrix
X_flexible <- model.matrix(~ 0 + sex + (exp1 + exp2 + exp3 + exp4 + hsg + scl + clg + ad + so + we + ne + 
                                         factor(occ2) + factor(ind2))^2, data = X)

In [3]:
y <- as.matrix(y)

X_flexible <- as.matrix(X_flexible)

### 2 Creating the Lasso Cross-Validation Procedure

4. The `log_grid` function is pretty straight forward

In [4]:
log_grid <- function(lower, upper, log_step) {
  log_grid <- seq(lower, upper, length.out = 1 / log_step)
  return(exp(log_grid))
}

5. To code the `k_folds` function, there are many different approaches. However, we sticked to using only numpy. With this library, we exploited the kronecker product operation and block matrices to build the $k$-folds. Also, we addressed the issue of divisibility between the sample size $n$ and $k$ using an if-else statement depending on the module of $n/k$

In [5]:
k_folds <- function(data, k = 5) {
  
  module <- nrow(data) %% k
  floor <- nrow(data) %/% k 
  
  if (module == 0) {
    trues <- matrix(1, nrow = floor, ncol = 1)
    split_matrix <- kronecker(diag(k), trues)
  } else {
    trues_g1 <- matrix(1, nrow = floor + 1, ncol = 1)
    split_matrix_g1 <- kronecker(diag(module), trues_g1)
    
    trues_g2 <- matrix(1, nrow = floor, ncol = 1)
    split_matrix_g2 <- kronecker(diag(k - module), trues_g2)
    
    split_matrix <- rbind(
      cbind(split_matrix_g1, matrix(0, nrow = nrow(split_matrix_g1), ncol = ncol(split_matrix_g2))),
      cbind(matrix(0, nrow = nrow(split_matrix_g2), ncol = ncol(split_matrix_g1)), split_matrix_g2)
    )
  }
  
  sm_bool <- split_matrix == 1
  splits <- lapply(1:k, function(x) sm_bool[, x])
  
  return(splits)
}

6. For the `optimal_lambda` search function, we basically adapted the code provided in the labs so it can use the functions of log-grid and our own $k$-folds function

In [6]:

optimal_lambda <- function(Y, X, lambda_bounds, k = 5, niter = 100) {
  
  library(glmnet)
  
  Y <- drop(Y)
  
  if (is.vector(X)) {
    X <- matrix(X, ncol = 1)
  }
  
  folds <- k_folds(X, k)
  all_lambdas <- exp(seq(lambda_bounds[1], lambda_bounds[2], length.out = niter))
  all_mse <- numeric(niter)
  
  for (l in all_lambdas) {
    split_pes <- numeric(k)
    
    for (i in seq_len(k)) {
      X_train <- X[!folds[[i]], ]
      X_test <- X[folds[[i]], ]
      y_train <- Y[!folds[[i]]]
      y_test <- Y[folds[[i]]]
      
      model <- glmnet(X_train, y_train, alpha = 1, lambda = l,standardize = FALSE)
      predict <- predict(model, X_test, s = l)
      
      pe <- sum((y_test - predict)^2)
      split_pes[i] <- pe
    }
    
    all_mse[which(all_lambdas == l)] <- mean(split_pes)
  }
  
  selected <- which.min(all_mse)
  optimal_lambda <- all_lambdas[selected]
  optimal_model <- glmnet(X, Y, alpha = 1, lambda = optimal_lambda)
  optimal_coef <- coef(optimal_model, s = optimal_lambda)
  
  output <- list(
    optimal_lambda = optimal_lambda,
    optimal_coef = optimal_coef,
    all_lambdas = all_lambdas,
    all_mse = all_mse
  )
  
  return(output)
}

7. The `predict_model` function can be easily implemented using the results of `optimal_function`

In [7]:
predict_model <- function(optimal_model, X) {
  
  intercept <- matrix(1, nrow = nrow(X), ncol = 1)
  Z <- cbind(intercept, X)
  
  return(Z %*% optimal_model$optimal_coef)
}

### 3 Applying the Lasso Cross-Validation Procedure

We split the sample in train and test

In [8]:
library(caTools)


split <- sample.split(y, SplitRatio = 0.75)
X_flexible_train <- subset(X_flexible, split == TRUE)
X_flexible_test <- subset(X_flexible, split == FALSE)
y_train <- y[split]
y_test <- y[!split]

"package 'caTools' was built under R version 4.3.3"


8. We perform the OLS fitting

In [9]:
model_ls <- lm(y_train ~ ., data = data.frame(y_train = y_train, X_flexible_train))

9. Npw we search the optimal lambda using our `optimal_lambda` function

In [10]:
model_lasso <- optimal_lambda(y_train, X_flexible_train, c(-7, 7))

"package 'glmnet' was built under R version 4.3.3"
Loading required package: Matrix

Loaded glmnet 4.1-8



In [11]:
print(model_lasso$optimal_lambda)

[1] 0.001605462


In [12]:
print(model_lasso$optimal_coef)

981 x 1 sparse Matrix of class "dgCMatrix"
                                         s1
(Intercept)                    2.721740e+00
sex                           -6.289675e-02
exp1                           6.384099e-03
exp2                           .           
exp3                           .           
exp4                           .           
hsg                            .           
scl                            .           
clg                            3.080748e-01
ad                             3.982615e-01
so                             .           
we                             5.575766e-02
ne                             .           
factor(occ2)1                  2.427481e-01
factor(occ2)2                  1.546268e-01
factor(occ2)3                  9.047187e-02
factor(occ2)4                  3.834206e-02
factor(occ2)5                  .           
factor(occ2)6                 -1.108579e-02
factor(occ2)7                  .           
factor(occ2)8                 -6.

10. Now we use HDM for python (hdmpy) to estimate the model using the theoretically optimal penalty parameter.

In [13]:
install.packages("hdm")
library(hdm)

model_rlasso <- rlasso(X_flexible_train, y_train)

Installing package into 'C:/Users/User/AppData/Local/R/win-library/4.3'
(as 'lib' is unspecified)



package 'hdm' successfully unpacked and MD5 sums checked

The downloaded binary packages are in
	C:\Users\User\AppData\Local\Temp\RtmpQPC3XQ\downloaded_packages


"package 'hdm' was built under R version 4.3.3"


As you may notice, the optimal penalty parameter resulting from this procedure is not comparable in size to the cross validation result. This is due to the fact that this penalty is the theoretically optimal parameter for the Lasso estimator under data-driven penalty loadings. That is:

\begin{equation*}

\hat{\beta} = \arg \ \underset{\beta}{\min} \sum_{i=1}^n (y_i - x_{i}^{\prime}\beta)^2 + \frac{\lambda}{n} \lVert \hat{\Psi}\beta \rVert_1

\end{equation*}

Where $\hat{\Psi} = diag(\hat{\psi_1},\hat{\psi_2},\dots,\hat{\psi_p})$ are the data-driven penalty loadings chosen to be a function of the data depending on the setting. For more detail, you can check the [package documentation](https://arxiv.org/pdf/1608.00354)

In [14]:
rlambda = model_rlasso$lambda0
rlambda

11. The predictive capability of each model (OLS, Lasso and RLasso) is reported via $MSE$ and $R^2$ out of sample

In [15]:
y_predict_ols <- predict(model_ls, data.frame(X_flexible_test))

MSE_ols <- mean((y_test - y_predict_ols)^2)

R2_test_ols <- 1 - MSE_ols / var(y_test)

print(R2_test_ols)

"prediction from rank-deficient fit; attr(*, "non-estim") has doubtful cases"


[1] 0.08568767


In [16]:
# Lasso CV

y_predict_lasso <- predict_model(model_lasso, X_flexible_test)
MSE_lasso <- mean((y_test - y_predict_lasso)^2)
R2_test_lasso <- 1 - MSE_lasso / var(y_test)

print(R2_test_lasso)

[1] 0.2807233


In [17]:
# Rigurous Lasso

y_predict_rlasso <- predict(model_rlasso, newdata = X_flexible_test)
MSE_rlasso <- mean((y_test - y_predict_rlasso)^2)
R2_test_rlasso <- 1 - MSE_rlasso / var(y_test)

print(R2_test_rlasso)

[1] 0.3111814
