In [1]:
using Pkg
using CSV
using Distributions
using DataFrames
using Dates
using Plots
using Random
using LinearAlgebra
using LaTeXStrings
using Lasso
# using MLBase
using Statistics
using GLMNet

### 1 Loading the data

In [2]:
# Change the working directory
cd("D:/PUCP/JP-TC")

# Load the necessary packages
using CSV, DataFrames

# Read the data
data = CSV.read("wage2015_subsample_inference.csv", DataFrame);

# Separate the features and the target variable
data = select(data, Not(["wage", "rownames"])); # Drop columns 'wage' and 'lwage'

In [3]:
# Create the design matrix

design = @formula( lwage ~ 0 + sex + (exp1 + exp2 + exp3 + exp4 + hsg + scl + clg + ad + 
so + we + ne + occ2 + ind2))
X_flexible = modelmatrix(design, data);                      

In [4]:
y = [data[:,1];;]

5150×1 Matrix{Float64}:
 2.2633643798407643
 3.872802292274865
 2.403126322215923
 2.634927936273247
 3.361976668508874
 2.4622152385859297
 2.9565115604007097
 2.9565115604007097
 2.4849066497880004
 2.9565115604007097
 ⋮
 3.117779707996832
 2.822980167776187
 3.1796551117149194
 2.6280074934286737
 2.6925460145662448
 3.138833117194664
 3.649658740960655
 3.4955080611333966
 2.8511510447428834

### 2 Creating the Lasso Cross-Validation Procedure

4. The `log_grid` function is pretty straight forward

In [33]:
function log_grid(lower::Int, upper::Int, log_step::Float64)
    log_grid = range(lower, stop=upper, length= Int(1 /log_step))
    return exp.(log_grid)
end

log_grid (generic function with 2 methods)

5. To code the `k_folds` function, there are many different approaches. However, we sticked to using only numpy. With this library, we exploited the kronecker product operation and block matrices to build the $k$-folds. Also, we addressed the issue of divisibility between the sample size $n$ and $k$ using an if-else statement depending on the module of $n/k$

In [6]:
function k_folds(data::AbstractArray, k::Int = 5)
    mdl = size(data, 1) % k
    floor = size(data, 1) ÷ k 

    if mdl == 0
        trues = fill(1, floor, 1)
        split_matrix = kron(I(k), trues)
    else
        trues_g1 = fill(1, floor + 1, 1)
        split_matrix_g1 = kron(I(mdl), trues_g1)
        
        trues_g2 = fill(1, floor, 1)
        split_matrix_g2 = kron(I(k - mdl), trues_g2)
        
        split_matrix = [split_matrix_g1  zeros(size(split_matrix_g1, 1), size(split_matrix_g2, 2));
                        zeros(size(split_matrix_g2, 1), size(split_matrix_g1, 2))  split_matrix_g2]
    end
    
    sm_bool = split_matrix .== 1
    splits = [sm_bool[:, x] for x in 1:k]
    
    return splits
end

k_folds (generic function with 2 methods)

6. For the `optimal_lambda` search function, we basically adapted the code provided in the labs so it can use the functions of log-grid and our own $k$-folds function

In [174]:
using GLMNet
function optimal_lambda(Y::AbstractVector, X::AbstractArray, lambda_bounds::Tuple{Int, Int}, k::Int = 5; niter::Int = 100)
    Y = vec(Y) 

    if ndims(X) == 1
        X = reshape(X, :, 1)
    end

    folds = k_folds(X, k)
    all_lambdas = log_grid(lambda_bounds[1],lambda_bounds[2], 1/niter)
    all_mse = zeros(niter)

    for (j, l) in enumerate(all_lambdas)
        split_pes = zeros(k)
        
        for i in 1:k
            X_train = X[.!folds[i], :]
            X_test = X[folds[i], :]
            y_train = Y[.!folds[i]]
            y_test = Y[folds[i]]

            model = glmnet(X_train, y_train, alpha=1.0, lambda=[l])
            predict = GLMNet.predict(model, X_test)

            pe = sum((y_test - predict).^2)
            split_pes[i] = pe
        end

        all_mse[j] = mean(split_pes)
    end

    selected = argmin(all_mse)
    optimal_lambda = all_lambdas[selected]
    optimal_model = glmnet(X, Y, alpha=1.0, lambda=[optimal_lambda])
    optimal_coef = [optimal_model.a0;optimal_model.betas[:]]

    output = Dict(
        "optimal_lambda" => optimal_lambda,
        "optimal_coef" => optimal_coef,  #issue
        "all_lambdas" => all_lambdas,
        "all_mse" => all_mse
    )

    return output
end

optimal_lambda (generic function with 2 methods)

7. The `predict_model` function can be easily implemented using the results of `optimal_function`

In [187]:
function predict_model(optimal_model::Dict, X::AbstractArray)
    intercept = ones(size(X, 1), 1)
    Z = [intercept;; X]
    
    return Z * optimal_model["optimal_coef"]
end

predict_model (generic function with 1 method)

### 3 Applying the Lasso Cross-Validation Procedure

We split the sample in train and test

In [25]:
train_sample = rand(Float64, size(data)[1]) .< 0.80
test_sample = .!(train_sample)
y_train, y_test = y[train_sample], y[test_sample]
X_flexible_train, X_flexible_test = X_flexible[train_sample, :], X_flexible[test_sample, :];

8. We perform the OLS fitting

In [155]:
using GLM, DataFrames

# Fitting a linear regression model
model_ls = lm([ones(size(X_flexible_train)[1]);;X_flexible_train],y_train)

LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, CholeskyPivoted{Float64, Matrix{Float64}, Vector{Int64}}}}:

Coefficients:
───────────────────────────────────────────────────────────────────────
           Coef.  Std. Error       t  Pr(>|t|)    Lower 95%   Upper 95%
───────────────────────────────────────────────────────────────────────
x1    2.82293     0.0691773    40.81    <1e-99   2.68731      2.95856
x2   -0.0951806   0.0163281    -5.83    <1e-08  -0.127193    -0.0631686
x3    0.0281351   0.0115791     2.43    0.0151   0.00543366   0.0508365
x4   -0.0838956   0.117094     -0.72    0.4737  -0.313463     0.145672
x5    0.0156588   0.0437893     0.36    0.7207  -0.070192     0.10151
x6   -0.00182871  0.00537928   -0.34    0.7339  -0.012375     0.0087176
x7    0.117898    0.0519481     2.27    0.0233   0.0160513    0.219744
x8    0.253109    0.0521105     4.86    <1e-05   0.150944     0.355274
x9    0.543313    0.0529705    10.26    <1e-23   0.439462     0.647164
x

9. Now we search the optimal lambda using our `optimal_lambda` function

In [175]:
# Finding the optimal lambda and fitting the Lasso model using the training data
model_lasso = optimal_lambda(y_train, X_flexible_train, (-7, 7))

Dict{String, Any} with 4 entries:
  "optimal_coef"   => [3.13263, -0.0549881, 0.00627277, 0.0, 0.0, 0.0, -0.09324…
  "all_mse"        => [213.914, 213.864, 213.821, 213.753, 213.656, 213.549, 21…
  "optimal_lambda" => 0.0177689
  "all_lambdas"    => [0.000911882, 0.0010504, 0.00120996, 0.00139375, 0.001605…

In [176]:
# Printing the optimal lambda
model_lasso["optimal_lambda"]

0.017768944609069942

In [177]:
# Printing the optimal coefficients
model_lasso["optimal_coef"]

15-element Vector{Float64}:
  3.132631959797634
 -0.05498805908555213
  0.00627276757380166
  0.0
  0.0
  0.0
 -0.09324321682448818
  0.0
  0.23665716689555433
  0.4042965544988727
  0.0
  0.0
  0.0
 -0.016992863063534744
 -0.010210126354513696

10. Now we use HDM for python (hdmpy) to estimate the model using the theoretically optimal penalty parameter.

In [128]:
using HDMjl

In [178]:
model_rlasso = rlasso(X_flexible_train, y_train)

Dict{String, Any} with 15 entries:
  "tss"          => 1348.46
  "dev"          => [0.904667, -0.565009, 0.393842, -0.50592, -0.0116236, -0.01…
  "model"        => [0.0 31.0 … 10.0 9.0; 0.0 18.0 … 19.0 4.0; … ; 1.0 12.0 … 1…
  "loadings"     => [0.246062, 5.40921, 2.04982, 7.43547, 27.4597, 0.206141, 0.…
  "sigma"        => 0.500159
  "lambda0"      => 470.239
  "lambda"       => [115.708, 2543.62, 963.907, 3496.45, 12912.6, 96.9354, 103.…
  "intercept"    => 3.17201
  "iter"         => 4
  "residuals"    => [0.459347, -0.392823, 0.230223, -0.618978, 0.149122, 0.0272…
  "rss"          => 1028.4
  "index"        => Bool[1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1]
  "beta"         => [-0.0943658, 0.00874918, 0.0, 0.0, 0.0, -0.112374, 0.0, 0.2…
  "options"      => Dict{String, Any}("intercept"=>true, "post"=>true, "meanx"=…
  "coefficients" => [3.17201, -0.0943658, 0.00874918, 0.0, 0.0, 0.0, -0.112374,…

As you may notice, the optimal penalty parameter resulting from this procedure is not comparable in size to the cross validation result. This is due to the fact that this penalty is the theoretically optimal parameter for the Lasso estimator under data-driven penalty loadings. That is:

\begin{equation*}

\hat{\beta} = \arg \ \underset{\beta}{\min} \sum_{i=1}^n (y_i - x_{i}^{\prime}\beta)^2 + \frac{\lambda}{n} \lVert \hat{\Psi}\beta \rVert_1

\end{equation*}

Where $\hat{\Psi} = diag(\hat{\psi_1},\hat{\psi_2},\dots,\hat{\psi_p})$ are the data-driven penalty loadings chosen to be a function of the data depending on the setting. For more detail, you can check the [package documentation](https://arxiv.org/pdf/1608.00354)

In [179]:
rlambda = model_rlasso["lambda0"]
rlambda

470.23886001420425

11. The predictive capability of each model (OLS, Lasso and RLasso) is reported via $MSE$ and $R^2$ out of sample

In [180]:
# OLS 

y_predict_ols = GLM.predict(model_ls, [ones(size(X_flexible_test)[1]);;X_flexible_test])
MSE_ols = mean((y_test-y_predict_ols).^2)
R2_test_ols = 1-MSE_ols/var(y_test)

0.18936386968849128

In [194]:
# Lasso CV

y_predict_lasso = predict_model(model_lasso, X_flexible_test)
MSE_lasso = mean((y_test-y_predict_lasso).^2)
R2_test_lasso = 1-MSE_lasso/var(y_test)


0.1848957453620037

In [198]:
# Rigurous Lasso
intercept = ones(size(X_flexible_test,1))
Z = [intercept;;X_flexible_test]
y_predict_rlasso = Z*model_rlasso["coefficients"]

MSE_rlasso = mean((y_test-y_predict_rlasso).^2)
R2_test_rlasso = 1-MSE_rlasso/var(y_test)

0.18471427916881367