In [1]:
using Pkg
using CSV
using Distributions
using DataFrames
using Dates
using Plots
using Random
using LinearAlgebra
using LaTeXStrings
using Lasso
# using MLBase
using Statistics
using GLMNet
using MLJ

### 1 Loading the data

In [2]:
# Change the working directory
cd("D:/PUCP/JP-TC")

# Load the necessary packages
using CSV, DataFrames

# Read the data
data = CSV.read("wage2015_subsample_inference.csv", DataFrame);

# Separate the features and the target variable
data = select(data, Not(["wage", "rownames"])); # Drop columns 'wage' and 'lwage'

In [3]:
# Create the design matrix

design = @formula( lwage ~ 0 + sex + (exp1 + exp2 + exp3 + exp4 + hsg + scl + clg + ad + 
so + we + ne + occ2 + ind2))
X_flexible = modelmatrix(design, data);                      

In [4]:
y = [data[:,1];;]

5150×1 Matrix{Float64}:
 2.2633643798407643
 3.872802292274865
 2.403126322215923
 2.634927936273247
 3.361976668508874
 2.4622152385859297
 2.9565115604007097
 2.9565115604007097
 2.4849066497880004
 2.9565115604007097
 ⋮
 3.117779707996832
 2.822980167776187
 3.1796551117149194
 2.6280074934286737
 2.6925460145662448
 3.138833117194664
 3.649658740960655
 3.4955080611333966
 2.8511510447428834

### 2 Creating the Lasso Cross-Validation Procedure

4. The `log_grid` function is pretty straight forward

In [5]:
function log_grid(lower::Int, upper::Int, log_step::Int)
    log_grid = range(lower, stop=upper, length=1 / log_step)
    return exp.(log_grid)
end

log_grid (generic function with 1 method)

5. To code the `k_folds` function, there are many different approaches. However, we sticked to using only numpy. With this library, we exploited the kronecker product operation and block matrices to build the $k$-folds. Also, we addressed the issue of divisibility between the sample size $n$ and $k$ using an if-else statement depending on the module of $n/k$

In [6]:
function k_folds(data::AbstractArray, k::Int = 5)
    mdl = size(data, 1) % k
    floor = size(data, 1) ÷ k 

    if mdl == 0
        trues = fill(1, floor, 1)
        split_matrix = kron(I(k), trues)
    else
        trues_g1 = fill(1, floor + 1, 1)
        split_matrix_g1 = kron(I(mdl), trues_g1)
        
        trues_g2 = fill(1, floor, 1)
        split_matrix_g2 = kron(I(k - mdl), trues_g2)
        
        split_matrix = [split_matrix_g1  zeros(size(split_matrix_g1, 1), size(split_matrix_g2, 2));
                        zeros(size(split_matrix_g2, 1), size(split_matrix_g1, 2))  split_matrix_g2]
    end
    
    sm_bool = split_matrix .== 1
    splits = [sm_bool[:, x] for x in 1:k]
    
    return splits
end

k_folds (generic function with 2 methods)

6. For the `optimal_lambda` search function, we basically adapted the code provided in the labs so it can use the functions of log-grid and our own $k$-folds function

In [19]:
using MLJ, Random, StatsBase

function optimal_lambda(Y::AbstractVector, X::AbstractArray, lambda_bounds::Tuple{Int, Int}, k::Int = 5; niter::Int = 100)
    Y = vec(Y)  # Ensure Y is a 1D array

    if ndims(X) == 1
        X = reshape(X, :, 1)
    end

    folds = k_folds(X, k)
    all_lambdas = exp.(range(lambda_bounds[1], stop=lambda_bounds[2], length=niter))
    all_mse = zeros(niter)

    for (j, l) in enumerate(all_lambdas)
        split_pes = zeros(k)
        
        for i in 1:k
            X_train = X[.!folds[i], :]
            X_test = X[folds[i], :]
            y_train = Y[.!folds[i]]
            y_test = Y[folds[i]]

            model = Lasso(alpha=l)
            !fit(X_train, y_train)
            predict = predict(model, X_test)

            pe = sum((y_test - predict).^2)
            split_pes[i] = pe
        end

        all_mse[j] = mean(split_pes)
    end

    selected = argmin(all_mse)
    optimal_lambda = all_lambdas[selected]
    optimal_model = glmnet(X, Y, alpha=1.0, lambda=[optimal_lambda])
    optimal_coef = vcat(optimal_model.intercept_, coef(optimal_model))

    output = Dict(
        "optimal_lambda" => optimal_lambda,
        "optimal_coef" => optimal_coef,
        "all_lambdas" => all_lambdas,
        "all_mse" => all_mse
    )

    return output
end

optimal_lambda (generic function with 2 methods)

7. The `predict_model` function can be easily implemented using the results of `optimal_function`

In [8]:
function predict_model(optimal_model::Dict, X::AbstractArray)
    intercept = ones(size(X, 1), 1)
    Z = hcat(intercept, X)
    
    return Z * reshape(optimal_model["optimal_coef"], :, 1)
end

predict_model (generic function with 1 method)

### 3 Applying the Lasso Cross-Validation Procedure

We split the sample in train and test

In [9]:
train_sample = rand(Float64, size(data)[1]) .< 0.80
test_sample = .!(train_sample)
y_train, y_test = y[train_sample], y[test_sample]
X_flexible_train, X_flexible_test = X_flexible[train_sample, :], X_flexible[test_sample, :];

8. We perform the OLS fitting

In [10]:
using GLM, DataFrames


# Fitting a linear regression model
model_ls = lm(X_flexible_train,y_train)

LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, CholeskyPivoted{Float64, Matrix{Float64}, Vector{Int64}}}}:

Coefficients:
─────────────────────────────────────────────────────────────────────────
           Coef.  Std. Error       t  Pr(>|t|)     Lower 95%    Upper 95%
─────────────────────────────────────────────────────────────────────────
x1   -0.0832756   0.0191439    -4.35    <1e-04  -0.120808     -0.0457431
x2    0.222663    0.012255     18.17    <1e-70   0.198637      0.24669
x3   -1.64079     0.130375    -12.59    <1e-34  -1.8964       -1.38518
x4    0.494905    0.0503171     9.84    <1e-21   0.396256      0.593554
x5   -0.0518033   0.00631768   -8.20    <1e-15  -0.0641894    -0.0394172
x6    1.60241     0.0453767    35.31    <1e-99   1.51345       1.69137
x7    1.77841     0.0440866    40.34    <1e-99   1.69198       1.86484
x8    2.21322     0.0404132    54.76    <1e-99   2.13399       2.29245
x9    2.43521     0.0447395    54.43    <1e-99   2.3475       

9. Npw we search the optimal lambda using our `optimal_lambda` function

In [20]:
# Finding the optimal lambda and fitting the Lasso model using the training data
model_lasso = optimal_lambda(y_train, X_flexible_train, (-7, 7))

MethodError: MethodError: objects of type Module are not callable

In [117]:
# Printing the optimal lambda
println(model_lasso["optimal_lambda"])

0.0016054624479407073


In [133]:
# Printing the optimal coefficients
println(model_lasso["optimal_coef"])

[ 2.74211151e+00  1.92056918e-01  2.19697055e-01  1.98834039e-01
  6.11655366e-04  0.00000000e+00 -0.00000000e+00  0.00000000e+00
 -1.30250733e-01 -0.00000000e+00  0.00000000e+00 -0.00000000e+00
 -0.00000000e+00 -0.00000000e+00 -0.00000000e+00 -0.00000000e+00
  0.00000000e+00 -1.19425276e-01 -0.00000000e+00  0.00000000e+00
  0.00000000e+00 -0.00000000e+00 -3.07064932e-02  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00 -3.90294823e-02 -0.00000000e+00  0.00000000e+00
  2.99489128e-02  0.00000000e+00  3.83567086e-02 -0.00000000e+00
 -0.00000000e+00 -2.17286049e-03 -0.00000000e+00 -2.23640240e-02
 -2.28784353e-01  0.00000000e+00  0.00000000e+00 -0.00000000e+00
  0.00000000e+00 -0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00 -0.00000000e+00  0.00000000e+00
 -0.00000000e+00  0.00000

For comparison, we can see that our code replicates exactly (but less efficiently) the results of the routine embedded in LassoCV

In [130]:
using GLM, MLJ

# Generate the grid of alpha values (lambda values)
alphas = exp(range(-7, stop = 7, length = 100))

# Fitting Lasso model with cross-validation
model = LassoCV(alphas)
fitted = fit!(model, X_flexible_train, y_train)

In [131]:
fitted.best_lambda

np.float64(0.0016054624479407073)

In [135]:
np.hstack((fitted.intercept_,fitted.coef_))

array([ 2.74211151e+00,  1.92056918e-01,  2.19697055e-01,  1.98834039e-01,
        6.11655366e-04,  0.00000000e+00, -0.00000000e+00,  0.00000000e+00,
       -1.30250733e-01, -0.00000000e+00,  0.00000000e+00, -0.00000000e+00,
       -0.00000000e+00, -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,
        0.00000000e+00, -1.19425276e-01, -0.00000000e+00,  0.00000000e+00,
        0.00000000e+00, -0.00000000e+00, -3.07064932e-02,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00, -3.90294823e-02, -0.00000000e+00,  0.00000000e+00,
        2.99489128e-02,  0.00000000e+00,  3.83567086e-02, -0.00000000e+00,
       -0.00000000e+00, -2.17286049e-03, -0.00000000e+00, -2.23640240e-02,
       -2.28784353e-01,  0.00000000e+00,  0.00000000e+00, -0.00000000e+00,
        0.00000000e+00, -0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  

10. Now we use HDM for python (hdmpy) to estimate the model using the theoretically optimal penalty parameter.

In [136]:
# !git clone https://github.com/maxhuppertz/hdmpy
import hdmpy

In [137]:
model_rlasso = hdmpy.rlasso(X_flexible_train,y_train)

As you may notice, the optimal penalty parameter resulting from this procedure is not comparable in size to the cross validation result. This is due to the fact that this penalty is the theoretically optimal parameter for the Lasso estimator under data-driven penalty loadings. That is:

\begin{equation*}

\hat{\beta} = \arg \ \underset{\beta}{\min} \sum_{i=1}^n (y_i - x_{i}^{\prime}\beta)^2 + \frac{\lambda}{n} \lVert \hat{\Psi}\beta \rVert_1

\end{equation*}

Where $\hat{\Psi} = diag(\hat{\psi_1},\hat{\psi_2},\dots,\hat{\psi_p})$ are the data-driven penalty loadings chosen to be a function of the data depending on the setting. For more detail, you can check the [package documentation](https://arxiv.org/pdf/1608.00354)

In [138]:
rlambda = model_rlasso.est['lambda0']
rlambda

np.float64(617.5092395386587)

11. The predictive capability of each model (OLS, Lasso and RLasso) is reported via $MSE$ and $R^2$ out of sample

In [148]:
# OLS 

y_predict_ols = model_ls.predict(X_flexible_test)
MSE_ols = np.mean((y_test-y_predict_ols)**2)
R2_test_ols = 1-MSE_ols/np.var(y_test)

In [145]:
# Lasso CV

y_predict_lasso = predict_model(model_lasso, X_flexible_test)
MSE_lasso = np.mean((y_test-y_predict_lasso)**2)
R2_test_lasso = 1-MSE_lasso/np.var(y_test)


In [158]:
# Rigurous Lasso
intercept = np.ones((X_flexible_test.shape[0],1))
Z = np.hstack((intercept,X_flexible_test))
y_predict_rlasso = Z@model_rlasso.est['coefficients'].to_numpy()

MSE_lasso = np.mean((y_test-y_predict_lasso)**2)
R2_test_lasso = 1-MSE_lasso/np.var(y_test)

In [152]:
model_rlasso.est['coefficients'].to_numpy()

array([[ 2.71428000e+00],
       [ 2.37534975e-01],
       [ 2.13217801e-01],
       [ 2.67714266e-01],
       [ 0.00000000e+00],
       [ 0.00000000e+00],
       [ 0.00000000e+00],
       [ 0.00000000e+00],
       [-1.62529429e-01],
       [ 0.00000000e+00],
       [ 0.00000000e+00],
       [-2.68111905e-01],
       [ 0.00000000e+00],
       [ 3.30115974e-02],
       [-7.76382024e-02],
       [-2.63716934e-01],
       [ 0.00000000e+00],
       [-1.67552014e-01],
       [ 0.00000000e+00],
       [ 0.00000000e+00],
       [ 0.00000000e+00],
       [ 0.00000000e+00],
       [ 0.00000000e+00],
       [ 0.00000000e+00],
       [ 0.00000000e+00],
       [ 0.00000000e+00],
       [ 0.00000000e+00],
       [ 0.00000000e+00],
       [ 0.00000000e+00],
       [ 0.00000000e+00],
       [ 0.00000000e+00],
       [ 0.00000000e+00],
       [ 1.75920170e-01],
       [ 0.00000000e+00],
       [ 1.05706230e-01],
       [ 0.00000000e+00],
       [ 0.00000000e+00],
       [ 0.00000000e+00],
       [ 0.0