In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import patsy
from sklearn.preprocessing import StandardScaler
import warnings
warnings.simplefilter('ignore')


### 1 Loading the data

In [3]:
os.chdir("D:/PUCP/JP-TC")

data = pd.read_csv('wage2015_subsample_inference.csv')

X = data.drop(['wage', 'lwage'], axis=1)
y = data[['lwage']]

In [4]:
X_flexible = patsy.dmatrix('0+ sex + (exp1+exp2+exp3+exp4+hsg+scl+clg+ad+so+we+ne+C(occ2)+C(ind2))**2',
                      X, return_type='dataframe')

In [5]:
y = y.to_numpy()
X_flexible = X_flexible.to_numpy()

### 2 Creating the Lasso Cross-Validation Procedure

4. The `log_grid` function is pretty straight forward

In [6]:
def log_grid(lower:int, upper:int, log_step:int):
    
    log_grid = np.linspace(lower,upper,int(1/log_step))
    
    return np.exp(log_grid)

5. To code the `k_folds` function, there are many different approaches. However, we sticked to using only numpy. With this library, we exploited the kronecker product operation and block matrices to build the $k$-folds. Also, we addressed the issue of divisibility between the sample size $n$ and $k$ using an if-else statement depending on the module of $n/k$

In [7]:
def k_folds(data:np.ndarray, k:int = 5):

    
    module = data.shape[0]%k
    floor = data.shape[0]//k 

    if module == 0:
        
        trues = np.repeat(1, floor).reshape(-1,1)

        split_matrix = np.kron(np.eye(k), trues)
        
    else:
        
        trues_g1 = np.repeat(1,floor+1).reshape(-1,1)

        split_matrix_g1 = np.kron(np.eye(module), trues_g1)

        trues_g2 = np.repeat(1,floor).reshape(-1,1)

        split_matrix_g2 = np.kron(np.eye(k-module), trues_g2)
        
        split_matrix = np.block([[split_matrix_g1, np.zeros((split_matrix_g1.shape[0],split_matrix_g2.shape[1]))],
                                     [np.zeros((split_matrix_g2.shape[0],split_matrix_g1.shape[1])), split_matrix_g2]])
        
    sm_bool = split_matrix == 1
    
    splits = [sm_bool[:,x] for x in range(k)]
        
    return splits

6. For the `optimal_lambda` search function, we basically adapted the code provided in the labs so it can use the functions of log-grid and our own $k$-folds function

In [8]:

def optimal_lambda(Y:np.ndarray, X:np.ndarray, lambda_bounds:tuple, k:int=5, *, niter:int = 100):
    
    from sklearn.linear_model import Lasso
    
    Y = Y.squeeze()
    
    if len(X.shape) == 1:
        
        X = X.reshape(-1,1)
        
    folds = k_folds(X,k)
    
    all_lambdas = log_grid(lambda_bounds[0],lambda_bounds[1],1/niter)
        
    all_mse = np.zeros(niter)

    for l in all_lambdas:
        
        split_pes = np.zeros(k)
        
        for i in range(k):
            
            X_train, X_test = X[~folds[i]],  X[folds[i]]
            y_train, y_test = Y[~folds[i]],  Y[folds[i]]

            
            model = Lasso(alpha=l).fit(X_train, y_train)
            
            predict = model.predict(X_test)
            
            pe = np.sum((y_test-predict)**2)
            
            split_pes[i] = pe
            
        all_mse[all_lambdas == l] = np.mean(split_pes)

    selected = np.where(all_mse == np.min(all_mse))

    optimal_lambda = all_lambdas[selected][0]

    optimal_model = Lasso(alpha=optimal_lambda).fit(X,Y)
    
    optimal_coef = np.hstack((optimal_model.intercept_,optimal_model.coef_))
    
    output = {'optimal_lambda':optimal_lambda, 'optimal_coef':optimal_coef,
              'all_lambdas': all_lambdas, 'all_mse':all_mse}
    
    return output

7. The `predict_model` function can be easily implemented using the results of `optimal_function`

In [9]:
def predict_model(optimal_model:dict, X:np.ndarray):

    intercept = np.ones((X.shape[0],1))
    Z = np.hstack((intercept,X))
    return Z@optimal_model['optimal_coef'].reshape(-1,1)

In [18]:
X_flexible_train.shape

(4120, 980)

### 3 Applying the Lasso Cross-Validation Procedure

We split the sample in train and test

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
X_flexible_train, X_flexible_test, y_train, y_test = train_test_split(X_flexible, y, test_size= 0.2)

8. We perform the OLS fitting

In [12]:
from sklearn.linear_model import LinearRegression
model_ls = LinearRegression()
model_ls.fit(X_flexible_train,y_train)

9. Npw we search the optimal lambda using our `optimal_lambda` function

In [13]:
model_lasso = optimal_lambda(y_train,X_flexible_train, (-7,7))

In [14]:
print(model_lasso['all_mse'])

[189.48858513 189.37697851 189.27797951 189.21841569 189.26878342
 189.52962527 189.85343543 190.36558096 190.93055087 191.679036
 192.64540952 193.6266622  194.39405716 195.25928025 196.36050402
 197.48622865 198.72532779 200.36385093 202.2461461  203.55738484
 204.26839459 204.64604751 205.43410903 206.02436035 206.77825682
 207.49556857 208.4872388  209.75251045 211.29296016 212.98207143
 214.66867318 216.46794788 218.23715345 219.96417253 222.00202443
 224.36414392 227.09827758 229.63986137 232.40016662 235.3451906
 237.45463549 239.01197673 240.86964186 243.32844402 247.0913061
 251.9950296  254.94995277 257.6142977  259.01063628 259.67275174
 260.17173369 260.47140056 260.67726563 260.73758923 260.74857764
 260.76186039 260.77976501 260.80411288 260.83711786 260.88164212
 260.94156025 261.02198569 261.12986757 261.2743253  261.46743789
 261.72547563 262.06977331 262.41804353 262.50583821 262.52404239
 262.52061321 262.5191056  262.50974762 262.49794605 262.48166637
 262.45701139 

In [117]:
print(model_lasso['optimal_lambda'])

0.0016054624479407073


In [133]:
print(model_lasso['optimal_coef'])

[ 2.74211151e+00  1.92056918e-01  2.19697055e-01  1.98834039e-01
  6.11655366e-04  0.00000000e+00 -0.00000000e+00  0.00000000e+00
 -1.30250733e-01 -0.00000000e+00  0.00000000e+00 -0.00000000e+00
 -0.00000000e+00 -0.00000000e+00 -0.00000000e+00 -0.00000000e+00
  0.00000000e+00 -1.19425276e-01 -0.00000000e+00  0.00000000e+00
  0.00000000e+00 -0.00000000e+00 -3.07064932e-02  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00 -3.90294823e-02 -0.00000000e+00  0.00000000e+00
  2.99489128e-02  0.00000000e+00  3.83567086e-02 -0.00000000e+00
 -0.00000000e+00 -2.17286049e-03 -0.00000000e+00 -2.23640240e-02
 -2.28784353e-01  0.00000000e+00  0.00000000e+00 -0.00000000e+00
  0.00000000e+00 -0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00 -0.00000000e+00  0.00000000e+00
 -0.00000000e+00  0.00000

For comparison, we can see that our code replicates exactly (but less efficiently) the results of the routine embedded in LassoCV

In [130]:
from sklearn.linear_model import LassoCV
model = LassoCV(alphas = log_grid(-7,7,1/100))
fitted = model.fit(X_flexible_train,y_train)

In [131]:
fitted.alpha_

np.float64(0.0016054624479407073)

In [135]:
np.hstack((fitted.intercept_,fitted.coef_))

array([ 2.74211151e+00,  1.92056918e-01,  2.19697055e-01,  1.98834039e-01,
        6.11655366e-04,  0.00000000e+00, -0.00000000e+00,  0.00000000e+00,
       -1.30250733e-01, -0.00000000e+00,  0.00000000e+00, -0.00000000e+00,
       -0.00000000e+00, -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,
        0.00000000e+00, -1.19425276e-01, -0.00000000e+00,  0.00000000e+00,
        0.00000000e+00, -0.00000000e+00, -3.07064932e-02,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00, -3.90294823e-02, -0.00000000e+00,  0.00000000e+00,
        2.99489128e-02,  0.00000000e+00,  3.83567086e-02, -0.00000000e+00,
       -0.00000000e+00, -2.17286049e-03, -0.00000000e+00, -2.23640240e-02,
       -2.28784353e-01,  0.00000000e+00,  0.00000000e+00, -0.00000000e+00,
        0.00000000e+00, -0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  

10. Now we use HDM for python (hdmpy) to estimate the model using the theoretically optimal penalty parameter.

In [136]:
# !git clone https://github.com/maxhuppertz/hdmpy
import hdmpy

In [137]:
model_rlasso = hdmpy.rlasso(X_flexible_train,y_train)

As you may notice, the optimal penalty parameter resulting from this procedure is not comparable in size to the cross validation result. This is due to the fact that this penalty is the theoretically optimal parameter for the Lasso estimator under data-driven penalty loadings. That is:

\begin{equation*}

\hat{\beta} = \arg \ \underset{\beta}{\min} \sum_{i=1}^n (y_i - x_{i}^{\prime}\beta)^2 + \frac{\lambda}{n} \lVert \hat{\Psi}\beta \rVert_1

\end{equation*}

Where $\hat{\Psi} = diag(\hat{\psi_1},\hat{\psi_2},\dots,\hat{\psi_p})$ are the data-driven penalty loadings chosen to be a function of the data depending on the setting. For more detail, you can check the [package documentation](https://arxiv.org/pdf/1608.00354)

In [138]:
rlambda = model_rlasso.est['lambda0']
rlambda

np.float64(617.5092395386587)

11. The predictive capability of each model (OLS, Lasso and RLasso) is reported via $MSE$ and $R^2$ out of sample

In [148]:
# OLS 

y_predict_ols = model_ls.predict(X_flexible_test)
MSE_ols = np.mean((y_test-y_predict_ols)**2)
R2_test_ols = 1-MSE_ols/np.var(y_test)

In [145]:
# Lasso CV

y_predict_lasso = predict_model(model_lasso, X_flexible_test)
MSE_lasso = np.mean((y_test-y_predict_lasso)**2)
R2_test_lasso = 1-MSE_lasso/np.var(y_test)


In [158]:
# Rigurous Lasso
intercept = np.ones((X_flexible_test.shape[0],1))
Z = np.hstack((intercept,X_flexible_test))
y_predict_rlasso = Z@model_rlasso.est['coefficients'].to_numpy()

MSE_lasso = np.mean((y_test-y_predict_lasso)**2)
R2_test_lasso = 1-MSE_lasso/np.var(y_test)

In [152]:
model_rlasso.est['coefficients'].to_numpy()

array([[ 2.71428000e+00],
       [ 2.37534975e-01],
       [ 2.13217801e-01],
       [ 2.67714266e-01],
       [ 0.00000000e+00],
       [ 0.00000000e+00],
       [ 0.00000000e+00],
       [ 0.00000000e+00],
       [-1.62529429e-01],
       [ 0.00000000e+00],
       [ 0.00000000e+00],
       [-2.68111905e-01],
       [ 0.00000000e+00],
       [ 3.30115974e-02],
       [-7.76382024e-02],
       [-2.63716934e-01],
       [ 0.00000000e+00],
       [-1.67552014e-01],
       [ 0.00000000e+00],
       [ 0.00000000e+00],
       [ 0.00000000e+00],
       [ 0.00000000e+00],
       [ 0.00000000e+00],
       [ 0.00000000e+00],
       [ 0.00000000e+00],
       [ 0.00000000e+00],
       [ 0.00000000e+00],
       [ 0.00000000e+00],
       [ 0.00000000e+00],
       [ 0.00000000e+00],
       [ 0.00000000e+00],
       [ 0.00000000e+00],
       [ 1.75920170e-01],
       [ 0.00000000e+00],
       [ 1.05706230e-01],
       [ 0.00000000e+00],
       [ 0.00000000e+00],
       [ 0.00000000e+00],
       [ 0.0

In [26]:
from sklearn.linear_model import LassoCV

In [30]:
y = np.random.uniform(0,1,100)
X = 1+1*y + np.random.normal(0,1,100)

In [42]:
alphas = log_grid(-7,7,1/100)

In [43]:
alphas

array([9.11881966e-04, 1.05039829e-03, 1.20995548e-03, 1.39374965e-03,
       1.60546245e-03, 1.84933476e-03, 2.13025167e-03, 2.45384031e-03,
       2.82658257e-03, 3.25594499e-03, 3.75052824e-03, 4.32023947e-03,
       4.97649076e-03, 5.73242767e-03, 6.60319260e-03, 7.60622811e-03,
       8.76162633e-03, 1.00925314e-02, 1.16256030e-02, 1.33915507e-02,
       1.54257487e-02, 1.77689446e-02, 2.04680757e-02, 2.35772092e-02,
       2.71586249e-02, 3.12840635e-02, 3.60361628e-02, 4.15101135e-02,
       4.78155662e-02, 5.50788271e-02, 6.34453891e-02, 7.30828452e-02,
       8.41842462e-02, 9.69719679e-02, 1.11702165e-01, 1.28669902e-01,
       1.48215066e-01, 1.70729172e-01, 1.96663206e-01, 2.26536662e-01,
       2.60947943e-01, 3.00586353e-01, 3.46245901e-01, 3.98841207e-01,
       4.59425824e-01, 5.29213342e-01, 6.09601694e-01, 7.02201167e-01,
       8.08866647e-01, 9.31734785e-01, 1.07326679e+00, 1.23629773e+00,
       1.42409333e+00, 1.64041539e+00, 1.88959711e+00, 2.17662993e+00,
      

In [45]:
model = LassoCV(alphas=alphas)
fit = model.fit(X.reshape(-1,1),y)
fit.alpha_

np.float64(0.0009118819655545162)