In [49]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import patsy
from sklearn.preprocessing import StandardScaler


### 1 Loading the data

In [70]:
os.chdir("D:/PUCP/JP-TC")

data = pd.read_csv('wage2015_subsample_inference.csv')

X = data[[x for x in list(data.columns)
  if (x != 'lwage')&
     (x != 'wage')&
     (x != 'rownames')]]
y = data[['lwage']]

In [72]:
y_norm = StandardScaler().fit_transform(y)
X_norm = StandardScaler().fit_transform(X)

In [73]:
X_flexible = patsy.dmatrix('0+ sex + (exp1+exp2+exp3+exp4+hsg+scl+clg+ad+so+we+ne+C(occ2)+C(ind2))**2',
                      data, return_type='dataframe')

### 2 Creating the Lasso Cross-Validation Procedure

4. Logarithmically spaced grid

In [74]:
def log_grid(lower:int, upper:int, log_step:int):
    
    log_grid = np.linspace(lower,upper,int(1/log_step))
    
    return np.exp(log_grid)

5. K-folds splitting generator 

In [75]:
def k_folds(data:np.ndarray, k:int = 5):

    
    module = data.shape[0]%k
    floor = data.shape[0]//k 

    if module == 0:
        
        trues = np.repeat(1, floor).reshape(-1,1)

        split_matrix = np.kron(np.eye(k), trues)
        
    else:
        
        trues_g1 = np.repeat(1,floor+1).reshape(-1,1)

        split_matrix_g1 = np.kron(np.eye(module), trues_g1)

        trues_g2 = np.repeat(1,floor).reshape(-1,1)

        split_matrix_g2 = np.kron(np.eye(k-module), trues_g2)
        
        split_matrix = np.block([[split_matrix_g1, np.zeros((split_matrix_g1.shape[0],split_matrix_g2.shape[1]))],
                                     [np.zeros((split_matrix_g2.shape[0],split_matrix_g1.shape[1])), split_matrix_g2]])
        
    sm_bool = split_matrix == 1
    
    splits = [sm_bool[:,x] for x in range(k)]
        
    return splits

6. Optimal lambda search function

In [76]:

def optimal_lambda(Y:np.ndarray, X:np.ndarray, lambda_bounds:tuple, k:int=5, *, niter:int = 100):
    
    from sklearn.linear_model import Lasso
    
    all_lambdas = log_grid(lambda_bounds[0],lambda_bounds[1],1/niter)
        
    folds = k_folds(X,k)

    all_mse = np.zeros(niter)

        
    for l in all_lambdas:
        
        split_pes = np.zeros(k)
        
        for i in range(k):
            
            X_train, X_test = X[~folds[i]],  X[folds[i]]
            y_train, y_test = Y[~folds[i]],  Y[folds[i]]

            
            model = Lasso(alpha=l).fit(X_train, y_train)
            
            predict = model.predict(X_test)
            
            pe = np.sum((y_test-predict)**2)
            
            split_pes[i] = pe
            
        all_mse[all_lambdas == l] = np.mean(split_pes)

    selected = np.where(all_mse == np.min(all_mse))

    optimal_lambda = all_lambdas[selected][0]

    
    optimal_coef = Lasso(alpha=optimal_lambda).fit(X,y).coef_
    
    output = {'optimal_lambda':optimal_lambda, 'optimal_coef':optimal_coef,
              'all_lambdas': all_lambdas, 'all_mse':all_mse}
    
    return output

7. Prediction function

In [84]:
def predict_model(optimal_model:dict, X:np.ndarray):
    
    return X@optimal_model['optimal_coef'].reshape(-1,1)

### 3 Applying the Lasso Cross-Validation Procedure

8. OLS fitting

In [None]:
from sklearn.linear_model import LinearRegression
model_ls = LinearRegression()
model_ls.fit(X_flexible,y_norm)

9. Optimal lambda search

In [None]:
lambda_CV = optimal_lambda(y_norm,X_flexible, (-7,7), niter = 10)
print(lambda_CV['optimal_lambda'])
print(lambda_CV['optimal_coef'])