# Lasso model for high dimensional data

In [1]:
import pandas as pd 
import numpy as np 
import sklearn.linear_model as lm
from sklearn.preprocessing import PolynomialFeatures

import LassoFunctions as lf
%load_ext autoreload
%autoreload 2

## Read data 

In [17]:
path = '/Users/johan/Documents/04 Div Uni/09 Asset Pricing Data/data.csv'
dat = pd.read_csv(path, low_memory=False)

print(f'The data contains {dat.shape[0]} rows (hours) and {dat.shape[1]} columns (variables)')

The data contains 120523 rows (hours) and 471 columns (variables)


In [26]:
dat.iloc[:,113].unique()

array([True, False, nan], dtype=object)

In [18]:
# Assuming df is your DataFrame
for index, row in dat.iterrows():
    if isinstance(row[113], float):  # Column index 112 corresponds to column 113 (0-based index)
        print(f"Row {index} with float value in column 113: {row[113]}")
        break

  if isinstance(row[113], float):  # Column index 112 corresponds to column 113 (0-based index)


Row 57497 with float value in column 113: nan


  print(f"Row {index} with float value in column 113: {row[113]}")


In [19]:
# Assuming df is your DataFrame
for index, row in dat.iterrows():
    if isinstance(row[113], float):  # Column index 112 corresponds to column 113 (0-based index)
        print(f"Row {index} with float value in column 113: {row[113]}")
        print(row)
        break

  if isinstance(row[113], float):  # Column index 112 corresponds to column 113 (0-based index)


Row 57497 with float value in column 113: nan
SpotPriceDKK                                 218.339996
from                          2018-03-11 07:00:00+00:00
to                            2018-03-11 08:00:00+00:00
mean_temp_Aabenraa                                  2.6
mean_temp_Aalborg                                  -0.6
                                        ...            
wind_dir_W_Vejen                                  False
wind_dir_W_Vejle                                  False
wind_dir_W_Vesthimmerlands                        False
wind_dir_W_Viborg                                 False
wind_dir_W_Ærø                                    False
Name: 57497, Length: 471, dtype: object


  print(f"Row {index} with float value in column 113: {row[113]}")


# Variable Selection

We select the following variables for our analysis:

In [None]:
# control variables



ds = ['lgdp_initial']

# all RHS variables
xs = ds + zs

# LHS variable
ys = ['gdp_growth']

# avoiding missings
all_vars = ys + xs
I = dat[all_vars].notnull().all(1)

# print number of control variables
print(f'Number of control variables before added interaction and squared terms: {len(zs)}')

Number of control variables before added interaction and squared terms: 45


In [None]:
# Create interaction terms in control variables using PolynomialFeatures
poly = PolynomialFeatures(degree=2, include_bias=True) # set to 1 for no interaction terms set to 2 for interaction terms and squared terms
Z_base = dat.loc[I, zs].values
Z = poly.fit_transform(Z_base)
Z_names = poly.get_feature_names_out(input_features=zs)

Z_df = pd.DataFrame(Z, columns=Z_names)

# dropping columns if there is no variation
Z_df = Z_df.loc[:, Z_df.std() > 0].reset_index(drop=True)
Z = Z_df.values
Z_names = Z_df.columns
print(f'Number of control variables in Z: {Z.shape[1]}')


d = dat.loc[I, ds].values.reshape(-1, 1)
X = np.hstack((d,Z))
X_names = pd.Index(ds + list(Z_df.columns))
y = dat.loc[I, ys].values.reshape(-1, 1)*100 # easier to interpret coefficients in %

Number of control variables in Z: 1064


In [5]:
X_stan=lf.standardize(X)
Z_stan=lf.standardize(Z)
d_stan=lf.standardize(d)

N = X.shape[0]

print(f'The final dataset contains {Z_stan.shape[0]} countries and {Z_stan.shape[1]} control variables')

The final dataset contains 84 countries and 1064 control variables


# Post Double Lasso

## BRT

In [6]:
# calculate first penalty
penalty_BRTyx = lf.brt_pen(X_stan,y, do_print=True)

Max term:  0.99
Penalty_BRT:  0.71


In [7]:
# run first lasso
fit_BRTyx = lm.Lasso(alpha=penalty_BRTyx).fit(X_stan, y)
coefs = fit_BRTyx.coef_

# calculate residuals
resyx = y - fit_BRTyx.predict(X_stan).reshape(-1,1)
resyxz = resyx + d_stan * coefs[0]

# Selected variables
selected_variables_BRT_first = (coefs!=0)
print('Selected varriables: ', X_names[selected_variables_BRT_first].to_list())

Selected varriables:  ['cenlong pother', 'asia pother']


In [8]:
# calculate second penalty
penalty_BRTdz = lf.brt_pen(Z_stan,d, do_print=True)

Max term:  0.99
Penalty_BRT:  0.67


In [9]:
# run second lasso
fit_BRTdz = lm.Lasso(alpha=penalty_BRTdz).fit(Z_stan, d)
coefs = fit_BRTdz.coef_

# calculate residuals
resdz = d - fit_BRTdz.predict(Z_stan).reshape(-1,1)

# Selected variables
selected_variables_BRT_second = (coefs!=0)
print('Selected varriables: ', Z_names[selected_variables_BRT_second].to_list())

Selected varriables:  ['abslat demreg', 'uvdamage pdiv']


In [10]:
# finding alpha
resid_BRT = {'resyx': resyx, 'resdz': resdz, 'resyxz': resyxz}
alpha_BRT = lf.alpha_pdl(resid_BRT, d, do_print=True)

# finding standard errors
se_BRT = lf.se_pdl(resid_BRT, do_print=True)[0]

alpha PDL:  -0.213281
Standard error PDL:  0.166696


## BCCH

In [11]:
# calculate first penalty
penalty_BCCHyx = lf.bcch_pen(X_stan, y, do_print=True)

Penalty_BCCH:  2.82


In [12]:
# run first lasso
fit_BCCHyx = lm.Lasso(alpha=penalty_BCCHyx).fit(X_stan, y)
coefs = fit_BCCHyx.coef_

# calculate residuals
resyx = y - fit_BCCHyx.predict(X_stan).reshape(-1,1)
resyxz = resyx + d_stan * coefs[0]

# Selected variables
selected_variables_BCCH_first = (coefs!=0)
print('Selected varriables: ', X_names[selected_variables_BCCH_first].to_list())

Selected varriables:  []


In [13]:
# calculate second penalty
penalty_BCCHdz = lf.bcch_pen(Z_stan,d, do_print=True)

Penalty_BCCH:  1.31


In [14]:
# run second lasso
fit_BCCHdz = lm.Lasso(alpha=penalty_BCCHdz).fit(Z_stan, d)
coefs = fit_BCCHdz.coef_

# calculate residuals
resdz = d - fit_BCCHdz.predict(Z_stan).reshape(-1,1)

# Selected variables
selected_variables_BCCH_second = (coefs!=0)
print('Selected varriables: ', Z_names[selected_variables_BCCH_second].to_list())

Selected varriables:  []


In [15]:
# calculate alpha
resid_BCCH = {'resyx': resyx, 'resdz': resdz, 'resyxz': resyxz}
alpha_BCCH = lf.alpha_pdl(resid_BCCH, d, do_print=True)

# finding standard errors
se_BCCH = lf.se_pdl(resid_BCCH, do_print=True)[0]

alpha PDL:  -0.128434
Standard error PDL:  0.122495


## Gathering results

### Results

In [16]:
BCCH_res = [alpha_BCCH, se_BCCH]
BRT_res = [alpha_BRT, se_BRT]

res = { 'BRT': BRT_res, 'BCCH': BCCH_res}

lf.generate_table(res, 'output/pdl_results.tex')

LaTeX table saved to output/pdl_results.tex


### Penalties

In [17]:
BCCH_penalty = [penalty_BCCHyx, penalty_BCCHdz]
BRT_penalty = [penalty_BRTyx, penalty_BRTdz]

penalties = {'BRT': BRT_penalty, 'BCCH': BCCH_penalty}

In [18]:
path = 'output/pdl_penalties.tex'
cols = list(penalties.keys())
num_models = len(cols)

lines = []

lines.append("\\begin{tabular}{" + "l" + "c" * num_models + "}")
lines.append("\\hline\\hline\\\\[-1.8ex]")
header_row = [""] + cols
lines.append(" & ".join(header_row) + " \\\\")
lines.append("\\hline")

# For each variable in label_x
lines.append("First stage & " + " & ".join([f"{penalties[col][0]:.4f}" for col in cols]) + " \\\\")
lines.append("Second stage & " + " & ".join([f"{penalties[col][1]:.4f}" for col in cols]) + " \\\\")

lines.append("\\hline\\hline")

# End of table
lines.append("\\end{tabular}")

# Save to file
with open(path, 'w') as f:
    f.write('\n'.join(lines))

print(f"LaTeX table saved to {path}")

LaTeX table saved to output/pdl_penalties.tex


### Variable selection

In [19]:
# translate variables to labels using lbl_all dictionary and zs
lbls_zs = [lbl_all[var] for var in zs]
lbls_ds = [lbl_all[var] for var in ds]
lbls_ys = [lbl_all[var] for var in ys]

lbls_zs = [variable.replace('%', '\\%') for variable in lbls_zs] # preparing for latex


In [20]:
path = 'output/model_variables.tex'

lines = []

lines.append("\\begin{longtable}{lp{11cm}}")
lines.append("\\caption{Variables used in model} \\\\")
lines.append('\\label{tab:model-vars} \\\\')
lines.append("\\hline\\hline\\\\[-1.8ex]")
lines.append(" & Variable Name \\\\")
lines.append("\\hline")

# dependent variable
lines.append(f'Dependent Variable & {lbls_ys[0]} \\\\')
lines.append("\\hline")

# independent variable
lines.append(f'Independent Variables & {lbls_ds[0]} \\\\')
lines.append("\\hline")

# control variables
lines.append('Control Variables')
for variable in lbls_zs:
    lines.append(f" & {variable} \\\\")

lines.append("\\hline\\hline")
lines.append("\\end{longtable}")

# Save to file
with open(path, 'w') as f:
    f.write('\n'.join(lines))

print(f"LaTeX table saved to {path}")

LaTeX table saved to output/model_variables.tex


In [21]:
BRT_var = [X_names[selected_variables_BRT_first].to_list(), Z_names[selected_variables_BRT_second].to_list()]
BCCH_var = [X_names[selected_variables_BCCH_first].to_list(), Z_names[selected_variables_BCCH_second].to_list()]

vars = {'BRT': BRT_var, 'BCCH': BCCH_var}

In [22]:
path = 'output/pdl_variables.tex'
cols = list(vars.keys())
num_models = len(cols)

lines = []

lines.append("\\begin{tabular}{" + "l" + "c" * num_models + "}")
lines.append("\\hline\\hline\\\\[-1.8ex]")
header_row = [""] + cols
lines.append(" & ".join(header_row) + " \\\\")
lines.append("\\hline")

# Print each variable in the First stage on its own row
lines.append("First stage")
for i in range(max(len(vars[col][0]) for col in cols)):  # Determine the maximum number of variables in the first stage
    row = [""]  # Start each row with an empty cell for the label column
    for col in cols:
        row.append(vars[col][0][i] if i < len(vars[col][0]) else "")  # Add variable or empty string if no more variables
    lines.append(" & ".join(row) + " \\\\")

# Print each variable in the Second stage on its own row
lines.append("Second stage")
for i in range(max(len(vars[col][1]) for col in cols)):  # Determine the maximum number of variables in the second stage
    row = [""]  # Start each row with an empty cell for the label column
    for col in cols:
        row.append(vars[col][1][i] if i < len(vars[col][1]) else "")  # Add variable or empty string if no more variables
    lines.append(" & ".join(row) + " \\\\")

lines.append("\\hline\\hline")
lines.append("\\end{tabular}")

# Save to file
with open(path, 'w') as f:
    f.write('\n'.join(lines))

print(f"LaTeX table saved to {path}")

LaTeX table saved to output/pdl_variables.tex


## Effect of the penalty, $\lambda$

In [23]:
penalty_grid = np.geomspace(0.001, 3, num = 50)

coefs = []
MSE = []
for lamb in penalty_grid:
    fit = lm.Lasso(alpha = lamb, max_iter=10000).fit(X_stan,y) 
    coefs.append(fit.coef_)
    pred = fit.predict(X_stan).reshape(-1,1)
    MSE.append(np.mean((y-pred)**2))

  model = cd_fast.enet_coordinate_descent(


In [24]:
lf.plot_lasso_path(penalty_grid, coefs, None, -1, vlines={'BCCH': penalty_BCCHyx, 'BRT': penalty_BRTyx}, do_print=False, save_path='output/lasso_path.png')

Figure saved to output/lasso_path.png


In [25]:
lf.plot_MSE_path(penalty_grid, MSE,0.1, do_print=False, save_path='output/MSE_path.png')

Figure saved to output/MSE_path.png
