In [1]:
# basics
import numpy as np
import pandas as pd 
from pydataset import data

# graphics
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)

# scikit
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# pandas display options
pd.set_option('display.max_rows', 5000)
pd.set_option('display.max_columns', 5000)
pd.set_option('display.width', 10000)

# [Elastic Net Regression](https://en.wikipedia.org/wiki/Elastic_net_regularization)

### What is it?
Regularized regression model that uses Lasso & Ridge penalty.

### When do I use it?
a] I have a lots of predictors in the data and do not know much about them.

b] When I want to use linear model as a part of my ML pipeline (serves good as a first prototype).

### Why do I use it?
To get some good predictions and model explainability. To get rid off non-sense predictors. Its super fast.

### Why do I NOT use it?
Its a linear model which struggles to beat other ML models.

# How does it work?

# What is regularization?

A way of penalizing loss function by aditional term dependent on coefficients.

### Regularized loss function

$L(\hat{y}, y)_{reg} = L(\hat{y}, y) + \lambda * R(\beta)$.

#### For example:
In stead of just least squares we have


$L(\hat{y}, y)_{reg} = \sum_{p}{(\hat{y}-y)^2} + 0.8 * \sum_{p}{\beta^2}$.

### But whyyy?

This way we can put some constraints on the size of regression parameters and thus balance the bias/variance problem of ML. 

For example not letting them to explode.

# Ridge regularization == $L_2$ regularization

Penalizes the sum of squares of regression coeffitients.

##### Regularization funct
ion is
$L_2(\beta) = \sum_{p}{\beta^2}$

##### The loss function for linear regression is
$L(\hat{y}, y)_{ridge} = \sum_{p}{(\hat{y}-y)^2} + \lambda * \sum_{p}{\beta^2}$, where $\lambda \in N$.

##### What does it actualy do?

It makes the coeffitients values smaller. With increasing $\lambda$ coeffitients go asymptoticaly to 0.

##### Good when?
When we know all the predictors in prediction dataset are important.

### Example:

In [196]:
# generate linear data
size = 10
X = np.linspace(1,10,size)
y = 8 * X + np.random.normal(2, 10, size) + np.random.binomial(1, 0.2, size) * np.random.normal(30, 5, size)

In [197]:
# linear model
model_linear = LinearRegression()
model_linear.fit(X.reshape(-1, 1),y)
print(model_linear)
print(f"Coeffs are: b_0:{model_linear.intercept_}, b_1:{model_linear.coef_}")
model_linear_fit_line = model_linear.intercept_ + model_linear.coef_ * X

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
Coeffs are: b_0:21.89839718302582, b_1:[5.85447595]


In [198]:
# ridge penalty model
model_elastic = ElasticNet(l1_ratio = 0.1, alpha=10)
model_elastic.fit(X.reshape(-1, 1),y)
print(model_elastic)
print(f"Coeffs are: b_0:{model_elastic.intercept_}, b_1:{model_elastic.coef_}")
model_elastic_fit_line = model_elastic.intercept_ + model_elastic.coef_ * X

ElasticNet(alpha=10, copy_X=True, fit_intercept=True, l1_ratio=0.1,
           max_iter=1000, normalize=False, positive=False, precompute=False,
           random_state=None, selection='cyclic', tol=0.0001, warm_start=False)
Coeffs are: b_0:39.0170383115717, b_1:[2.74199574]


In [199]:
# plot the data & fitted values with plotly
trace0 = go.Scatter(x=X, y=y, mode='markers', name='Original data')
trace1 = go.Scatter(x=X, y=model_linear_fit_line, mode='lines', name='Linear net model fit')
trace2 = go.Scatter(x=X, y=model_elastic_fit_line, mode='lines', name='Elastic net model fit')
plot_data = [trace0, trace1, trace2]
iplot(plot_data)

# Lasso regularization == $L_1$ regularization

Penalizes the sum of absolute values of regression coeffitients.

##### Regularization function is
$L_1(\beta) = \sum_{p}{|\beta|}$

##### The loss function for linear regression is
$L(\hat{y}, y)_{lasso} = \sum_{p}{(\hat{y}-y)^2} + \lambda * \sum_{p}{|\beta|}$, where $\lambda \in N$.

##### What does it actualy do?

It terminates coeffitients with small impact to original loss function... And thus allows us to put to the prediction set all kinds of stuff ; -)

##### Good when?
When we know some of the predictors in prediction dataset are not important.

# Lasso vs. Ridge penalty illustration

<img src="./lasso_vs_ridge.png" />

# Elastic Net == Lasso regularization + Ridge regularization

Penalizes the sum of absolute values & quadrates of regression coeffitients.

##### Regularization function is
$L(\beta) = (1-\alpha)/2\sum_{p}{|\beta|} + \alpha \sum_{p}{\beta ^2}$

##### The loss function for linear regression is
$L(\hat{y}, y)_{lasso} = \sum_{p}{(\hat{y}-y)^2} + \lambda * \big[(1-\alpha)/2\sum_{p}{|\beta|} + \alpha \sum_{p}{\beta ^2}\big]$, where $\lambda \in N, \alpha \in [0, 1]$.

##### What does it actualy do?

It terminates coeffitients with small impact to original loss function + allows for sparse prediction set.


## But how is $\lambda$ set? => CrossValidation!


<img src="./cv_mse.png" />

# Coeffitients values vs. L1 norm

<img src="./coeffs_plot.png" />

# Example

In [276]:
data('Fair')#(show_doc=True)

Unnamed: 0,sex,age,ym,child,religious,education,occupation,rate,nbaffairs
1,male,37.0,10.0,no,3,18,7,4,0
2,female,27.0,4.0,no,4,14,6,4,0
3,female,32.0,15.0,yes,1,12,1,4,0
4,male,57.0,15.0,yes,5,18,6,5,0
5,male,22.0,0.75,no,2,17,6,3,0
6,female,32.0,1.5,no,2,17,5,5,0
7,female,22.0,0.75,no,2,12,1,3,0
8,male,57.0,15.0,yes,2,14,4,4,0
9,female,32.0,15.0,yes,4,16,1,2,0
10,male,22.0,1.5,no,4,14,4,5,0


In [261]:
df = pd.DataFrame(data('Fair'))
df.loc[df.sex=='male', 'sex'] = 0
df.loc[df.sex=='female','sex'] = 1
df['sex'] = df['sex'].astype(int)
df.loc[df.child=='no', 'child'] = 0
df.loc[df.child=='yes','child'] = 1
df['child'] = df['child'].astype(int)
X = df[['religious','age','sex','ym','education','occupation','nbaffairs']]
y = df['rate']

In [262]:
# Linear regression model
regression=LinearRegression()
regression.fit(X,y)
MSE_lm=(mean_squared_error(y_true=y,y_pred=regression.predict(X)))
print(MSE_lm)

1.049873864469667


In [274]:
# CV
lm_search=GridSearchCV(estimator=regression, param_grid={},scoring='neg_mean_squared_error',refit=True,cv=5)
lm_search.fit(X,y)
print(f"CV score is: {abs(lm_search.best_score_)}")

CV score is: 1.1137026331064832



The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.



In [267]:
coef_dict_baseline = {}
for coef, feat in zip(regression.coef_,X.columns):
    coef_dict_baseline[feat] = coef
coef_dict_baseline

{'religious': 0.04235281110639179,
 'age': -0.00905964542867384,
 'sex': 0.0888201333708709,
 'ym': -0.030458802565476555,
 'education': 0.06810255742293703,
 'occupation': -0.00597950685299818,
 'nbaffairs': -0.07882571247653965}

In [275]:
# Elastic net model
elastic = ElasticNet(normalize=True)
search = GridSearchCV(estimator=elastic,param_grid={'alpha':np.logspace(-5,2,10),'l1_ratio':[.2,.4,.6,.65,.7,.75,.8,.9]},scoring='neg_mean_squared_error',refit=True,cv=5)
search.fit(X,y)
print(search.best_params_)
print(f"CV score is: {abs(search.best_score_)}")

{'alpha': 0.0021544346900318843, 'l1_ratio': 0.8}
CV score is: 1.0843454928901914



The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.



In [250]:
coef_dict_baseline = {}
for coef, feat in zip(search.best_estimator_.coef_,X.columns):
    coef_dict_baseline[feat] = coef
coef_dict_baseline

{'religious': 0.033962065418850655,
 'age': -0.0095679573021939,
 'sex': 0.05697735090151964,
 'ym': -0.02571070532111094,
 'education': 0.05286874416056274,
 'occupation': -0.0,
 'nbaffairs': -0.06849433387738459}

## Sources
[Scikit documentation](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ElasticNetCV.html#sklearn.linear_model.ElasticNetCV)