Follow [Friedman et al 2007 , pg 6](http://arxiv.org/pdf/0708.1485.pdf)

$$min_{\beta} \frac{1}{2} \sum (y_i -\sum x_{ij} \beta_j)^2 + \lambda \sum_{j} (\alpha |\beta_j| + (1-\alpha) \frac{\beta_{j}^2}{2} )$$

If the data are standardized so that $ \frac{\sum x_i^2}{n} =1 $ and $\sum x_i=0$, the algorithm can be defined as:

$$ \beta_j \leftarrow \frac{S(\frac{\sum_i x_{ij} (y_i - \hat{y_i^j})_{+}}{n} , \lambda \alpha )}{1+(\lambda(1-\alpha) )} $$

Where the soft threshold operator is given by

$$S(\beta, \tau) = sign(\beta) (|\beta|-\tau)$$

And $\hat{y_i^j} = \sum_{k \neq j } x_{ik} \beta_{k}$ stands for the fitted values of the standing betas ignoring the jth column.

### Import packages

In [1]:
## packages
import numpy as np
import pandas as pd
import scipy as sp
import scipy.linalg as la
import matplotlib.pyplot as plt
import statsmodels as sm
%matplotlib inline

from sklearn.metrics import r2_score
from sklearn.preprocessing import normalize, scale




## **Code the algorithm**

In [2]:
## packages
import numpy as np
import pandas as pd
import scipy as sp
import scipy.linalg as la
import matplotlib.pyplot as plt
import statsmodels as sm
%matplotlib inline

from sklearn.metrics import r2_score
from sklearn.preprocessing import normalize, scale



### Code the soft threshold operator

In [3]:
def S(z, gamma):
    if np.abs(z) - gamma > 0:
        return np.sign(z)*(np.abs(z) - gamma)
    else:
        return 0

### Code coordinate descent function

In [4]:


def coordinate_descent(data_X, data_y, b0 , maxiter = 200000, tol = 0.00001, alpha = .5, l1_ratio = 1):
    '''
    INPUT
    -data_X: array of exogenous variables
    -data_y: array (vector) of endogenous variables
    - b0 starting point
    -alpha: penalization/shrinking parameters
    -l1_ratio: weight put on the lasso, 1-l1_ratio is the weight put on the ridge
    -maxiter: maximum number of iterations
    -tol: level of tolerance for convergence.
    OUTPUT: path, b
    -path: path of optimization
    -b: optimal coefficients
    -Note: output is for standardize coeficients
    '''
    #1.Standardize the data#
    X = scale(data_X, axis=0)
    y =scale(data_y)
    print(X.shape)
    #2.Initialize#
    b = b0
    p = X.shape[1]
    N = X.shape[0]
    b_new = b + np.ones(p)*3000
    path = []; path.append(b.copy())

    #3.Start outer loop!
    for itr in range(maxiter):
        #3.1Check convergence
        if np.linalg.norm(b_new-b)/np.linalg.norm(b) <tol:
            b = b_new.copy()
            path.append(b.copy())
            #Plot the path
            plt.plot(path[:10])
            #Print message:
            print('Convergence achieved after', itr, 'iterations and coefficients',b )
            #Return the path and the coefficients
            return  b
            
        #3.2 If not convergece, take another internal loop
        else:
            b = b_new.copy()
            path.append(b.copy())
            #Loop accross coordinates
            for j in range(p):
                b_new[j] = S(1/N * np.dot(X[:,j], (y - (np.dot(X,b) - np.dot(X[:,j], b_new[j]))))
                             , alpha*l1_ratio)/(1+alpha*(1-l1_ratio))

        #If convergence is not achieve:
    print('Maximum iterations', itr,'exhausted, coefficents \n',b)



In [5]:
import numba
from numba import jit
@jit( cache=True)
def numba_coordinate_descent(data_X, data_y, b0 , maxiter = 200000, tol = 0.00001, alpha = .5, l1_ratio = 1):
    '''
    INPUT
    -data_X: array of exogenous variables
    -data_y: array (vector) of endogenous variables
    - b0 starting point
    -alpha: penalization/shrinking parameters
    -l1_ratio: weight put on the lasso, 1-l1_ratio is the weight put on the ridge
    -maxiter: maximum number of iterations
    -tol: level of tolerance for convergence.
    OUTPUT: path, b
    -path: path of optimization
    -b: optimal coefficients
    -Note: output is for standardize coeficients
    '''
    #1.Standardize the data#
    X = scale(data_X, axis=0)
    y =scale(data_y)
    print(X.shape)
    #2.Initialize#
    b = b0
    p = X.shape[1]
    N = X.shape[0]
    b_new = b + np.ones(p)*3000
    path = []; path.append(b.copy())

    #3.Start outer loop!
    for itr in range(maxiter):
        #3.1Check convergence
        if np.linalg.norm(b_new-b)/np.linalg.norm(b) <tol:
            b = b_new.copy()
            path.append(b.copy())
            #Plot the path
            plt.plot(path[:10])
            #Print message:
            print('Convergence achieved after', itr, 'iterations and coefficients',b )
            #Return the path and the coefficients
            return  b
            
        #3.2 If not convergece, take another internal loop
        else:
            b = b_new.copy()
            path.append(b.copy())
            #Loop accross coordinates
            for j in range(p):
                b_new[j] = S(1/N * np.dot(X[:,j], (y - (np.dot(X,b) - np.dot(X[:,j], b_new[j]))))
                             , alpha*l1_ratio)/(1+alpha*(1-l1_ratio))

        #If convergence is not achieve:
    print('Maximum iterations', itr,'exhausted, coefficents \n',b)

### **Proceed to test**

#### Generate some data

In [6]:
np.random.seed(42)

pp = 100
n = 100000
n_samples, n_features =n , pp
data_X = np.random.randn(n_samples, n_features)
coef = 3 * np.random.randn(n_features)
inds = np.arange(n_features)
np.random.shuffle(inds)
coef[inds[5:]] = 0  # sparsify coef
data_y = np.dot(data_X, coef)
# add noise
data_y += 0.02 * np.random.normal((n_samples,))

#### Compare the elastic net and the two versions

In [None]:
from sklearn.linear_model import ElasticNet
data_X =scale(data_X, axis = 0)
data_y = scale(data_y)

In [None]:
%%time
enet = ElasticNet(alpha=1, l1_ratio=0.5).fit(data_X,data_y).coef_
enet

In [None]:
enet

In [None]:
%%time
coordinate_descent(data_X, data_y, b0 = np.zeros(data_X.shape[1]), maxiter = 200000, tol = .1, alpha = 1, l1_ratio = .5)

In [None]:
%%time
numba_coordinate_descent(data_X, data_y, b0 = np.zeros(data_X.shape[1]), maxiter = 200000, tol = 0.1, alpha = 1, l1_ratio = .5)