In [1]:
from random import randrange
import numpy as np
from sklearn.metrics import mean_squared_error, log_loss
from sklearn.datasets import load_breast_cancer, load_diabetes
from sklearn.linear_model import Lasso, LinearRegression
from sklearn.preprocessing import normalize


def grad_check_sparse(f, x, analytic_grad, num_checks=10, h=1e-5, error=1e-9):
    """
    sample a few random elements and only return numerical
    in this dimensions
    """

    for i in range(num_checks):
        ix = tuple([randrange(m) for m in x.shape])

        oldval = x[ix]
        x[ix] = oldval + h  # increment by h
        fxph = f(x)  # evaluate f(x + h)
        x[ix] = oldval - h  # increment by h
        fxmh = f(x)  # evaluate f(x - h)
        x[ix] = oldval  # reset

        grad_numerical = (fxph - fxmh) / (2 * h)
        grad_analytic = analytic_grad[ix]
        rel_error = abs(grad_numerical - grad_analytic) / (
            abs(grad_numerical) + abs(grad_analytic)
        )
        print(
            "numerical: %f analytic: %f, relative error: %e"
            % (grad_numerical, grad_analytic, rel_error)
        )
        assert rel_error < error

def rel_error(x, y):
    """ returns relative error """
    return np.max(np.abs(x - y) / (np.maximum(1e-8, np.abs(x) + np.abs(y))))

In [2]:
data = load_diabetes()
X, y = data.data, data.target

In [3]:
def mse_loss_vectorized(w, b, X, y):
    """
    MSE loss function WITHOUT FOR LOOPs , NO REGULARIZATION
    
    Returns a tuple of:
    - loss 
    - gradient with respect to weights w
    - gradient with respect to bias b
    """
    loss = 0.0
    dw = np.zeros_like(w)
    
    loss = (1.0 / len(X)) * (np.linalg.norm(X @ w + b - y, ord=2)) ** 2 
    dw = (2.0 / len(X)) * (X @ w + b - y) @ X 
    db = (2.0 / len(X)) * (X @ w + b - y) @ np.ones(len(X))
    
    return loss, dw, np.array(db).reshape(1,)

In [23]:
def soft_threshold(x, delta):
    if x > delta:
        return x - delta
    elif abs(x) <= delta:
        return 0
    elif x < -delta:
        return x + delta
    else:
        return "error in soft_threshold"

# Lasso Subgradient Descent

In [19]:
def l1_subgradient(w):
    """
    Subgradient of the L1 loss
    """
    dw = np.zeros_like(w)
    # YOUR CODE HERE
    # raise NotImplementedError()
    dw = np.where(w <= 0, -1, 1)
    return dw
    

def lasso_subgradient_mse_loss_vectorized(w, b, X, y, alpha):
    """
    MSE loss function adding the subgradient for w
    """
    loss, dw, db = mse_loss_vectorized(w, b, X, y)
    # Add the subgradient to dw
    # YOUR CODE HERE
    #raise NotImplementedError()
    dw += alpha * l1_subgradient(w)
    return loss, dw, db

In [20]:
class LassolSubgradientDescent():
    def __init__(self,  alpha=0.1):
        self.w = None
        self.b = None
        self.alpha = alpha

    def train(self, X, y, learning_rate=1e-3, num_iters=100, batch_size=200, verbose=False):
        N, d = X.shape
        
        if self.w is None: # Initialization
            self.w = 0.001 * np.random.randn(d)
            self.b = 0.0

        # Run stochastic gradient descent to optimize w
        
        loss_history = []
        for it in range(num_iters):
            X_batch = None
            y_batch = None
                                                               
            # Sample batch_size elements in X_batch and y_batch
            # X_batch shape is  (batch_size, d) and y_batch shape is (batch_size,)                                                                                          
            # Hint: Use np.random.choice to generate indices
            i = np.random.choice(N, batch_size)
            X_batch = X[i]
            y_batch = y[i]
            
            # evaluate loss and gradient
            loss, dw, db = self.loss(X_batch, y_batch)

            # perform parameter update                                                                
            # Update the weights w using the gradient and the learning rate.  
            # YOUR CODE HERE
            # raise NotImplementedError()
            self.w = self.w - dw * learning_rate
            self.b = self.b - db * learning_rate
            
            if verbose and it % 10000 == 0:
                print("iteration %d / %d: loss %f" % (it, num_iters, loss))
    
    
    def predict(self, X):
        # YOUR CODE HERE
        # raise NotImplementedError()
        return X @ self.w + self.b
        
    def loss(self, X_batch, y_batch):
        return lasso_subgradient_mse_loss_vectorized(self.w, self.b, X_batch, y_batch, self.alpha)

In [21]:
model = LassolSubgradientDescent(alpha=0.1)
model.train(X, y, learning_rate=1e-2,verbose=True, num_iters=200_000)
pred = model.predict(X)
mse = mean_squared_error(pred, y)

sk_model = Lasso(alpha=0.1, fit_intercept=True)
sk_model.fit(X, y)
sk_pred = sk_model.predict(X)
sk_mse = mean_squared_error(sk_pred, y)

print("MSE scikit-learn:", sk_mse)
print("MSE Coordinate descent model :", mse)
assert mse - sk_mse < 50

iteration 0 / 200000: loss 29084.441000
iteration 10000 / 200000: loss 3325.051135
iteration 20000 / 200000: loss 2920.266086
iteration 30000 / 200000: loss 2689.382114
iteration 40000 / 200000: loss 3262.216206
iteration 50000 / 200000: loss 2472.315066
iteration 60000 / 200000: loss 3006.465068
iteration 70000 / 200000: loss 3271.020721
iteration 80000 / 200000: loss 3142.086463
iteration 90000 / 200000: loss 3355.231139
iteration 100000 / 200000: loss 2990.349893
iteration 110000 / 200000: loss 2817.098979
iteration 120000 / 200000: loss 2854.323095
iteration 130000 / 200000: loss 2900.121413
iteration 140000 / 200000: loss 3051.975382
iteration 150000 / 200000: loss 3565.336954
iteration 160000 / 200000: loss 3007.808465
iteration 170000 / 200000: loss 3169.516880
iteration 180000 / 200000: loss 3452.520921
iteration 190000 / 200000: loss 2617.928033
MSE scikit-learn: 2912.521795117546
MSE Coordinate descent model : 2888.5590631391706


In [22]:
model = LassolSubgradientDescent(alpha=2)
model.train(X, y, learning_rate=1e-2,verbose=True, num_iters=200_000)
pred = model.predict(X)
mse = mean_squared_error(pred, y)

sk_model = Lasso(alpha=2, fit_intercept=True)
sk_model.fit(X, y)
sk_pred = sk_model.predict(X)
sk_mse = mean_squared_error(sk_pred, y)

print("MSE scikit-learn:", sk_mse)
print("MSE Coordinate descent model :", mse)
assert mse - sk_mse < 50

iteration 0 / 200000: loss 30308.028815
iteration 10000 / 200000: loss 4426.045141
iteration 20000 / 200000: loss 3741.705448
iteration 30000 / 200000: loss 3991.036258
iteration 40000 / 200000: loss 3863.047133
iteration 50000 / 200000: loss 3817.124396
iteration 60000 / 200000: loss 3991.173946
iteration 70000 / 200000: loss 3726.165369
iteration 80000 / 200000: loss 3416.234407
iteration 90000 / 200000: loss 3956.946075
iteration 100000 / 200000: loss 3524.113015
iteration 110000 / 200000: loss 4216.585425
iteration 120000 / 200000: loss 3627.220743
iteration 130000 / 200000: loss 3784.481900
iteration 140000 / 200000: loss 3720.478022
iteration 150000 / 200000: loss 3818.190933
iteration 160000 / 200000: loss 3621.932908
iteration 170000 / 200000: loss 3279.170849
iteration 180000 / 200000: loss 3950.437527
iteration 190000 / 200000: loss 3856.304123
MSE scikit-learn: 5650.287416333697
MSE Coordinate descent model : 3808.2072903841336


# Lasso Proximal Gradient Descent (iterative soft thresholding algorithm or ISTA)

In [50]:
class LassoProximalGradientDescent():
    def __init__(self,  alpha=0.1):
        self.w = None
        self.b = None
        self.alpha = alpha

    def train(self, X, y, learning_rate=1e-3, num_iters=100, batch_size=200, verbose=False):
        N, d = X.shape
        
        if self.w is None: # Initialization
            self.w = 0.001 * np.random.randn(d)
            self.b = 0.0

        # Run stochastic gradient descent to optimize w
        
        loss_history = []
        for it in range(num_iters):
            X_batch = None
            y_batch = None
                                                               
            # Sample batch_size elements in X_batch and y_batch
            # X_batch shape is  (batch_size, d) and y_batch shape is (batch_size,)                                                                                          
            # Hint: Use np.random.choice to generate indices
            # YOUR CODE HERE
            # raise NotImplementedError()
            i = np.random.choice(N, batch_size)
            X_batch = X[i]
            y_batch = y[i]
            
            # evaluate loss and gradient
            loss, dw, db = self.loss(X_batch, y_batch)

            # perform parameter update                                                                
            # Update the weights w using the gradient and the learning rate.  
            # PROJECT THE GRADIENT FOR w USING soft_threshold
            # YOUR CODE HERE
            # raise NotImplementedError()
            # self.w = self.w - dw * learning_rate
            self.b = self.b - db * learning_rate
            for j in range(len(self.w)):
                self.w[j] = soft_threshold(self.w[j] - dw[j] * learning_rate, learning_rate * self.alpha)
            
            if verbose and it % 10000 == 0:
                print("iteration %d / %d: loss %f" % (it, num_iters, loss))

    def predict(self, X):
        # YOUR CODE HERE
        #raise NotImplementedError()
        return X @ self.w + self.b

    def loss(self, X_batch, y_batch):
        return mse_loss_vectorized(self.w, self.b, X_batch, y_batch)

In [51]:
model = LassoProximalGradientDescent(alpha=0.1)
model.train(X, y, learning_rate=1e-2,verbose=True, num_iters=200_000)
pred = model.predict(X)
mse= mean_squared_error(pred, y)

sk_model = Lasso(alpha=0.1, fit_intercept=True)
sk_model.fit(X, y)
sk_pred = sk_model.predict(X)
sk_mse = mean_squared_error(sk_pred, y)

print("MSE scikit-learn:", sk_mse)
print("MSE Coordinate descent model :", mse)
assert mse - sk_mse < 50

iteration 0 / 200000: loss 27763.485913
iteration 10000 / 200000: loss 3895.628457
iteration 20000 / 200000: loss 3048.950470
iteration 30000 / 200000: loss 2557.222497
iteration 40000 / 200000: loss 2630.101896
iteration 50000 / 200000: loss 2903.025772
iteration 60000 / 200000: loss 2753.424484
iteration 70000 / 200000: loss 3182.598740
iteration 80000 / 200000: loss 2924.720647
iteration 90000 / 200000: loss 2785.552882
iteration 100000 / 200000: loss 2540.413656
iteration 110000 / 200000: loss 2903.632166
iteration 120000 / 200000: loss 2496.911586
iteration 130000 / 200000: loss 2806.412581
iteration 140000 / 200000: loss 2587.501825
iteration 150000 / 200000: loss 2778.105739
iteration 160000 / 200000: loss 3165.354891
iteration 170000 / 200000: loss 3119.247905
iteration 180000 / 200000: loss 3031.266637
iteration 190000 / 200000: loss 3151.056374
MSE scikit-learn: 2912.521795117546
MSE Coordinate descent model : 2888.7069400785517


In [47]:
model = LassoProximalGradientDescent(alpha=2)
model.train(X, y, learning_rate=1e-2,verbose=True, num_iters=200_000)
pred = model.predict(X)
mse= mean_squared_error(pred, y)

sk_model = Lasso(alpha=2, fit_intercept=True)
sk_model.fit(X, y)
sk_pred = sk_model.predict(X)
sk_mse = mean_squared_error(sk_pred, y)

print("MSE scikit-learn:", sk_mse)
print("MSE Coordinate descent model :", mse)
assert mse - sk_mse < 50

iteration 0 / 200000: loss 31788.681539
iteration 10000 / 200000: loss 4689.811699
iteration 20000 / 200000: loss 3698.230181
iteration 30000 / 200000: loss 4069.194737
iteration 40000 / 200000: loss 3730.518009
iteration 50000 / 200000: loss 4014.043251
iteration 60000 / 200000: loss 3537.864196
iteration 70000 / 200000: loss 3792.785780
iteration 80000 / 200000: loss 4411.820840
iteration 90000 / 200000: loss 3959.730354
iteration 100000 / 200000: loss 3626.720853
iteration 110000 / 200000: loss 3741.176104
iteration 120000 / 200000: loss 3550.444875
iteration 130000 / 200000: loss 3513.039465
iteration 140000 / 200000: loss 3526.970384
iteration 150000 / 200000: loss 3504.057635
iteration 160000 / 200000: loss 3439.984244
iteration 170000 / 200000: loss 3482.916262
iteration 180000 / 200000: loss 3516.905432
iteration 190000 / 200000: loss 4133.374563
MSE scikit-learn: 5650.287416333697
MSE Coordinate descent model : 3811.455529512126


# Lasso Projected Gradient Descent

In [40]:
class LassoProjectedGradientDescent():
    def __init__(self,  alpha=0.1):
        self.w_p = None
        self.w_n = None
        self.b = None
        self.alpha = alpha

    def train(self, X, y, learning_rate=1e-3, num_iters=100, batch_size=200, verbose=False):
        N, d = X.shape
        
        if self.w_p is None: # Initialization
            self.w_p = 0.001 * np.random.randn(d)
            self.w_n = 0.001 * np.random.randn(d)
            self.b = 0.0

        # Run stochastic gradient descent to optimize w
        
        loss_history = []
        for it in range(num_iters):
            X_batch = None
            y_batch = None
                                                               
            # Sample batch_size elements in X_batch and y_batch
            # X_batch shape is  (batch_size, d) and y_batch shape is (batch_size,)                                                                                          
            # Hint: Use np.random.choice to generate indices
            # YOUR CODE HERE
            # raise NotImplementedError()
            i = np.random.choice(N, batch_size)
            X_batch = X[i]
            y_batch = y[i]
            
            # evaluate loss and gradient
            loss, dw, db = self.loss(X_batch, y_batch)

            # perform parameter update                                                                
            # Update the weights w using the gradient and the learning rate.  
            # Project for w_p and w_n
            # YOUR CODE HERE
            # raise NotImplementedError()
            w = (self.w_p - self.w_n) - dw * learning_rate
            self.w_p = np.maximum(w, 0)
            self.w_n = np.maximum(-w, 0)
            self.b = self.b - db * learning_rate
            self.w_p = np.where(self.w_p < 0, 0, self.w_p)
            self.w_n = np.where(self.w_n < 0, 0, self.w_n)
            
            if verbose and it % 10000 == 0:
                print("iteration %d / %d: loss %f" % (it, num_iters, loss))

    def predict(self, X):
        # YOUR CODE HERE
        #raise NotImplementedError()
        return X @ (self.w_p - self.w_n) + self.b
    
    def loss(self, X_batch, y_batch):
        # YOUR CODE HERE
        # raise NotImplementedError()
        return lasso_subgradient_mse_loss_vectorized(self.w_p - self.w_n, self.b, X_batch, y_batch, self.alpha)

In [41]:
model = LassoProjectedGradientDescent(alpha=0.1)
model.train(X, y, learning_rate=1e-2,verbose=True, num_iters=200_000)
pred = model.predict(X)
mse= mean_squared_error(pred, y)

sk_model = Lasso(alpha=0.1, fit_intercept=True)
sk_model.fit(X, y)
sk_pred = sk_model.predict(X)
sk_mse = mean_squared_error(sk_pred, y)

print("MSE scikit-learn:", sk_mse)
print("MSE Coordinate descent model :", mse)
assert mse - sk_mse < 50

iteration 0 / 200000: loss 29560.709746
iteration 10000 / 200000: loss 2827.890857
iteration 20000 / 200000: loss 2801.631702
iteration 30000 / 200000: loss 3225.400448
iteration 40000 / 200000: loss 2755.110150
iteration 50000 / 200000: loss 3395.691432
iteration 60000 / 200000: loss 2446.848952
iteration 70000 / 200000: loss 2764.453022
iteration 80000 / 200000: loss 2358.378998
iteration 90000 / 200000: loss 2849.362201
iteration 100000 / 200000: loss 3304.655346
iteration 110000 / 200000: loss 2773.813605
iteration 120000 / 200000: loss 3011.740006
iteration 130000 / 200000: loss 3116.679838
iteration 140000 / 200000: loss 3191.150344
iteration 150000 / 200000: loss 3030.634704
iteration 160000 / 200000: loss 2553.372641
iteration 170000 / 200000: loss 2586.143330
iteration 180000 / 200000: loss 2784.399509
iteration 190000 / 200000: loss 2822.182766
MSE scikit-learn: 2912.521795117546
MSE Coordinate descent model : 2888.9505343333085


In [42]:
model = LassoProjectedGradientDescent(alpha=2)
model.train(X, y, learning_rate=1e-2,verbose=True, num_iters=200_000)
pred = model.predict(X)
mse= mean_squared_error(pred, y)

sk_model = Lasso(alpha=2, fit_intercept=True)
sk_model.fit(X, y)
sk_pred = sk_model.predict(X)
sk_mse = mean_squared_error(sk_pred, y)

print("MSE scikit-learn:", sk_mse)
print("MSE Coordinate descent model :", mse)
assert mse - sk_mse < 50

iteration 0 / 200000: loss 27384.304466
iteration 10000 / 200000: loss 3976.156375
iteration 20000 / 200000: loss 3700.695031
iteration 30000 / 200000: loss 3537.437642
iteration 40000 / 200000: loss 4231.570999
iteration 50000 / 200000: loss 3692.101587
iteration 60000 / 200000: loss 3180.179385
iteration 70000 / 200000: loss 3798.434756
iteration 80000 / 200000: loss 3436.802529
iteration 90000 / 200000: loss 4144.859056
iteration 100000 / 200000: loss 3342.038438
iteration 110000 / 200000: loss 3540.129709
iteration 120000 / 200000: loss 3510.431450
iteration 130000 / 200000: loss 3933.133896
iteration 140000 / 200000: loss 4063.363845
iteration 150000 / 200000: loss 3843.636494
iteration 160000 / 200000: loss 4362.581206
iteration 170000 / 200000: loss 4103.541205
iteration 180000 / 200000: loss 3332.461391
iteration 190000 / 200000: loss 3723.669956
MSE scikit-learn: 5650.287416333697
MSE Coordinate descent model : 3812.3329002053706


# Lasso Coordinate Descent (no intercept)

In [5]:
def fit_svd(X, y, alpha=1.0):
    """SVD approach"""
    # I matrix identity
    I = np.eye(len(X[0]), len(X[0]))
    U, s, Vh = np.linalg.svd(X, full_matrices=False)
    Uh = np.transpose(U)
    V = np.transpose(Vh)
    S = np.diag(s)
    term1 = S @ S + alpha * I
    for i in range(len(S)):
        if term1[i][i] != 0:
            term1[i][i] = 1 / term1[i][i]
    term2 = S @ Uh @ y
    return V @ term1 @ term2

# source : Ridge-regression assignment

Source of the algorithm below : Kevin P. Murphy, Machine Learning, a Probabilistic Perspective. MIT Press, 2012

In [24]:
class LassoCoordinateDescent():
    def __init__(self, alpha=0.1):
        self.w = None
        self.alpha = alpha

    def train(self, X, y, num_iters=1000):
        N, d = X.shape
        self.w = fit_svd(X, y, self.alpha)
        last_w = np.zeros_like(self.w)
        
        while (self.w - last_w).all() < 0.0000001 and num_iters > 0:
            num_iters -= 1
            last_w = self.w
            
            for j in range(d):
                a = 2 * np.sum(X[:, j] * X[:, j])
                c = 2 * np.sum(X[:, j] * (y - X @ self.w + self.w[j] * X[:, j]))
                self.w[j] = soft_threshold(a / c, self.alpha / a)
                a = 0
                c = 0

    def predict(self, X): 
        return X @ self.w

In [25]:
model = LassoCoordinateDescent(alpha=0.1)
model.train(X, y)
pred = model.predict(X)
mse= mean_squared_error(pred, y)

sk_model = Lasso(alpha=0.1, fit_intercept=False)
sk_model.fit(X, y)
sk_pred = sk_model.predict(X)
sk_mse = mean_squared_error(sk_pred, y)

print("MSE scikit-learn:", sk_mse)
print("MSE Coordinate descent model :", mse)
assert mse - sk_mse < 50

MSE scikit-learn: 26057.118798659812
MSE Coordinate descent model : 26035.042445634015


In [26]:
model = LassoCoordinateDescent(alpha=2)
model.train(X, y)
pred = model.predict(X)
mse= mean_squared_error(pred, y)

sk_model = Lasso(alpha=2, fit_intercept=False)
sk_model.fit(X, y)
sk_pred = sk_model.predict(X)
sk_mse = mean_squared_error(sk_pred, y)

print("MSE scikit-learn:", sk_mse)
print("MSE Coordinate descent model :", mse)
assert mse - sk_mse < 50

MSE scikit-learn: 28794.88441987604
MSE Coordinate descent model : 26759.06576808725
