In [13]:
import numpy as np
import cvxpy as cp
from sklearn.linear_model import HuberRegressor, LinearRegression
from sklearn.preprocessing import StandardScaler
import plotly.graph_objects as go
import scipy

$$ \text{minimize  } (1/2)\| Ax - b\|^2_2$$

where the columns of $A$ respresent the features/predictors of interest and $b$ is a vector containing the values of the target variable. The $1/2$ in front of the objective is just to simplify the gradient.

The squared L2 norm is just the dot product of the input with itself, so we can restate the objective as 

$$ (Ax-b)^T(Ax-b) = x^TA^TAx - (A^Tb)^Tx - b^Tb $$

If $A$ is full rank, then $A^TA$ is positive definite, so the objective function is convex. That means we can actually solve the minimization problem!

The gradient of the objective function is 

$$ A^TAx - A^Tb $$.

Setting the gradient equal to 0 and solving for $x$, we find that the analytical solution is 

$$ x = (A^TA)^{-1}A^Tb $$


$$ \text{minimize  } \| Ax-b \|_1 $$

Note that the objective function is not differentiable, so we can't compute the gradient or Hessian. So, we can't use Gradient Descent or Newton's Method to fit the model. We can recast the problem as

$$ \text{minimize  } \mathbf{1}^Tt \\ \text{subject to  } -t \preceq Ax-b \preceq t $$

Come back to this after learning about solving convex opt problems with inequality constraints

In [2]:
def l2_norm(x):
    return np.sqrt(np.sum(x**2))
def l1_norm(x):
    return np.sum(np.abs(x))

In [3]:
def back_track(x, M, objective, alpha, beta, descent_direction, gradient):
    t = 1
    while objective(x+t*descent_direction(x,M),M) > objective(x,M)+alpha*t*gradient(x,M)@descent_direction(x,M):
        t = t*beta
    return t
def back_track2(x, H, M, objective, alpha, beta, descent_direction, gradient):
    t = 1
    while objective(x+t*descent_direction(x,H,M),M) > objective(x,M)+alpha*t*gradient(x,M)@descent_direction(x,H,M):
        t = t*beta
    return t

In [4]:
# generate problem data
m=2000
n=100
A = np.random.uniform(-10,10,(m,n))
A_test = np.random.uniform(-10,10,(m,n))
real_weights = np.random.rand(n)
scaler = StandardScaler().fit(A)
A = scaler.transform(A)
A_test = scaler.transform(A_test)
b = A@real_weights + np.random.normal(0,5,m)
b[-20:] = 100*b[-20:]
b_test = A_test@real_weights + np.random.normal(0,5,m)

In [5]:
def huber(x, M):
    return np.where(np.abs(A@x-b)<=M, (A@x-b)**2, M*(2*np.abs(A@x-b)-M))
def huber_prime(x, M):
    return A.T@(np.where(np.abs(A@x-b)<=M, 2*(A@x-b), 2*M*np.sign(A@x-b)))
def neg_grad(x,M):
    return -1*huber_prime(x,M)
def total_penalty(x,M):
    return np.sum(huber(x,M))
def QN_step(x, H, M):
    return -np.linalg.inv(H)@huber_prime(x,M)

In [6]:
# objective is not twice continuously differentiable, which violates assumptions for Newton's Method
# BFGS update to Hessian
H = np.eye(n)
x = np.zeros(100) # initial point
tol = 1e-5
# set up paramters
M = 1
alpha = 0.01
beta = 0.5
i = 0
diff = np.inf
objective_values = [total_penalty(x,M)]
while i <= 1000:
    if diff < tol:
        break
    else:
        t = back_track2(x,H, M, total_penalty, alpha, beta, QN_step, huber_prime)
        s = x
        y = huber_prime(x,M)
        x = x + t*QN_step(x,H,M)
        s = x - s
        y = huber_prime(x,M)-y
        y = y.reshape(n,1)
        s = s.reshape(n,1)
        objective_values.append(total_penalty(x,M))
        diff = objective_values[i]-objective_values[i+1]
        # update hessian approx
        H = H + (y@y.T)/(y.T@s)- (H@s@s.T@H)/(s.T@(H@s))
    i += 1
num_iters = i
fig = go.Figure(data=go.Scatter(x=np.arange(1,num_iters+1), y=objective_values, mode='lines'))
fig.show()
print(np.min(objective_values))
print(l2_norm(A_test@x-b_test)/m)

40627.57051352472
0.11687273928030648


In [7]:
x = np.zeros(100) # initial point
grad_tol = 1e-5
objective_tol = 1e-5
# set up paramters
M = 1
alpha = 0.01
beta = 0.6
i = 0
diff = np.inf
objective_values = [total_penalty(x,M)]
while i <= 1000:
    if np.sqrt(huber_prime(x,M)@huber_prime(x,M)) < grad_tol or diff < objective_tol:
        break
    else:
        t = back_track(x, M, total_penalty, alpha, beta, neg_grad, huber_prime)
        x = x + t*neg_grad(x,M)
        objective_values.append(total_penalty(x,M))
        diff = objective_values[i]-objective_values[i+1]
    i += 1
num_iters = i
fig = go.Figure(data=go.Scatter(x=np.arange(1,num_iters+1), y=objective_values, mode='lines'))
fig.show()

In [8]:
l2_norm(A_test@x-b_test)/m

0.11687738476870498

In [9]:
hr = HuberRegressor(max_iter=10000, alpha=1.0, epsilon=1.08).fit(A,b)

In [10]:
total_penalty(hr.coef_,M) + l2_norm(hr.coef_)

40635.86585254757

In [11]:
l2_norm(hr.predict(A_test) - b_test)/m

0.11668254355084141

In [14]:
lr = LinearRegression().fit(A,b)
l2_norm(lr.predict(A_test) - b_test)/m

0.4440902118697147

In [15]:
z = cp.Variable(n)
objective = cp.Minimize(cp.sum(cp.huber(A@z-b))+cp.norm(z,2))
prob = cp.Problem(objective, [])
prob.solve()

40634.24860152859

In [16]:
np.min(objective_values)

40627.567396968516