In [1]:
import numpy as np
import cvxpy as cp
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
import plotly.graph_objects as go
import scipy

# Robust Regression

$$ \text{minimize  } \| Ax-b \|_1 $$

with variable $x$, where $A$ is an $m\times n$ matrix and $b$ is an $m \times 1$ vector.

Note that the objective function is not differentiable, so we can't compute the gradient or Hessian. So, we can't use Gradient Descent or Newton's Method to fit the model. We can recast the problem as an inequality constrained LP:

$$ \text{minimize  } \mathbf{1}^Tt \\ \text{subject to  } -t \preceq Ax-b \preceq t $$

We solve this problem, we recast the problem as an unconstrained minimization problem using the log barrier function.

$$ \text{Log Barrier Function: } \qquad s \cdot f_0(x) + \phi(x) $$

where

$$ \phi(x) = - \displaystyle\sum_{i=1}^{\ell} \log(-f_i(x)),$$

$f_i(x)$ are the $\ell$ inequality constraint functions, and $f_0(x)$ is the objective function.

For this application, there are $2m$ inequality constraints. For each $i$, $i=1,2,...,m$

$$-t_i \leq a_i^Tx - b_i \leq t_i $$

where $a_i^T$ is the $i$th row of the matrix $A$. We write this inequality as two constraint functions in standard form:

$$ f_i(x,t) = -t_i - a_i^Tx + b_i \leq 0 \qquad \qquad g_i(x,t) = -t_i+a_i^T x-b_i \leq 0 $$

We write the log barrier objective as

$$ s \cdot \mathbf{1}^Tt - \displaystyle\sum_{i=1}^{m} \log(-f_i(x,t)) - \displaystyle\sum_{i=1}^{m} \log(-g_i(x,t))$$


In [2]:
# generate some test data
m=2000
n=4
A = np.concatenate([np.random.beta(2,5,(m,1)), np.random.normal(1,0.25, (m,1)), 
                    np.random.binomial(1,0.7,(m,1)), np.random.uniform(1,2,(m,1)),
                   np.ones((m,1))], axis=1) # column of 1's to fit an intercept
real_weights = 10*np.random.uniform(-1,1,n+1)
b = A@real_weights - np.random.normal(0,1, m)

We can find an initial feasible point. We need to find values of $x, t$ such that 

$$ -f_i(x,t) > 0 \qquad \text{and} \qquad -g_i(x,t) >0$$

If we initialize $x$ to be the vector of all zeros, then we need to initialize $t$ so that

$$ t_i - b_i > 0 \qquad \text{and} \qquad t_i+b_i > 0 $$

for all $i$. We can achieve this by setting every $t_i$ to a value slightly larger than the maximum of the absolute value of all the values in the vector $b$.

In [3]:
def log_barrier_objective(x,t,s, A,b):
    return s*np.sum(t) - np.sum(np.log(t+A@x-b)) - np.sum(np.log(t-A@x+b))
def l1_norm(x):
    return np.sum(np.abs(x))

Using the barrier method to solve the original problem, we now need to minimize the log barrier function for a given value of $s$. In order to implement Newton's method, we need to know the gradient and Hessian of the log barrier objective.

Write the objective as

$$ s \cdot \mathbf{1}^T t - \displaystyle\sum_{i=1}^m \log((t+Ax-b)_i) - \displaystyle\sum_{i=1}^m \log((t-Ax+b)_i) $$

The derivative with respect to the vector $x$ is

$$-A^T \frac{1}{t+Ax-b} + A^T \frac{1}{t-Ax+b} = A^T \left(\frac{-1}{t+Ax-b} +  \frac{1}{t-Ax+b}\right)$$

and the derivative with respect to $t$ is

$$s \cdot \mathbf{1} - \frac{1}{t+Ax-b} - \frac{1}{t-Ax+b}$$.

When we write $\frac{1}{v}$ where $v$ is a vector, we mean the vector with entries $\frac{1}{v_i}$.



## Newton's Method

As the number of data points grows ($m$ gets larger), the time it takes to compute the Newton step increases dramatically. So, for a large number of observations, Newton's method will be slow. The subgradient method runs much faster, but the results of the method may vary depending on how the step length is adjusted. 

In [4]:
def grad(x,t,s, A, b):
    r = A@x-b
    D1 = 1/(t+r)
    D2 = 1/(t-r)
    return np.append(A.T@(-D1 + D2),s*np.ones(m)  -D1 - D2)

In [5]:
def hessian(x,t,s, A, b):
    r = A@x-b
    D1 = 1/(t+r)**2
    D2 = 1/(t-r)**2
    tt = np.diag(D1 + D2)
    xx = A.T@tt@A
    d = np.diag(D1-D2)
    tx = d@A
    xt = A.T@d
    return np.concatenate([np.concatenate([xx,tx]),np.concatenate([xt,tt])],axis=1)
def newton_step(x,t,s):
    return -1*np.linalg.inv(hessian(x,t,s))@grad(x,t,s)

In [6]:
# Backtracking line search
def backtrack(objective, x, t, s, A, b, grad, descent_direction, alpha, beta):
    t0 = 1
    # we need to make sure that the input is in the domain
    while np.min(t+t0*descent_direction[n+1:]+A@(x+t0*descent_direction[:n+1])-b)<=0 or np.min(t+t0*descent_direction[n+1:]-A@(x+t0*descent_direction[:n+1])+b)<=0:
        t0 = t0*beta
    while objective(x+t0*descent_direction[:n+1],t+t0*descent_direction[n+1:],s,A,b) > objective(x,t,s,A,b)+ alpha*t0*grad@descent_direction:
        t0 = t0*beta
    return t0

In [7]:
# feasible starting point
x = np.zeros(n+1)
t = np.array(m*[np.max(np.abs(b))+0.1])
# some descent parameters
tol = 1e-7
alpha = 0.1
beta = 0.5
max_iter = 100 # for centering step
s = 1
mu = 100
num_inequalities = 2*m
while num_inequalities/s >= tol:
    i=0
    # centering step
    while i <= max_iter:
        g = grad(x,t,s,A,b)
        H = hessian(x,t,s,A,b)
        newton_step = -np.linalg.inv(H)@g
        dec = -g@newton_step
        if dec < tol:
            break
        else:
            step_length = backtrack(log_barrier_objective,x,t, s,A,b,g, newton_step,alpha,beta)
            x = x+step_length*newton_step[:n+1]
            t = t+step_length*newton_step[n+1:]
        i +=1
    s = s*mu

In [8]:
x

array([-5.05688759,  4.85376368,  4.14309911,  5.98446951, -8.874058  ])

In [9]:
real_weights

array([-4.87701104,  4.69968674,  4.19236809,  6.06693577, -8.89723347])

In [10]:
l1_norm(A@x-b)

1597.8423398056564

In [11]:
# Original Problem
z = cp.Variable(n+1)
objective = cp.Minimize(cp.norm(A@z-b, 1))
prob = cp.Problem(objective, [])
prob.solve()

1597.842339490808

In [12]:
z.value

array([-5.05688759,  4.85376368,  4.14303744,  5.98446951, -8.87399632])

## Subgradient Method

Newton's Method is slow when the data matrix $A$ is large. A faster method is the subgradient method.

A subgradient of $\| Ax - b \|_1$ is

$$ A^T\text{sign}(Ax-b)$$

In [13]:
def subgrad(x,A,b):
    return A.T@np.sign(A@x-b)

In [14]:
x = np.zeros(n+1)
max_iters = 10000
step_length = 0.1
best_val = l1_norm(A@x-b)
best_x = np.zeros(n+1)
i = 0
while i <= max_iters:
    x = x - step_length*subgrad(x,A,b)
    obj_val = l1_norm(A@x-b)
    if obj_val < best_val:
        best_x = x
        best_val = obj_val
    i += 1
    step_length = 0.1/(i)

In [15]:
best_x

array([-5.10674996,  4.83185408,  4.14118545,  5.96648906, -8.80879584])

In [16]:
best_val

1597.859547532496