# Logistic regression

## Gradient of the cost function

### Unvectorized

$\textbf{x}^{(i)} \in \mathbb{R}^p$

$y_i \in \{0, 1\}$

$\textbf{w} \in \mathbb{R}^p$

$b \in \mathbb{R}$

$J(w, b) = \frac{1}{n} \sum_{i=1}^n \left[ -y_i \log(\sigma(\textbf{w}^T \textbf{x}^{(i)} + b)) - (1 - y_i) \log(1 - \sigma(\textbf{w}^T \textbf{x}^{(i)} + b)) \right]$

$z_i = \textbf{w}^T \textbf{x}^{(i)}$

$a_i = \sigma(z_i)$

$l_i = -y_i \log a_i - (1 - y_i) \log(1 - a_i)$

$\frac{\partial l_i}{\partial w_j} = \frac{\partial l_i}{\partial a_i}\frac{\partial a_i}{\partial z_i}\frac{\partial z_i}{\partial w_j}$

$\frac{\partial l_i}{\partial a_i} = \frac{-y_i}{a_i} + \frac{1 - y_i}{1 - a_i}$

$\frac{\partial a_i}{\partial z_i} = a_i (1 - a_i)$

$\frac{\partial z_i}{\partial w_j} = \frac{\partial \left( w_1 \textbf{x}_1^{(i)} + \dots + w_j \textbf{x}_j^{(i)}  + \dots + w_p \textbf{x}_p^{(i)} \right)}{w_j} = x_j^{(i)}$

$\frac{\partial l_i}{\partial a_i}\frac{\partial a_i}{\partial z_i} = -y_i (1 - a_i) + (1 - y_i) a_i = -y_i + a_i y_i + a_i - a_i y_i = a_i - y_i$

$\frac{\partial l_i}{\partial w_j} = (a_i - y_i) x_j^{(i)}$

$\frac{\partial l_i}{\partial b} = \frac{\partial l_i}{\partial a_i}\frac{\partial a_i}{\partial z_i}\frac{\partial z_i}{\partial b} = (a_i - y_i)$

$\frac{\partial J}{\partial w_j} = \frac{1}{n} \sum_{i=1}^n (a_i - y_i) x_j^{(i)}$

$\frac{\partial J}{\partial b} = \frac{1}{n} \sum_{i=1}^n (a_i - y_i)$

### Vectorized

#### Matrix layouts

$\frac{\partial J}{\partial w_j} = \frac{1}{n} \sum_{i=1}^n (a_i - y_i) x_j^{(i)}$

$\frac{\partial J}{\partial w_j} = \begin{bmatrix} \frac{\partial{J}}{w_1} \\ \vdots \\ \frac{\partial J}{\partial w_p} \end{bmatrix}$

$\textbf{a} = \begin{bmatrix} a_1 \\ \vdots \\ a_n \end{bmatrix}$

$\textbf{y} = \begin{bmatrix} y_1 \\ \vdots \\ y_n \end{bmatrix}$

$\textbf{a} - \textbf{y} = \begin{bmatrix} a_1 - y_1 \\ \vdots \\ a_n - y_n \end{bmatrix}$

$\textbf{X} = \begin{bmatrix} (\textbf{x}^{(1)})^T \\ \vdots \\ (\textbf{x}^{(n)})^T \end{bmatrix}$

$\textbf{X}^T = \begin{bmatrix} \textbf{x}^{(1)} & \cdots & \textbf{x}^{(n)} \end{bmatrix}$

$\textbf{X}^T (\textbf{a} - \textbf{y}) = (a_1 - y_1) \textbf{x}^{(1)} + \dots + (a_n - y_n) \textbf{x}^{(n)}$ (Matrix-vector multiplication using the "column view")

$\frac{1}{n} \textbf{X}^T (\textbf{a} - \textbf{y}) = \frac{1}{n} \begin{bmatrix} (a_1 - y_1) x_1^{(1)} + \dots + (a_n - y_n)x_1^{(n)} \\ \vdots \\ (a_1 - y_1) x_p^{(1)} + \dots + (a_n - y_n)x_p^{(n)}
\end{bmatrix} = \begin{bmatrix} \frac{1}{n} \sum_{i=1}^n (a_i - y_i) x_1^{(1)} \\ \vdots \\ \frac{1}{n} \sum_{i=1}^n(a_i - y_i) x_p^{(1)}
\end{bmatrix} = \begin{bmatrix} \frac{\partial J}{\partial w_1} \\ \vdots \\ \frac{\partial J}{\partial w_p} \end{bmatrix} = \nabla_{\textbf{w}} J(\textbf{w}, b)$

#### Matrix calculus

$\textbf{y} \in \{0, 1\}^n$

$\textbf{X} \in \mathbb{R}^{n \times p}$

$\textbf{w} \in \mathbb{R}^p$

$J(\textbf{w}) = \frac{1}{n}\left[-\textbf{y}^T \log \sigma(\textbf{X} \textbf{w}) - (1 - \textbf{y})^T \log(1 - \sigma(\textbf{X} \textbf{w})) \right]$

$\frac{\partial J}{\partial \textbf{w}} = \frac{\partial z}{\partial \textbf{w}} \frac{\partial a}{\partial \textbf{z}} \frac{\partial J}{\partial \textbf{a}}$ (reverse of the chain rule because we're using denominator layout)

$\frac{\partial \textbf{z}}{\partial \textbf{w}} = \frac{\partial (\textbf{X} \textbf{w})}{\textbf{w}} = \textbf{X}^T$ (the same shape as the partial derivative in denominator layout as expected)

$\frac{\partial \textbf{a}}{\partial \textbf{z}} = \begin{bmatrix} \frac{\partial a_1}{\partial z_1} & \dots & \frac{\partial a_1}{\partial z_n} \\ \vdots \\ \frac{\partial a_n}{\partial z_1} & \dots & \frac{\partial a_n}{\partial z_n} \end{bmatrix} = \begin{bmatrix} a_1 (1 - a_1) & \dots & 0 \\ \vdots \\ 0 & \dots & a_n (1 - a_n) \end{bmatrix}$

$\frac{\partial J}{\partial \textbf{a}} = \frac{1}{n} \begin{bmatrix} -\frac{a_1}{y_1} + \frac{(1-a_1)}{(1-y_1)} \\ \vdots \\ -\frac{a_n}{y_n} + \frac{(1-a_n)}{(1-y_n)} \end{bmatrix}$

$\frac{\partial J}{\partial \textbf{w}} = \frac{1}{n} \textbf{X}^T \begin{bmatrix} a_1 (1 - a_1) & \dots & 0 \\ \vdots \\ 0 & \dots & a_n (1 - a_n) \end{bmatrix} \begin{bmatrix} -\frac{a_1}{y_1} + \frac{(1-a_1)}{(1-y_1)} \\ \vdots \\ -\frac{a_n}{y_n} + \frac{(1-a_n)}{(1-y_n)} \end{bmatrix} = \frac{1}{n} \textbf{X}^T \begin{bmatrix} a_1 - y_1 \\ \vdots \\ a_n - y_n \end{bmatrix} = \frac{1}{n} \textbf{X}^T (\textbf{a} - \textbf{y})$

## Implementation

In [19]:
import scipy.special
import numpy as np
import warnings
def warn(*args, **kwargs):
    pass
warnings.warn = warn
import sklearn.linear_model

np.random.seed(0)
n = 1024
p = 2
X = np.random.randn(n, p)
w_true = np.random.randn(p, 1)
b_true = np.random.randn(1, 1)
z = np.dot(X, w_true) + b_true
p = scipy.special.expit(z)
y = np.zeros((n, 1))
for i in range(n):
    r = np.random.random()
    if r <= p[i, 0]:
        y[i, 0] = 1.

model = sklearn.linear_model.LogisticRegression(solver="lbfgs")
model.fit(X, y.flatten())

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [21]:
def _sigmoid(z):
    return 1./(1. + np.exp(-z))


def train_sgd(X, y):
    n, p = X.shape

    np.random.seed(0)
    w = np.random.randn(p, 1) * 0.01
    b = np.zeros((1, 1))

    lr = 0.01
    batch_size = 8
    num_epochs = 100
    for _ in range(num_epochs):
        start = 0
        end = batch_size
        while end <= n:
            X_batch = X[start:end, :]
            y_batch = y[start:end, :]
            z = np.dot(X, w) + b
            a = _sigmoid(z)
            dz = a - y
            dw = (1./n) * np.dot(X.T, dz)
            db = np.mean(dz)
            w = w - lr * dw
            b = b - lr * db
            start = end
            end = start + batch_size
    return w, b

w, b = train_sgd(X, y)
print("true:")
print(w_true)
print(b_true)
print("sklearn:")
print(model.coef_.T)
print(model.intercept_.reshape((1, 1)))
print("sgd:")
print(w)
print(b)

e_sklearn = abs(w_true[0, 0] - model.coef_[0, 0])
e_sklearn += abs(w_true[1, 0] - model.coef_[0, 1])
e_sklearn += abs(b_true[0, 0] - model.intercept_[0])
e = abs(w_true[0, 0] - w[0, 0])
e += abs(w_true[1, 0] - w[1, 0])
e += abs(b_true[0, 0] - b[0, 0])
assert e <= e_sklearn

assert w.shape == w_true.shape
assert abs(w_true[0, 0] - w[0, 0]) < 0.05
assert abs(w_true[1, 0] - w[1, 0]) < 0.05
assert b.shape == b_true.shape
assert abs(b_true[0, 0] - b[0, 0]) < 0.05

true:
[[-1.69613127]
 [ 0.73018353]]
[[-1.85748327]]
sklearn:
[[-1.69067103]
 [ 0.68004971]]
[[-1.79793506]]
sgd:
[[-1.71500546]
 [ 0.6894603 ]]
[[-1.81023436]]


## Sources

* [Logistic Regression Gradient Descent (C1W2L09)](https://www.youtube.com/watch?v=z_xiwjEdAC4&list=PLkDaE6sCZn6Ec-XTbcX1uRg2_u4xOEky0&index=15) 
* [Vectorizing Logistic Regression's Gradient Computation (C1W2L14)](https://www.youtube.com/watch?v=2BkqApHKwn0&list=PLkDaE6sCZn6Ec-XTbcX1uRg2_u4xOEky0&index=20)
* https://stats.stackexchange.com/questions/46523/how-to-simulate-artificial-data-for-logistic-regression