In [1]:
# !pip install scikit-learn

In [2]:
import numpy as np
from sklearn.linear_model import LogisticRegression

In [3]:
# ce: -\sum_i P(x_i) * log(Q(x_i))
# y = outcome, p = prediction
# bce: (1 - y) * Log(1-p) + y * log(p)

In [4]:
# sigmoid
# logits q0, q1
# e^q0 / (e^q0 + e^q1) 
# 1 / (1 + e^(-q1))

In [52]:
np.random.seed(0)
n = 100_000
n_X = 5
X = np.random.normal(0, 1, size=(n, n_X))
betas = np.random.normal(0, 0.3, size=n_X)
intercept = np.random.normal(0, 0.3)
eps = np.random.normal(0, 0.3, size=n)
print(betas, intercept)

z = X @ betas + intercept + eps
y = (1 / (1 + np.exp(-z)) > 0.5).astype(int)

train_pct = 0.9
train_n = int(train_pct * n)
train_X, train_y = X[:train_n], y[:train_n]
test_X, test_y = X[train_n:], y[train_n:]

[ 0.44589139 -0.14182575  0.42520219  0.26549794  0.01300365] 0.10740014109110481


In [53]:
def bce_loss(p, y):
    loss = -np.mean((1 - y) * np.log(1 - p) + y * np.log(p))
    grad = ((1-y)/(1-p) - y/p) / len(y)
    return loss, grad


class Sigmoid:
    def __init__(self):
        self.x = None

    def forward(self, x, *, no_grad):
        if not no_grad:
            self.x = x
        return 1 / (1 + np.exp(-x))

    def backward(self, grad):
        # d_loss / d_z * d_z / d_x
        # f(x) = 1 / (1 + e^-x)
        # d/dx f(x) = -1 / (1 + e^-x)^2 * (-e^-x)
        # = e^-x / (1 + e^-x)^2
        dz_dx = (np.exp(-self.x) / (1 + np.exp(-self.x)) ** 2)
        self.x = None
        return np.einsum("bj,bj->bj", grad, dz_dx)

    def step_fn(self, *, lr):
        pass

class Linear:
    def __init__(self, *, input_dim, output_dim):
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.W = np.random.normal(
            0, np.sqrt(2 / self.input_dim), size=(output_dim, input_dim)
        )
        self.b = np.zeros(output_dim)
        self.x = None
        self.grad_cache = {}

    def forward(self, x, *, no_grad):
        if not no_grad:
            self.x = x
        wx = np.einsum("ji,bi->bj", self.W, x)
        return wx + self.b

    def backward(self, grad):
        b, j = grad.shape
        # return d_loss / d_x = d_loss / d_z * d_z / d_x
        # store d_loss / d_W = d_loss / d_z * d_z / d_W
        # store d_loss / d_b = d_loss / d_z * d_z / d_b

        # w1 * x1 + w2 * x2 + b
        dz_dx = self.W  # ji
        dz_dW = self.x  # Bi
        dz_db = 1
        self.grad_cache["W"] = np.einsum("bj,bi->ji", grad, dz_dW)
        self.grad_cache["b"] = np.einsum("bj,->j", grad, dz_db)
        self.x = None
        return np.einsum("bj,ji->bi", grad, dz_dx)

    def step_fn(self, *, lr):
        self.W -= lr * self.grad_cache["W"]
        self.b -= lr * self.grad_cache["b"]
        self.grad_cache = {}


class LogReg:
    def __init__(self, input_dim):
        self.input_dim = input_dim
        self.linear = Linear(input_dim=input_dim, output_dim=1)
        self.sigmoid = Sigmoid()

    def forward(self, x, no_grad=False):
        x = self.linear.forward(x, no_grad=no_grad)
        x = self.sigmoid.forward(x, no_grad=no_grad)
        return x

    def backward(self, grad):
        grad = self.sigmoid.backward(grad)
        grad = self.linear.backward(grad)
        return grad

    def step_fn(self, *, lr):
        self.sigmoid.step_fn(lr=lr) # doesn't do anything
        self.linear.step_fn(lr=lr)

In [54]:
model = LogReg(n_X)
lr = 0.1
batch_size = 256

for it, idx in enumerate(range(0, train_n, batch_size)):
    batch_X = train_X[idx:idx+batch_size]
    batch_y = train_y[idx:idx+batch_size].reshape(-1, 1)
    
    p = model.forward(batch_X)
    loss, grad = bce_loss(p, batch_y)
    _ = model.backward(grad)
    model.step_fn(lr=lr)
    if it % 25 == 0:
        test_y_ = test_y.reshape(-1, 1)
        val_p = model.forward(test_X)
        val_loss, _ = bce_loss(val_p, test_y_)
        val_accuracy = ((val_p > 0.5).astype(int) == test_y_).mean()
        print(it, loss, train_loss, val_loss, val_accuracy)

0 1.0426366186325917 0.1733359007417484 0.9970746370598077 0.5035
25 0.6361983949684096 0.1733359007417484 0.6171528049929246 0.6688
50 0.48987423878073494 0.1733359007417484 0.45532785529878755 0.7915
75 0.38770475558579814 0.1733359007417484 0.3852365364492618 0.8466
100 0.3479609680802494 0.1733359007417484 0.3524435404459022 0.8667
125 0.37148281850400594 0.1733359007417484 0.3334678304468279 0.8726
150 0.3226197873797627 0.1733359007417484 0.3221778312128625 0.874
175 0.33306451644294643 0.1733359007417484 0.3139797520219594 0.8748
200 0.3253706665103777 0.1733359007417484 0.30776854176652924 0.8755
225 0.3473251564289802 0.1733359007417484 0.3029058829616264 0.8759
250 0.32217171724555177 0.1733359007417484 0.2991764472546533 0.8753
275 0.293676455364145 0.1733359007417484 0.2959045277523712 0.8752
300 0.31097113579570873 0.1733359007417484 0.29314526563889515 0.8753
325 0.31719618971314134 0.1733359007417484 0.2908719913705171 0.8751
350 0.3196167599504045 0.1733359007417484 0.2

In [55]:
train_y.mean()

np.float64(0.5591333333333334)

In [56]:
test_y.mean()

np.float64(0.5546)

In [57]:
betas, intercept

(array([ 0.44589139, -0.14182575,  0.42520219,  0.26549794,  0.01300365]),
 0.10740014109110481)

In [58]:
model.linear.W, model.linear.b

(array([[ 1.85469171, -0.55072033,  1.77522004,  1.10502778,  0.05569297]]),
 array([0.44974052]))

# baseline

In [59]:
clf = LogisticRegression(random_state=0).fit(train_X, train_y)

In [60]:
clf.score(train_X, train_y)

0.8721111111111111

In [61]:
clf.score(test_X, test_y)

0.8757

In [62]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss

y_prob = clf.predict_proba(test_X)[:, 1]
loss = log_loss(test_y, y_prob)
print(loss)

0.2715855723687466


In [63]:
clf.coef_, clf.intercept_

(array([[ 2.65052164, -0.8390301 ,  2.5420956 ,  1.58524263,  0.09128322]]),
 array([0.65365801]))