In [1]:
from random import randrange
import numpy as np
from sklearn.metrics import mean_squared_error, log_loss
from sklearn.datasets import load_breast_cancer, load_diabetes


def grad_check_sparse(f, x, analytic_grad, num_checks=10, h=1e-5, error=1e-9):
    """
    sample a few random elements and only return numerical
    in this dimensions
    """

    for i in range(num_checks):
        ix = tuple([randrange(m) for m in x.shape])

        oldval = x[ix]
        x[ix] = oldval + h  # increment by h
        fxph = f(x)  # evaluate f(x + h)
        x[ix] = oldval - h  # increment by h
        fxmh = f(x)  # evaluate f(x - h)
        x[ix] = oldval  # reset

        grad_numerical = (fxph - fxmh) / (2 * h)
        grad_analytic = analytic_grad[ix]
        rel_error = abs(grad_numerical - grad_analytic) / (
            abs(grad_numerical) + abs(grad_analytic)
        )
        print(
            "numerical: %f analytic: %f, relative error: %e"
            % (grad_numerical, grad_analytic, rel_error)
        )
        assert rel_error < error

def rel_error(x, y):
    """ returns relative error """
    return np.max(np.abs(x - y) / (np.maximum(1e-8, np.abs(x) + np.abs(y))))

In [2]:
from random import randrange
import numpy as np
from sklearn.metrics import mean_squared_error, log_loss
from sklearn.datasets import load_breast_cancer, load_diabetes


def grad_check_sparse(f, x, analytic_grad, num_checks=10, h=1e-5, error=1e-9):
    """
    sample a few random elements and only return numerical
    in this dimensions
    """

    for i in range(num_checks):
        ix = tuple([randrange(m) for m in x.shape])

        oldval = x[ix]
        x[ix] = oldval + h  # increment by h
        fxph = f(x)  # evaluate f(x + h)
        x[ix] = oldval - h  # increment by h
        fxmh = f(x)  # evaluate f(x - h)
        x[ix] = oldval  # reset

        grad_numerical = (fxph - fxmh) / (2 * h)
        grad_analytic = analytic_grad[ix]
        rel_error = abs(grad_numerical - grad_analytic) / (
            abs(grad_numerical) + abs(grad_analytic)
        )
        print(
            "numerical: %f analytic: %f, relative error: %e"
            % (grad_numerical, grad_analytic, rel_error)
        )
        #assert rel_error < error

def rel_error(x, y):
    """ returns relative error """
    return np.max(np.abs(x - y) / (np.maximum(1e-8, np.abs(x) + np.abs(y))))

# Linear regression

In [3]:
data = load_diabetes()
X_train1, y_train1 = data.data, data.target
w1 = np.random.randn(X_train1.shape[1]) * 0.0001
b1 = np.random.randn(1) * 0.0001

In [4]:
def mse_loss_naive(w, b, X, y, alpha=0):
    """
    MSE loss function WITH FOR LOOPs
    
    Returns a tuple of:
    - loss 
    - gradient with respect to weights w
    - gradient with respect to bias b
    """
    loss = 0.0
    dw = np.zeros_like(w)
    db = 0.0
    
    # computation of loss
    for i in range(len(X)):
        temp = 0
        for j in range(len(X[0])):
            temp += X[i][j] * w[j] 
        temp += b 
        loss += (temp - y[i]) ** 2
    loss = (1 / len(X)) * loss + alpha * (np.linalg.norm(w, ord=2) ** 2)
    
    # computation of dw
    for k in range(len(X[0])):
        v = 0
        for i in range(len(X)):
            u = 0
            for j in range(len(X[0])):
                u += X[i][j] * w[j] 
            v += X[i][k] * (u + b - y[i])
        dw[k] = (2 / len(X)) * v + 2 * alpha * w[k]
        
    # computation of db
    for i in range(len(X)):
        temp = 0
        for j in range(len(X[0])):
            temp += X[i][j] * w[j] 
        temp += b 
        db += (temp - y[i])
    db *= (2 / len(X)) 
        
    return loss, dw, np.array(db).reshape(1,)

## Naive Linear regression loss

In [5]:
loss, dw1, db1 = mse_loss_naive(w1, b1, X_train1, y_train1, alpha=0)

sk_loss = mean_squared_error(X_train1 @ w1 + b1, y_train1)
assert rel_error(loss, sk_loss) < 1e-9
print("Loss error : ",rel_error(loss, sk_loss))

print("Gradient check w")
# Check with numerical gradient w
f = lambda w1: mse_loss_naive(w1, b1, X_train1, y_train1, alpha=0)[0]
grad_numerical = grad_check_sparse(f, w1, dw1, 15,  error=1e-5)

print("Gradient check bias")
# Check with numerical gradient b
f2 = lambda b1: mse_loss_naive(w1, b1, X_train1, y_train1, alpha=0)[0]
grad_numerical = grad_check_sparse(f2, b1, db1, 15,  error=1e-5)

Loss error :  0.0
Gradient check w
numerical: -1.275041 analytic: -1.275043, relative error: 6.481984e-07
numerical: -3.153318 analytic: -3.153316, relative error: 3.261201e-07
numerical: -1.376394 analytic: -1.376393, relative error: 9.351795e-08
numerical: -4.296089 analytic: -4.296087, relative error: 2.050001e-07
numerical: 2.892059 analytic: 2.892059, relative error: 1.818295e-08
numerical: -2.801911 analytic: -2.801912, relative error: 3.068893e-07
numerical: -3.153318 analytic: -3.153316, relative error: 3.261201e-07
numerical: -1.275041 analytic: -1.275043, relative error: 6.481984e-07
numerical: -1.275041 analytic: -1.275043, relative error: 6.481984e-07
numerical: 2.892059 analytic: 2.892059, relative error: 1.818295e-08
numerical: -1.376394 analytic: -1.376393, relative error: 9.351795e-08
numerical: -2.801911 analytic: -2.801912, relative error: 3.068893e-07
numerical: -1.376394 analytic: -1.376393, relative error: 9.351795e-08
numerical: -3.234124 analytic: -3.234125, rela

## Naive Ridge regression loss

In [6]:
loss, dw1, db1 = mse_loss_naive(w1, b1, X_train1, y_train1, alpha=1)

print("Gradient check w")
# Check with numerical gradient w
f = lambda w1: mse_loss_naive(w1, b1, X_train1, y_train1, alpha=1)[0]
grad_numerical = grad_check_sparse(f, w1, dw1, 15,  error=1e-5)

print("Gradient check bias")
# Check with numerical gradient b
f2 = lambda b1: mse_loss_naive(w1, b1, X_train1, y_train1, alpha=1)[0]
grad_numerical = grad_check_sparse(f2, b1, db1, 15,  error=1e-5)

Gradient check w
numerical: 2.891846 analytic: 2.891846, relative error: 3.751684e-08
numerical: -3.153166 analytic: -3.153164, relative error: 3.296858e-07
numerical: -2.801625 analytic: -2.801627, relative error: 3.096951e-07
numerical: -1.553183 analytic: -1.553182, relative error: 5.411299e-07
numerical: -1.274956 analytic: -1.274958, relative error: 6.445730e-07
numerical: -4.145782 analytic: -4.145782, relative error: 6.952943e-09
numerical: -3.234068 analytic: -3.234069, relative error: 1.818144e-07
numerical: -4.145782 analytic: -4.145782, relative error: 6.952943e-09
numerical: -0.315335 analytic: -0.315334, relative error: 1.633028e-06
numerical: -2.801625 analytic: -2.801627, relative error: 3.096951e-07
numerical: -1.553183 analytic: -1.553182, relative error: 5.411299e-07
numerical: 2.891846 analytic: 2.891846, relative error: 3.751684e-08
numerical: -1.376200 analytic: -1.376200, relative error: 5.856052e-08
numerical: -3.234068 analytic: -3.234069, relative error: 1.8181

In [7]:
def mse_loss_vectorized(w, b, X, y, alpha=0):
    """
    MSE loss function WITHOUT FOR LOOPs
    
    Returns a tuple of:
    - loss 
    - gradient with respect to weights w
    - gradient with respect to bias b
    """
    loss = 0.0
    dw = np.zeros_like(w)
    
    regularizer = alpha * (np.linalg.norm(w, ord=2) ** 2)
    
    loss = (1.0 / len(X)) * (np.linalg.norm(X @ w + b - y, ord=2)) ** 2 + regularizer
    dw = (2.0 / len(X)) * (X @ w + b - y) @ X + 2 * alpha * w
    db = (2.0 / len(X)) * (X @ w + b - y) @ np.ones(len(X))
    
    return loss, dw, np.array(db).reshape(1,)

## Vectorised Linear regression loss

In [8]:
loss, dw1, db1 = mse_loss_vectorized(w1, b1, X_train1, y_train1, alpha=0)

sk_loss = mean_squared_error(X_train1 @ w1 + b1, y_train1)
assert rel_error(loss, sk_loss) < 1e-9
print("Loss error : ",rel_error(loss, sk_loss))

print("Gradient check w")
# Check with numerical gradient w
f = lambda w1: mse_loss_vectorized(w1, b1, X_train1, y_train1, alpha=0)[0]
grad_numerical = grad_check_sparse(f, w1, dw1, 15,  error=1e-5)

print("Gradient check bias")
# Check with numerical gradient b
f2 = lambda b1: mse_loss_vectorized(w1, b1, X_train1, y_train1, alpha=0)[0]
grad_numerical = grad_check_sparse(f2, b1, db1, 15,  error=1e-5)

Loss error :  1.2512623000643182e-16
Gradient check w
numerical: -2.801912 analytic: -2.801912, relative error: 1.475119e-08
numerical: -2.801912 analytic: -2.801912, relative error: 1.475119e-08
numerical: -3.153317 analytic: -3.153316, relative error: 6.653789e-08
numerical: -0.315453 analytic: -0.315453, relative error: 6.855219e-07
numerical: -1.275043 analytic: -1.275043, relative error: 6.223316e-09
numerical: -1.553187 analytic: -1.553187, relative error: 2.973792e-08
numerical: -1.275043 analytic: -1.275043, relative error: 6.223316e-09
numerical: -4.145424 analytic: -4.145424, relative error: 6.692186e-09
numerical: -1.376393 analytic: -1.376393, relative error: 1.047164e-07
numerical: -1.275043 analytic: -1.275043, relative error: 6.223316e-09
numerical: -3.153317 analytic: -3.153316, relative error: 6.653789e-08
numerical: -3.234124 analytic: -3.234125, relative error: 3.671298e-08
numerical: 2.892059 analytic: 2.892059, relative error: 4.471305e-08
numerical: -1.376393 anal

## Vectorized ridge regression loss

In [9]:
loss, dw1, db1 = mse_loss_vectorized(w1, b1, X_train1, y_train1, alpha=1)

print("Gradient check w")
# Check with numerical gradient w
f = lambda w1: mse_loss_vectorized(w1, b1, X_train1, y_train1, alpha=1)[0]
grad_numerical = grad_check_sparse(f, w1, dw1, 15,  error=1e-5)

print("Gradient check bias")
# Check with numerical gradient b
f2 = lambda b1: mse_loss_vectorized(w1, b1, X_train1, y_train1, alpha=1)[0]
grad_numerical = grad_check_sparse(f2, b1, db1, 15,  error=1e-5)

Gradient check w
numerical: -3.234069 analytic: -3.234069, relative error: 4.120288e-08
numerical: -0.315333 analytic: -0.315334, relative error: 6.743532e-07
numerical: -1.376199 analytic: -1.376200, relative error: 1.397017e-07
numerical: 2.891846 analytic: 2.891846, relative error: 2.538378e-08
numerical: -1.376199 analytic: -1.376200, relative error: 1.397017e-07
numerical: -4.296145 analytic: -4.296145, relative error: 1.322478e-09
numerical: -3.234069 analytic: -3.234069, relative error: 4.120288e-08
numerical: -3.234069 analytic: -3.234069, relative error: 4.120288e-08
numerical: -1.553182 analytic: -1.553182, relative error: 1.411825e-08
numerical: -4.145782 analytic: -4.145782, relative error: 1.498489e-08
numerical: -3.153165 analytic: -3.153164, relative error: 7.009101e-08
numerical: -4.145782 analytic: -4.145782, relative error: 1.498489e-08
numerical: -4.296145 analytic: -4.296145, relative error: 1.322478e-09
numerical: -1.376199 analytic: -1.376200, relative error: 1.39

# Logistic regression

In [10]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

data = load_breast_cancer()
X_train2, y_train2 = data.data, data.target
w2 = np.random.randn(X_train2.shape[1]) * 0.0001
b2 = np.random.randn(1) * 0.0001

# Naive

In [35]:
def log_loss_naive(w, b, X, y, alpha=0):
    """
    log loss function WITH FOR LOOPs
    
    Returns a tuple of:
    - loss 
    - gradient with respect to weights w
    """
    loss = 0.0
    dw = np.zeros_like(w)
    db = 0.0
    
    wh = np.transpose(w)
    for i in range(len(X)):
        u = sigmoid(X[i] @ wh + b)
        loss += y[i] * np.log(u)+ (1 - y[i]) * np.log(1 - u)
        dw += X[i] * (u - y[i]) 
        db += (u - y[i]) 
    loss = -loss / len(X) + alpha * (np.linalg.norm(w, ord=2) ** 2) 
    dw = dw / len(X) + 2 * alpha * w
    db = db / len(X)
    
    return loss, dw, np.array(db).reshape(1,)

In [38]:
y_pred_0 = sigmoid(X_train2 @ w2 + b2)
y_pred = np.vstack([1-y_pred_0, y_pred_0]).T
sk_loss = log_loss(y_train2, y_pred)

loss, dw2, db2 = log_loss_naive(w2, b2, X_train2, y_train2, alpha=0)
assert rel_error(loss, sk_loss) < 1e-9
print("Loss error : ",rel_error(loss, sk_loss))

print("Gradient check w")
# Check with numerical gradient w
f = lambda w2: log_loss_naive(w2, b2, X_train2, y_train2, alpha=0)[0]
grad_numerical = grad_check_sparse(f, w2, dw2, 15, error=1e-4)

print("Gradient check bias")
# Check with numerical gradient b
f2 = lambda b2: log_loss_naive(w2, b2, X_train2, y_train2, alpha=0)[0]
grad_numerical = grad_check_sparse(f2, b2, db2, 15,  error=1e-5)

Loss error :  4.853520116333816e-16
Gradient check w
numerical: -0.022759 analytic: -0.022759, relative error: 6.403406e-10
numerical: -0.683470 analytic: -0.683470, relative error: 1.170994e-09
numerical: -0.003085 analytic: -0.003085, relative error: 2.984326e-09
numerical: 17.938050 analytic: 17.937818, relative error: 6.480297e-06
numerical: -0.000577 analytic: -0.000577, relative error: 7.895948e-09
numerical: -0.012001 analytic: -0.012001, relative error: 6.033965e-11
numerical: 0.024270 analytic: 0.024270, relative error: 1.124185e-09
numerical: -0.000577 analytic: -0.000577, relative error: 7.895948e-09
numerical: -0.009732 analytic: -0.009732, relative error: 4.420325e-10
numerical: -0.683470 analytic: -0.683470, relative error: 1.170994e-09
numerical: -2.509023 analytic: -2.509023, relative error: 8.270212e-10
numerical: -0.914779 analytic: -0.914779, relative error: 4.757496e-10
numerical: -0.001148 analytic: -0.001148, relative error: 1.496700e-08
numerical: -0.183530 analy

# Naive with regularization

In [39]:
loss, dw2, db2 = log_loss_naive(w2, b2, X_train2, y_train2, alpha=1)

print("Gradient check w")
# Check with numerical gradient w
f = lambda w2: log_loss_naive(w2, b2, X_train2, y_train2, alpha=1)[0]
grad_numerical = grad_check_sparse(f, w2, dw2, 15, error=1e-4)

print("Gradient check bias")
# Check with numerical gradient b
f2 = lambda b2: log_loss_naive(w2, b2, X_train2, y_train2, alpha=1)[0]
grad_numerical = grad_check_sparse(f2, b2, db2, 15,  error=1e-5)

Gradient check w
numerical: -0.183197 analytic: -0.183197, relative error: 3.996336e-11
numerical: -5.346257 analytic: -5.346257, relative error: 2.511763e-08
numerical: -0.183197 analytic: -0.183197, relative error: 3.996336e-11
numerical: -0.003313 analytic: -0.003313, relative error: 2.634636e-09
numerical: -0.022809 analytic: -0.022809, relative error: 5.757887e-10
numerical: -0.000906 analytic: -0.000906, relative error: 1.883098e-08
numerical: -2.040793 analytic: -2.040793, relative error: 4.265740e-10
numerical: 5.508514 analytic: 5.508514, relative error: 4.563281e-08
numerical: -0.683636 analytic: -0.683636, relative error: 1.173664e-09
numerical: -2.040793 analytic: -2.040793, relative error: 4.265740e-10
numerical: -0.011817 analytic: -0.011817, relative error: 3.463368e-11
numerical: -0.031446 analytic: -0.031446, relative error: 1.259131e-10
numerical: -0.914802 analytic: -0.914802, relative error: 4.744188e-10
numerical: 0.024504 analytic: 0.024504, relative error: 1.1726

# Vectorized

In [47]:
def log_loss_vectorized(w, b,X, y, alpha=0):
    """
    log loss function WITHOUT FOR LOOPs
    
    Returns a tuple of:
    - loss 
    - gradient with respect to weights w
    """
    loss = 0.0
    dw = np.zeros_like(w)
    regularizer = alpha * (np.linalg.norm(w, ord=2) ** 2)
    
    u = sigmoid(X @ w + b)
    loss = - (1 / len(X)) * np.sum(y * np.log(u) + (1 - y) * np.log(1 - u)) + regularizer
    dw =  (1 / len(X)) * (u - y) @ X + 2 * alpha * w
    db = (1 / len(X)) * np.sum(u - y)
    
    return loss, dw, np.array(db).reshape(1,)

In [50]:
y_pred_0 = sigmoid(X_train2 @ w2 + b2)
y_pred = np.vstack([1-y_pred_0, y_pred_0]).T
sk_loss = log_loss(y_train2, y_pred)

loss, dw2, db2 = log_loss_vectorized(w2, b2, X_train2, y_train2, alpha=0)
assert rel_error(loss, sk_loss) < 1e-9
print("Loss error : ",rel_error(loss, sk_loss))

print("Gradient check w")
# Check with numerical gradient w
f = lambda w2: log_loss_vectorized(w2, b2, X_train2, y_train2, alpha=0)[0]
grad_numerical = grad_check_sparse(f, w2, dw2, 15, error=1e-4)

print("Gradient check bias")
# Check with numerical gradient b
f2 = lambda b2: log_loss_vectorized(w2, b2, X_train2, y_train2, alpha=0)[0]
grad_numerical = grad_check_sparse(f2, b2, db2, 15,  error=1e-5)

Loss error :  8.089200193889689e-17
Gradient check w
numerical: -0.009732 analytic: -0.009732, relative error: 1.568327e-10
numerical: 62.480214 analytic: 62.479407, relative error: 6.461319e-06
numerical: 0.006009 analytic: 0.006009, relative error: 5.205287e-11
numerical: 0.006811 analytic: 0.006811, relative error: 8.383055e-10
numerical: -0.009404 analytic: -0.009404, relative error: 3.586835e-11
numerical: -0.001142 analytic: -0.001142, relative error: 1.707702e-10
numerical: 0.007404 analytic: 0.007404, relative error: 3.086702e-10
numerical: -0.914779 analytic: -0.914779, relative error: 5.060909e-10
numerical: -0.003085 analytic: -0.003085, relative error: 2.852732e-10
numerical: -0.000577 analytic: -0.000577, relative error: 1.732034e-09
numerical: -0.022759 analytic: -0.022759, relative error: 1.525179e-10
numerical: -0.003085 analytic: -0.003085, relative error: 2.852732e-10
numerical: -0.001323 analytic: -0.001323, relative error: 1.088202e-09
numerical: -0.022759 analytic:

# Vectorized with regularization

In [51]:
loss, dw2, db2 = log_loss_vectorized(w2, b2, X_train2, y_train2, alpha=1)

print("Gradient check w")
# Check with numerical gradient w
f = lambda w2: log_loss_vectorized(w2, b2, X_train2, y_train2, alpha=1)[0]
grad_numerical = grad_check_sparse(f, w2, dw2, 15, error=1e-4)

print("Gradient check bias")
# Check with numerical gradient b
f2 = lambda b2: log_loss_vectorized(w2, b2, X_train2, y_train2, alpha=1)[0]
grad_numerical = grad_check_sparse(f2, b2, db2, 15,  error=1e-5)

Gradient check w
numerical: -0.031446 analytic: -0.031446, relative error: 3.764958e-11
numerical: 0.012482 analytic: 0.012482, relative error: 1.564808e-10
numerical: -0.000906 analytic: -0.000906, relative error: 4.489492e-10
numerical: -3.770286 analytic: -3.770287, relative error: 6.590938e-08
numerical: -3.770286 analytic: -3.770287, relative error: 6.590938e-08
numerical: 0.006147 analytic: 0.006147, relative error: 2.132562e-10
numerical: 0.007347 analytic: 0.007347, relative error: 2.668036e-10
numerical: 17.937932 analytic: 17.937700, relative error: 6.480340e-06
numerical: 0.007347 analytic: 0.007347, relative error: 2.668036e-10
numerical: 0.012474 analytic: 0.012474, relative error: 3.433207e-10
numerical: -2.040793 analytic: -2.040793, relative error: 4.238538e-10
numerical: 0.007347 analytic: 0.007347, relative error: 2.668036e-10
numerical: 0.093923 analytic: 0.093923, relative error: 2.471080e-10
numerical: -0.683636 analytic: -0.683636, relative error: 1.210203e-09
num

# Gradient descent for Linear models

In [62]:
class LinearModel():
    def __init__(self):
        self.w = None
        self.b = None

    def train(self, X, y, learning_rate=1e-3, alpha=0, num_iters=100, batch_size=200, verbose=False):
        N, d = X.shape
        
        if self.w is None: # Initialization
            self.w = 0.001 * np.random.randn(d)
            self.b = 0.0

        # Run stochastic gradient descent to optimize w
        
        loss_history = []
        for it in range(num_iters):
            X_batch = None
            y_batch = None
                                                               
            # Sample batch_size elements in X_batch and y_batch
            # X_batch shape is  (batch_size, d) and y_batch shape is (batch_size,)                                                                                          
            # Hint: Use np.random.choice to generate indices
            i = np.random.choice(N, batch_size)
            X_batch = X[i]
            y_batch = y[i]
            
            # evaluate loss and gradient
            loss, dw, db = self.loss(X_batch, y_batch, alpha)
            loss_history.append(loss)

            # perform parameter update                                                                
            # Update the weights w and bias b using the gradient and the learning rate.          
            self.w = self.w - dw * learning_rate
            self.b = self.b - db * learning_rate
            
            if verbose and it % 10000 == 0:
                print("iteration %d / %d: loss %f" % (it, num_iters, loss))
                
        return loss_history

    def predict(self, X):
        pass

    def loss(self, X_batch, y_batch, reg):
        pass

class LinearRegressor(LinearModel):
    """ Linear regression """

    def loss(self, X_batch, y_batch, alpha):
        return mse_loss_vectorized(self.w, self.b, X_batch, y_batch, alpha)
    
    def predict(self, X):
        return X @ self.w + self.b

class LogisticRegressor(LinearModel):
    """ Logistic regression """

    def loss(self, X_batch, y_batch, alpha):
        return log_loss_vectorized(self.w, self.b, X_batch, y_batch, alpha)
    
    def predict(self, X):
        """ Return prediction labels vector of 0 or 1 """
        #return sigmoid(X @ self.w + self.b)
        u = sigmoid(X @ self.w + self.b)
        u[u < 0.5] = 0
        u[u > 0.5] = 1
        return u

## Linear regression with gradient descent

In [63]:
from sklearn.linear_model import LinearRegression

sk_model = LinearRegression(fit_intercept=True)
sk_model.fit(X_train1, y_train1)
sk_pred = sk_model.predict(X_train1)
sk_mse = mean_squared_error(sk_pred, y_train1)

model = LinearRegressor()
model.train(X_train1, y_train1, num_iters=75000, batch_size=64, learning_rate=1e-2, verbose=True)
pred = model.predict(X_train1)
mse = mean_squared_error(pred, y_train1)

print("MSE scikit-learn:", sk_mse)
print("MSE gradient descent model :", mse)
assert mse - sk_mse < 100

iteration 0 / 75000: loss 31135.838826
iteration 10000 / 75000: loss 2153.119967
iteration 20000 / 75000: loss 2824.307317
iteration 30000 / 75000: loss 3305.047969
iteration 40000 / 75000: loss 2497.481924
iteration 50000 / 75000: loss 2973.117152
iteration 60000 / 75000: loss 3694.808164
iteration 70000 / 75000: loss 2515.538239
MSE scikit-learn: 2859.6903987680657
MSE gradient descent model : 2884.362616848996


## Logistc regression with gradient descent

In [64]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train2 = scaler.fit_transform(X_train2)

sk_model = LogisticRegression(fit_intercept=True)
sk_model.fit(X_train2, y_train2)
sk_pred = sk_model.predict(X_train2)
sk_log_loss = log_loss(sk_pred, y_train2)

model = LogisticRegressor()
model.train(X_train2, y_train2, num_iters=75000, batch_size=64, learning_rate=1e-3, verbose=True)
pred = model.predict(X_train2)
model_log_loss = log_loss(pred, y_train2)

print("Log-loss scikit-learn:", sk_log_loss)
print("Log-loss gradiet descent model :", model_log_loss)
print("Error :", rel_error(sk_log_loss, model_log_loss))
assert rel_error(sk_log_loss, model_log_loss) < 1e-7

iteration 0 / 75000: loss 0.692383
iteration 10000 / 75000: loss 0.069834
iteration 20000 / 75000: loss 0.103321
iteration 30000 / 75000: loss 0.096072
iteration 40000 / 75000: loss 0.132246
iteration 50000 / 75000: loss 0.060531
iteration 60000 / 75000: loss 0.050685
iteration 70000 / 75000: loss 0.050146
Log-loss scikit-learn: 0.4249086712816093
Log-loss gradiet descent model : 0.4249086712816093
Error : 0.0
