In [1]:
def logistic_pen(weights, data, targets, hyperparameters):
    """
    Calculate negative log likelihood and its derivatives with respect to weights.
    Also return the predictions.

    Note: N is the number of examples and 
          M is the number of features per example.

    Inputs:
        weights:    (M+1) x 1 vector of weights, where the last element
                    corresponds to bias (intercepts).
        data:       N x M data matrix where each row corresponds 
                    to one data point.
        targets:    N x 1 vector of targets class probabilities.
        hyperparameters: The hyperparameters dictionary.

    Outputs:
        f:             The sum of the loss over all data points. This is the objective that we want to minimize.
        df:            (M+1) x 1 vector of derivative of f w.r.t. weights.
    """
    N, M = data.shape[0], data.shape[1]
    wd = hyperparameters["weight_decay"]
    y = logistic_predict(weights, data)
    w_without_b = weights[:-1,:]
    penalty = (wd/2) * np.sum(w_without_b * w_without_b)
    constant = -((M/2) * np.log((2 * math.pi)/wd)) if wd != 0 else 0
    f = np.sum(-np.dot(targets.T,np.log(y))) - np.sum(np.dot((1-targets.T), np.log(1 - y))) + penalty + constant
    new_data = np.hstack((data, np.ones((N,1))))
    reg = np.pad(wd * w_without_b, ((0,1),(0,0)), 'constant')
    df = np.dot(new_data.T, (y - targets)) + reg
    return f, df, y
