In [1]:
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


# Load data

In [2]:
data = load_breast_cancer()
X = data['data']
y = data['target'].reshape((-1, 1))

In [3]:
X = X[np.where(y != 2)[0]]
y = y[np.where(y != 2)[0]]

# Baseline with sklearn package

In [4]:
nb_splits = 5
skf = StratifiedKFold(n_splits=nb_splits, shuffle=True, random_state=42)
lr = LogisticRegression(penalty='none', solver='saga', max_iter=10000)

score_train = list()
score_test = list()

for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index].ravel(), y[test_index].ravel()
    lr.fit(X_train, y_train)
    score_train.append(accuracy_score(y_train, lr.predict(X_train)))
    score_test.append(accuracy_score(y_test, lr.predict(X_test)))

    
print("Mean accuracy on train : {0:.1f}%".format(100 * np.mean(score_train)))
print("Standard deviation on train : {0:.1f}%".format(100 * np.std(score_train)))

print("Mean accuracy on test : {0:.1f}%".format(100 * np.mean(score_test)))
print("Standard deviation on test : {0:.1f}%".format(100 * np.std(score_test)))


Mean accuracy on train : 92.4%
Standard deviation on train : 0.6%
Mean accuracy on test : 92.3%
Standard deviation on test : 2.8%


# Logistic Regression from scratch

In [5]:
def sigmoid(X, W):
    return 1 / (1 + np.exp(np.dot(X, -W.T)))

In [6]:
def update_weights(X, y, W, alpha):
    gradient = np.mean(X * sigmoid(X, W) - y * X, axis=0)
    return W - alpha * gradient

In [7]:
def compute_loss(X, y, W):
    total_loss = np.dot(X, W.T) * y - np.log(1 + np.exp(np.dot(X, W.T)))
    return - np.mean(total_loss)

In [8]:
def init_X(X):
    n_features = X.shape[1] + 1
    n_samples = X.shape[0]

    bias = np.ones((n_samples, 1))
    X_and_bias = np.concatenate((bias, X), axis=1)
    
    return X_and_bias

def init_W(X):
    n_features = X.shape[1] + 1
    seed = 42
    W = np.random.RandomState(seed).normal(size=(1, n_features))
    
    return W

In [9]:
def fit_lr(X, y, nb_iter, stop_nb_iter, nb_iter_in_tolerance, stop_tolerance):
    X_and_bias = init_X(X)
    W = init_W(X)

    alpha = 1e-4
    saved_losses = list()
    for i in range(nb_iter):
        W = update_weights(X_and_bias, y, W, alpha)
        loss =  compute_loss(X_and_bias, y, W)
        if saved_losses and abs(loss - saved_losses[-1]) < stop_tolerance:
            nb_iter_in_tolerance += 1
        else:
            nb_iter_in_tolerance = 0
        if nb_iter_in_tolerance >= stop_nb_iter:
            print("The logistic regression has converged in {} iteration".format(i))
            break
        if i == nb_iter - 1:    
            print("The logistic regression has not converged")
        saved_losses.append(loss)
    
    return W, saved_losses

In [10]:
def predict_lr(X, W):
    X_and_bias = init_X(X)
    
    predictions = sigmoid(X_and_bias, W)
    predictions[predictions >= 0.5] = 1
    predictions[predictions < 0.5] = 0

    return predictions

In [11]:
nb_iter = 30000
stop_nb_iter = 10
nb_iter_in_tolerance = 0
stop_tolerance = 5e-6

nb_splits = 5
skf = StratifiedKFold(n_splits=nb_splits, shuffle=True, random_state=42)

score_train = list()
score_test = list()

for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    W, loss = fit_lr(X_train, y_train, nb_iter, stop_nb_iter, nb_iter_in_tolerance, stop_tolerance)
    score_train.append(accuracy_score(y_train, predict_lr(X_train, W)))
    score_test.append(accuracy_score(y_test, predict_lr(X_test, W)))
    
print("Mean accuracy on train : {0:.1f}%".format(100 * np.mean(score_train)))
print("Standard deviation on train : {0:.1f}%".format(100 * np.std(score_train)))

print("Mean accuracy on test : {0:.1f}%".format(100 * np.mean(score_test)))
print("Standard deviation on test : {0:.1f}%".format(100 * np.std(score_test)))


  


The logistic regression has converged in 22401 iteration
The logistic regression has converged in 22281 iteration
The logistic regression has converged in 25796 iteration
The logistic regression has converged in 23741 iteration
The logistic regression has converged in 23590 iteration
Mean accuracy on train : 92.8%
Standard deviation on train : 0.3%
Mean accuracy on test : 92.5%
Standard deviation on test : 2.0%


In [12]:
from matplotlib import pyplot as plt
plt.figure()
plt.scatter(range(len(loss[100:])), loss[100:])
plt.show()

<Figure size 640x480 with 1 Axes>