In [1]:
from sklearn.datasets import load_breast_cancer
import numpy as np

In [2]:
X, y = load_breast_cancer(return_X_y=True, as_frame=True)

In [3]:
X.describe()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,...,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,14.127292,19.289649,91.969033,654.889104,0.09636,0.104341,0.088799,0.048919,0.181162,0.062798,...,16.26919,25.677223,107.261213,880.583128,0.132369,0.254265,0.272188,0.114606,0.290076,0.083946
std,3.524049,4.301036,24.298981,351.914129,0.014064,0.052813,0.07972,0.038803,0.027414,0.00706,...,4.833242,6.146258,33.602542,569.356993,0.022832,0.157336,0.208624,0.065732,0.061867,0.018061
min,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,0.04996,...,7.93,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504
25%,11.7,16.17,75.17,420.3,0.08637,0.06492,0.02956,0.02031,0.1619,0.0577,...,13.01,21.08,84.11,515.3,0.1166,0.1472,0.1145,0.06493,0.2504,0.07146
50%,13.37,18.84,86.24,551.1,0.09587,0.09263,0.06154,0.0335,0.1792,0.06154,...,14.97,25.41,97.66,686.5,0.1313,0.2119,0.2267,0.09993,0.2822,0.08004
75%,15.78,21.8,104.1,782.7,0.1053,0.1304,0.1307,0.074,0.1957,0.06612,...,18.79,29.72,125.4,1084.0,0.146,0.3391,0.3829,0.1614,0.3179,0.09208
max,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,0.09744,...,36.04,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075


In [4]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 30 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              569 non-null    float64
 1   mean texture             569 non-null    float64
 2   mean perimeter           569 non-null    float64
 3   mean area                569 non-null    float64
 4   mean smoothness          569 non-null    float64
 5   mean compactness         569 non-null    float64
 6   mean concavity           569 non-null    float64
 7   mean concave points      569 non-null    float64
 8   mean symmetry            569 non-null    float64
 9   mean fractal dimension   569 non-null    float64
 10  radius error             569 non-null    float64
 11  texture error            569 non-null    float64
 12  perimeter error          569 non-null    float64
 13  area error               569 non-null    float64
 14  smoothness error         5

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
from sklearn.preprocessing import StandardScaler

In [7]:
from tensorflow.keras.optimizers import SGD
import tensorflow as tf

In [8]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.66)

In [10]:
X_train = tf.constant(X_train, dtype=tf.float32)
X_test = tf.constant(X_test, dtype=tf.float32)
y_train = tf.constant(y_train, dtype=tf.float32)
y_test = tf.constant(y_test, dtype=tf.float32)


In [11]:
def logistic_function(x, betas):
    #assert len(x) == len(betas) + 1
    feature_input = x*betas[1:]
    exp_term = -(betas[0] + tf.reduce_sum(feature_input, axis=1))
    return 1 / (1+tf.math.exp(exp_term))


In [12]:
def cross_entropy(y_true, prediction):
    ce_eps = 1e-7
    prediction = tf.clip_by_value(prediction, ce_eps, 1-ce_eps) #clip to avoid numerical problems with log(0). In general, logit= 0 or 1 shouldn't be possible, but these values come up due to the limited float32 precision
    return -tf.reduce_mean(y_true * tf.math.log(prediction) + (1-y_true)*tf.math.log(1-prediction))

## Custom regression + tensorflow optimizer

In [13]:
opt = SGD(learning_rate=0.1, momentum=0.)
n_steps = 100
lr = 0.1
l2_reg = 0.01
betas = tf.Variable(tf.random.normal([31], 0, 0.1), dtype=tf.float32)

for ii in range(n_steps):
    with tf.GradientTape() as tape:
        preds = logistic_function(X_train, betas)
        loss = cross_entropy(y_train, preds) + l2_reg*tf.reduce_sum(betas**2)
    gradients = tape.gradient(loss, betas)
    opt.apply_gradients(zip([gradients], [betas]))
    #betas.assign(betas - lr*gradients)
    print(loss)


tf.Tensor(0.6428133, shape=(), dtype=float32)
tf.Tensor(0.5145679, shape=(), dtype=float32)
tf.Tensor(0.44082108, shape=(), dtype=float32)
tf.Tensor(0.3916763, shape=(), dtype=float32)
tf.Tensor(0.355891, shape=(), dtype=float32)
tf.Tensor(0.32835555, shape=(), dtype=float32)
tf.Tensor(0.30635336, shape=(), dtype=float32)
tf.Tensor(0.28828293, shape=(), dtype=float32)
tf.Tensor(0.27312836, shape=(), dtype=float32)
tf.Tensor(0.26020867, shape=(), dtype=float32)
tf.Tensor(0.24904707, shape=(), dtype=float32)
tf.Tensor(0.23929806, shape=(), dtype=float32)
tf.Tensor(0.23070396, shape=(), dtype=float32)
tf.Tensor(0.22306801, shape=(), dtype=float32)
tf.Tensor(0.2162369, shape=(), dtype=float32)
tf.Tensor(0.21008909, shape=(), dtype=float32)
tf.Tensor(0.20452678, shape=(), dtype=float32)
tf.Tensor(0.19947031, shape=(), dtype=float32)
tf.Tensor(0.194854, shape=(), dtype=float32)
tf.Tensor(0.19062322, shape=(), dtype=float32)
tf.Tensor(0.18673193, shape=(), dtype=float32)
tf.Tensor(0.18314137,

## Sklearn's logistic regression model

In [14]:
from sklearn.linear_model import LogisticRegression

In [15]:
clf = LogisticRegression(random_state=0).fit(X_train, y_train)

In [16]:
cross_entropy(y_test, tf.constant(clf.predict_proba(X_test)[:, 1], dtype=tf.float32))

<tf.Tensor: shape=(), dtype=float32, numpy=0.10721653>

In [17]:
cross_entropy(y_test, logistic_function(X_test, betas))


<tf.Tensor: shape=(), dtype=float32, numpy=0.14806245>

## Custom Stochastic Gradient Descent

In [18]:
batch_size = 600

In [19]:
n_batches = X_train.shape[0] // batch_size

In [20]:
n_batches

0

In [21]:
momentum_rate = 0.1
betas_sgd = tf.Variable(tf.random.normal([31], 0, 0.1))
update = 0.
for epoch in range(n_steps):
    random_idx = np.random.permutation(X_train.shape[0])
    X_train, y_train = tf.gather(X_train, random_idx), tf.gather(y_train, random_idx)
    for ii in range(n_batches+1):
        X_batch = X_train[ii*batch_size : (ii+1)*batch_size, :]
        y_batch = y_train[ii*batch_size : (ii+1)*batch_size]
        with tf.GradientTape() as tape:
            y_pred = logistic_function(X_batch, betas_sgd)
            batch_loss = cross_entropy(y_batch, y_pred) + l2_reg*tf.reduce_sum(betas_sgd**2)
        batch_gradients = tape.gradient(batch_loss, betas_sgd)
        update = momentum_rate*update + lr*batch_gradients
        betas_sgd.assign(betas_sgd - update)
    print(f'Epoch {epoch} loss: {cross_entropy(y_train, logistic_function(X_train, betas_sgd))}')

Epoch 0 loss: 0.6359825730323792
Epoch 1 loss: 0.47558698058128357
Epoch 2 loss: 0.3921740651130676
Epoch 3 loss: 0.3417300283908844
Epoch 4 loss: 0.30728843808174133
Epoch 5 loss: 0.28185367584228516
Epoch 6 loss: 0.26206982135772705
Epoch 7 loss: 0.24611014127731323
Epoch 8 loss: 0.2328839749097824
Epoch 9 loss: 0.22169384360313416
Epoch 10 loss: 0.21206986904144287
Epoch 11 loss: 0.20368221402168274
Epoch 12 loss: 0.19629140198230743
Epoch 13 loss: 0.18971864879131317
Epoch 14 loss: 0.18382735550403595
Epoch 15 loss: 0.17851091921329498
Epoch 16 loss: 0.17368482053279877
Epoch 17 loss: 0.16928093135356903
Epoch 18 loss: 0.16524362564086914
Epoch 19 loss: 0.16152697801589966
Epoch 20 loss: 0.15809263288974762
Epoch 21 loss: 0.15490826964378357
Epoch 22 loss: 0.15194641053676605
Epoch 23 loss: 0.14918363094329834
Epoch 24 loss: 0.14659962058067322
Epoch 25 loss: 0.1441768854856491
Epoch 26 loss: 0.14190015196800232
Epoch 27 loss: 0.1397559642791748
Epoch 28 loss: 0.1377326250076294
Ep

## AdaGrad

In [22]:
momentum_rate = 0.1
betas_adagrad = tf.Variable(tf.random.normal([31], 0, 0.1))
update = 0.
G_diag = tf.Variable(tf.zeros(betas_adagrad.shape), trainable=False)
for epoch in range(n_steps):
    random_idx = np.random.permutation(X_train.shape[0])
    X_train, y_train = tf.gather(X_train, random_idx), tf.gather(y_train, random_idx)
    for ii in range(n_batches+1):
        X_batch = X_train[ii*batch_size : (ii+1)*batch_size, :]
        y_batch = y_train[ii*batch_size : (ii+1)*batch_size]
        with tf.GradientTape() as tape:
            y_pred = logistic_function(X_batch, betas_adagrad)
            batch_loss = cross_entropy(y_batch, y_pred) + l2_reg*tf.reduce_sum(betas_adagrad**2)
        batch_gradients = tape.gradient(batch_loss, betas_adagrad)
        G_diag.assign(G_diag + batch_gradients**2)
        update = lr*batch_gradients / tf.math.sqrt(G_diag)
        betas_adagrad.assign(betas_adagrad - update)
    print(f'Epoch {epoch} loss: {cross_entropy(y_train, logistic_function(X_train, betas_adagrad))}')

Epoch 0 loss: 0.34642294049263
Epoch 1 loss: 0.26040250062942505
Epoch 2 loss: 0.22292675077915192
Epoch 3 loss: 0.19932802021503448
Epoch 4 loss: 0.18254263699054718
Epoch 5 loss: 0.1698254942893982
Epoch 6 loss: 0.15978136658668518
Epoch 7 loss: 0.15160518884658813
Epoch 8 loss: 0.14479389786720276
Epoch 9 loss: 0.1390148252248764
Epoch 10 loss: 0.13403798639774323
Epoch 11 loss: 0.1296987682580948
Epoch 12 loss: 0.12587586045265198
Epoch 13 loss: 0.12247776240110397
Epoch 14 loss: 0.1194339320063591
Epoch 15 loss: 0.11668907850980759
Epoch 16 loss: 0.11419917643070221
Epoch 17 loss: 0.11192869395017624
Epoch 18 loss: 0.10984855890274048
Epoch 19 loss: 0.1079348474740982
Epoch 20 loss: 0.10616753995418549
Epoch 21 loss: 0.10452983528375626
Epoch 22 loss: 0.10300745815038681
Epoch 23 loss: 0.10158824920654297
Epoch 24 loss: 0.10026172548532486
Epoch 25 loss: 0.09901885688304901
Epoch 26 loss: 0.09785174578428268
Epoch 27 loss: 0.09675353020429611
Epoch 28 loss: 0.0957181304693222
Epoc

## RMS Prop

In [24]:
betas_rms = tf.Variable(tf.random.normal([31], 0, 0.1))
forgetting_factor = 0.01
running_avg = tf.Variable(tf.zeros(betas_rms.shape), trainable=False)
for epoch in range(n_steps):
    random_idx = np.random.permutation(X_train.shape[0])
    X_train, y_train = tf.gather(X_train, random_idx), tf.gather(y_train, random_idx)
    for ii in range(n_batches+1):
        X_batch = X_train[ii*batch_size : (ii+1)*batch_size, :]
        y_batch = y_train[ii*batch_size : (ii+1)*batch_size]
        with tf.GradientTape() as tape:
            y_pred = logistic_function(X_batch, betas_rms)
            batch_loss = cross_entropy(y_batch, y_pred) + l2_reg*tf.reduce_sum(betas_rms**2)
        batch_gradients = tape.gradient(batch_loss, betas_rms)
        running_avg.assign(forgetting_factor*running_avg + (1-forgetting_factor)*batch_gradients**2)
        update = lr*batch_gradients / tf.math.sqrt(running_avg)
        betas_rms.assign(betas_rms - update)
    print(f'Epoch {epoch} loss: {cross_entropy(y_train, logistic_function(X_train, betas_rms))}')

Epoch 0 loss: 0.2946400046348572
Epoch 1 loss: 0.15876005589962006
Epoch 2 loss: 0.10068787634372711
Epoch 3 loss: 0.08193618059158325
Epoch 4 loss: 0.07754947245121002
Epoch 5 loss: 0.0792754590511322
Epoch 6 loss: 0.08013128489255905
Epoch 7 loss: 0.07874292880296707
Epoch 8 loss: 0.07913576066493988
Epoch 9 loss: 0.07830916345119476
Epoch 10 loss: 0.07836197316646576
Epoch 11 loss: 0.07794619351625443
Epoch 12 loss: 0.07772601395845413
Epoch 13 loss: 0.07762741297483444
Epoch 14 loss: 0.07718892395496368
Epoch 15 loss: 0.07734014093875885
Epoch 16 loss: 0.07672737538814545
Epoch 17 loss: 0.07707707583904266
Epoch 18 loss: 0.07632561028003693
Epoch 19 loss: 0.07683362066745758
Epoch 20 loss: 0.07597237825393677
Epoch 21 loss: 0.0766066461801529
Epoch 22 loss: 0.07565917819738388
Epoch 23 loss: 0.07639393955469131
Epoch 24 loss: 0.07537949830293655
Epoch 25 loss: 0.07619379460811615
Epoch 26 loss: 0.0751282274723053
Epoch 27 loss: 0.07600495964288712
Epoch 28 loss: 0.07490122318267822

## ADAM

In [49]:
betas_adam = tf.Variable(tf.random.normal([31], 0, 0.1))
beta_1 = 0.9
beta_2 = 0.999
epsilon = 1e-7
running_avg = tf.Variable(tf.zeros(betas_adam.shape), trainable=False)
running_var = tf.Variable(tf.zeros(betas_adam.shape), trainable=False)
running_avg_hat = tf.Variable(tf.zeros(betas_adam.shape), trainable=False)
running_var_hat = tf.Variable(tf.zeros(betas_adam.shape), trainable=False)
training_iter = 1
for epoch in range(n_steps):
    random_idx = np.random.permutation(X_train.shape[0])
    X_train, y_train = tf.gather(X_train, random_idx), tf.gather(y_train, random_idx)
    for ii in range(n_batches+1):
        X_batch = X_train[ii*batch_size : (ii+1)*batch_size, :]
        y_batch = y_train[ii*batch_size : (ii+1)*batch_size]
        with tf.GradientTape() as tape:
            y_pred = logistic_function(X_batch, betas_adam)
            batch_loss = cross_entropy(y_batch, y_pred) + l2_reg*tf.reduce_sum(betas_adam**2)
        batch_gradients = tape.gradient(batch_loss, betas_adam)
        running_avg.assign(beta_1*running_avg + (1-beta_1)*batch_gradients)
        running_var.assign(beta_2*running_var + (1-beta_2)*batch_gradients**2)
        running_avg_hat.assign(running_avg/(1-beta_1**training_iter))
        running_var_hat.assign(running_var / (1-beta_2**training_iter))
        update = lr * running_avg_hat / (tf.math.sqrt(running_var_hat) + epsilon)
        betas_adam.assign(betas_adam - update)
        training_iter += 1
    print(f'Epoch {epoch} loss: {cross_entropy(y_train, logistic_function(X_train, betas_adam))}')

Epoch 0 loss: 0.22991017997264862
Epoch 1 loss: 0.15012414753437042
Epoch 2 loss: 0.11079052835702896
Epoch 3 loss: 0.08721912652254105
Epoch 4 loss: 0.07278712093830109
Epoch 5 loss: 0.06394709646701813
Epoch 6 loss: 0.058598510921001434
Epoch 7 loss: 0.055399179458618164
Epoch 8 loss: 0.05338878929615021
Epoch 9 loss: 0.05190142244100571
Epoch 10 loss: 0.05054578185081482
Epoch 11 loss: 0.04916718974709511
Epoch 12 loss: 0.04778120294213295
Epoch 13 loss: 0.046496693044900894
Epoch 14 loss: 0.04544699192047119
Epoch 15 loss: 0.04473799467086792
Epoch 16 loss: 0.044417038559913635
Epoch 17 loss: 0.04446619749069214
Epoch 18 loss: 0.04481950029730797
Epoch 19 loss: 0.0453931987285614
Epoch 20 loss: 0.046114273369312286
Epoch 21 loss: 0.046938054263591766
Epoch 22 loss: 0.0478539802134037
Epoch 23 loss: 0.04888114333152771
Epoch 24 loss: 0.0500560961663723
Epoch 25 loss: 0.05141536518931389
Epoch 26 loss: 0.05297780781984329
Epoch 27 loss: 0.05473363399505615
Epoch 28 loss: 0.0566454604

In [64]:
from abc import ABC, abstractmethod

class Optimizer(ABC):
    def __init__(self, lr = 0.1):
        self.lr = lr

    @abstractmethod
    def get_update(self, gradients):
        raise NotImplementedError('This method is not implemented for the parent Optimizer class.')

    def apply_gradients(self, gradients, variables):
        update = self.get_update(gradients)
        variables.assign(variables - update)


class ADAM(Optimizer):
    def __init__(self, beta_1 = 0.9, beta_2 = 0.999, var_shape = (31), epsilon = 1e-7, **kwargs):
        super().__init__(**kwargs)
        self.beta_1 = beta_1
        self.beta_2 = beta_2
        self.epsilon = epsilon
        self.running_avg = tf.Variable(tf.zeros(var_shape), trainable=False)
        self.running_var = tf.Variable(tf.zeros(var_shape), trainable=False)
        self.training_iter = 1

    def get_update(self, gradients):
        self.running_avg.assign(self.beta_1*self.running_avg + (1-self.beta_1)*gradients)
        self.running_var.assign(self.beta_2*self.running_var + (1-self.beta_2)*gradients**2)
        running_avg_hat = self.running_avg / (1-self.beta_1**self.training_iter)
        running_var_hat = self.running_var / (1-self.beta_2**self.training_iter)
        self.training_iter += 1
        return self.lr * running_avg_hat / (tf.math.sqrt(running_var_hat) + self.epsilon)


class RMSProp(Optimizer):
    def __init__(self, forgetting_factor = 0.01, var_shape = (31), **kwargs):
        super().__init__(**kwargs)
        self.forgetting_factor = forgetting_factor
        self.running_avg = tf.Variable(tf.zeros(var_shape), trainable=False)

    def get_update(self, gradients):
        self.running_avg.assign(self.forgetting_factor*self.running_avg + (1-self.forgetting_factor)*gradients**2)
        return self.lr*gradients / tf.math.sqrt(self.running_avg)


class AdaGrad(Optimizer):
    def __init__(self, var_shape = (31), **kwargs):
        super().__init__(**kwargs)
        self.G_diag = tf.Variable(tf.zeros(var_shape), trainable=False)

    def get_update(self, gradients):
        self.G_diag.assign(self.G_diag + gradients**2)
        return self.lr*gradients / tf.math.sqrt(self.G_diag)


class SGD(Optimizer):
    def __init__(self, momentum_rate = 0.,  **kwargs):
        super().__init__(**kwargs)
        self.update = 0.
        self.momentum_rate = momentum_rate

    def get_update(self, gradients):
        self.update = self.momentum_rate*self.update + self.lr*gradients
        return self.update

In [65]:
betas_adam = tf.Variable(tf.random.normal([31], 0, 0.1))
beta_1 = 0.9
beta_2 = 0.999
epsilon = 1e-7

adam = ADAM(lr=0.01, beta_1=beta_1, beta_2 = beta_2)
rmsprop = RMSProp(lr = lr, forgetting_factor = 0.01)
adagrad = AdaGrad(lr = lr)
sgd = SGD(lr = lr, momentum_rate= 0.1)
for epoch in range(n_steps):
    random_idx = np.random.permutation(X_train.shape[0])
    X_train, y_train = tf.gather(X_train, random_idx), tf.gather(y_train, random_idx)
    for ii in range(n_batches+1):
        X_batch = X_train[ii*batch_size : (ii+1)*batch_size, :]
        y_batch = y_train[ii*batch_size : (ii+1)*batch_size]
        with tf.GradientTape() as tape:
            y_pred = logistic_function(X_batch, betas_adam)
            batch_loss = cross_entropy(y_batch, y_pred) + l2_reg*tf.reduce_sum(betas_adam**2)
        batch_gradients = tape.gradient(batch_loss, betas_adam)
        sgd.apply_gradients(batch_gradients, betas_adam)
    print(f'Epoch {epoch} loss: {cross_entropy(y_train, logistic_function(X_train, betas_adam))}')

Epoch 0 loss: 0.5073832869529724
Epoch 1 loss: 0.4158197045326233
Epoch 2 loss: 0.3604734241962433
Epoch 3 loss: 0.3231218159198761
Epoch 4 loss: 0.2957425117492676
Epoch 5 loss: 0.27453312277793884
Epoch 6 loss: 0.2574610710144043
Epoch 7 loss: 0.24332869052886963
Epoch 8 loss: 0.23137733340263367
Epoch 9 loss: 0.22109933197498322
Epoch 10 loss: 0.21213982999324799
Epoch 11 loss: 0.20424221456050873
Epoch 12 loss: 0.1972152441740036
Epoch 13 loss: 0.19091317057609558
Epoch 14 loss: 0.18522241711616516
Epoch 15 loss: 0.1800529956817627
Epoch 16 loss: 0.1753324717283249
Epoch 17 loss: 0.17100180685520172
Epoch 18 loss: 0.16701219975948334
Epoch 19 loss: 0.16332300007343292
Epoch 20 loss: 0.15989995002746582
Epoch 21 loss: 0.15671396255493164
Epoch 22 loss: 0.15374021232128143
Epoch 23 loss: 0.15095725655555725
Epoch 24 loss: 0.14834654331207275
Epoch 25 loss: 0.14589188992977142
Epoch 26 loss: 0.1435791552066803
Epoch 27 loss: 0.14139588177204132
Epoch 28 loss: 0.13933102786540985
Epoch

In [27]:
# todo:  zrobic jakas ladna petle z eksperymentami (rozne parametry + porownanie z kerasowymi optymizerami), zwizualizowac loss function (albo jako przebieg, albo jak taka siatke wartosci koncowych/srednich jak zrobil Krubeal, porownac najlepszy optimizer (lub kilka) z LogisticRegression.fit implementacji sklearn