# Learning Rate Finder

**TODO:** FInidh this study later

refs:

1. How reliable: https://blog.dataiku.com/the-learning-rate-finder-technique-how-reliable-is-it
1. https://fastai1.fast.ai/callbacks.lr_finder.html
1. Good discussion: https://forums.fast.ai/t/new-lr-finder-output/89236/3
1. colab: https://walkwithfastai.com/lr_finder
1. Small discussion regards implementation: https://aidancoco.medium.com/optimizing-the-learning-rate-of-your-neural-networks-32e38addd8a3

**GOAL**

* Understand learning rate finder better.
* Why I can choose large values of lr than the lrf reccomendation?
* Is this behavor also on fastai or tensorflow?
* Implement increase by mini-batch and not epoch as mentioned in one of the ref above. DOes this make any different and why?


In [1]:
import numpy as np
import random as rand

import matplotlib
import matplotlib.pyplot as plt
# required for iteractive plot
%matplotlib notebook  

import seaborn as sns
from scipy import stats 

from sklearn.datasets import make_classification, make_blobs, make_regression
from sklearn.metrics import accuracy_score

from typing import List, Dict, Tuple, Callable

import tensorflow as tf

import warnings
warnings.filterwarnings('ignore')

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline

## Keras implementation of learning rate finder

https://github.com/WittmannF/LRFinder

In [2]:
from keras.callbacks import Callback
import keras.backend as K
import numpy as np
import matplotlib.pyplot as plt

class LRFinder(Callback):
    def __init__(self, min_lr, max_lr, mom=0.9, stop_multiplier=None, 
                 reload_weights=True, batches_lr_update=5):
        self.min_lr = min_lr
        self.max_lr = max_lr
        self.mom = mom
        self.reload_weights = reload_weights
        self.batches_lr_update = batches_lr_update
        if stop_multiplier is None:
            self.stop_multiplier = -20*self.mom/3 + 10 # 4 if mom=0.9
                                                       # 10 if mom=0
        else:
            self.stop_multiplier = stop_multiplier
        
    def on_train_begin(self, logs={}):
        p = self.params
        try:
            n_iterations = p['epochs']*p['samples']//p['batch_size']
        except:
            n_iterations = p['steps']*p['epochs']
            
        self.learning_rates = np.geomspace(self.min_lr, self.max_lr, \
                                           num=n_iterations//self.batches_lr_update+1)
        self.losses=[]
        self.iteration=0
        self.best_loss=0
        if self.reload_weights:
            self.model.save_weights('tmp.hdf5')
        
    
    def on_batch_end(self, batch, logs={}):
        loss = logs.get('loss')
        
        if self.iteration!=0: # Make loss smoother using momentum
            loss = self.losses[-1]*self.mom+loss*(1-self.mom)
        
        if self.iteration==0 or loss < self.best_loss: 
                self.best_loss = loss
                
        if self.iteration%self.batches_lr_update==0: # Evaluate each lr over 5 epochs
            
            if self.reload_weights:
                self.model.load_weights('tmp.hdf5')
          
            lr = self.learning_rates[self.iteration//self.batches_lr_update]            
            K.set_value(self.model.optimizer.lr, lr)

            self.losses.append(loss)            

        if loss > self.best_loss*self.stop_multiplier: # Stop criteria
            self.model.stop_training = True
                
        self.iteration += 1
    
    def on_train_end(self, logs=None):
        if self.reload_weights:
                self.model.load_weights('tmp.hdf5')
                
        plt.figure(figsize=(12, 6))
        plt.plot(self.learning_rates[:len(self.losses)], self.losses)
        plt.xlabel("Learning Rate")
        plt.ylabel("Loss")
        plt.xscale('log')
        plt.show()

In [3]:
from keras.datasets import fashion_mnist

# 1. Input Data
(X_train, y_train), (X_test, y_test) = fashion_mnist.load_data()

mean, std = X_train.mean(), X_train.std()
X_train, X_test = (X_train-mean)/std, (X_test-mean)/std


In [5]:
# model = tf.keras.Sequential([
#                 tf.keras.layers.Dense(1, activation='relu',
#                           kernel_initializer=W01,
#                           bias_initializer=b01,
#                           input_shape=(2,))
#     ])

# sgd = tf.keras.optimizers.SGD(learning_rate=lr, 
#                               momentum=0.0,
#                               nesterov=False, name='SGD'
#                              )


# model.compile(loss='mse', optimizer=sgd, metrics=['mse'])

# _ = model.fit(X, y, epochs=n_iter, batch_size=bs, verbose=0, validation_split=0.0)


# 2. Define and Compile Model
model = tf.keras.Sequential([tf.keras.layers.Flatten(),
                    tf.keras.layers.Dense(512, activation='relu'),
                    tf.keras.layers.Dense(10, activation='softmax')])

model.compile(loss='sparse_categorical_crossentropy', 
              metrics=['accuracy'], optimizer='adam')


# 3. Fit using Callback
lr_finder = LRFinder(min_lr=0.001, max_lr=1)

model.fit(X_train, y_train, batch_size=128, callbacks=[lr_finder])

ValueError: Weights for model sequential have not yet been created. Weights are created when the Model is first called on inputs or `build()` is called with an `input_shape`.

## Cots functions

### Linera regression

In [None]:
# X: matrix nxd
# y: column vector nx1
# theta: column vector dx1

def cost_func_linear_regression(theta: np.array, X: np.array, y: np.array) -> np.array:
    
    n = X.shape[0]
    
    # error: pred - y
    z = np.matmul(X,theta) - y

    return np.matmul(z.T,z)/2.0/n  # <== average of all loss MSE

def gradient_linear_regression(theta: np.array, X: np.array, y: np.array) -> np.array:
    
    n = X.shape[0]
    
    # error: pred - y
    z = np.matmul(X,theta) - y    
    grad = np.matmul(X.T,z)
        
    return grad/n

### Logistic Regression Sigmoid activation

* vectorized formula of the cost function: https://ml-cheatsheet.readthedocs.io/en/latest/logistic_regression.html

* Cost function is also calles **Cross-entropy** or **log loss**

**TODO:** Try to find a demonstartion of the vectorial version of the gradient using matrix derivative

In [None]:
# X: matrix nxd
# y: column vector nx1
# theta: column vector dx1

def cost_func_logistic_regression(theta: np.array, X: np.array, y: np.array) -> np.array:
    
    n = X.shape[0]
    z = np.matmul(X,theta)  # nx1
    h = 1.0/(1.0 + np.exp(-z))  # activation: nx1
    
    cost = (-1.0/n)*(np.matmul(y.T,np.log(h)) + np.matmul(1-y.T,np.log(1-h))) # scalar
    
    return cost

def gradient_logistic_regression(theta: np.array, X: np.array, y: np.array)-> np.array:
    
    z = np.matmul(X,theta)
    
    # pred
    h = 1.0/(1.0 + np.exp(-z))  # activation
    
    return np.matmul(X.T,h - y)

## Simulated Data

In [None]:
def make_2d_linear_data(slope: float, intercept: float, noise: float, n_sanples: int, random: int = 2021):
    
    x = np.linspace(-1.0, 1.0, n).reshape(n,1)
    
    np.random.seed(random)
    e = np.random.normal(0.0,noise,n).reshape(n,1)
    
    y = slope*x + intercept + e
    y = y.reshape(n,)

    _ones = np.ones((n,1))

    x.shape

    _ones.shape

    X = np.hstack((_ones, x))
    
    return X,y, e

slope = 5.0 
intercept = 4.0
noise = 0.1
n = 50

X,y, e = make_2d_linear_data(slope,intercept, noise, n)
theta_true = np.array([intercept,slope])

X.shape
y.shape

X[0:3]

print(f"True parameters: {theta_true}")
bayes_error = np.sum(np.matmul(e.T,e))/n  # <== mse
print(f"bayes error: {bayes_error:.2f}")

_ = plt.subplot(121);
_ = plt.scatter(X[:,1],y);

plt.show()

## TUNING Learning rate 

https://miguel-data-sc.github.io/2017-11-05-first/
https://arxiv.org/abs/1506.01186

1. learning rate finding 
1. learnig rate find vs batchs size

Needs to introduce learning rate shdedule $\eta = \eta(t)$ in mini batch algo

In [None]:
from abc import ABC, abstractmethod

class LearningRateScheduler(ABC):

    def __init__(self, lr0: float ):
        
        self._lr0 = lr0
        
    @abstractmethod
    def run(it: int) -> float: 
        return
    
class ConstLearningRateScheduler(LearningRateScheduler):
    
    def run(self, it: int) -> float: 
        return self._lr0
    
class FindLearningRateScheduler(LearningRateScheduler):
    
    def __init__(self, lr0: float, number_of_lr_per_order: int = 4 ):
    
        super().__init__(lr0)
        
        self._lr0 = lr0
        self._number_of_lr_per_order = number_of_lr_per_order
    
    def run(self, it: int) -> float: 
        
        lr = self._lr0*(10**(it/self._number_of_lr_per_order))
        
        return lr

In [None]:
lr_scheduler = ConstLearningRateScheduler(0.01)

lr_scheduler.run(10)
lr_scheduler.run(100)

In [None]:
n_iter = 55

lr_scheduler = FindLearningRateScheduler(1e-5,number_of_lr_per_order=10)

lrs = np.zeros((n_iter,))
for it in range(n_iter):
    
    lrs[it] = lr_scheduler.run(it)

fig, (ax1, ax2) = plt.subplots(2);
fig.suptitle('learning rate scheduler');
ax1.plot(range(n_iter),lrs,'b',label='');
ax1.set_yscale('log');
ax1.grid(True);
ax1.set_title('log scale');
ax2.plot(range(n_iter),lrs,'b',label='learning rate scheduler');
ax2.grid(True);

In [None]:
def create_mini_batches(data_shuffle, batch_size):
    
    n = data_shuffle.shape[0]
    mini_batches = []
    
    for k in np.arange(0, n, batch_size):
        
        X_mini = data_shuffle[k:k+batch_size,:-1]
        y_mini = data_shuffle[k:k+batch_size,-1]
        
        mini_batches.append((X_mini,y_mini))
    return mini_batches

def mini_batch_gradient_descent(X,y, cost_func, gradient_func, 
                                learning_rate_scheduler: Callable[[float,int], float],
                                n_iterations: int,
                                batch_size: int, 
                                init_guess: np.array,seed: int =1):
    
    n = X.shape[0] # training size
    d = X.shape[1]  # number of dimensions or paramters
    
    theta = init_guess

    Xy = np.hstack((X,y.reshape((n,1))))
        
    theta_history = []
    grad_history = []
    cost_history = []
 
    np.random.seed(seed)
    for it in range(0,n_iterations):
           
        data_shuffle = Xy.copy()
        np.random.shuffle(data_shuffle)
        
        cost = 0.0
        mini_batches = create_mini_batches(data_shuffle,batch_size)
        
        for X_mini, y_mini in mini_batches:

            # compute the gradients and cost function
            cost += cost_func(theta,X_mini,y_mini)
            gradient_at_theta = gradient_func(theta,X_mini,y_mini)  # gradient approximation using 1 data point
            

            # log history for oploting
            theta_history.append(theta.T)
            grad_history.append(gradient_at_theta)
            
           # update parameters
            learning_rate = learning_rate_scheduler.run(it)
            dtheta = -learning_rate* gradient_at_theta

            theta = theta + dtheta
            
        cost_history.append(cost/len(mini_batches)) # << average loss in the epoch 

    history = {'theta': theta_history, "cost": cost_history, "grad": grad_history}

    return theta, history

### Test on regression

In [None]:
slope = 5.0 
intercept = 4.0
noise = 0.1
n = 300

X,y,_ = make_2d_linear_data(slope,intercept, noise, n)
theta_true = np.array([intercept,slope])

print(f"True parameters: {theta_true}")
bayes_error = np.sum(np.matmul(e.T,e))/n  # <== mse
print(f"bayes error: {bayes_error:.2f}")

_ = plt.subplot(121);
_ = plt.scatter(X[:,1],y);

plt.show()

In [None]:
n_iter = 53
bs = 18

lr_scheduler = FindLearningRateScheduler(1e-6,number_of_lr_per_order=8)

init_guess = np.array([1.0,0.5])


theta, history = mini_batch_gradient_descent(X, y, cost_func_linear_regression, gradient_linear_regression, 
                                             lr_scheduler, n_iter,bs, init_guess)



theta0 = history['theta'][0]
grad0 =  history['grad'][0]
cost0 = history['cost'][0]

print(f"Initial Guess it: 0 => theta: {theta0}; grad: {grad0}; cost: {cost0:.2f}")

it = n_iter -1
theta = history['theta'][-1]
grad =  history['grad'][-1]
cost = history['cost'][-1]

print(f"it: {it} => theta: {theta}; grad: {grad}; cost: {cost:.2f}")

print()

In [None]:
lrs = np.zeros((n_iter,))
for it in range(n_iter):
    
    lrs[it] = lr_scheduler.run(it)

cost = history['cost'] 

plt.subplots()
plt.plot(lrs,cost,'b',label='cost');
plt.xscale('log');
plt.ylim(0.0,8.5);
plt.legend();
plt.title('Learning Rate Finder');
plt.grid(True);

In [None]:
X.shape
y.shape

In [None]:

W01 = tf.constant_initializer(0.5)
b01 = tf.constant_initializer(1.0)

model = tf.keras.Sequential([
                tf.keras.layers.Dense(1, activation='relu',
                          kernel_initializer=W01,
                          bias_initializer=b01,
                          input_shape=(2,))
    ])

sgd = tf.keras.optimizers.SGD(learning_rate=lr, 
                              momentum=0.0,
                              nesterov=False, name='SGD'
                             )


model.compile(loss='mse', optimizer=sgd, metrics=['mse'])

#_ = model.fit(X, y, epochs=n_iter, batch_size=bs, verbose=0, validation_split=0.0)

In [None]:
# 3. Fit using Callback
lr_finder = LRFinder(min_lr=1e-14, max_lr=0.001)


model.fit(X, y, batch_size=bs, callbacks=[lr_finder], epochs=2)

### testing logistic regression

In [None]:
# other options are also available
X, y = make_classification(
    n_samples=100, 
    n_classes=2,
    n_clusters_per_class=1,
    n_features=2,
    n_redundant = 0,
    n_informative = 2,
    n_repeated = 0,
    weights=[0.5,0.5], # balanced classes
    flip_y=0.001, # add noisy the default value for flip_y is 0.1%
    class_sep=1.7, # class_sep is 1.0. The lower the value, the harder classification is.
    random_state=2020) 

ax = sns.scatterplot(X[:,0],X[:,1],hue=y);
ax.set_title("Dataset");

In [None]:
lr = 0.1
n_iter = 200
bs = 100

lr_scheduler = ConstLearningRateScheduler(lr)

init_guess = np.array([1.0,0.5])

theta, history = mini_batch_gradient_descent(X, y, cost_func_linear_regression, gradient_linear_regression, 
                                             lr_scheduler, n_iter,bs, init_guess)


theta0 = history['theta'][0]
grad0 =  history['grad'][0]
cost0 = history['cost'][0]

print(f"Initial Guess it: 0 => theta: {theta0}; grad: {grad0}; cost: {cost0:.2f}")

it = n_iter -1
theta = history['theta'][-1]
grad =  history['grad'][-1]
cost = history['cost'][-1]

print(f"it: {it} => theta: {theta}; grad: {grad}; cost: {cost:.2f}")
print(f"Expected theta:{theta_true}")
print()

In [None]:
cost = history['cost']
    
    
m = len(cost)
x = np.linspace(0,n_iter,m)
    
fig,ax = plt.subplots()
_ = ax.plot(x,cost,'b',label='cost');
_ = ax.set_ylabel("Cost")
_ = ax.set_ylabel("Epoch")
ax.legend();

In [None]:
n_iter = 54
bs = 100

lr_scheduler = FindLearningRateScheduler(1e-5,number_of_lr_per_order=10)

init_guess = np.array([1.0,0.5])

theta, history = mini_batch_gradient_descent(X, y, cost_func_linear_regression, gradient_linear_regression, 
                                             lr_scheduler, n_iter,bs, init_guess)


theta0 = history['theta'][0]
grad0 =  history['grad'][0]
cost0 = history['cost'][0]

print(f"Initial Guess it: 0 => theta: {theta0}; grad: {grad0}; cost: {cost0:.2f}")

it = n_iter -1
theta = history['theta'][-1]
grad =  history['grad'][-1]
cost = history['cost'][-1]

print(f"it: {it} => theta: {theta}; grad: {grad}; cost: {cost:.2f}")
print(f"Expected theta:{theta_true}")
print()

In [None]:
lrs = np.zeros((n_iter,))
for it in range(n_iter):
    
    lrs[it] = lr_scheduler.run(it)


cost = history['cost'] 
print(f"first 5 lrs: {lrs[0:5]}")
print(f"last 5 lrs: {lrs[-5:]}")
print(f"last 5 cost: {cost[-5:]}")


plt.subplots()
plt.plot(lrs,cost,'b',label='cost');
plt.xscale('log');
plt.ylim(0.0,2.0);
plt.legend();
plt.title('Learning Rate Finder');
plt.grid(True);

In [None]:
lrs[-13:]
cost[-13:]

In [None]:
sim = [('manual', 0.01),
       ('tuned lr',0.2 )]

n_iter = 300
init_guess
bs

for name, lr in sim:

    lr_scheduler = ConstLearningRateScheduler(lr)
    
    theta, history[name] = mini_batch_gradient_descent(X, y, cost_func_linear_regression, gradient_linear_regression, 
                                             lr_scheduler, n_iter,bs, init_guess)


    print(f"{name}, lr: {lr}")
    theta0 = history[name]['theta'][0]
    grad0 =  history[name]['grad'][0]
    cost0 = history[name]['cost'][0]

    print(f"Initial Guess it: 0 => theta: {theta0}; grad: {grad0}; cost: {cost0:.2f}")

    it = n_iter -1
    theta = history[name]['theta'][-1]
    grad =  history[name]['grad'][-1]
    cost = history[name]['cost'][-1]

    print(f"it: {it} => theta: {theta}; grad: {grad}; cost: {cost:.2f}")
    
    z = np.matmul(X,theta)  # nx1
    h = 1.0/(1.0 + np.exp(-z))
    y_pred = h > 0.5

    acc = accuracy_score(y, y_pred)
    print(f"accuracy: {acc}")
    
    #print(f"Expected theta:{theta_true}")
    print()


In [None]:
colors = ['b','k','g']

fig,ax = plt.subplots()
for idx, hyper_params in enumerate(sim):
    
    name, lr = hyper_params= hyper_params
    
    cost = history[name]['cost']
    
    m = len(cost)
    x = np.linspace(0,n_iter,m)
    
    _ = ax.plot(x,cost,colors[idx],label=name);
    
#_ = ax.set_ylim([6, 8])
_ = ax.set_ylabel("Cost")
_ = ax.set_ylabel("Epoch")
ax.legend();