In [16]:
import numpy as np

# Normal Gradient Descent

In [84]:
class gradient_descent:

    def __init__(self,lr,epochs,weights,inputs,b,y):
        self.lr = lr
        self.epochs = epochs
        self.weights = weights
        self.inputs = inputs
        self.b = b
        self.y = y

    def activation(self,x):
        return 1 / (1 + np.exp(-(x * self.weights + self.b)))

    def error(self):
        err = 0
        for x,y in zip(self.inputs,self.y):
            f_x = self.activation(x)
            err += 0.5 * (f_x - y) ** 2
        return err

    def grad_b(self,x,y):
        f_x = self.activation(x)
        return (f_x - y) * f_x * (1 - f_x)

    def grad_w(self,x,y):
        f_x = self.activation(x)
        return (f_x - y) * f_x * (1 - f_x) * x

    def gd_algo(self):
        for i in range(self.epochs):
            dw,db = 0,0
            for x,y in zip(self.inputs,self.y):
                dw += self.grad_w(x,y)
                db += self.grad_b(x,y)
            self.weights = self.weights - self.lr * dw
            self.b = self.b - self.lr * db
        
        return self.weights,self.b

In [85]:
lr = .01
epochs = 200000
weights = 1
bias = -2
inputs = [2,-5,3,10]
output = [4,25,9,100]

gd = gradient_descent(lr,epochs,weights,inputs,b=bias,y=output)

#Without Gradient Descent the Loss is
gd.error()

5253.323119969317

In [86]:
new_w,new_b = gd.gd_algo()
print(new_w)
print(new_b)

5.201402921211379
-0.06102561975430424


In [87]:
#Loss after GD
gd = gradient_descent(lr,epochs,new_w,inputs,b=new_b,y=output)
gd.error()

5249.500098189401

# Stochastic Gradient Descent

In [69]:
class sgd:

    def __init__(self,inputs,outputs,weights,bias,learning_rate,epochs):
        self.x = inputs
        self.y = outputs
        self.weights = weights
        self.bias = bias
        self.lr = learning_rate
        self.epochs = epochs

    def activation(self,x):
        return 1 / (1 + np.exp(-(x * self.weights + self.bias)))

    def loss(self):
        err = 0
        for x,y in zip(self.x,self.y):
            f_x = self.activation(x)
            err += 0.5 * (f_x - y) ** 2
        return err

    def grad_b(self,x,y):
        f_x = self.activation(x)
        return (f_x - y) * f_x * (1 - f_x)

    def grad_w(self,x,y):
        f_x = self.activation(x)
        return (f_x - y) * f_x * (1 - f_x) * x
        
    def stochastic_gd(self):
        for i in range(self.epochs):
            d_w,d_b = 0,0
            for x,y in zip(self.x,self.y):
                d_w += self.grad_w(x,y)
                d_b += self.grad_b(x,y)
                self.weights = self.weights - self.lr * d_w
                self.bias = self.bias - self.lr * d_b
        return self.weights,self.bias      

In [70]:
lr = .01
epochs = 200000
weights = 1
bias = -2
inputs = [2,-5,3,10]
output = [4,25,9,100]

In [71]:
s_gd = sgd(inputs,output,weights,bias,lr,epochs)

In [72]:
#loss with out sgd
loss = s_gd.loss()
loss

5253.323119969317

In [73]:
w,b = s_gd.stochastic_gd()

In [74]:
#loss with sgd
loss = s_gd.loss()
loss

5249.500024967948

# Mini Batch Gradient Descent

In [78]:
class mini_batch_sgd:

    def __init__(self,inputs,outputs,weights,bias,learning_rate,batch_size,epochs):
        self.x = inputs
        self.y = outputs
        self.weights = weights
        self.bias = bias
        self.lr = learning_rate
        self.batch_size = batch_size
        self.epochs = epochs

    def activation(self,x):
        return 1 / (1 + np.exp(-(x * self.weights + self.bias)))

    def loss(self):
        err = 0
        for x,y in zip(self.x,self.y):
            f_x = self.activation(x)
            err += 0.5 * (f_x - y) ** 2
        return err

    def grad_b(self,x,y):
        f_x = self.activation(x)
        return (f_x - y) * f_x * (1 - f_x)

    def grad_w(self,x,y):
        f_x = self.activation(x)
        return (f_x - y) * f_x * (1 - f_x) * x

    def mini_batch(self):
        for i in range(self.epochs):
            d_w,d_b,points_seen = 0,0,0
            for x,y in zip(self.x,self.y):
                d_w = self.grad_w(x,y)
                d_b = self.grad_b(x,y)
                points_seen += 1

                if points_seen % self.batch_size == 0:
                    self.weights = self.weights - self.lr * d_w
                    self.bias = self.bias - self.lr * d_b
                    d_w,d_b = 0,0
        return self.weights,self.bias

In [76]:
lr = .01
epochs = 200000
weights = 1
bias = -2
inputs = [2,-5,3,10]
output = [4,25,9,100]

In [80]:
mini_b = mini_batch_sgd(inputs,output,weights,bias,lr,batch_size=2, epochs=epochs)

In [81]:
#loss with out mini_batch
loss = s_gd.loss()
loss

5249.500024967948

In [82]:
w,b = mini_b.mini_batch()

In [83]:
#loss with sgd
loss = s_gd.loss()
loss

5249.500024967948

# Momentum Gradient Descent

In [95]:
class momentum_GD:

    def __init__(self,inputs,outputs,weights,bias,learning_rate,gamma,epochs):
        self.x = inputs
        self.y = outputs
        self.weights = weights
        self.bias = bias
        self.lr = learning_rate
        self.gamma = gamma
        self.epochs = epochs

    def activation(self,x):
        return 1 / (1 + np.exp(-(x * self.weights + self.bias)))

    def loss(self):
        err = 0
        for x,y in zip(self.x,self.y):
            f_x = self.activation(x)
            err += 0.5 * (f_x - y) ** 2
        return err

    def grad_b(self,x,y):
        f_x = self.activation(x)
        return (f_x - y) * f_x * (1 - f_x)

    def grad_w(self,x,y):
        f_x = self.activation(x)
        return (f_x - y) * f_x * (1 - f_x) * x

    def momentum(self):
        prev_w,prev_b = 0,0 
        for i in range(self.epochs):
            d_w,d_b = 0,0
            for x,y in zip(self.x,self.y):
                d_w = self.grad_w(x,y)
                d_b = self.grad_b(x,y)
            
            update_w = self.gamma * prev_w + self.lr * d_w
            update_b = self.gamma * prev_b + self.lr * d_b
            self.weights = self.weights - update_w
            self.bias = self.bias - update_b
            prev_w = update_w
            prev_b = update_w
        return self.weights,self.bias

In [96]:
lr = .01
epochs = 200000
weights = 1
bias = -2
inputs = [2,-5,3,10]
output = [4,25,9,100]

In [97]:
momentum = momentum_GD(inputs,output,weights,bias,lr,gamma=0.9,epochs=epochs)

In [98]:
#loss with out mini_batch
loss = momentum.loss()
loss

5253.323119969317

In [99]:
w,b = mini_b.mini_batch()
#loss with sgd
loss = s_gd.loss()
loss

5249.500024967948

# Mini Batch Momentum GD

In [105]:
class minibatch_momentum:

    def __init__(self,inputs,outputs,weights,bias,learning_rate,gamma,batch_size,epochs):
        self.x = inputs
        self.y = outputs
        self.weights = weights
        self.bias = bias
        self.lr = learning_rate
        self.gamma = gamma
        self.batch_size = batch_size
        self.epochs = epochs

    def activation(self,x):
        return 1 / (1 + np.exp(-(x * self.weights + self.bias)))

    def loss(self):
        err = 0
        for x,y in zip(self.x,self.y):
            f_x = self.activation(x)
            err += 0.5 * (f_x - y) ** 2
        return err

    def grad_b(self,x,y):
        f_x = self.activation(x)
        return (f_x - y) * f_x * (1 - f_x)

    def grad_w(self,x,y):
        f_x = self.activation(x)
        return (f_x - y) * f_x * (1 - f_x) * x

    def mini_momentum(self):
        prev_w,prev_b = 0,0 
        for i in range(self.epochs):
            d_w,d_b,points_seen = 0,0,0
            for x,y in zip(self.x,self.y):
                d_w = self.grad_w(x,y)
                d_b = self.grad_b(x,y)
                points_seen += 1

                if points_seen % self.batch_size == 0:
                    update_w = self.gamma * prev_w + self.lr * d_w
                    update_b = self.gamma * prev_b + self.lr * d_b
                    self.weights = self.weights - update_w
                    self.bias = self.bias - update_b
                    prev_w = update_w
                    prev_b = update_b
                    d_w,d_b = 0,0

        return self.weights,self.bias

In [106]:
lr = .01
epochs = 200000
weights = 1
bias = -2
inputs = [2,-5,3,10]
output = [4,25,9,100]

In [107]:
mini_momentum = minibatch_momentum(inputs,output,weights,bias,lr,gamma=0.9,batch_size=2,epochs=epochs)
#loss with out mini_batch
loss = mini_momentum.loss()
loss


5253.323119969317

In [108]:
w,b = mini_momentum.mini_momentum()
#loss with sgd
loss = mini_momentum.loss()
loss

5225.0000552166575

# Nestrov Momentum

In [10]:
class nestrov_GD:

    '''
    Implemented with Mini GD approach
    Can be implemented with simple GD
    '''

    def __init__(self,inputs,outputs,learning_rate,gamma,batch_size,epochs):
        self.x = inputs
        self.y = outputs
        # self.weights = weights
        # self.bias = bias
        self.lr = learning_rate
        self.gamma = gamma
        self.batch_size = batch_size
        self.epochs = epochs

    def activation(self,w,b,x):
        return 1 / (1 + np.exp(-(x * w + b)))

    def loss(self,w,b):
        err = 0
        for x,y in zip(self.x,self.y):
            f_x = self.activation(w,b,x)
            err += 0.5 * (f_x - y) ** 2
        return err

    def grad_b(self,w,b,x,y):
        f_x = self.activation(w,b,x)
        return (f_x - y) * f_x * (1 - f_x)

    def grad_w(self,w,b,x,y):
        f_x = self.activation(w,b,x)
        return (f_x - y) * f_x * (1 - f_x) * x

    def nest(self,w,b):
        prev_w,prev_b = 0,0
        for i in range(self.epochs):
            d_w,d_b,points_seen = 0,0,0

            #update the weights i.e lookahead
            w_lookahead = self.gamma * prev_w - self.lr * w
            b_lookahead = self.gamma * prev_b - self.lr * b

            for x,y in zip(self.x,self.y):
                d_w = self.grad_w(w_lookahead,b_lookahead,x,y)
                d_b = self.grad_b(w_lookahead,b_lookahead,x,y)
                points_seen += 1

                if points_seen % self.batch_size == 0:
                    update_w = self.gamma * prev_w + self.lr * d_w
                    update_b = self.gamma * prev_b + self.lr * d_b
                    w = w - update_w
                    b = b - update_b
                    prev_w = update_w
                    prev_b = update_b
        return w,b

In [11]:
lr = .01
epochs = 200000
weights = 1
bias = -2
inputs = [2,-5,3,10]
output = [4,25,9,100]

In [12]:
nestrov = nestrov_GD(inputs,output,lr,gamma=0.9,batch_size=2,epochs=epochs)
#loss with out mini_batch
loss = nestrov.loss(weights,bias)
loss

5253.323119969317

In [15]:
w,b = nestrov.nest(weights,bias)
#loss with sgd
loss = nestrov.loss(w,b)
loss

5225.0

# ADAGRAD

In [33]:
class adagrad:

    def __init__(self,inputs,outputs,weights,bias,learning_rate,gamma,batch_size,epochs):
        self.x = inputs
        self.y = outputs
        self.weights = weights
        self.bias = bias
        self.lr = learning_rate
        self.gamma = gamma
        self.batch_size = batch_size
        self.epochs = epochs

    def activation(self,x):
        return 1 / (1 + np.exp(-(x * self.weights + self.bias)))

    def loss(self):
        err = 0
        for x,y in zip(self.x,self.y):
            f_x = self.activation(x)
            err += 0.5 * (f_x - y) ** 2
        return err

    def grad_b(self,x,y):
        f_x = self.activation(x)
        return (f_x - y) * f_x * (1 - f_x)

    def grad_w(self,x,y):
        f_x = self.activation(x)
        return (f_x - y) * f_x * (1 - f_x) * x

    def adagrad(self,eps):
        prev_w,prev_b = 0,0
        for i in range(self.epochs):
            d_w,d_b,points_seen = 0,0,0
            for x,y in zip(self.x,self.y):
                d_w = self.grad_w(x,y)
                d_b = self.grad_b(x,y)
                points_seen += 1

                if points_seen % self.batch_size == 0:
                    prev_w = prev_w + d_w ** 2
                    prev_b = prev_b + d_b ** 2
                    self_weights = self.weights - (self.lr / np.sqrt(prev_w + eps)) * d_w  
                    self_bias = self.bias - (self.lr / np.sqrt(prev_b + eps)) * d_b
                    d_w,d_b = 0,0

        return self.weights,self.bias
        

In [34]:
lr = .01
epochs = 200000
weights = 1
bias = -2
inputs = [2,-5,3,10]
output = [4,25,9,100]

In [35]:
adagrad = adagrad(inputs,output,weights=weights,learning_rate=lr,bias=bias,gamma=0.9,batch_size=2,epochs=epochs)
#loss with out mini_batch
loss = adagrad.loss()
loss

5253.323119969317

In [36]:
w,b = adagrad.adagrad(eps=1e-8)
#loss with sgd
loss = adagrad.loss()
loss

5253.323119969317

# RMS PROP

In [39]:
class rmsprop:

    def __init__(self,inputs,outputs,weights,bias,learning_rate,gamma,batch_size,epochs):
        self.x = inputs
        self.y = outputs
        self.weights = weights
        self.bias = bias
        self.lr = learning_rate
        self.gamma = gamma
        self.batch_size = batch_size
        self.epochs = epochs

    def activation(self,x):
        return 1 / (1 + np.exp(-(x * self.weights + self.bias)))

    def loss(self):
        err = 0
        for x,y in zip(self.x,self.y):
            f_x = self.activation(x)
            err += 0.5 * (f_x - y) ** 2
        return err

    def grad_b(self,x,y):
        f_x = self.activation(x)
        return (f_x - y) * f_x * (1 - f_x)

    def grad_w(self,x,y):
        f_x = self.activation(x)
        return (f_x - y) * f_x * (1 - f_x) * x

    def rmsprop(self,beta,eps):
        prev_w,prev_b = 0,0
        for i in range(self.epochs):
            d_w,d_b,points_seen = 0,0,0
            for x,y in zip(self.x,self.y):
                d_w = self.grad_w(x,y)
                d_b = self.grad_b(x,y)
                points_seen += 1

                if points_seen % self.batch_size == 0:
                    prev_w = beta * prev_w + (1-beta) * d_w ** 2
                    prev_b = beta * prev_b + (1-beta) * d_b ** 2
                    self_weights = self.weights - (self.lr / np.sqrt(prev_w + eps)) * d_w  
                    self_bias = self.bias - (self.lr / np.sqrt(prev_b + eps)) * d_b
                    d_w,d_b = 0,0

        return self.weights,self.bias


In [44]:
lr = .01
epochs = 200000
weights = 1
bias = -2
inputs = [2,-5,3,10]
output = [4,25,9,100]

In [45]:
rms_prop = rmsprop(inputs,output,weights=weights,learning_rate=lr,bias=bias,gamma=0.9,batch_size=2,epochs=epochs)
#loss with out mini_batch
loss = rms_prop.loss()
loss

5253.323119969317

In [46]:
w,b = rms_prop.rmsprop(beta=0.95,eps=1e-8)
loss = rms_prop.loss()
loss

5253.323119969317