In [1]:
import numpy as np
import pandas as pd

df = pd.read_csv('toys.csv')

In [6]:
def sigmoid(w,b,x):
    return 1/(1+np.exp(-(w*x+b)))

def error(w,b):
    err = 0
    i=0
    for x,y in zip(X,Y):
        fx = sigmoid(w,b,x)
        err = err+0.5*(fx-y)**2
    return err

def grad_w(w,b,x,y):
    fx = sigmoid(w,b,x)
    return (fx-y)*fx*(1-fx)*x

def grad_b(w,b,x,y):
    fx = sigmoid(w,b,x)
    return (fx-y)*fx*(1-fx)

In [7]:
def momentum_gd():
    w,b,eta = init_w,init_b,1.0
    prev_v_w,prev_v_b,gamma = 0,0,0.9
    for i in range(max_epochs):
        dw,db=0,0
        for x,y in zip(X,Y):
            dw = dw + grad_w(w,b,x,y)
            db = db + grad_b(w,b,x,y)
        
        v_w = gamma*prev_v_w + eta*dw
        v_b = gamma*prev_v_b + eta*db
        w = w - v_w
        b = b - v_b
        prev_v_w = v_w
        prev_v_b = v_b

In [8]:
def NAG():
    w,b,eta = init_w,init_b,1.0
    prev_v_w,prev_v_b,gamma = 0,0,0.9
    for i in range(max_epochs):
        dw,db=0,0
        #partial updates
        v_w = gamma*prev_v_w
        v_b = gamma*prev_v_b
        for x,y in zip(X,Y):
            dw = dw + grad_w(w-v_w,b-v_b,x,y)
            db = db + grad_b(w-v_w,b-v_b,x,y)
        
        v_w = gamma*prev_v_w + eta*dw
        v_b = gamma*prev_v_b + eta*db
        w = w - v_w
        b = b - v_b
        prev_v_w = v_w
        prev_v_b = v_b

In [9]:
def mini_batch_gd():
    w,b,eta = -2,-2,1.0
    mini_batch_size,num_points_seen = 2,0
    for i in range(max_epochs):
        dw,db,num_points = 0,0,0
        for x,y in zip(X,Y):
            dw = dw + grad_w(w,b,x,y)
            db = db + grad_b(w,b,x,y)
            num_points_seen += 1
            
            if num_points_seen % mini_batch_size == 0:
                w = w - eta*dw
                b = b - eta*db
                dw,db=0

In [10]:
def stochastic_gd():
    w,b,eta = -2,-2,1.0
    for i in range(max_epochs):
        dw,db = 0,0
        for x,y in zip(X,Y):
            dw = dw + grad_w(w,b,x,y)
            db = db + grad_b(w,b,x,y)
            w = w - eta*dw
            b = b - eta*db
        

In [11]:
def accl_momentum_gd():
    w,b,eta = init_w,init_b,1.0
    prev_v_w,prev_v_b,gamma = 0,0,0.9
    for i in range(max_epochs):
        dw,db=0,0
        for x,y in zip(X,Y):
            dw = dw + grad_w(w,b,x,y)
            db = db + grad_b(w,b,x,y)
        
            v_w = gamma*prev_v_w + eta*dw
            v_b = gamma*prev_v_b + eta*db
            w = w - v_w
            b = b - v_b
            prev_v_w = v_w
            prev_v_b = v_b

In [12]:
def accl_NAG():
    w,b,eta = init_w,init_b,1.0
    prev_v_w,prev_v_b,gamma = 0,0,0.9
    for i in range(max_epochs):
        dw,db=0,0
        #partial updates
        v_w = gamma*prev_v_w
        v_b = gamma*prev_v_b
        for x,y in zip(X,Y):
            dw = dw + grad_w(w-v_w,b-v_b,x,y)
            db = db + grad_b(w-v_w,b-v_b,x,y)
        
            v_w = gamma*prev_v_w + eta*dw
            v_b = gamma*prev_v_b + eta*db
            w = w - v_w
            b = b - v_b
            prev_v_w = v_w
            prev_v_b = v_b

In [13]:
def line_serach():
    w,b,etas = init_w ,init_b,[0.1,0.5,1.0,5.0,10.0]
    for i in range(max_epochs):
        dw,db=0,0
        for x,y in zip(X,Y):
            dw += grad_w(w,b,x,y)
            db += grad_b(w,b,x,y)
        min_error = 10000   #set it to a large value
        best_w,best_b = w,b
        for eta in etas:
            tmp_w = w - eta*dw
            tmp_b = b - eta*db
            if error(tmp_w,tmp_b) < min_error:
                best_w = tmp_w
                best_b = tmp_b
                min_error = error(tmp_w,tmp_b)
        w,b = best_w,best_b

In [14]:
def adagrad():
    w,b,etas = init_w,init_b,0.1
    v_w,v_b,eps = 0,0,1e-8
    for i in range(max_epochs):
        dw,db=0,0
        for x,y in zip(X,Y):
            dw += grad_w(w,b,x,y)
            db += grad_b(w,b,x,y)
            
        v_w = v_w + dw**2
        v_b = v_b +dw**2
        
        w = w - (eta/np.sqrt(v_w + eps))*dw
        b = b - (eta/np.sqrt(v_b + eps))*db

In [17]:
def rmsprop():
    w,b,eta = init_w,init_b,0.1
    v_w,b_updates,eps,beta1= 0,0,1e-8,0.9
    for i in range(max_epochs):
        dw,db=0
        for x,y in zip(X,Y):
            dw += grad_w(w,b,x,y)
            db += grad_b(w,b,x,y)
        
        v_w = beta1 * v_w + (1 - beta1)*dw**2
        v_b = beta1 * v_b + (1 - beta1)*db**2
        
        w = w - (eta/np.sqrt(v_w+eps))*dw
        b = b - (eta/np.sqrt(v_b+eps))*db

In [19]:
def adam_optimizer():
    w_b_dw_db = [(init_w,init_b,0,0)]
    w_history,b_history,error_history = [],[],[]
    
    w,b,eta,mini_batch_size, num_points_seen = init_w,init_b,0.1,10,0
    
    m_w,m_b,v_w,v_b,eps,beta1,beta2 = 0,0,0,0,1e-8,0.9,0.99
    for i in range(max_epochs):
        dw,db=0,0
        for x,y in zip(X,Y):
            dw += grad_w(w,b,x,y)
            db += grad_b(w,b,x,y)
            
        m_w = beta1*m_w + (1-beta1)*dw
        m_b = beta1*m_b + (1-beta1)*db
        
        v_w = beta2*v_w + (1-beta2)*(dw**2)
        v_b = beta2*v_b + (1-beta2)*(db**2)
        
        m_w = m_w/(1 - math.pow(beta1,i+1))
        m_b = m_b/(1 - math.pow(beta1,i+1))
        
        v_w = v_w/(1 - math.pow(beta2,i+1))
        v_b = v_b/(1 - math.pow(beta2,i+1))
        
        w = w - (eta/np.sqrt(v_w + eps)) * m_w
        b = b - (eta/np.sqrt(v_b + eps)) * m_b