In [None]:
import numpy as np

In [None]:
x = np.array([[0,1],[2,6],[3,8]]) #x1, x2
y = np.array([1,1,4])

x_b = np.c_[np.ones((x.shape[0],1)),x]

In [None]:
def cost_function(theta, x, y):
    y_hat = x.dot(theta)
    c = (1/(2*x.shape[0])) * np.sum((y_hat - y) ** 2)  # Use x.shape[0] to get N
    return c

**Stochastic GD**

In [None]:
def stochastic_gradient_descent(alpha, x, y, ep=0.001, max_iter=10000):
    converged = False
    iter = 0
    N = x.shape[0]  # number of samples
    print("Num of data =", N)

    # initial theta
    theta = np.random.random((x.shape[1], 1))
    print("Init theta.shape =", theta.shape)

    # total error, J(theta)
    J = cost_function(theta, x, y)
    print("First J =", J)

    while not converged:
        # Shuffle the data at the beginning of each epoch
        indices = np.random.permutation(N)
        x = x[indices]
        y = y[indices]

        for i in range(N):
            # Select one training example
            x_i = x[i].reshape(1, -1)
            y_i = y[i].reshape(-1, 1)

            # Calculate the prediction
            y_hat = x_i.dot(theta)

            # Calculate the gradient
            diff = y_hat - y_i
            grad = x_i.T.dot(diff)

            # Update the parameters
            theta = theta - alpha * grad

            assert theta.shape == (x.shape[1], 1)

            # Compute the cost with the updated theta
            J2 = cost_function(theta, x, y)

            # Check for convergence
            if abs(J - J2) <= ep:
                print("       Converged, iterations: ", iter, "/", max_iter)
                converged = True
                break

            J = J2
            iter += 1

            if iter == max_iter:
                print('       Max iterations exceeded!')
                converged = True
                break

    return theta


In [None]:
if __name__ == '__main__':
    print("start main")
    print(x_b.shape)
    y = y.reshape(-1, 1)
    print(y.shape)

    alpha = 0.01  # learning rate
    # Training process
    theta = stochastic_gradient_descent(alpha, x_b, y, ep=0.000000000001, max_iter=1000000)
    print("Theta =", theta)

    # Predict trained x
    xtest = np.array([[4, 9]])
    xtest_b = np.c_[np.ones((xtest.shape[0], 1)), xtest]
    y_p = xtest_b.dot(theta)
    print("y predict =", y_p)

start main
(3, 3)
(3, 1)
Num of data = 3
Init theta.shape = (3, 1)
First J = 5.9193094502980985
       Converged, iterations:  202044 / 1000000
Theta = [[ 6.99670569]
 [14.99290677]
 [-5.99700918]]
y predict = [[12.99525014]]


**Mini-Batch GD**

In [None]:
def mini_batch_gradient_descent(alpha, x, y, batch_size=2, ep=0.001, max_iter=10000):
    converged = False
    iter = 0
    N = x.shape[0]  # number of samples
    print("Num of data =", N)

    # initial theta
    theta = np.random.random((x.shape[1], 1))
    print("Init theta.shape =", theta.shape)

    # total error, J(theta)
    J = cost_function(theta, x, y)
    print("First J =", J)

    while not converged:
        # Shuffle the data at the beginning of each epoch
        indices = np.random.permutation(N)
        x = x[indices]
        y = y[indices]

        for i in range(0, N, batch_size):
            # Select mini-batch
            x_i = x[i:i+batch_size]
            y_i = y[i:i+batch_size].reshape(-1, 1)

            # Calculate the prediction
            y_hat = x_i.dot(theta)

            # Calculate the gradient
            diff = y_hat - y_i
            grad = x_i.T.dot(diff) / batch_size

            # Update the parameters
            theta = theta - alpha * grad

            assert theta.shape == (x.shape[1], 1)

        # Compute the cost with the updated theta
        J2 = cost_function(theta, x, y)

        # Check for convergence
        if abs(J - J2) <= ep:
            print("       Converged, iterations: ", iter, "/", max_iter)
            converged = True

        J = J2
        iter += 1

        if iter == max_iter:
            print('       Max iterations exceeded!')
            converged = True

    return theta


In [None]:
if __name__ == '__main__':
    print("start main")
    print(x_b.shape)
    y = y.reshape(-1, 1)
    print(y.shape)

    alpha = 0.01  # learning rate
    # Training process
    theta = mini_batch_gradient_descent(alpha, x_b, y, batch_size=2, ep=0.000000000001, max_iter=1000000)
    print("Theta =", theta)

    # Predict trained x
    xtest = np.array([[4, 9]])
    xtest_b = np.c_[np.ones((xtest.shape[0], 1)), xtest]
    y_p = xtest_b.dot(theta)
    print("y predict =", y_p)

start main
(3, 3)
(3, 1)
Num of data = 3
Init theta.shape = (3, 1)
First J = 7.514786557797791
       Converged, iterations:  137180 / 1000000
Theta = [[ 6.99303046]
 [14.98478705]
 [-5.99354833]]
y predict = [[12.99024366]]
