## Load and Preprocess

In [1]:
from sklearn import datasets
import numpy

x_sparse, y = datasets.load_svmlight_file('diabetes')
x = x_sparse.todense()

print('Shape of x: ' + str(x.shape))
print('Shape of y: ' + str(y.shape))

Shape of x: (768, 8)
Shape of y: (768,)


In [2]:
# partition the data to training and test sets
n = x.shape[0]
n_train = 640
n_test = n - n_train

rand_indices = numpy.random.permutation(n)
train_indices = rand_indices[0:n_train]
test_indices = rand_indices[n_train:n]

x_train = x[train_indices, :]
x_test = x[test_indices, :]
y_train = y[train_indices].reshape(n_train, 1)
y_test = y[test_indices].reshape(n_test, 1)

print('Shape of x_train: ' + str(x_train.shape))
print('Shape of x_test: ' + str(x_test.shape))
print('Shape of y_train: ' + str(y_train.shape))
print('Shape of y_test: ' + str(y_test.shape))

Shape of x_train: (640, 8)
Shape of x_test: (128, 8)
Shape of y_train: (640, 1)
Shape of y_test: (128, 1)


In [3]:
# Standardization
import numpy

# calculate mu and sig using the training set
d = x_train.shape[1]
mu = numpy.mean(x_train, axis=0).reshape(1, d)
sig = numpy.std(x_train, axis=0).reshape(1, d)

# transform the training features
x_train = (x_train - mu) / (sig + 1E-6)

# transform the test features
x_test = (x_test - mu) / (sig + 1E-6)

print('test mean = ')
print(numpy.mean(x_test, axis=0))

print('test std = ')
print(numpy.std(x_test, axis=0))

test mean = 
[[0.02737791 0.15399625 0.07148086 0.02516393 0.12153415 0.20934169
  0.05839781 0.01043028]]
test std = 
[[0.99997612 0.98253508 1.03762608 1.06287525 1.12851295 0.87727793
  1.02666135 0.95764898]]


In [4]:
n_train, d = x_train.shape
x_train = numpy.concatenate((x_train, numpy.ones((n_train, 1))), axis=1)

n_test, d = x_test.shape
x_test = numpy.concatenate((x_test, numpy.ones((n_test, 1))), axis=1)

print('Shape of x_train: ' + str(x_train.shape))
print('Shape of x_test: ' + str(x_test.shape))

Shape of x_train: (640, 9)
Shape of x_test: (128, 9)


## Stochastic GD

In [13]:
def stochastic_objective_gradient(w, xi, yi, lam):
    d = xi.shape[0]
    yx = yi * xi # 1-by-d matrix
    yxw = float(numpy.dot(yx, w)) # scalar
    
    # calculate objective function Q_i
    loss = numpy.log(1 + numpy.exp(-yxw)) # scalar
    reg = lam / 2 * numpy.sum(w * w) # scalar
    obj = loss + reg
    
    # calculate stochastic gradient
    g_loss = -yx.T / (1 + numpy.exp(yxw)) # d-by-1 matrix
    g = g_loss + lam * w # d-by-1 matrix
    
    return obj, g

In [66]:
n,d = x_train.shape
lam = 1E-6
xi = x_train[0, :] # 1-by-d matrix
yi = float(y_train[0, :]) # scalar
w = numpy.zeros((d, 1))
xi, yi, w

(matrix([[-0.84036248, -0.22110404, -0.25380177,  0.92125727, -0.68894809,
           0.23557983,  0.22555012, -1.03259777,  1.        ]]),
 -1.0,
 array([[0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.]]))

In [67]:
stochastic_objective_gradient(w, xi, yi, lam)

(0.6931471805599453,
 matrix([[-0.42018124],
         [-0.11055202],
         [-0.12690088],
         [ 0.46062863],
         [-0.34447404],
         [ 0.11778992],
         [ 0.11277506],
         [-0.51629889],
         [ 0.5       ]]))

In [68]:
yx = yi * xi # 1-by-d matrix
yxw = float(numpy.dot(yx, w)) # scalar
yx, yxw

(matrix([[ 0.84036248,  0.22110404,  0.25380177, -0.92125727,  0.68894809,
          -0.23557983, -0.22555012,  1.03259777, -1.        ]]),
 0.0)

In [69]:
loss = numpy.log(1 + numpy.exp(-yxw)) # scalar
reg = lam / 2 * numpy.sum(w * w) # scalar
obj = loss + reg
loss, reg, obj

(0.6931471805599453, 0.0, 0.6931471805599453)

In [70]:
z = 1 / (1 + numpy.exp(yxw))
z

0.5

In [72]:
g_loss = -yx.T / (1 + numpy.exp(yxw)) # d-by-1 matrix
g_loss_2 = z * -yx.T
g = g_loss + lam * w # d-by-1 matrix
print(numpy.array_equal(g_loss, g_loss_2))
g_loss, g

True


(matrix([[-0.42018124],
         [-0.11055202],
         [-0.12690088],
         [ 0.46062863],
         [-0.34447404],
         [ 0.11778992],
         [ 0.11277506],
         [-0.51629889],
         [ 0.5       ]]),
 matrix([[-0.42018124],
         [-0.11055202],
         [-0.12690088],
         [ 0.46062863],
         [-0.34447404],
         [ 0.11778992],
         [ 0.11277506],
         [-0.51629889],
         [ 0.5       ]]))

## Mini-batch SGD

In [102]:
lam = 1E-6
b = 8
xi = x_train[0:b, :]
yi = y_train[0:b].reshape((b, 1))
w = numpy.zeros((d,1))
xi, yi, w

(matrix([[-0.84036248, -0.22110404, -0.25380177,  0.92125727, -0.68894809,
           0.23557983,  0.22555012, -1.03259777,  1.        ],
         [ 0.05057952,  0.43532147,  0.16255999, -1.29792359, -0.68894809,
          -0.98796944, -0.58159803,  2.26284276,  1.        ],
         [ 0.34756018,  2.09201444,  0.37074087,  0.41401593,  1.15072785,
           1.48409963,  1.71543636,  1.67135343,  1.        ],
         [-0.54338181, -0.2523624 ,  0.31869565,  0.73104177, -0.68894809,
           0.49776936, -0.97303454, -1.03259777,  1.        ],
         [-0.24640115,  1.31055549, -0.87834441,  1.11147277, -0.68894809,
           0.68504733,  0.55629879, -0.77910235,  1.        ],
         [ 0.05057952, -0.12732897,  0.16255999, -0.53706158,  0.08424905,
          -1.20021768, -0.01720121,  0.31937783,  1.        ],
         [ 0.94152151, -0.40865419,  0.26665043, -1.29792359, -0.68894809,
          -0.26382784, -0.65138904, -0.18761302,  1.        ],
         [ 0.34756018, -1.47143835

In [103]:
yx = numpy.multiply(yi, xi)
yxw = numpy.dot(yx, w)
yx, yxw

(matrix([[ 0.84036248,  0.22110404,  0.25380177, -0.92125727,  0.68894809,
          -0.23557983, -0.22555012,  1.03259777, -1.        ],
         [-0.05057952, -0.43532147, -0.16255999,  1.29792359,  0.68894809,
           0.98796944,  0.58159803, -2.26284276, -1.        ],
         [-0.34756018, -2.09201444, -0.37074087, -0.41401593, -1.15072785,
          -1.48409963, -1.71543636, -1.67135343, -1.        ],
         [-0.54338181, -0.2523624 ,  0.31869565,  0.73104177, -0.68894809,
           0.49776936, -0.97303454, -1.03259777,  1.        ],
         [ 0.24640115, -1.31055549,  0.87834441, -1.11147277,  0.68894809,
          -0.68504733, -0.55629879,  0.77910235, -1.        ],
         [ 0.05057952, -0.12732897,  0.16255999, -0.53706158,  0.08424905,
          -1.20021768, -0.01720121,  0.31937783,  1.        ],
         [-0.94152151,  0.40865419, -0.26665043,  1.29792359,  0.68894809,
           0.26382784,  0.65138904,  0.18761302, -1.        ],
         [ 0.34756018, -1.47143835

In [104]:
loss = numpy.log(1 + numpy.exp(-yxw))
reg = lam / 2 * numpy.sum(w * w)
obj = (1/b) * numpy.sum(loss + reg)  # scalar
loss, reg, obj

(matrix([[0.69314718],
         [0.69314718],
         [0.69314718],
         [0.69314718],
         [0.69314718],
         [0.69314718],
         [0.69314718],
         [0.69314718]]),
 0.0,
 0.6931471805599453)

In [105]:
z = 1 / (1 + numpy.exp(yxw))
z

matrix([[0.5],
        [0.5],
        [0.5],
        [0.5],
        [0.5],
        [0.5],
        [0.5],
        [0.5]])

In [106]:
g_loss = numpy.multiply(z, -yx)
g_loss

matrix([[-0.42018124, -0.11055202, -0.12690088,  0.46062863, -0.34447404,
          0.11778992,  0.11277506, -0.51629889,  0.5       ],
        [ 0.02528976,  0.21766074,  0.08128   , -0.64896179, -0.34447404,
         -0.49398472, -0.29079902,  1.13142138,  0.5       ],
        [ 0.17378009,  1.04600722,  0.18537044,  0.20700796,  0.57536393,
          0.74204981,  0.85771818,  0.83567672,  0.5       ],
        [ 0.27169091,  0.1261812 , -0.15934783, -0.36552088,  0.34447404,
         -0.24888468,  0.48651727,  0.51629889, -0.5       ],
        [-0.12320057,  0.65527775, -0.43917221,  0.55573638, -0.34447404,
          0.34252367,  0.2781494 , -0.38955117,  0.5       ],
        [-0.02528976,  0.06366448, -0.08128   ,  0.26853079, -0.04212452,
          0.60010884,  0.0086006 , -0.15968892, -0.5       ],
        [ 0.47076076, -0.20432709,  0.13332522, -0.64896179, -0.34447404,
         -0.13191392, -0.32569452, -0.09380651,  0.5       ],
        [-0.17378009,  0.73571918,  0.23099132, 

In [107]:
g_loss = g_loss.T + lam * w
g_loss.shape, g_loss.T

((9, 8),
 matrix([[-0.42018124, -0.11055202, -0.12690088,  0.46062863, -0.34447404,
           0.11778992,  0.11277506, -0.51629889,  0.5       ],
         [ 0.02528976,  0.21766074,  0.08128   , -0.64896179, -0.34447404,
          -0.49398472, -0.29079902,  1.13142138,  0.5       ],
         [ 0.17378009,  1.04600722,  0.18537044,  0.20700796,  0.57536393,
           0.74204981,  0.85771818,  0.83567672,  0.5       ],
         [ 0.27169091,  0.1261812 , -0.15934783, -0.36552088,  0.34447404,
          -0.24888468,  0.48651727,  0.51629889, -0.5       ],
         [-0.12320057,  0.65527775, -0.43917221,  0.55573638, -0.34447404,
           0.34252367,  0.2781494 , -0.38955117,  0.5       ],
         [-0.02528976,  0.06366448, -0.08128   ,  0.26853079, -0.04212452,
           0.60010884,  0.0086006 , -0.15968892, -0.5       ],
         [ 0.47076076, -0.20432709,  0.13332522, -0.64896179, -0.34447404,
          -0.13191392, -0.32569452, -0.09380651,  0.5       ],
         [-0.17378009,  0

In [100]:
g = numpy.sum(g_loss, axis=1) / 8
g1 = (-0.42018124,  0.02528976,  0.17378009,  0.27169091, -0.12320057, -0.02528976,  0.47076076, -0.17378009)
g1 = sum(g1) / 8
g, g1

(matrix([[ 0.02488373],
         [ 0.31620393],
         [-0.02196674],
         [ 0.05967764],
         [-0.01946359],
         [ 0.15429946],
         [ 0.17896509],
         [ 0.19835687],
         [ 0.125     ]]),
 0.0248837325)

In [None]:
iters = int(n / b)
start = 0
for i in range(iters):
    end = start + b
    print(start, end)
    xi = x[rand_indices[0:b], :]
    print(len(xi))
    yi = y[rand_indices[0:b]].reshape((b, 1))
    start = end