## Load and Preprocess

In [1]:
from sklearn import datasets
import numpy

x_sparse, y = datasets.load_svmlight_file('diabetes')
x = x_sparse.todense()

print('Shape of x: ' + str(x.shape))
print('Shape of y: ' + str(y.shape))

Shape of x: (768, 8)
Shape of y: (768,)


In [2]:
# partition the data to training and test sets
n = x.shape[0]
n_train = 640
n_test = n - n_train

rand_indices = numpy.random.permutation(n)
train_indices = rand_indices[0:n_train]
test_indices = rand_indices[n_train:n]

x_train = x[train_indices, :]
x_test = x[test_indices, :]
y_train = y[train_indices].reshape(n_train, 1)
y_test = y[test_indices].reshape(n_test, 1)

print('Shape of x_train: ' + str(x_train.shape))
print('Shape of x_test: ' + str(x_test.shape))
print('Shape of y_train: ' + str(y_train.shape))
print('Shape of y_test: ' + str(y_test.shape))

Shape of x_train: (640, 8)
Shape of x_test: (128, 8)
Shape of y_train: (640, 1)
Shape of y_test: (128, 1)


In [3]:
# Standardization
import numpy

# calculate mu and sig using the training set
d = x_train.shape[1]
mu = numpy.mean(x_train, axis=0).reshape(1, d)
sig = numpy.std(x_train, axis=0).reshape(1, d)

# transform the training features
x_train = (x_train - mu) / (sig + 1E-6)

# transform the test features
x_test = (x_test - mu) / (sig + 1E-6)

print('test mean = ')
print(numpy.mean(x_test, axis=0))

print('test std = ')
print(numpy.std(x_test, axis=0))

test mean = 
[[ 0.06078954  0.0138906   0.23721259  0.05485782 -0.0182362   0.16715915
  -0.00124598  0.07095006]]
test std = 
[[0.99887728 0.97976865 0.89839885 1.12092429 1.018332   0.82670951
  0.83409264 1.04976618]]


In [4]:
n_train, d = x_train.shape
x_train = numpy.concatenate((x_train, numpy.ones((n_train, 1))), axis=1)

n_test, d = x_test.shape
x_test = numpy.concatenate((x_test, numpy.ones((n_test, 1))), axis=1)

print('Shape of x_train: ' + str(x_train.shape))
print('Shape of x_test: ' + str(x_test.shape))

Shape of x_train: (640, 9)
Shape of x_test: (128, 9)


## Stochastic GD

In [54]:
def stochastic_objective_gradient(w, xi, yi, lam):
    d = xi.shape[0]
    yx = yi * xi # 1-by-d matrix
    yxw = float(numpy.dot(yx, w)) # scalar
    
    # calculate objective function Q_i
    loss = numpy.log(1 + numpy.exp(-yxw)) # scalar
    reg = lam / 2 * numpy.sum(w * w) # scalar
    obj = loss + reg
    
    # calculate stochastic gradient
    g_loss = -yx.T / (1 + numpy.exp(yxw)) # d-by-1 matrix
    g = g_loss + lam * w # d-by-1 matrix
    
    return obj, g

In [55]:
lam = 1E-6
stepsize = 0.1
objval = 0

In [56]:
n, d = x_train.shape
xi = x_train[0, :] # 1-by-d matrix
yi = float(y_train[0, :]) # scalar
w1 = numpy.zeros((d, 1))
xi, yi, w1

(matrix([[-0.83481206,  1.6276372 ,  0.28947436, -1.30657715, -0.69807902,
           0.62291575, -1.12977615,  0.4203355 ,  1.        ]]),
 -1.0,
 array([[ 1.48076958e-02],
        [-7.95391065e-02],
        [-3.57784032e-02],
        [ 5.76488912e-02],
        [ 4.61795175e-02],
        [-4.33413730e-02],
        [ 9.25984864e-02],
        [-6.87438089e-02],
        [ 7.74521941e-05]]))

In [57]:
obj, g = stochastic_objective_gradient(w1, xi, yi, lam)
obj, g

(0.6931471805599453,
 matrix([[-0.41740603],
         [ 0.8138186 ],
         [ 0.14473718],
         [-0.65328857],
         [-0.34903951],
         [ 0.31145788],
         [-0.56488808],
         [ 0.21016775],
         [ 0.5       ]]))

In [58]:
w1 -= stepsize * g
objval += obj
w1

array([[ 0.0417406 ],
       [-0.08138186],
       [-0.01447372],
       [ 0.06532886],
       [ 0.03490395],
       [-0.03114579],
       [ 0.05648881],
       [-0.02101678],
       [-0.05      ]])

In [59]:
xi = x_train[1, :] # 1-by-d matrix
yi = float(y_train[1, :]) # scalar

In [60]:
obj, g = stochastic_objective_gradient(w1, xi, yi, lam)
obj, g

(0.694697335876756,
 matrix([[ 0.26932907],
         [-0.01842753],
         [ 0.21304685],
         [ 0.07679966],
         [-0.11275566],
         [ 0.12195585],
         [-0.36109679],
         [ 0.47727034],
         [-0.50077452]]))

In [61]:
w1 -= stepsize * g
objval += obj
objval /= 2
w1, objval

(array([[ 1.48076958e-02],
        [-7.95391065e-02],
        [-3.57784032e-02],
        [ 5.76488912e-02],
        [ 4.61795175e-02],
        [-4.33413730e-02],
        [ 9.25984864e-02],
        [-6.87438089e-02],
        [ 7.74521941e-05]]),
 0.6939222582183506)

## Mini-batch SGD

In [72]:
# Calculate the objective Q_I and the gradient of Q_I
# Inputs:
#     w: d-by-1 matrix
#     xi: b-by-d matrix
#     yi: b-by-1 matrix
#     lam: scalar, the regularization parameter
#     b: integer, the batch size
# Return:
#     obj: scalar, the objective Q_i
#     g: d-by-1 matrix, gradient of Q_i
def mb_stochastic_objective_gradient(w, xi, yi, lam, b):
    # Fill the function
    # Follow the implementation of stochastic_objective_gradient
    # Use matrix-vector multiplication; do not use FOR LOOP of vector-vector multiplications
    n,d = xi.shape
    yx = numpy.multiply(yi, xi)  # b by d matrix
    yxw = numpy.dot(yx, w)  # b x 1 vector

    # Solve objective function Q_i
    loss = numpy.log(1 + numpy.exp(-yxw)) # b x 1 vector
    reg = (lam / 2) * numpy.sum(w * w)  # scalar
    obj = (1/b) * numpy.sum(loss + reg)  # scalar

    # Calculate gradient
    z = (1 / (1 + numpy.exp(yxw)))  # b x 1 vector
    g_loss = numpy.multiply(z, -yx).T  # b x d matrix
    g = (1/b) * numpy.sum(g_loss, axis=1) # d x 1 vector
    g += (lam * w)
    return obj, g

In [73]:
lam = 1E-6
b = 1
start = 0
mbobjval = 0

In [74]:
end = start + b
xi = x_train[0:b, :]
yi = y_train[0:b, :]
w = numpy.zeros((d,1))
xi, yi, w

(matrix([[-0.83481206,  1.6276372 ,  0.28947436, -1.30657715, -0.69807902,
           0.62291575, -1.12977615,  0.4203355 ,  1.        ]]),
 array([[-1.]]),
 array([[0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.]]))

In [75]:
mbobj, mbg = mb_stochastic_objective_gradient(w, xi, yi, lam, b)
mbobj, mbg

(0.6931471805599453,
 matrix([[-0.41740603],
         [ 0.8138186 ],
         [ 0.14473718],
         [-0.65328857],
         [-0.34903951],
         [ 0.31145788],
         [-0.56488808],
         [ 0.21016775],
         [ 0.5       ]]))

In [76]:
w -= stepsize * mbg
mbobjval += mbobj
w, mbobj

(array([[ 0.0417406 ],
        [-0.08138186],
        [-0.01447372],
        [ 0.06532886],
        [ 0.03490395],
        [-0.03114579],
        [ 0.05648881],
        [-0.02101678],
        [-0.05      ]]),
 0.6931471805599453)

In [77]:
start = end
end = start + b
xi = x_train[start:end, :]
yi = y_train[start:end, :]
xi, yi, w

(matrix([[-0.537825  ,  0.03679791, -0.42543476, -0.15336164,  0.22516264,
          -0.24353455,  0.72107678, -0.95306448,  1.        ]]),
 array([[1.]]),
 array([[ 0.0417406 ],
        [-0.08138186],
        [-0.01447372],
        [ 0.06532886],
        [ 0.03490395],
        [-0.03114579],
        [ 0.05648881],
        [-0.02101678],
        [-0.05      ]]))

In [78]:
mbobj, mbg = mb_stochastic_objective_gradient(w, xi, yi, lam, b)
mbobj, mbg

(0.694697335876756,
 matrix([[ 0.26932907],
         [-0.01842753],
         [ 0.21304685],
         [ 0.07679966],
         [-0.11275566],
         [ 0.12195585],
         [-0.36109679],
         [ 0.47727034],
         [-0.50077452]]))

In [79]:
w -= stepsize * mbg
mbobjval += mbobj
mbobjval /= 2
w, mbobjval

(array([[ 1.48076958e-02],
        [-7.95391065e-02],
        [-3.57784032e-02],
        [ 5.76488912e-02],
        [ 4.61795175e-02],
        [-4.33413730e-02],
        [ 9.25984864e-02],
        [-6.87438089e-02],
        [ 7.74521941e-05]]),
 0.6939222582183506)

In [82]:
g, mbg

(matrix([[ 0.26932907],
         [-0.01842753],
         [ 0.21304685],
         [ 0.07679966],
         [-0.11275566],
         [ 0.12195585],
         [-0.36109679],
         [ 0.47727034],
         [-0.50077452]]),
 matrix([[ 0.26932907],
         [-0.01842753],
         [ 0.21304685],
         [ 0.07679966],
         [-0.11275566],
         [ 0.12195585],
         [-0.36109679],
         [ 0.47727034],
         [-0.50077452]]))

In [81]:
mbobjval == objval, numpy.array_equal(g, mbg), numpy.array_equal(w, w1)

(True, False, False)

In [37]:
g_loss = numpy.multiply(z, -yx)
g_loss

matrix([[-1.04582673,  0.57944573, -0.11684842,  0.63770399,  0.3453668 ,
          0.11338537,  0.26408208, -0.07607304, -0.5       ],
        [ 0.28779336, -0.31623852,  0.30181073, -0.41665653, -0.21050974,
          0.40804039, -0.34509038,  0.39744967, -0.5       ],
        [-0.15674667, -0.01234565, -0.43084279,  0.63770399,  0.3453668 ,
         -0.17500046,  0.27934956,  0.010022  , -0.5       ],
        [ 0.28779336,  0.4674852 ,  0.45880791,  0.63770399,  0.3453668 ,
          0.11338537,  0.50683501,  0.4835447 , -0.5       ],
        [-0.00856666,  0.17958669, -0.16918082,  0.01749192, -0.08891175,
          0.21996272,  0.54195022,  0.26830711, -0.5       ],
        [ 0.00856666,  1.00399607,  0.22151321,  0.57170955,  0.85758477,
          0.31919328, -0.31904501, -0.09611704,  0.5       ],
        [ 0.28779336,  0.85134988, -0.01218364, -0.35463533,  0.05874296,
          0.43311741,  0.43660461,  0.35440215, -0.5       ],
        [ 0.13961335,  0.75538371,  0.04014876, 

In [38]:
g_loss = g_loss.T + lam * w
numpy.array_equal(g_loss.T, mat)

True

In [20]:
g = numpy.average(g_loss, axis=1)
g1 = (-0.42018124,  0.02528976,  0.17378009,  0.27169091, -0.12320057, -0.02528976,  0.47076076, -0.17378009)
g1 = sum(g1) / 8
g, g1

(matrix([[-0.02494749],
         [ 0.43858289],
         [ 0.03665312],
         [ 0.18755359],
         [ 0.22536851],
         [ 0.1963183 ],
         [ 0.20493193],
         [ 0.22275409],
         [-0.375     ]]),
 0.0248837325)

In [21]:
iters = int(n / b)
start = 0
for i in range(iters):
    end = start + b
    print(start, end)
    xi = x[rand_indices[0:b], :]
    print(len(xi))
    yi = y[rand_indices[0:b]].reshape((b, 1))
    start = end

0 8
8
8 16
8
16 24
8
24 32
8
32 40
8
40 48
8
48 56
8
56 64
8
64 72
8
72 80
8
80 88
8
88 96
8
96 104
8
104 112
8
112 120
8
120 128
8
128 136
8
136 144
8
144 152
8
152 160
8
160 168
8
168 176
8
176 184
8
184 192
8
192 200
8
200 208
8
208 216
8
216 224
8
224 232
8
232 240
8
240 248
8
248 256
8
256 264
8
264 272
8
272 280
8
280 288
8
288 296
8
296 304
8
304 312
8
312 320
8
320 328
8
328 336
8
336 344
8
344 352
8
352 360
8
360 368
8
368 376
8
376 384
8
384 392
8
392 400
8
400 408
8
408 416
8
416 424
8
424 432
8
432 440
8
440 448
8
448 456
8
456 464
8
464 472
8
472 480
8
480 488
8
488 496
8
496 504
8
504 512
8
512 520
8
520 528
8
528 536
8
536 544
8
544 552
8
552 560
8
560 568
8
568 576
8
576 584
8
584 592
8
592 600
8
600 608
8
608 616
8
616 624
8
624 632
8
632 640
8
