### Project
#### Author: Zhili Zhang - Net ID: zz2382

In [1]:
import numpy as np
import numpy.random as random
import itertools
import warnings
from math import exp
from model import Model
warnings.filterwarnings('ignore')

In [2]:
# Input Data X
X = [[1,1], [1,-1], [-1,1], [-1,-1]]

# Input target Y
Y = [-1,1,1,-1]

X = np.array(X)
Y = np.array(Y)

In [3]:
w01 = np.array([[-0,3378, 0.2771, 0.2859, -0.3329],
                [0.1970, 0.3191, -0.1448, 0.3594],
                [0.3099, 0.1904, -0.0347, -0.4861]])
w12 = np.array([[-0.1401], [0.4919], [-0.2913], \
                [-0.3979], [0.3581]])

In [4]:
model = Model(N0=2, N1=4, N2=1, x0=1.0, lr=0.2, tol=0.05, \
              zeta=1, epoch_thres=1000, cost='L2', transfer='tansig')

In [None]:
model.train(X, Y)

#### We define transfer function, train, predict and mse as below.

In [5]:
# predict function for classification only
def predict(X, w1, w2, transfer=tansig):
    transfer = np.vectorize(transfer)
    a1 = transfer(np.dot(X,w1))
    a1 = np.concatenate([np.array([1]*a1.shape[0]).reshape(-1,1), a1], axis=-1)
    a2 = transfer(np.dot(a1, w2))
    return a2

In [6]:
# compute the mean squared error
def mse(ys, T):
    return np.sum((ys-T)**2)/ys.shape[0]

In [7]:
# X:(4,3), Y(4), w1:(3,2)[[b1,b2], [w11,w12], [w21,w22]], w2:(3,)[b,w1,w2]
#
def train(X, Y, w1, w2, lr, tol=0.001, transfer=tansig, d_transfer=d_tansig):
    w1 = np.copy(w1)
    w2 = np.copy(w2)
    # converge flag, when delta is lower than tolerance, then converge True
    converge = False
    counter = 0
    j = 0 # number of iterations
    transfer = np.vectorize(transfer)
    d_transfer = np.vectorize(d_tansig)
    
    while (not converge) and (j < 10000):
    #while True and (j < 1000):
        i = random.choice(4, 1)
        #print(i)
        a0 = X[i]
        # n1:(2,)
        n1 = np.dot(a0, w1).reshape(-1)
        a1 = transfer(n1)
        # b_a1:(3,)
        b_a1 = np.concatenate([[1], a1], axis=0)
        n2 = np.dot(b_a1, w2)
        a2 = transfer(n2)
        
        s2 = (a2 - Y[i])*d_transfer(n2)
        # s1i = f'(n1i)*w1i*s2
        s1 = d_transfer(n1)*w2[1:]*s2
        
        # w2_ij = w2_ij - lr * a1_i * s2_j
        # b2_j  = b2_j - lr * s2_j
        delta_2 = b_a1 * s2
        # w1_ij = w1_ij - lr * a0_i * s1_j
        # b1_j  = b1_j - lr * s1_j
        delta_1 = a0.reshape(-1,1) * s1.reshape(1,-1)
        
        # check if every entry in delta is below the tolerance
        if np.all(np.absolute(delta_1) < tol) and \
                   np.all(np.absolute(delta_2) < tol):
            counter += 1
        else:
            counter = 0
        # the converge condition satisfied when 5 consecutive iters has
        # delta less than tolerance
        converge = counter >= 5
        if not converge:
            w2 = w2 - lr * delta_2
            w1 = w1 - lr * delta_1
        
        #y_pred = predict(X, w1, w2)
        #err = mse(y_pred, Y)
        #if j%50==0:
        #    print("Iter: {0}, err: {1:.4f}, y_pred:{2}".format(j, err, str(y_pred)))
        #print(delta_1, delta_2)
        j += 1
    '''
    if j < 10000:
        print("Iterations needed for convergence: {}".format(j))
    else:
        print("Algorithm doesn't converge in 10000 iterations.")
    '''
    return w1, w2, j

#### Note for train method

The 'converge' flag in my code is a DIY, which is set 'False' as default.

I have a parameter 'tolerance' which serves as a threshold to the delta. If all entries in delta have absolute value less than tolerance, then I see the current iteration as a 'converged iteration'.

Only if we have 5 consecutive 'converged iterations' are we going change the flag to True and withdraw from training while-loop.

### Experiment with fixed learning_rate

I design the experiment below. Basically I will change the learning rate and tolerance in respective experiments. For each experiment, I will perform the training for 20 times. We want to see for how many times in the experiment, the training actually has a 'good' result and even has a 'great' result.

We set the standard of 'good' as $mse < 0.01$. We set the standard of 'great' as $mse < 0.001$

We also present the average mse and average converging steps required for each training process.

In [8]:
# here we set alpha=2, tolerance=1e-2
alpha = 2.
tol = 1e-2
w1 = np.random.rand(3,2) - 0.5
w2 = np.random.rand(3) - 0.5

good = 0
great = 0
steps = []
errs = []
for i in range(20):
    tw1, tw2, step = train(X, Y, w1, w2, alpha, tol=tol)
    y_pred = predict(X, tw1, tw2)
    err = mse(y_pred, Y)
    steps.append(step)
    errs.append(err)
    if err < 1e-2:
        good += 1
        if err < 1e-3:
            great += 1
    print("Iteration: {0}; err: {1:.4f}; y_pred: {2}".\
          format(i, err, str(y_pred)))
print("lr: {}; tol: {}; Num Good: {}; Num Great: {}".format(alpha, str(tol), good, great))
print("Mean Err: {}; mean steps required: {}".format(np.mean(errs), np.mean(steps)))

Iteration: 0; err: 0.3291; y_pred: [-0.1374507  -0.96572327 -0.93708932  0.86769724]
Iteration: 1; err: 0.0123; y_pred: [ 0.86337    -0.92668873 -0.91737658  0.8640729 ]
Iteration: 2; err: 0.9118; y_pred: [-0.90564771 -0.95044103 -0.90190383  0.93944545]
Iteration: 3; err: 0.9883; y_pred: [ 0.92283478 -0.91038136  0.98278757  0.91258493]
Iteration: 4; err: 0.0162; y_pred: [ 0.87792137 -0.92524576 -0.91806773  0.80584675]
Iteration: 5; err: 0.0083; y_pred: [ 0.91361685 -0.88984608 -0.91413077  0.92119003]
Iteration: 6; err: 0.0128; y_pred: [ 0.85367251 -0.92217385 -0.91394946  0.8725187 ]
Iteration: 7; err: 0.0080; y_pred: [ 0.91183518 -0.90302651 -0.91132075  0.91745892]
Iteration: 8; err: 0.0075; y_pred: [ 0.90417823 -0.92520698 -0.93418092  0.89514346]
Iteration: 9; err: 0.0105; y_pred: [ 0.90591277 -0.91659638 -0.92137204  0.85913092]
Iteration: 10; err: 0.0100; y_pred: [ 0.88201224 -0.93742618 -0.9146069   0.87868409]
Iteration: 11; err: 0.0115; y_pred: [ 0.84900325 -0.91485859 -0.

In [9]:
# here we set alpha=2, tolerance=1e-3
alpha = 2.
tol = 1e-3
w1 = np.random.rand(3,2) - 0.5
w2 = np.random.rand(3) - 0.5

good = 0
great = 0
steps = []
errs = []
for i in range(20):
    tw1, tw2, step = train(X, Y, w1, w2, alpha, tol=tol)
    y_pred = predict(X, tw1, tw2)
    err = mse(y_pred, Y)
    steps.append(step)
    errs.append(err)
    if err < 1e-2:
        good += 1
        if err < 1e-3:
            great += 1
    print("Iteration: {0}; err: {1:.4f}; y_pred: {2}".\
          format(i, err, str(y_pred)))
print("lr: {}; tol: {}; Num Good: {}; Num Great: {}".format(alpha, str(tol), good, great))
print("Mean Err: {}; mean steps required: {}".format(np.mean(errs), np.mean(steps)))

Iteration: 0; err: 0.0025; y_pred: [ 0.9448515  -0.95357364 -0.97066529  0.93588835]
Iteration: 1; err: 0.0015; y_pred: [ 0.97078674 -0.95648644 -0.95333756  0.96941368]
Iteration: 2; err: 0.7554; y_pred: [ 0.76241633  0.7216027  -0.97100059  0.97685362]
Iteration: 3; err: 0.0012; y_pred: [ 0.95829069 -0.97105067 -0.96867291  0.96235834]
Iteration: 4; err: 0.0013; y_pred: [ 0.96908013 -0.95802075 -0.96033054  0.96950708]
Iteration: 5; err: 0.0014; y_pred: [ 0.95972456 -0.97012544 -0.96978238  0.9544192 ]
Iteration: 6; err: 0.0013; y_pred: [ 0.9575887  -0.97021851 -0.97049665  0.95939925]
Iteration: 7; err: 0.0014; y_pred: [ 0.97114192 -0.95478121 -0.9582561   0.96874081]
Iteration: 8; err: 0.0014; y_pred: [ 0.9583858  -0.96880711 -0.96916305  0.95659243]
Iteration: 9; err: 0.0016; y_pred: [ 0.95250719 -0.97498974 -0.96889032  0.94989263]
Iteration: 10; err: 0.0013; y_pred: [ 0.9596436  -0.96935882 -0.97020677  0.9605528 ]
Iteration: 11; err: 0.0012; y_pred: [ 0.96182881 -0.96954609 -0.

In [10]:
# here we set alpha=2, tolerance=1e-4
alpha = 2.
tol = 1e-4
w1 = np.random.rand(3,2) - 0.5
w2 = np.random.rand(3) - 0.5

good = 0
great = 0
steps = []
errs = []
for i in range(20):
    tw1, tw2, step = train(X, Y, w1, w2, alpha, tol=tol)
    y_pred = predict(X, tw1, tw2)
    err = mse(y_pred, Y)
    steps.append(step)
    errs.append(err)
    if err < 1e-2:
        good += 1
        if err < 1e-3:
            great += 1
    print("Iteration: {0}; err: {1:.4f}; y_pred: {2}".\
          format(i, err, str(y_pred)))
print("lr: {}; tol: {}; Num Good: {}; Num Great: {}".format(alpha, str(tol), good, great))
print("Mean Err: {}; mean steps required: {}".format(np.mean(errs), np.mean(steps)))

Iteration: 0; err: 0.0001; y_pred: [ 0.99019341 -0.98641146 -0.98669527  0.98999418]
Iteration: 1; err: 0.0001; y_pred: [ 0.98593498 -0.99003661 -0.98970808  0.98624158]
Iteration: 2; err: 0.0001; y_pred: [ 0.98578869 -0.99009807 -0.99019876  0.98635153]
Iteration: 3; err: 0.0001; y_pred: [ 0.99000071 -0.98695149 -0.98587519  0.99016736]
Iteration: 4; err: 0.0001; y_pred: [ 0.98678942 -0.99021209 -0.99008821  0.98641683]
Iteration: 5; err: 0.0001; y_pred: [ 0.99003626 -0.98674255 -0.98575526  0.99013693]
Iteration: 6; err: 0.0001; y_pred: [ 0.98999271 -0.98645155 -0.98693973  0.9903774 ]
Iteration: 7; err: 0.0001; y_pred: [ 0.99004213 -0.98716335 -0.98672785  0.9902982 ]
Iteration: 8; err: 0.0001; y_pred: [ 0.98993822 -0.98623319 -0.98653499  0.99050146]
Iteration: 9; err: 0.0001; y_pred: [ 0.98676006 -0.98999223 -0.99006616  0.98662026]
Iteration: 10; err: 0.0001; y_pred: [ 0.99006546 -0.98659235 -0.98622849  0.99011264]
Iteration: 11; err: 0.0001; y_pred: [ 0.99004531 -0.98629452 -0.

In [11]:
# here we set alpha=3, tolerance=1e-2
alpha = 3.
tol = 1e-2
w1 = np.random.rand(3,2) - 0.5
w2 = np.random.rand(3) - 0.5

good = 0
great = 0
steps = []
errs = []
for i in range(20):
    tw1, tw2, step = train(X, Y, w1, w2, alpha, tol=tol)
    y_pred = predict(X, tw1, tw2)
    err = mse(y_pred, Y)
    steps.append(step)
    errs.append(err)
    if err < 1e-2:
        good += 1
        if err < 1e-3:
            great += 1
    print("Iteration: {0}; err: {1:.4f}; y_pred: {2}".\
          format(i, err, str(y_pred)))
print("lr: {}; tol: {}; Num Good: {}; Num Great: {}".format(alpha, str(tol), good, great))
print("Mean Err: {}; mean steps required: {}".format(np.mean(errs), np.mean(steps)))

Iteration: 0; err: 1.0153; y_pred: [ 0.87657377 -0.92925742  0.99807689  0.77970129]
Iteration: 1; err: 0.9730; y_pred: [ 0.96298823  0.97029592 -0.92826723  0.94094071]
Iteration: 2; err: 0.8366; y_pred: [ 0.78456309  0.81523059 -0.93478404  0.97671959]
Iteration: 3; err: 0.9482; y_pred: [-0.94089965 -0.87879756 -0.93346371  0.91877141]
Iteration: 4; err: 0.0216; y_pred: [ 0.81457646 -0.79313072 -0.91917749  0.94681936]
Iteration: 5; err: 0.9431; y_pred: [ 0.90787277  0.93699905 -0.9332033   0.91434268]
Iteration: 6; err: 1.0024; y_pred: [ 0.95066437 -0.89909061  0.99921717  0.99826735]
Iteration: 7; err: 1.0331; y_pred: [ 0.9594954   0.99813321 -0.62832098  0.99772421]
Iteration: 8; err: 0.9864; y_pred: [ 0.91209788 -0.93862118 -0.93478503 -0.98239746]
Iteration: 9; err: 0.0235; y_pred: [ 0.79851508 -0.78866203 -0.94138699  0.92629535]
Iteration: 10; err: 0.0110; y_pred: [ 0.8691115  -0.92513347 -0.92725571  0.87303243]
Iteration: 11; err: 0.9939; y_pred: [ 0.98645308  0.98828796 -0.

In [12]:
# here we set alpha=3, tolerance=1e-3
alpha = 3.
tol = 1e-3
w1 = np.random.rand(3,2) - 0.5
w2 = np.random.rand(3) - 0.5

good = 0
great = 0
steps = []
errs = []
for i in range(20):
    tw1, tw2, step = train(X, Y, w1, w2, alpha, tol=tol)
    y_pred = predict(X, tw1, tw2)
    err = mse(y_pred, Y)
    steps.append(step)
    errs.append(err)
    if err < 1e-2:
        good += 1
        if err < 1e-3:
            great += 1
    print("Iteration: {0}; err: {1:.4f}; y_pred: {2}".\
          format(i, err, str(y_pred)))
print("lr: {}; tol: {}; Num Good: {}; Num Great: {}".format(alpha, str(tol), good, great))
print("Mean Err: {}; mean steps required: {}".format(np.mean(errs), np.mean(steps)))

Iteration: 0; err: 0.0014; y_pred: [ 0.96886191 -0.95471441 -0.9584671   0.97104974]
Iteration: 1; err: 0.9763; y_pred: [ 0.97021905 -0.96759612  0.97544055  0.97234314]
Iteration: 2; err: 0.0012; y_pred: [ 0.95958024 -0.97694339 -0.96845082  0.96154005]
Iteration: 3; err: 0.9680; y_pred: [ 0.97272656 -0.97183864 -0.963274   -0.96704873]
Iteration: 4; err: 0.0021; y_pred: [ 0.94748236 -0.98131564 -0.97010404  0.93344605]
Iteration: 5; err: 0.9742; y_pred: [ 0.98831195  0.9738565  -0.9861663   0.97684048]
Iteration: 6; err: 0.0009; y_pred: [ 0.97250301 -0.96660776 -0.96227911  0.98313338]
Iteration: 7; err: 0.9453; y_pred: [ 0.97358782 -0.9910548  -0.93956862 -0.9434244 ]
Iteration: 8; err: 0.0013; y_pred: [ 0.96972564 -0.96062323 -0.95542676  0.96992545]
Iteration: 9; err: 0.0010; y_pred: [ 0.96896918 -0.96306184 -0.96269485  0.97932203]
Iteration: 10; err: 0.8746; y_pred: [-0.86969659 -0.97273816 -0.97933375  0.96033392]
Iteration: 11; err: 0.0011; y_pred: [ 0.95901099 -0.97001205 -0.

In [13]:
# here we set alpha=3, tolerance=1e-4
alpha = 3.
tol = 1e-4
w1 = np.random.rand(3,2) - 0.5
w2 = np.random.rand(3) - 0.5

good = 0
great = 0
steps = []
errs = []
for i in range(20):
    tw1, tw2, step = train(X, Y, w1, w2, alpha, tol=tol)
    y_pred = predict(X, tw1, tw2)
    err = mse(y_pred, Y)
    steps.append(step)
    errs.append(err)
    if err < 1e-2:
        good += 1
        if err < 1e-3:
            great += 1
    print("Iteration: {0}; err: {1:.4f}; y_pred: {2}".\
          format(i, err, str(y_pred)))
print("lr: {}; tol: {}; Num Good: {}; Num Great: {}".format(alpha, str(tol), good, great))
print("Mean Err: {}; mean steps required: {}".format(np.mean(errs), np.mean(steps)))

Iteration: 0; err: 0.0001; y_pred: [ 0.98999788 -0.98654805 -0.98605265  0.99004494]
Iteration: 1; err: 0.0001; y_pred: [ 0.98974162 -0.98601573 -0.98593038  0.99027955]
Iteration: 2; err: 0.9726; y_pred: [ 0.99410718 -0.97191025 -0.9964371  -0.9721612 ]
Iteration: 3; err: 0.0001; y_pred: [ 0.99011229 -0.98596172 -0.98644466  0.99011865]
Iteration: 4; err: 0.0067; y_pred: [ 0.99377797 -0.86060232 -0.91375287  0.99169962]
Iteration: 5; err: 0.9749; y_pred: [ 0.99625311  0.97459954 -0.99535088  0.97435663]
Iteration: 6; err: 0.0001; y_pred: [ 0.9864801  -0.99002947 -0.99112344  0.98707674]
Iteration: 7; err: 0.0002; y_pred: [ 0.9852887  -0.99041169 -0.98956369  0.98641208]
Iteration: 8; err: 0.0001; y_pred: [ 0.98680042 -0.99029433 -0.99007515  0.98708193]
Iteration: 9; err: 0.0002; y_pred: [ 0.98554899 -0.99001205 -0.98936071  0.98523786]
Iteration: 10; err: 0.0001; y_pred: [ 0.98615136 -0.99040545 -0.99003612  0.98673443]
Iteration: 11; err: 0.0017; y_pred: [ 0.93437671 -0.9930943  -0.

### Sort out the experiment results
lr: 2.0; tol: 0.01; Num Good: 7; Num Great: 0; Mean err: 0.194; average converge steps: 278

lr: 2.0; tol: 0.001; Num Good: 18; Num Great: 0; Mean err: 0.079; average converge steps: 781

lr: 2.0; tol: 0.0001; Num Good: 20; Num Great: 20; Mean err: 1.4e-4; average converge steps: 5985

lr: 3.0; tol: 0.01; Num Good: 3; Num Great: 0; Mean err: 0.672; average converge steps: 104

lr: 3.0; tol: 0.001; Num Good: 12; Num Great: 2; Mean err: 0.294; average converge steps: 443

lr: 3.0; tol: 0.0001; Num Good: 17; Num Great: 15; Mean err: 0.099; average converge steps: 3152

When lr is fixed, 2 patterns are found:

- When learning rate is lower, there are relatively more good and great results, and lower mean err.
- When tolerance is lower, there are more good and great results, and lower mean err
- When learning rate or tolerance is low, it took more steps to converge.

### Experiments with adaptive learning rate

I also apply adaptive learning rate in following codes.
During each iteration of training, we calculate the prediciton results and mean squared error. Then we set the learning rate as:
$$ lr = base\_lr * (1+mse) $$
$base\_lr$ is the preset lr as parameter.

In [14]:
# X:(4,3), Y(4), w1:(3,2)[[b1,b2], [w11,w12], [w21,w22]], w2:(3,)[b,w1,w2]
#
def adaptive_train(X, Y, w1, w2, base_lr, tol=0.001, transfer=tansig, d_transfer=d_tansig):
    w1 = np.copy(w1)
    w2 = np.copy(w2)
    # converge flag, when delta is lower than tolerance, then converge True
    converge = False
    counter = 0
    j = 0 # number of iterations
    transfer = np.vectorize(transfer)
    d_transfer = np.vectorize(d_tansig)
    
    lr = base_lr
    
    while (not converge) and (j < 10000):
    #while True and (j < 1000):
        i = random.choice(4, 1)
        #print(i)
        a0 = X[i]
        # n1:(2,)
        n1 = np.dot(a0, w1).reshape(-1)
        a1 = transfer(n1)
        # b_a1:(3,)
        b_a1 = np.concatenate([[1], a1], axis=0)
        n2 = np.dot(b_a1, w2)
        a2 = transfer(n2)
        
        s2 = (a2 - Y[i])*d_transfer(n2)
        # s1i = f'(n1i)*w1i*s2
        s1 = d_transfer(n1)*w2[1:]*s2
        
        # w2_ij = w2_ij - lr * a1_i * s2_j
        # b2_j  = b2_j - lr * s2_j
        delta_2 = b_a1 * s2
        # w1_ij = w1_ij - lr * a0_i * s1_j
        # b1_j  = b1_j - lr * s1_j
        delta_1 = a0.reshape(-1,1) * s1.reshape(1,-1)
        
        # check if every entry in delta is below the tolerance
        if np.all(np.absolute(delta_1) < tol) and \
                   np.all(np.absolute(delta_2) < tol):
            counter += 1
        else:
            counter = 0
        # the converge condition satisfied when 5 consecutive iters has
        # delta less than tolerance
        converge = counter >= 5
        if not converge:
            w2 = w2 - lr * delta_2
            w1 = w1 - lr * delta_1
        
        err = mse(predict(X, w1, w2), Y)
        lr = base_lr * (1+err)
        j += 1

    return w1, w2, j

In [17]:
# here we set alpha=2, tolerance=1e-3
alpha = 2.
tol = 1e-3
w1 = np.random.rand(3,2) - 0.5
w2 = np.random.rand(3) - 0.5

good = 0
great = 0
steps = []
errs = []
for i in range(20):
    tw1, tw2, step = adaptive_train(X, Y, w1, w2, alpha, tol=tol)
    y_pred = predict(X, tw1, tw2)
    err = mse(y_pred, Y)
    steps.append(step)
    errs.append(err)
    if err < 1e-2:
        good += 1
        if err < 1e-3:
            great += 1
    print("Iteration: {0}; err: {1:.4f}; y_pred: {2}".\
          format(i, err, str(y_pred)))
print("lr: {}; tol: {}; Num Good: {}; Num Great: {}".format(alpha, str(tol), good, great))
print("Mean Err: {}; mean steps required: {}".format(np.mean(errs), np.mean(steps)))

Iteration: 0; err: 0.9989; y_pred: [ 0.99473446  0.99838327 -0.9985442   0.95555841]
Iteration: 1; err: 1.9994; y_pred: [-0.99975738 -0.99974845 -0.99971013 -0.99968146]
Iteration: 2; err: 1.9595; y_pred: [-0.99885923 -0.99905756  0.96019714  0.98832522]
Iteration: 3; err: 0.0016; y_pred: [ 0.95575232 -0.96488795 -0.96968995  0.95418295]
Iteration: 4; err: 1.0146; y_pred: [ 0.75453181 -0.99846294 -0.99989765 -0.99956641]
Iteration: 5; err: 1.9340; y_pred: [-0.99953257 -0.9841586  -0.9736421  -0.93315715]
Iteration: 6; err: 1.9990; y_pred: [-0.99971533 -0.99983636  0.99933363  0.9993021 ]
Iteration: 7; err: 0.0023; y_pred: [ 0.97654888 -0.92801187 -0.94513351  0.98350704]
Iteration: 8; err: 1.0029; y_pred: [ 0.99848814 -0.91882028  0.99999768  0.92877074]
Iteration: 9; err: 1.0007; y_pred: [ 0.99892939 -0.9450138   0.99996598  0.99992143]
Iteration: 10; err: 1.9952; y_pred: [ 0.99954698 -0.9954428   0.99954101 -0.9956703 ]
Iteration: 11; err: 1.9726; y_pred: [-0.99686694  0.97559572 -0.

In [18]:
# here we set alpha=2, tolerance=1e-4
alpha = 2.
tol = 1e-4
w1 = np.random.rand(3,2) - 0.5
w2 = np.random.rand(3) - 0.5

good = 0
great = 0
steps = []
errs = []
for i in range(20):
    tw1, tw2, step = adaptive_train(X, Y, w1, w2, alpha, tol=tol)
    y_pred = predict(X, tw1, tw2)
    err = mse(y_pred, Y)
    steps.append(step)
    errs.append(err)
    if err < 1e-2:
        good += 1
        if err < 1e-3:
            great += 1
    print("Iteration: {0}; err: {1:.4f}; y_pred: {2}".\
          format(i, err, str(y_pred)))
print("lr: {}; tol: {}; Num Good: {}; Num Great: {}".format(alpha, str(tol), good, great))
print("Mean Err: {}; mean steps required: {}".format(np.mean(errs), np.mean(steps)))

Iteration: 0; err: 1.7378; y_pred: [-0.99996845 -0.99601393 -0.99987655 -0.71792245]
Iteration: 1; err: 0.9846; y_pred: [ 0.98433732 -0.99668392  0.98445889  0.99739495]
Iteration: 2; err: 1.0001; y_pred: [ 0.98658673 -0.98612653 -0.99242339 -0.99999662]
Iteration: 3; err: 0.9788; y_pred: [ 0.97799152  0.97854059 -0.99396111  0.99402679]
Iteration: 4; err: 1.9969; y_pred: [ 0.99991432 -0.99867731  0.99986002 -0.99699795]
Iteration: 5; err: 1.0072; y_pred: [ 0.91082214  0.99996675 -0.85481633  0.99085549]
Iteration: 6; err: 1.0002; y_pred: [ 0.98227868 -0.98245245 -0.98260131 -0.99999847]
Iteration: 7; err: 0.9959; y_pred: [ 0.99163339  0.99582627 -0.99205645  0.9993632 ]
Iteration: 8; err: 1.0009; y_pred: [ 0.96597666 -0.9999057  -0.94787398 -0.99995306]
Iteration: 9; err: 1.0014; y_pred: [ 0.95138046 -0.9535952   0.99997479  0.96453833]
Iteration: 10; err: 0.9196; y_pred: [ 0.99096158 -0.91583664 -0.99848063 -0.91603073]
Iteration: 11; err: 0.0001; y_pred: [ 0.98656036 -0.99030333 -0.

In [20]:
# here we set alpha=1, tolerance=1e-4
alpha = 1.5
tol = 1e-4
w1 = np.random.rand(3,2) - 0.5
w2 = np.random.rand(3) - 0.5

good = 0
great = 0
steps = []
errs = []
for i in range(20):
    tw1, tw2, step = adaptive_train(X, Y, w1, w2, alpha, tol=tol)
    y_pred = predict(X, tw1, tw2)
    err = mse(y_pred, Y)
    steps.append(step)
    errs.append(err)
    if err < 1e-2:
        good += 1
        if err < 1e-3:
            great += 1
    print("Iteration: {0}; err: {1:.4f}; y_pred: {2}".\
          format(i, err, str(y_pred)))
print("lr: {}; tol: {}; Num Good: {}; Num Great: {}".format(alpha, str(tol), good, great))
print("Mean Err: {}; mean steps required: {}".format(np.mean(errs), np.mean(steps)))

Iteration: 0; err: 0.0002; y_pred: [ 0.99012159 -0.98580541 -0.98571111  0.98944793]
Iteration: 1; err: 0.0001; y_pred: [ 0.99025572 -0.98677886 -0.98651359  0.98982934]
Iteration: 2; err: 0.9318; y_pred: [ 0.96246551 -0.99291873 -0.9280235  -0.92887984]
Iteration: 3; err: 0.0001; y_pred: [ 0.99046972 -0.9864826  -0.98636748  0.98999222]
Iteration: 4; err: 0.0001; y_pred: [ 0.99014596 -0.98644564 -0.98666977  0.99005014]
Iteration: 5; err: 0.0001; y_pred: [ 0.98646235 -0.99036706 -0.98999664  0.98654901]
Iteration: 6; err: 0.0001; y_pred: [ 0.99005196 -0.98610562 -0.98616744  0.98997421]
Iteration: 7; err: 0.0001; y_pred: [ 0.99031502 -0.98654654 -0.98566914  0.99001908]
Iteration: 8; err: 0.0001; y_pred: [ 0.99003423 -0.98658251 -0.98678217  0.99008007]
Iteration: 9; err: 0.5094; y_pred: [ 0.9922989  -0.99123164  0.13254636  0.13122135]
Iteration: 10; err: 0.0001; y_pred: [ 0.98617222 -0.99008322 -0.99067254  0.98682075]
Iteration: 11; err: 0.0001; y_pred: [ 0.99061155 -0.98596507 -0.

### Sort out the experiment results
lr: 2.0; tol: 0.001; Num Good: 4; Num Great: 0; Mean err: 1.17; average converge steps: 196

lr: 2.0; tol: 0.0001; Num Good: 2; Num Great: 2; Mean err: 1.03; average converge steps: 1439

lr: 1.5; tol: 0.0001; Num Good: 14; Num Great: 14; Mean err: 0.25; average converge steps: 5976


Compared with fixed learning rate:

- Adaptive learning rate results in faster convergence.
- The results are more unstable. There are more 'bad' convergence and as a result bad prediction.
- The corresponding mean err also larger than counterpart in fixed lr experiments.