# Data Processing

In [27]:
import numpy as np
x1 = 2*np.random.rand(100, 1)
x2 = 2*np.random.rand(100, 1)
x0 = np.array([1.0]) 
x0 = np.tile(x0, (100,1))
X = np.concatenate((x1, x2), axis=1)

X = np.concatenate((x0, X), axis=1)     # 100*3  2-d array(matrix)
Y=3*x1+4*x2 +np.random.randn(100,1)+2   # 100*1  array 
#print(X)
#print(Y)

# (1) Normal Equation

In [28]:
from numpy.linalg import inv       # used to get inverse matrix
XT = X.T
#minW = np.dot(XT * X).I * XT * Y
a = inv(np.dot(XT, X))
b = np.dot(a, XT)
minW = np.dot(b, Y)
print(minW)

[[2.18659808]
 [2.78058884]
 [3.98352281]]


# (2) Gradient Descent (Batch)

In [29]:
# Mean Square Error
def MSE(w, x, y):    # x[i] and w are arrays, y[i] and mse are scalar
    length = len(x)
    MSE = 0.0
    for i in range(0, length):
        mse = (x[i]*w - y[i])**2
        MSE += mse
    MSE /= length
    return MSE

# Derivative of MSE to w
def dMSE(w, x, y):
    length = len(x)
    sumdMSE = np.matrix('0.0, 0.0, 0.0')
    for i in range(0, length):
        deri = (np.dot(x[i],w) - y[i]) * x[i]
        sumdMSE += deri
    sumdMSE *= 2/length
    return sumdMSE.T     # hrizontal [x1, x2]

### First, let learning rate = 1, because random data belongs [0, 1] 

In [30]:
w = np.matrix('1.0;1.0;1.0')   # initialize, just give 1.0 
step = 1    # learning rate

oldMSE = 1000000              # a big number, just promise it bigger than the first MSE
while MSE(w, X, Y) < oldMSE :
    oldMSE = MSE(w, X, Y)
    w = w - dMSE(w, X, Y) * step 
print(w)
print(MSE(w, X, Y))

[[12.87436246]
 [14.3479013 ]
 [14.74575746]]
[[1179.3519162]]


### According to the incredibly big MSE of rate = 1 , so changing it 0.1

In [31]:
w = np.matrix('1.0;1.0;1.0')   # initialize, just give 1.0 
step = 0.1

oldMSE = 1000000
while MSE(w, X, Y) < oldMSE :
    oldMSE = MSE(w, X, Y)
    w = w - dMSE(w, X, Y) * step 
print(w)
print(MSE(w, X, Y))

[[2.18659834]
 [2.78058871]
 [3.9835227 ]]
[[0.82383836]]



### Rate = 0.1 seems good, so I make it smaller to 0.01 in order to get more accurate coefficient  

In [33]:
# !!! This step may run for tens of seconds
w = np.matrix('1.0;1.0;1.0')   #  initialize, just give 1.0 
step = 0.01

oldMSE = 1000000
while MSE(w, X, Y) < oldMSE :
    oldMSE = MSE(w, X, Y)
    w = w - dMSE(w, X, Y) * step 
print(w)
print(MSE(w, X, Y))

[[2.1865992 ]
 [2.78058828]
 [3.98352236]]
[[0.82383836]]


### Rate = 0.1 and Rate = 0.01 get almost the same result, which means rate = 0.1 has been small enough. As a result, test rate = 0.5

In [32]:
w = np.matrix('1.0;1.0;1.0')   #  initialize, just give 1.0 
step = 0.5

oldMSE = 1000000
while MSE(w, X, Y) < oldMSE :
    oldMSE = MSE(w, X, Y)
    w = w - dMSE(w, X, Y) * step 
print(w)
print(MSE(w, X, Y))

[[6.93718123]
 [7.67395065]
 [7.87287873]]
[[198.00315784]]


# Conclusion: learning rate = 0.1
# Firstly, it contributes to a almost accurate result, which is almost the same to normal equation 
# Secondly, it runs very fast