In [3]:
import copy, math
import numpy as np
import matplotlib.pyplot as plt

In [4]:
X_train = np.array([[2104, 5, 1, 45], [1416, 3, 2, 40], [852, 2, 1, 35]])
y_train = np.array([460, 232, 178])

In [8]:
print(f"X Shape: {X_train.shape}, X Type: {type(X_train)}")
print(X_train)
print(f"y Shape: {y_train.shape}, y Type: {type(y_train)}")
print(y_train)

X Shape: (3, 4), X Type: <class 'numpy.ndarray'>
[[2104    5    1   45]
 [1416    3    2   40]
 [ 852    2    1   35]]
y Shape: (3,), y Type: <class 'numpy.ndarray'>
[460 232 178]


In [52]:
b_init = 785.1811367994083
w_init = np.array([ 0.39133535, 18.75376741, -53.36032453, -26.42131618])
print(f"w_init shape: {w_init.shape}, b_init type: {type(b_init)}")

w_init shape: (4,), b_init type: <class 'float'>


In [53]:
def predict_single_loop(x, w, b):
    """
    linear regression，单变量预测
    参数：
        x(ndarray): (n,)数据，多个特征
        w: (n,)参数
        b: 参数
    返回值：
        p：预测值
    """
    n = x.shape[0]
    p = 0
    for i in range(n):
        p_i = x[i] * w[i]
        p += p + p_i
    p = p + b
    return p

In [54]:
# 第一行数据
x_vec = X_train[0, :]
f_wb = predict_single_loop(x_vec, w_init, b_init)
f_wb

6451.533219039409

In [55]:
def compute_cost(X, y, w, b): 
    """
    compute cost
    Args:
      X (ndarray (m,n)): Data, m examples with n features
      y (ndarray (m,)) : target values
      w (ndarray (n,)) : model parameters  
      b (scalar)       : model parameter
      
    Returns:
      cost (scalar): cost
    """
    m = X.shape[0]
    cost = 0.0
    for i in range(m):                                
        f_wb_i = np.dot(X[i], w) + b           #(n,)(n,) = scalar (see np.dot)
        cost = cost + (f_wb_i - y[i])**2       #scalar
    cost = cost / (2 * m)                      #scalar    
    return cost

In [56]:
cost = compute_cost(X_train, y_train, w_init, b_init)
print(f"cost at optimal w is {cost}")

cost at optimal w is 1.5578904045996674e-12


In [57]:
# 多变量梯度下降
def compute_gradient(X, y, w, b):
    m, n = X.shape
    # 创建了一个名为dj_dw的NumPy数组，并将其初始化为全零。
    # 数组的大小是(n,)，其中n是一个变量或常数，表示数组的长度或维度。
    dj_dw = np.zeros((n,))
    dj_db = 0.
    
    for i in range(m):
        err = (np.dot(X[i], w) + b) - y[i]
        for j in range(n):
            dj_dw[j] = dj_dw[j] + err * X[i, j]
        dj_db = dj_db + err
    dj_dw = dj_dw/m
    dj_db = dj_db/m
    
    return dj_dw, dj_db

In [58]:
tmp_dj_db, tmp_dj_dw = compute_gradient(X_train, y_train, w_init, b_init)
print(f'dj_db at initial w,b: {tmp_dj_db}')
print(f'dj_dw at initial w,b: \n {tmp_dj_dw}')

dj_db at initial w,b: [-2.72623574e-03 -6.27197255e-06 -2.21745574e-06 -6.92403377e-05]
dj_dw at initial w,b: 
 -1.6739251122999121e-06


In [59]:
def gradient_descent(X, y, w_in, b_in, cost_function, gradient_function, alpha, num_iters): 
    J_history = []
    w = copy.deepcopy(w_in)
    b = b_in
    
    for i in range(num_iters):
        # 计算梯度
        dj_db,dj_dw = gradient_function(X, y, w, b) 
        
        # 通过alpha和梯度更新系数
        w = w - alpha * dj_dw
        b = b - alpha * dj_db
        
        # 保存每一步的j
        if i<100000: 
            J_history.append( cost_function(X, y, w, b))
            
        # 每10次打印cost
        if i% math.ceil(num_iters / 10) == 0:
            print(f"Iteration {i}: Cost {J_history[-1]}   ")
            
    return w, b, J_history

In [60]:
def gradient_descent(X, y, w_in, b_in, cost_function, gradient_function, alpha, num_iters): 
    """
    Performs batch gradient descent to learn w and b. Updates w and b by taking 
    num_iters gradient steps with learning rate alpha
    
    Args:
      X (ndarray (m,n))   : Data, m examples with n features
      y (ndarray (m,))    : target values
      w_in (ndarray (n,)) : initial model parameters  
      b_in (scalar)       : initial model parameter
      cost_function       : function to compute cost
      gradient_function   : function to compute the gradient
      alpha (float)       : Learning rate
      num_iters (int)     : number of iterations to run gradient descent
      
    Returns:
      w (ndarray (n,)) : Updated values of parameters 
      b (scalar)       : Updated value of parameter 
      """
    
    # An array to store cost J and w's at each iteration primarily for graphing later
    J_history = []
    w = copy.deepcopy(w_in)  #avoid modifying global w within function
    b = b_in
    
    for i in range(num_iters):

        # Calculate the gradient and update the parameters
        dj_db,dj_dw = gradient_function(X, y, w, b)   ##None

        # Update Parameters using w, b, alpha and gradient
        w = w - alpha * dj_dw               ##None
        b = b - alpha * dj_db               ##None
      
        # Save cost J at each iteration
        if i<100000:      # prevent resource exhaustion 
            J_history.append( cost_function(X, y, w, b))

        # Print cost every at intervals 10 times or as many iterations if < 10
        if i% math.ceil(num_iters / 10) == 0:
            print(f"Iteration {i}: Cost {J_history[-1]}   ")
        
    return w, b, J_history #return final w,b and J history for graphing

In [61]:
# 初始化
initial_w = np.zeros_like(w_init)
initial_b = 0.

# 梯度设定
iterations = 1000
alpha = 5.0e-7

# 梯度下降计算
w_final, b_final, J_hist = gradient_descent(X_train, y_train, initial_w, initial_b,
                                                    compute_cost, compute_gradient, 
                                                    alpha, iterations)

Iteration 0: Cost [49376.16867696 49445.9121573  49446.02082549 49444.32523138]   


ValueError: setting an array element with a sequence.