### 1.用Numpy实现

In [2]:
# 导入数据集用的是sklearn的diabetes数据集

In [29]:
from sklearn.datasets import load_diabetes
from sklearn.utils import shuffle
import numpy as np

In [30]:
diabetes = load_diabetes()

In [31]:
data, target = diabetes.data, diabetes.target

In [32]:
len(data), len(target)

(442, 442)

In [33]:
# 打乱数据集
X, y = shuffle(data, target, random_state = 13)

In [34]:
# 划分训练集和测试集，按照8：2的比例
x_train, y_train = X[:int(X.shape[0] * 0.8)], y[:int(X.shape[0] * 0.8)]
x_test, y_test = X[int(X.shape[0] * 0.8):], y[int(X.shape[0] * 0.8):]

In [35]:
y_train.shape, y_test.shape

((353,), (89,))

In [36]:
# 将训练集标签和测试集标签改为列向量的形式
y_train = y_train.reshape((-1, 1))
y_test = y_test.reshape((-1, 1))

In [37]:
print('x_train shape:', x_train.shape)
print('y_train shape:', y_train.shape)
print('x_test shape:', x_test.shape)
print('y_test shape:', y_test.shape)

x_train shape: (353, 10)
y_train shape: (353, 1)
x_test shape: (89, 10)
y_test shape: (89, 1)


In [38]:
# 定义回归模型函数的本体
def linear_loss(X, y, w, b):
    num_train = X.shape[0]
    num_tagret = X.shape[1]
    y_hat = np.dot(X, w) + b
    # 回归一般用的是均方误差
    loss = ((y_hat - y) ** 2 ) / num_train
    # 基于均方误差对去权重求一阶导数
    dw = np.dot(X.T, (y_hat - y)) / num_train
    # 基于均方误差求偏置的一阶导数
    db = np.sum(y_hat - y) / num_train
    return y_hat, loss, dw, db

In [39]:
# 初始化模型的参数
def initialize_params(dims):
    w = np.zeros((dims, 1))
    b = 0
    return w, b

In [40]:
# 定义线性回归模型的训练过程
def linear_train(X, y, learning_rate = 0.01, epochs = 10000):
    loss_num = []
    w,b = initialize_params(X.shape[1])# 列
    for i in range(1, epochs):
        y_hat, loss, dw, db = linear_loss(X, y, w, b)
        w += - learning_rate * dw
        b += -learning_rate * db
        loss_num.append(loss)
        if i % 10000 == 0:
            print('epochs %d loss %f'%(i, loss))
        params = {
            'w': w,
            'b': b
        }
        grads = {
            'dw': dw,
            'db': db
        }
        return loss_num, params, grads

In [41]:
# 模型进行训练
loss_num, params, grads = linear_train(x_train, y_train, 0.01, 100000)

In [42]:
params

{'w': array([[ 0.00855811],
        [ 0.00075944],
        [ 0.01655004],
        [ 0.01568291],
        [ 0.00710851],
        [ 0.00466418],
        [-0.01434783],
        [ 0.01624442],
        [ 0.0206719 ],
        [ 0.01195389]]),
 'b': 1.4996317280453257}

In [43]:
# 进行预测
def predict(X, params):
    w = params['w']
    b = params['b']
    # 预测
    y_pred = np.dot(X, w) + b
    return y_pred

In [44]:
y_pred = predict(x_test, params)

In [48]:
# 定义R**2 函数。来看预测的结果是否好
def r2_score(y_test, y_pred):
    y_avg = np.mean(y_test)
    # 总离差平方和
    ss_tot = np.sum((y_test - y_avg) ** 2)
    # 残差平方和
    ss_res = np.sum((y_test - y_pred) ** 2)
    # R**2的计算
    r2 = 1 - (ss_res / ss_tot)
    return r2

In [49]:
print(r2_score(y_test, y_pred))

-3.465289262600014


In [50]:
# 基于sklearn的模型实现

In [51]:
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score

In [52]:
reger = linear_model.LinearRegression()

In [53]:
reger.fit(x_train, y_train)

LinearRegression()

In [54]:
y_pred = reger.predict(x_test)

In [55]:
print('均方误差为:', mean_squared_error(y_test, y_pred))

均方误差为: 3371.8842111461654


In [56]:
print('R^2系数为:', r2_score(y_test, y_pred))

R^2系数为: 0.5392080506325068
