# 线性回归 (Linear Regression) - 动手实践

在这个 Notebook 中，我们将从零实现线性回归算法，并通过代码加深理解。

## 目标
- 理解线性回归的数学原理
- 实现梯度下降法求解
- 对比闭式解与梯度下降
- 在真实数据集上应用

## 1. 导入必要的库

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

# 设置中文字体
plt.rcParams['font.sans-serif'] = ['SimHei', 'DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False

## 2. 生成示例数据

In [None]:
# 设置随机种子以保证可重复性
np.random.seed(42)

# 生成数据
n_samples = 100
true_slope = 2.5
true_intercept = 1.0
noise_level = 0.5

X = 2 * np.random.rand(n_samples, 1)  # 特征: [0, 2]
y = true_slope * X + true_intercept + noise_level * np.random.randn(n_samples, 1)

print(f"数据形状: X={X.shape}, y={y.shape}")
print(f"真实参数: slope={true_slope}, intercept={true_intercept}")

## 3. 可视化数据

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(X, y, alpha=0.6)
plt.xlabel('X (特征)')
plt.ylabel('y (目标)')
plt.title('线性回归示例数据')
plt.grid(True, alpha=0.3)
plt.show()

## 4. 方法一：闭式解 (Ordinary Least Squares)

线性回归的闭式解公式：
$$m = \frac{\sum(x_i - \bar{x})(y_i - \bar{y})}{\sum(x_i - \bar{x})^2}, \quad b = \bar{y} - m\bar{x}$$

In [None]:
def closed_form_solution(X, y):
    """
    计算线性回归的闭式解
    
    Args:
        X: 特征矩阵 (n_samples, 1)
        y: 目标值 (n_samples, 1)
    
    Returns:
        slope, intercept: 回归系数
    """
    n = len(X)
    mean_x = np.mean(X)
    mean_y = np.mean(y)
    
    # 计算斜率
    numerator = np.sum((X - mean_x) * (y - mean_y))
    denominator = np.sum((X - mean_x) ** 2)
    slope = numerator[0] / denominator[0]
    
    # 计算截距
    intercept = mean_y[0] - slope * mean_x[0]
    
    return slope, intercept

# 使用闭式解求解
slope_cf, intercept_cf = closed_form_solution(X, y)
print(f"闭式解结果: y = {slope_cf:.4f}x + {intercept_cf:.4f}")

## 5. 方法二：梯度下降法 (Gradient Descent)

In [None]:
def gradient_descent(X, y, learning_rate=0.01, n_iterations=1000):
    """
    使用梯度下降法求解线性回归
    
    Args:
        X: 特征矩阵
        y: 目标值
        learning_rate: 学习率
        n_iterations: 迭代次数
    
    Returns:
        slope, intercept, history
    """
    # 初始化参数
    slope = 0.0
    intercept = 0.0
    n = len(X)
    
    # 存储历史用于可视化
    history = {'slope': [], 'intercept': [], 'loss': []}
    
    for i in range(n_iterations):
        # 前向传播：计算预测值
        y_pred = slope * X.flatten() + intercept
        
        # 计算误差
        errors = y_pred - y.flatten()
        
        # 计算梯度
        slope_gradient = (2/n) * np.sum(errors * X.flatten())
        intercept_gradient = (2/n) * np.sum(errors)
        
        # 更新参数
        slope = slope - learning_rate * slope_gradient
        intercept = intercept - learning_rate * intercept_gradient
        
        # 记录历史
        history['slope'].append(slope)
        history['intercept'].append(intercept)
        history['loss'].append(np.mean(errors ** 2))
    
    return slope, intercept, history

# 使用梯度下降求解
slope_gd, intercept_gd, history = gradient_descent(X, y, learning_rate=0.1, n_iterations=500)
print(f"梯度下降结果: y = {slope_gd:.4f}x + {intercept_gd:.4f}")

## 6. 可视化梯度下降过程

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Loss 曲线
axes[0].plot(history['loss'])
axes[0].set_xlabel('迭代次数')
axes[0].set_ylabel('MSE Loss')
axes[0].set_title('损失函数下降过程')
axes[0].grid(True, alpha=0.3)

# 参数变化
axes[1].plot(history['slope'], label='斜率 (slope)', linewidth=2)
axes[1].plot(history['intercept'], label='截距 (intercept)', linewidth=2)
axes[1].set_xlabel('迭代次数')
axes[1].set_ylabel('参数值')
axes[1].set_title('参数收敛过程')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 7. 对比结果

In [None]:
# 绘制所有拟合结果
plt.figure(figsize=(12, 6))
plt.scatter(X, y, alpha=0.5, label='数据点')

# 闭式解拟合线
y_pred_cf = slope_cf * X + intercept_cf
plt.plot(X, y_pred_cf, 'r-', linewidth=2, label=f'闭式解: y={slope_cf:.2f}x+{intercept_cf:.2f}')

# 梯度下降拟合线
y_pred_gd = slope_gd * X + intercept_gd
plt.plot(X, y_pred_gd, 'b--', linewidth=2, label=f'梯度下降: y={slope_gd:.2f}x+{intercept_gd:.2f}')

# 真实线
X_range = np.array([[0], [2]])
y_true = true_slope * X_range + true_intercept
plt.plot(X_range, y_true, 'g:', linewidth=2, label=f'真实: y={true_slope}x+{true_intercept}')

plt.xlabel('X (特征)')
plt.ylabel('y (目标)')
plt.title('线性回归拟合结果对比')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

# 计算 MSE
mse_cf = mean_squared_error(y, y_pred_cf)
mse_gd = mean_squared_error(y, y_pred_gd)
print(f"闭式解 MSE: {mse_cf:.6f}")
print(f"梯度下降 MSE: {mse_gd:.6f}")

## 8. 使用 scikit-learn

在实际项目中，我们通常使用 scikit-learn 库：

In [None]:
# 创建并训练模型
model = LinearRegression()
model.fit(X, y)

# 获取参数
print(f"scikit-learn 结果:")
print(f"  斜率 (coef_): {model.coef_[0][0]:.6f}")
print(f"  截距 (intercept_): {model.intercept_[0]:.6f}")
print(f"  R² 得分: {model.score(X, y):.6f}")

# 预测
y_pred_sk = model.predict(X)

print(f"\nMSE: {mean_squared_error(y, y_pred_sk):.6f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y, y_pred_sk)):.6f}")

## 9. 练习：真实数据集

试试在真实数据集上应用线性回归：

In [None]:
# 加载波士顿房价数据集（示例）
from sklearn.datasets import fetch_california_housing

# 获取数据
housing = fetch_california_housing()
X_housing = housing.data[:, [0]]  # 只使用第一个特征
y_housing = housing.target

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(
    X_housing, y_housing, test_size=0.2, random_state=42
)

# 训练模型
model_housing = LinearRegression()
model_housing.fit(X_train.reshape(-1, 1), y_train)

# 评估
y_pred_test = model_housing.predict(X_test.reshape(-1, 1))
print(f"R² 得分: {model_housing.score(X_test.reshape(-1, 1), y_test):.4f}")
print(f"MSE: {mean_squared_error(y_test, y_pred_test):.2f}")