In [15]:
import pandas as pd
import os
import csv

import math
import numpy as np
import matplotlib.pyplot as plt

读取训练集和测试集的csv文件

In [16]:
train_data = pd.read_csv('./airfoil_noise_samples.csv')
test_data = pd.read_csv('./airfoil_noise_test.csv')
train_data.head()

Unnamed: 0,Frequency,Angle,Displacement,Chord length,Velocity,Thickness,Sound Pressure
0,2175.611424,15.138865,21.075119,0.088194,66.764401,0.044771,122.365215
1,2962.92362,13.400893,13.200889,0.048462,78.221903,0.011041,129.296236
2,4430.810843,2.164599,13.959536,0.226743,57.053201,0.011499,121.82738
3,4939.695645,13.857682,18.203793,0.021705,23.896377,0.021475,114.998132
4,2193.979785,9.298757,11.007713,0.052737,38.917034,0.001741,125.639641


定义特征和目标变量

In [17]:
X_train = train_data.iloc[:, :-1]
Y_train = train_data.iloc[:, -1]
X_test = test_data.iloc[:, :-1]
Y_test = test_data.iloc[:, -1]

数据归一化，防止数据过拟合

In [18]:
#归一化
X_train = (X_train - X_train.mean()) / X_train.std()
X_test = (X_test - X_test.mean()) / X_test.std()
X_train.head()

Unnamed: 0,Frequency,Angle,Displacement,Chord length,Velocity,Thickness
0,-0.275324,1.240614,1.602819,-0.558959,0.93419,2.181587
1,0.00762,0.92848,-0.074895,-0.98941,1.63119,-0.232215
2,0.53515,-1.089521,0.086745,0.942044,0.343425,-0.19943
3,0.718034,1.010518,0.991043,-1.27929,-1.673619,0.514487
4,-0.268723,0.19175,-0.542182,-0.943098,-0.759861,-0.897771


使用Lasso regression训练

In [19]:
#设置学习率和迭代次数
learning_rate = 0.001
epochs = 4000
alphas = [0.001, 0.01, 0.1, 1, 10]

# Lasso cost
def lasso_cost(X, y, w, b, alpha):
    m = len(y)
    predictions = X.dot(w) + b
    cost = (1/2*m) * np.sum(np.square(predictions-y))
    l1_penalty = alpha * (np.sum(np.abs(w)) + np.abs(b))
    return cost + l1_penalty

# Lasso gradient
def lasso_gradient(X, y, w, b, alpha):
    m = len(y)
    h = X.dot(w) + b
    dw = (1/m) * X.T.dot(h - y) + alpha * np.sign(w)
    db = (1/m) * np.sum(h - y) + alpha * np.sign(b)
    return dw, db

# Gradient descent
def gradient_descent(X, y, alpha, learning_rate, epochs):
    w = np.zeros(X.shape[1])
    b = 0
    for _ in range(epochs):
        dw, db = lasso_gradient(X, y, w, b, alpha)
        w -= learning_rate * dw
        b -= learning_rate * db
    return w, b

# 5-Fold CV
def cross_validate(X, y, alphas, learning_rate, epochs):
    fold_size = len(y) // 5
    best_alpha = None
    best_mse = float('inf')
    
    for alpha in alphas:
        mse_sum = 0
        for i in range(5):
            X_train = np.vstack([X[:i*fold_size], X[(i+1)*fold_size:]])
            y_train = np.hstack([y[:i*fold_size], y[(i+1)*fold_size:]])
            X_val = X[i*fold_size:(i+1)*fold_size]
            y_val = y[i*fold_size:(i+1)*fold_size]
            
            w, b = gradient_descent(X_train, y_train, alpha, learning_rate, epochs)
            mse = np.mean((X_val.dot(w) + b - y_val)**2)
            mse_sum += mse
        
        if mse_sum < best_mse:
            best_mse = mse_sum
            best_alpha = alpha
    
    return best_alpha

# 测试模型
best_alpha = cross_validate(X_train, Y_train, alphas, learning_rate, epochs)
print("best_alpha:", best_alpha)

best_alpha: 0.001


In [20]:
w, b = gradient_descent(X_train, Y_train, best_alpha, learning_rate, epochs)
# 测试模型
Y_pred_test = np.dot(X_test, w) + b

使用R^2分数和MSE来评估模型在测试集上的表现：

In [21]:

mse = np.mean((Y_test - Y_pred_test) ** 2)
r2_score = 1 - np.sum((Y_test - Y_pred_test) ** 2) / np.sum((Y_test - np.mean(Y_test)) ** 2)

# 输出结果
print(f"MSE on test set: {mse}")
print(f"R^2 Score on test set: {r2_score}")

MSE on test set: 7.608439599011104
R^2 Score on test set: 0.6484749242580675
