In [1]:
import pandas as pd

# 加载数据集
file_path = './瓜子二手车.csv'
data = pd.read_csv(file_path)

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# 选择相关特征和目标变量
features = data.drop(columns=['售价'])
target = data['售价']

# 分别处理分类和数值列
categorical_cols = features.select_dtypes(include=['object', 'bool']).columns
numerical_cols = features.select_dtypes(include=['float64', 'int64']).columns

# 数值数据预处理：填补缺失值和标准化数据
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# 分类数据预处理：填补缺失值和应用独热编码
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# 捆绑数值和分类数据的预处理
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

# 将数据集分为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=0)

# 对训练数据应用预处理流水线
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

# 检查处理后的特征形状
X_train_preprocessed.shape, X_test_preprocessed.shape


((6602, 5156), (1651, 5156))

In [3]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
import numpy as np

# 将稀疏矩阵转换为NumPy数组
X_train_preprocessed_np = X_train_preprocessed.toarray()
X_test_preprocessed_np = X_test_preprocessed.toarray()

# 定义模型
model = Sequential([
    Dense(256, activation='relu', input_shape=(X_train_preprocessed_np.shape[1],)),
    Dense(128, activation='relu'),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(1)  # 输出层用于预测
])

# 编译模型
model.compile(optimizer=Adam(), loss='mean_squared_error')

# 训练模型
history = model.fit(X_train_preprocessed_np, y_train, validation_split=0.2, epochs=100, batch_size=128, verbose=1)

# 在测试集上评估模型
test_loss = model.evaluate(X_test_preprocessed_np, y_test, verbose=1)

# 进行预测以计算R²分数
y_pred = model.predict(X_test_preprocessed_np).flatten()
r2_score = 1 - sum((y_test - y_pred) ** 2) / sum((y_test - np.mean(y_test)) ** 2)

# 计算RMSE
rmse = np.sqrt(test_loss)

print(f"Test Loss (MSE): {test_loss}")
print(f"RMSE: {rmse}")
print(f"R² Score: {r2_score}")


Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - loss: 140.9957 - val_loss: 21.4482
Epoch 2/100
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 19.2122 - val_loss: 11.8037
Epoch 3/100
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 8.1466 - val_loss: 6.9257
Epoch 4/100
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 3.7923 - val_loss: 4.5736
Epoch 5/100
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 1.7769 - val_loss: 3.9092
Epoch 6/100
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 1.0967 - val_loss: 2.8970
Epoch 7/100
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.5799 - val_loss: 2.5302
Epoch 8/100
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.4237 - val_loss: 2.3522
Epoch 9/100
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m