In [44]:
import pandas as pd

# 文件路径
train_path = "/home/lc/code/ai/house_pred/train.csv"
test_path = "/home/lc/code/ai/house_pred/test.csv"
# 读取文件
train_data = pd.read_csv(train_path, index_col="Id")
test_data = pd.read_csv(test_path, index_col="Id")

In [45]:
# 选择预测目标和特征变量
y = train_data.SalePrice
X = train_data.drop(["SalePrice"], axis="columns")

In [46]:
from sklearn.impute import SimpleImputer

# 分离数值型和字符型数据
numeric_X = X.select_dtypes(include=["int64", "float64"])
numeric_X_test = test_data.select_dtypes(include=["int64", "float64"])

str_X = X.select_dtypes(include=["object"])
str_X_test = test_data.select_dtypes(include=["object"])

# 对数值型数据进行mean处理
numeric_imputer = SimpleImputer(strategy="mean")
numeric_imputed_X = pd.DataFrame(
    numeric_imputer.fit_transform(numeric_X), columns=numeric_X.columns, index=X.index
)
numeric_imputed_X_test = pd.DataFrame(
    numeric_imputer.transform(numeric_X_test),
    columns=numeric_X.columns,
    index=numeric_X_test.index,
)

In [47]:
# 对字符型数据进行处理
str_imputer = SimpleImputer(strategy="most_frequent")
str_imputed_X = pd.DataFrame(
    str_imputer.fit_transform(str_X), columns=str_X.columns, index=X.index
)
str_imputed_X_test = pd.DataFrame(
    str_imputer.transform(str_X_test), columns=str_X.columns, index=str_X_test.index
)

# 数据合并
imputed_X = pd.concat([str_imputed_X, numeric_imputed_X], axis="columns")
imputed_X_test = pd.concat([str_imputed_X_test, numeric_imputed_X_test], axis="columns")

# 确保列名一致
imputed_X.columns = X.columns
imputed_X_test.columns = test_data.columns

In [48]:
# 划分训练集和测试集
from sklearn.model_selection import train_test_split

train_X, val_X, train_y, val_y = train_test_split(imputed_X, y, random_state=0)

In [49]:
# 选择基数较少的列进行独热编码
categorical_cols = [
    cols
    for cols in train_X.columns
    if train_X[cols].nunique() < 10 and train_X[cols].dtype == "object"
]

from sklearn.preprocessing import OneHotEncoder

# 创建并拟合独热编码器
onehot_encoder = OneHotEncoder(handle_unknown="ignore", sparse=False)
onehot_encoder.fit(train_X[categorical_cols])

# 对训练集进行独热编码
onehot_train_col = pd.DataFrame(
    onehot_encoder.transform(train_X[categorical_cols]), index=train_X.index
)

# 对验证集进行独热编码
onehot_val_col = pd.DataFrame(
    onehot_encoder.transform(val_X[categorical_cols]), index=val_X.index
)

# 对测试集进行独热编码
onehot_test_col = pd.DataFrame(
    onehot_encoder.transform(imputed_X_test[categorical_cols]),
    index=imputed_X_test.index,
)

# 删除原来的分类列
train_X_col = train_X.drop(categorical_cols, axis="columns")
val_X_col = val_X.drop(categorical_cols, axis="columns")
test_X_col = imputed_X_test.drop(categorical_cols, axis="columns")

# 合并独热编码后的列和原来的数值列
train_X = pd.concat([onehot_train_col, train_X_col], axis="columns")
val_X = pd.concat([onehot_val_col, val_X_col], axis="columns")
X_test = pd.concat([onehot_test_col, test_X_col], axis="columns")

# 确保列名都是字符串
train_X.columns = train_X.columns.astype(str)
val_X.columns = val_X.columns.astype(str)
X_test.columns = X_test.columns.astype(str)

In [50]:
imputed_X.dtypes

MSSubClass        object
MSZoning          object
LotFrontage       object
LotArea           object
Street            object
                  ...   
MiscVal          float64
MoSold           float64
YrSold           float64
SaleType         float64
SaleCondition    float64
Length: 79, dtype: object

In [51]:
# 找出不属于float64类型的数据
train_X_dtype = [col for col in train_X.columns if train_X[col].dtype != "float64"]
print(train_X_dtype)
# TODO:删除object类型的列，还未学习到处理该列的过程
cols_to_drop = ["Utilities", "HouseStyle", "OverallQual"]
train_X = train_X.drop(cols_to_drop, axis="columns")
val_X = val_X.drop(cols_to_drop, axis="columns")
X_test = X_test.drop(cols_to_drop, axis="columns")

['Utilities', 'HouseStyle', 'OverallQual']


In [52]:
# 检查数据框的所有列类型
num_cols = val_X.select_dtypes(include=["number"])

# 判断是否所有列都是数值型
is_all_numeric = len(num_cols.columns) == len(val_X.columns)

print("所有列是否都是数值型:", is_all_numeric)

所有列是否都是数值型: True


In [53]:
from sklearn.ensemble import RandomForestRegressor
import optuna
import numpy as np
from sklearn.model_selection import cross_val_score


# 建立随机森林模型
def randomtree(trial):
    # 进行超参数搜索
    # 树的数量
    n_estimators = trial.suggest_int("n_estimators", 10, 300)
    # 树的深度
    max_depth = trial.suggest_int("max_depth", 2, 32)
    # 每棵树使用的最大特征数（使用浮点数）
    max_features = trial.suggest_float("max_features", 0.1, 1.0)
    # 内部节点划分所需的最小样本数
    min_samples_split = trial.suggest_int("min_samples_split", 2, 20)
    # 叶节点所需的最小样本数
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 20)

    model = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        max_features=max_features,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=0,
    )

    # 计算交叉验证的平均误差
    MAE = -np.mean(
        cross_val_score(
            model, train_X, train_y, cv=5, scoring="neg_mean_absolute_error"
        )
    )

    return MAE


# 创建Optuna对象选择最佳超参数，设定优化方向为最小化 MAE
param_study = optuna.create_study(direction="minimize")

# 定义训练次数
param_study.optimize(randomtree, n_trials=100)

# 输出最优超参数
print("最优超参数为：", param_study.best_params)
print("最小误差为：", param_study.best_value)

[32m[I 2024-09-24 12:25:11,500][0m A new study created in memory with name: no-name-46e8b441-6b0e-4e74-a7bd-5dcab39be60c[0m
[32m[I 2024-09-24 12:25:13,454][0m Trial 0 finished with value: 18084.38733262585 and parameters: {'n_estimators': 138, 'max_depth': 11, 'max_features': 0.4172286310596467, 'min_samples_split': 18, 'min_samples_leaf': 3}. Best is trial 0 with value: 18084.38733262585.[0m
[32m[I 2024-09-24 12:25:16,061][0m Trial 1 finished with value: 18441.25606142413 and parameters: {'n_estimators': 178, 'max_depth': 23, 'max_features': 0.5041668777986215, 'min_samples_split': 14, 'min_samples_leaf': 9}. Best is trial 0 with value: 18084.38733262585.[0m
[32m[I 2024-09-24 12:25:16,553][0m Trial 2 finished with value: 19860.56324062345 and parameters: {'n_estimators': 35, 'max_depth': 6, 'max_features': 0.5814407295858671, 'min_samples_split': 2, 'min_samples_leaf': 12}. Best is trial 0 with value: 18084.38733262585.[0m
[32m[I 2024-09-24 12:25:19,614][0m Trial 3 finis

最优超参数为： {'n_estimators': 199, 'max_depth': 19, 'max_features': 0.336962229416895, 'min_samples_split': 5, 'min_samples_leaf': 1}
最小误差为： 16982.810666280282


In [54]:
# 选择最佳超参数
best_params = param_study.best_params

# 定义模型
best_model = RandomForestRegressor(
    n_estimators=best_params["n_estimators"],
    max_depth=best_params["max_depth"],
    max_features=best_params["max_features"],
    min_samples_split=best_params["min_samples_split"],
    min_samples_leaf=best_params["min_samples_leaf"],
    random_state=0,
)

best_model.fit(train_X, train_y)
# 验证集进行预测
best_pred = best_model.predict(X_test)
# 保存输出
output = pd.DataFrame({"Id": X_test.index, "SalePrice": best_pred})

output.to_csv("submit.csv", index=False)

In [55]:
output

Unnamed: 0,Id,SalePrice
0,1461,128578.086009
1,1462,154034.017286
2,1463,181128.766881
3,1464,187357.927054
4,1465,199931.002056
...,...,...
1454,2915,88085.053403
1455,2916,89395.901085
1456,2917,155574.145166
1457,2918,112822.293317
