In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer  # <--- 添加这一行!
from sklearn.metrics import mean_absolute_error # 可选：如果分割数据进行验证时使用

print("库导入完成。")

In [None]:
train_path = '/kaggle/input/home-data-for-ml-course/train.csv'
test_path = '/kaggle/input/home-data-for-ml-course/test.csv'
submission_path = 'submission.csv'
print("开始加载数据...")
try:
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)
    print("数据加载成功！")
    print(f"训练数据形状: {train_data.shape}")
    print(f"测试数据形状: {test_data.shape}")
except FileNotFoundError:
    print("错误：找不到 train.csv 或 test.csv 文件。")
    print("请确保文件路径正确，并且数据已添加到您的环境中。")
    # 可以选择在这里退出或进行其他错误处理
    exit()

In [None]:
y = train_data.SalePrice
print("目标变量 'SalePrice' 已分离。")

# --- 特征选择 (为简化，仅保留数值特征) ---
# 从训练特征中删除目标变量和 Id 列
X = train_data.drop(['SalePrice', 'Id'], axis=1)
# 保留测试数据的 Id 列以备后用，但从特征中删除
X_test = test_data.drop('Id', axis=1)
test_ids = test_data['Id'] # 存储测试集的 Id 用于提交文件

# 选择仅包含数值类型的列
X_numerical = X.select_dtypes(include=np.number)
X_test_numerical = X_test.select_dtypes(include=np.number)

print(f"已选择 {len(X_numerical.columns)} 个数值特征。")
# print("数值特征列表:", X_numerical.columns.tolist())

In [None]:
shared_cols = list(set(X_numerical.columns) & set(X_test_numerical.columns))
X_numerical = X_numerical[shared_cols]
X_test_numerical = X_test_numerical[shared_cols]

print(f"对齐后的特征数量: {len(shared_cols)}")


In [None]:
imputer = SimpleImputer(strategy='mean') # 也可以尝试 'median' (中位数)

print("开始使用均值填充缺失值...")
X_imputed = imputer.fit_transform(X_numerical)
X_test_imputed = imputer.transform(X_test_numerical) # 对测试数据只使用 transform

# 将插补后的 NumPy 数组转换回 DataFrame (可选，但有时更方便)
X_imputed_df = pd.DataFrame(X_imputed, columns=shared_cols)
X_test_imputed_df = pd.DataFrame(X_test_imputed, columns=shared_cols)

print("使用均值插补处理缺失值完成。")


In [None]:
print("数据预处理阶段完成。")

# ===========================================================================
# 4. 模型选择与训练 (Model Selection & Training)
# ===========================================================================
print("\n选择并训练模型...")

# --- 选择模型 ---
# 随机森林回归器 (RandomForestRegressor) 是一个不错的起点
model = RandomForestRegressor(n_estimators=100, # 树的数量
                              random_state=0,   # 为了结果可复现
                              n_jobs=-1)        # 使用所有可用的 CPU 核心

print("开始训练随机森林回归模型...")
# 在预处理后的训练数据上训练模型
model.fit(X_imputed_df, y)
print("模型训练完成。")


In [None]:
print("\n在测试数据上生成预测...")
predictions = model.predict(X_test_imputed_df)
print("预测生成完成。")

In [None]:
print("\n创建提交文件...")

# --- 创建 DataFrame ---
submission_df = pd.DataFrame({'Id': test_ids, 'SalePrice': predictions})

# --- (可选) 合理性检查 ---
# 如果需要，确保 SalePrice 预测值是合理的（例如，非负数）
# submission_df['SalePrice'] = submission_df['SalePrice'].apply(lambda x: max(0, x))

# --- 保存为 CSV 文件 ---
submission_df.to_csv(submission_path, index=False) # index=False 表示不将 DataFrame 的索引写入文件

print(f"提交文件已成功创建于: {submission_path}")
print("提交文件前几行内容:")
print(submission_df.head())