In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [23]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer  # 新增：缺失值填充器
from sklearn.linear_model import Ridge, Lasso, LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import r2_score

# ----------------------
# 1. 加载数据并分离特征和目标
# ----------------------
train_data = pd.read_csv('/kaggle/input/playground-series-s5e3/train.csv')
test_data = pd.read_csv('/kaggle/input/playground-series-s5e3/test.csv')

X_train = train_data.drop(['id', 'day', 'rainfall'], axis=1)  # 训练特征（不含id、day、目标）
y_train = train_data['rainfall']                              # 训练目标（降雨量）
X_test = test_data.drop(['id', 'day'], axis=1)                 # 测试特征（不含id、day）


# ----------------------
# 2. 划分训练集和验证集
# ----------------------
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)


# ----------------------
# 3. 预处理：填充缺失值 + 标准化（关键！）
# ----------------------
# 初始化填充器（用训练集的中位数填充缺失值）
imputer = SimpleImputer(strategy='median')  # 可选 'mean'（均值）、'most_frequent'（众数）

# 用训练集拟合填充器（记录训练集的中位数）
imputer.fit(X_tr)  # 注意：仅用训练集拟合，避免数据泄露

# 填充训练集、验证集、测试集的缺失值
X_tr_imputed = imputer.transform(X_tr)  # 训练集填充
X_val_imputed = imputer.transform(X_val)  # 验证集填充（使用训练集的中位数）
X_test_imputed = imputer.transform(X_test)  # 测试集填充（使用训练集的中位数）

# 标准化填充后的数据（仅对线性模型需要）
scaler = StandardScaler()
scaler.fit(X_tr_imputed)  # 用训练集填充后的数据拟合标准化器

# 标准化训练集、验证集、测试集
X_tr_scaled = scaler.transform(X_tr_imputed)
X_val_scaled = scaler.transform(X_val_imputed)
X_test_scaled = scaler.transform(X_test_imputed)  # 测试集标准化（使用训练集的均值/标准差）


# ----------------------
# 4. 定义模型并训练
# ----------------------
models = [
    ('LinearRegression', LinearRegression()),
    ('Ridge', Ridge(alpha=1.0)),
    ('Lasso', Lasso(alpha=0.1)),
    ('RandomForest', RandomForestRegressor(n_estimators=200, random_state=42)),
    ('XGBoost', XGBRegressor(n_estimators=200, random_state=42, verbosity=0)),
    ('LightGBM', LGBMRegressor(n_estimators=200, random_state=42))
]

best_r2 = -np.inf
best_model = None
best_name = ""

for name, model in models:
    # 线性模型使用标准化后的数据，树模型使用原始填充后的数据（可选优化）
    if name in ['LinearRegression', 'Ridge', 'Lasso']:
        model.fit(X_tr_scaled, y_tr)
        val_pred = model.predict(X_val_scaled)
    else:
        model.fit(X_tr_imputed, y_tr)  # 树模型直接使用填充后的原始数据（无需标准化）
        val_pred = model.predict(X_val_imputed)
    
    # 确保预测值非负（降雨量不能为负）
    val_pred = np.maximum(val_pred, 0)
    
    # 计算R²分数
    current_r2 = r2_score(y_val, val_pred)
    
    # 更新最优模型
    if current_r2 > best_r2:
        best_r2 = current_r2
        best_model = model
        best_name = name

    print(f"模型：{name}，验证集R²: {current_r2:.4f}")


# ----------------------
# 5. 用最优模型预测测试集（关键：测试集已填充+标准化）
# ----------------------
if best_name in ['LinearRegression', 'Ridge', 'Lasso']:
    # 线性模型使用标准化后的测试数据（已填充缺失值）
    test_pred = best_model.predict(X_test_scaled)
else:
    # 树模型使用填充后的原始测试数据（无需标准化）
    test_pred = best_model.predict(X_test_imputed)

# 确保预测值非负
test_pred = np.maximum(test_pred, 0)


# ----------------------
# 6. 保存结果
# ----------------------
res_df = pd.DataFrame({'id': test_data['id'], 'rainfall': test_pred})
res_df.to_csv('/kaggle/working/aab.csv', index=False)
print("预测结果已保存至 /kaggle/working/aac.csv")

模型：LinearRegression，验证集R²: 0.4088
模型：Ridge，验证集R²: 0.4088
模型：Lasso，验证集R²: 0.3354
模型：RandomForest，验证集R²: 0.3895
模型：XGBoost，验证集R²: 0.2860
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000133 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1310
[LightGBM] [Info] Number of data points in the train set: 1752, number of used features: 10
[LightGBM] [Info] Start training from score 0.759703
模型：LightGBM，验证集R²: 0.3511
预测结果已保存至 /kaggle/working/aac.csv
