In [None]:
import pandas as pd
import numpy as np
import cudf  
from cuml.ensemble import RandomForestRegressor  
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import torch
import os
import matplotlib.pyplot as plt
import pickle


config = {
    'num_estimators': 100,
    'max_depth': 10,
    'random_state': 42,
    'batch_size': 4,
    'save_path': './saved_models/dual_guass_rf',
    'num_epochs': 1,
    'patience': 30,
    'head_target_col': 'head',          
    'conc_target_col': 'concentration'  
}


device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(f"使用设备: {device}")


def compute_metrics(y_true, y_pred):
    """计算评估指标，正确处理cuDF/cuPy数组"""
    

    if hasattr(y_true, 'to_pandas'):
        y_true = y_true.to_pandas().values
    elif hasattr(y_true, 'get'):  
        y_true = y_true.get()
    elif hasattr(y_true, 'values'):
        y_true = y_true.values
    
    if hasattr(y_pred, 'to_pandas'):
        y_pred = y_pred.to_pandas().values
    elif hasattr(y_pred, 'get'):  
        y_pred = y_pred.get()
    elif hasattr(y_pred, 'values'):
        y_pred = y_pred.values
    

    y_true = np.array(y_true).ravel()
    y_pred = np.array(y_pred).ravel()
    
  
    mask = ~np.isnan(y_true) & ~np.isinf(y_true) & ~np.isnan(y_pred) & ~np.isinf(y_pred)
    y_true_clean = y_true[mask]
    y_pred_clean = y_pred[mask]
    
    if len(y_true_clean) == 0:
        return {'mse': np.inf, 'rmse': np.inf, 'mae': np.inf, 'r2': -np.inf}
    
    mse = mean_squared_error(y_true_clean, y_pred_clean)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true_clean, y_pred_clean)
    r2 = r2_score(y_true_clean, y_pred_clean)
    
    return {'mse': mse, 'rmse': rmse, 'mae': mae, 'r2': r2}

def preprocess_data_for_rf(data):

    required_cols = [
        'row', 'col', 'time_step', 'x', 'y', 'top', 'bottom', 'K', 'recharge', 'ET',
        'river_stage', 'river_cond', 'river_rbot', 'well_rate', 'well_mask',
        'chd_mask', 'lytyp', 'conc_mask', 'head', 'concentration'
    ]
    
    missing_cols = [col for col in required_cols if col not in data.columns]
    if missing_cols:
        raise KeyError(f"缺少必要的列: {missing_cols}")
    
    
    data = data.astype({
        'x': np.float32, 'y': np.float32, 'top': np.float32, 
        'bottom': np.float32, 'K': np.float32, 'recharge': np.float32,
        'ET': np.float32, 'river_stage': np.float32, 'river_cond': np.float32,
        'river_rbot': np.float32, 'well_rate': np.float32, 'well_mask': np.uint8,
        'chd_mask': np.uint8, 'lytyp': np.uint8, 'head': np.float32, 
        'concentration': np.float32, 'conc_mask': np.uint8
    })
    
    print(f"原始数据形状: {data.shape}")
    
 
    data = data.dropna(subset=required_cols)
    print(f"处理缺失值后数据形状: {data.shape}")
    
  
    time_min = data['time_step'].min()
    data['time_step'] = data['time_step'] - time_min
    
 
    data = data.sort_values(['row', 'col', 'time_step'])
    
 
    base_feature_cols = [
        'x', 'y', 'top', 'bottom', 'K', 'recharge', 'ET',
        'river_stage', 'river_cond', 'river_rbot', 'well_rate', 'well_mask',
        'chd_mask', 'lytyp'
    ]
    
   
    conc_base_feature_cols = base_feature_cols + ['conc_mask']
    
    print("计算历史特征...")
    
  
    prev_head = np.zeros(len(data), dtype=np.float32)
    prev2_head = np.zeros(len(data), dtype=np.float32)
    prev_conc = np.zeros(len(data), dtype=np.float32)
    prev2_conc = np.zeros(len(data), dtype=np.float32)
    
 
    groups = data.groupby(['row', 'col'], sort=False)
    for (row, col), group in groups:
        time_series = group.sort_values('time_step')
        indices = time_series.index
        
       
        head_values = time_series['head'].values
        conc_values = time_series['concentration'].values
        
        prev_head[indices] = np.roll(head_values, 1)
        prev2_head[indices] = np.roll(head_values, 2)
        prev_conc[indices] = np.roll(conc_values, 1)
        prev2_conc[indices] = np.roll(conc_values, 2)
        
 
        first_idx = indices[0]
        if len(indices) > 1:
            second_idx = indices[1]
         
            prev_head[first_idx] = head_values[0]
            prev2_head[first_idx] = head_values[0]
            prev_conc[first_idx] = conc_values[0]
            prev2_conc[first_idx] = conc_values[0]
            
            
            prev2_head[second_idx] = head_values[0]
            prev2_conc[second_idx] = conc_values[0]
        else:
            
            prev_head[first_idx] = head_values[0]
            prev2_head[first_idx] = head_values[0]
            prev_conc[first_idx] = conc_values[0]
            prev2_conc[first_idx] = conc_values[0]
    

    data['prev_head'] = prev_head
    data['prev2_head'] = prev2_head
    data['prev_conc'] = prev_conc
    data['prev2_conc'] = prev2_conc
    
    print("历史特征计算完成")
    

    head_feature_cols = base_feature_cols + ['prev_head', 'prev2_head']
    X_head = data[head_feature_cols].values.astype(np.float32)
    

    conc_feature_cols = conc_base_feature_cols + ['prev_head', 'prev2_head', 'prev_conc', 'prev2_conc']
    X_conc = data[conc_feature_cols].values.astype(np.float32)
    
    print(f"水头模型特征维度: {X_head.shape[1]} (期望16维)")
    print(f"浓度模型特征维度: {X_conc.shape[1]} (期望19维)")
    

    def get_float_indices(feature_cols, data_dtypes):
        float_indices = []
        uint8_cols = ['well_mask', 'chd_mask', 'lytyp', 'conc_mask']
        for i, col in enumerate(feature_cols):
            if col not in uint8_cols:
                float_indices.append(i)
        return float_indices
    
    # 水头模型特征标准化
    head_float_indices = get_float_indices(head_feature_cols, data.dtypes)
    head_scaler = StandardScaler()
    X_head_scaled = X_head.copy()
    X_head_scaled[:, head_float_indices] = head_scaler.fit_transform(X_head[:, head_float_indices])
    
 
    conc_float_indices = get_float_indices(conc_feature_cols, data.dtypes)
    conc_scaler = StandardScaler()
    X_conc_scaled = X_conc.copy()
    X_conc_scaled[:, conc_float_indices] = conc_scaler.fit_transform(X_conc[:, conc_float_indices])
    

    X_head_scaled = np.nan_to_num(X_head_scaled, nan=0.0, posinf=1e6, neginf=-1e6)
    X_conc_scaled = np.nan_to_num(X_conc_scaled, nan=0.0, posinf=1e6, neginf=-1e6)
    

    y_head = data['head'].values.astype(np.float32)
    y_conc = data['concentration'].values.astype(np.float32)
    

    index_data = data[['row', 'col', 'time_step']].copy()
    
    print("数据预处理完成!")
    print(f"最终特征形状 - 水头: {X_head_scaled.shape}, 浓度: {X_conc_scaled.shape}")
    print(f"目标变量形状 - 水头: {y_head.shape}, 浓度: {y_conc.shape}")
    
    return (X_head_scaled, X_conc_scaled, y_head, y_conc, 
            head_scaler, conc_scaler, head_feature_cols, conc_feature_cols, index_data)

def train_rf():
  
    cleaned_data = pd.read_csv('conc_dual_guass.csv')
    
 
    X_head, X_conc_base, y_head, y_conc, scaler_head, scaler_conc, head_feature_cols, conc_feature_cols, index_data = preprocess_data_for_rf(cleaned_data)
    
   
    X_head_cudf = cudf.DataFrame(X_head)
    y_head_cudf = cudf.Series(y_head)
    y_conc_cudf = cudf.Series(y_conc)
    
   
    print("初始化 10 折交叉验证...")
    kf = KFold(n_splits=10, shuffle=True, random_state=config['random_state'])
    
  
    head_train_metrics_all = []
    conc_train_metrics_all = []
    head_test_metrics_all = []
    conc_test_metrics_all = []
    predictions_all = []
    
  
    head_feature_importance = np.zeros(X_head.shape[1])
    conc_feature_importance = np.zeros(X_conc_base.shape[1] + 1)  # +1 for predicted head
    

    for fold, (train_idx, test_idx) in enumerate(kf.split(X_head_cudf)):
        print(f"\n处理第 {fold + 1}/10 折...")
        
      
        X_head_train = X_head_cudf.iloc[train_idx]
        X_head_test = X_head_cudf.iloc[test_idx]
        y_head_train = y_head_cudf.iloc[train_idx]
        y_head_test = y_head_cudf.iloc[test_idx]
        
 
        X_conc_base_cudf = cudf.DataFrame(X_conc_base)
        X_conc_train_base = X_conc_base_cudf.iloc[train_idx]
        X_conc_test_base = X_conc_base_cudf.iloc[test_idx]
        y_conc_train = y_conc_cudf.iloc[train_idx]
        y_conc_test = y_conc_cudf.iloc[test_idx]
        

        print(f"第 {fold + 1} 折 - 训练集形状: X_head_train={X_head_train.shape}, y_head_train={len(y_head_train)}")
        print(f"第 {fold + 1} 折 - 测试集形状: X_head_test={X_head_test.shape}, y_head_test={len(y_head_test)}")

        print(f"第 {fold + 1} 折 - 检查 X_head_train 中的 NaN 或无穷值:")
        print("NaN:", X_head_train.isnull().sum().sum())
        print("无穷值:", (X_head_train == float('inf')).sum().sum() + (X_head_train == -float('inf')).sum().sum())
        

        head_rf = RandomForestRegressor(
            n_estimators=config['num_estimators'],
            max_depth=config['max_depth'],
            random_state=config['random_state'],
            n_streams=1
        )
        
    
        print(f"第 {fold + 1} 折 - 训练水头随机森林模型...")
        head_rf.fit(X_head_train, y_head_train)
        

        try:
           
            head_importance = head_rf.get_feature_importances()
            head_feature_importance += head_importance.to_numpy()
            
           
            print(f"第 {fold + 1} 折 - 水头模型的特征重要性 (前5):")
            head_importance_df = pd.DataFrame({
                'feature': head_feature_cols,
                'importance': head_importance.to_numpy()
            }).sort_values('importance', ascending=False)
            print(head_importance_df.head(5))
        except Exception as e:
            print(f"无法获取水头模型的特征重要性: {e}")
            print("跳过特征重要性分析，继续训练...")
        
    
        print(f"第 {fold + 1} 折 - 准备浓度训练数据...")
        pred_head_train = head_rf.predict(X_head_train)
        X_conc_train = X_conc_train_base.copy()
        X_conc_train['pred_head'] = pred_head_train
        
  
        print(f"第 {fold + 1} 折 - pred_head_train 中的 NaN:", pred_head_train.isnull().sum())
        if X_conc_train['pred_head'].isnull().sum() > 0:
            print(f"第 {fold + 1} 折 - 警告: X_conc_train['pred_head'] 包含 {X_conc_train['pred_head'].isnull().sum()} 个 NaN 值，填充为均值")
            X_conc_train['pred_head'] = X_conc_train['pred_head'].fillna(X_conc_train['pred_head'].mean())
        

        conc_rf = RandomForestRegressor(
            n_estimators=config['num_estimators'],
            max_depth=config['max_depth'],
            random_state=config['random_state'],
            n_streams=1
        )

        print(f"第 {fold + 1} 折 - 训练浓度随机森林模型...")
        pred_head_test = head_rf.predict(X_head_test)
        X_conc_test = X_conc_test_base.copy()
        X_conc_test['pred_head'] = pred_head_test
        
        if X_conc_test['pred_head'].isnull().sum() > 0:
            print(f"第 {fold + 1} 折 - 警告: X_conc_test['pred_head'] 包含 {X_conc_test['pred_head'].isnull().sum()} 个 NaN 值，填充为均值")
            X_conc_test['pred_head'] = X_conc_test['pred_head'].fillna(X_conc_test['pred_head'].mean())
        
        conc_rf.fit(X_conc_train, y_conc_train)
        

        try:
           
            conc_importance = conc_rf.get_feature_importances()
            conc_feature_importance += conc_importance.to_numpy()
            
         
            print(f"第 {fold + 1} 折 - 浓度模型的特征重要性 (前5):")
            conc_feature_names = conc_feature_cols + ['pred_head']
            conc_importance_df = pd.DataFrame({
                'feature': conc_feature_names,
                'importance': conc_importance.to_numpy()
            }).sort_values('importance', ascending=False)
            print(conc_importance_df.head(5))
        except Exception as e:
            print(f"无法获取浓度模型的特征重要性: {e}")
            print("跳过特征重要性分析，继续训练...")
        
        # 在训练集上评估
        head_train_pred = head_rf.predict(X_head_train)
        conc_train_pred = conc_rf.predict(X_conc_train)
        
        head_train_metrics = compute_metrics(y_head_train, head_train_pred)
        conc_train_metrics = compute_metrics(y_conc_train, conc_train_pred)
        head_train_metrics_all.append(head_train_metrics)
        conc_train_metrics_all.append(conc_train_metrics)
        
     
        head_test_pred = head_rf.predict(X_head_test)
        conc_test_pred = conc_rf.predict(X_conc_test)
        
        head_test_metrics = compute_metrics(y_head_test, head_test_pred)
        conc_test_metrics = compute_metrics(y_conc_test, conc_test_pred)
        head_test_metrics_all.append(head_test_metrics)
        conc_test_metrics_all.append(conc_test_metrics)
        
      
        index_test = index_data.iloc[test_idx]
        predictions = {
            'fold': np.full(len(test_idx), fold + 1),
            'row': index_test['row'].values,
            'col': index_test['col'].values,
            'time_step': index_test['time_step'].values,
            'pred_conc': conc_test_pred.to_pandas().values,
            'true_conc': y_conc_test.to_pandas().values,
            'pred_head': head_test_pred.to_pandas().values,
            'true_head': y_head_test.to_pandas().values
        }
        predictions_all.append(pd.DataFrame(predictions))
    
 
    os.makedirs(config['save_path'], exist_ok=True)
    try:
       
        head_rf.save(os.path.join(config['save_path'], 'head_rf_model.pkl'))
        conc_rf.save(os.path.join(config['save_path'], 'conc_rf_model.pkl'))
    except Exception as e:
        print(f"无法使用 cuML 原生方法保存模型: {e}")
        print("尝试使用 pickle 保存...")
        try:
            with open(os.path.join(config['save_path'], 'head_rf_model.pkl'), 'wb') as f:
                pickle.dump(head_rf, f)
            with open(os.path.join(config['save_path'], 'conc_rf_model.pkl'), 'wb') as f:
                pickle.dump(conc_rf, f)
        except Exception as e2:
            print(f"无法使用 pickle 保存模型: {e2}")
            print("模型将不会被保存。")
    

    with open(os.path.join(config['save_path'], 'scaler_head.pkl'), 'wb') as f:
        pickle.dump(scaler_head, f)
    with open(os.path.join(config['save_path'], 'scaler_conc.pkl'), 'wb') as f:
        pickle.dump(scaler_conc, f)
    
 
    predictions_df = pd.concat(predictions_all, ignore_index=True)
    predictions_df.to_csv(os.path.join(config['save_path'], 'test_predictions_rf_kfold.csv'), index=False)
    

    try:
     
        head_feature_importance /= 10  
        conc_feature_importance /= 10  
        
       
        head_importance_final = pd.DataFrame({
            'feature': head_feature_cols,
            'importance': head_feature_importance
        }).sort_values('importance', ascending=False)
        
        conc_importance_final = pd.DataFrame({
            'feature': conc_feature_cols + ['pred_head'],
            'importance': conc_feature_importance
        }).sort_values('importance', ascending=False)
        
        head_importance_final.to_csv(os.path.join(config['save_path'], 'head_feature_importance.csv'), index=False)
        conc_importance_final.to_csv(os.path.join(config['save_path'], 'conc_feature_importance.csv'), index=False)
        
  
        plt.figure(figsize=(12, 8))
        
       
        plt.subplot(2, 1, 1)
        head_top10 = head_importance_final.head(10)
        plt.barh(head_top10['feature'], head_top10['importance'])
        plt.title('水头模型特征重要性 (前10)')
        plt.xlabel('重要性')
        plt.gca().invert_yaxis()  
        

        plt.subplot(2, 1, 2)
        conc_top10 = conc_importance_final.head(10)
        plt.barh(conc_top10['feature'], conc_top10['importance'])
        plt.title('浓度模型特征重要性 (前10)')
        plt.xlabel('重要性')
        plt.gca().invert_yaxis()  
        
        plt.tight_layout()
        plt.savefig(os.path.join(config['save_path'], 'feature_importance.png'), dpi=300)
        
    except Exception as e:
        print(f"无法保存或绘制特征重要性: {e}")
        print("跳过特征重要性分析，继续评估...")
    

    def average_metrics(metrics_list):
        keys = metrics_list[0].keys()
        avg_metrics = {key: np.mean([m[key] for m in metrics_list]) for key in keys}
        return avg_metrics
    
    avg_head_train_metrics = average_metrics(head_train_metrics_all)
    avg_conc_train_metrics = average_metrics(conc_train_metrics_all)
    avg_head_test_metrics = average_metrics(head_test_metrics_all)
    avg_conc_test_metrics = average_metrics(conc_test_metrics_all)
    

    print("\n📊 平均训练集结果 (10折交叉验证):")
    print("\n🔹 水头指标:")
    for k, v in avg_head_train_metrics.items():
        print(f"{k.upper():<5}: {v:.4f}")
    print("\n🔹 浓度指标:")
    for k, v in avg_conc_train_metrics.items():
        print(f"{k.upper():<5}: {v:.4f}")
    
    print("\n📊 平均测试集结果 (10折交叉验证):")
    print("\n🔹 水头指标:")
    for k, v in avg_head_test_metrics.items():
        print(f"{k.upper():<5}: {v:.4f}")
    print("\n🔹 浓度指标:")
    for k, v in avg_conc_test_metrics.items():
        print(f"{k.upper():<5}: {v:.4f}")
    
    # 绘制预测与真实值对比图
    try:
        plt.figure(figsize=(12, 10))
        
 
        plt.subplot(2, 1, 1)
        all_pred_head = np.concatenate([df['pred_head'].values for df in predictions_all])
        all_true_head = np.concatenate([df['true_head'].values for df in predictions_all])
        plt.scatter(all_true_head, all_pred_head, alpha=0.3)
        min_val = min(all_true_head.min(), all_pred_head.min())
        max_val = max(all_true_head.max(), all_pred_head.max())
        plt.plot([min_val, max_val], [min_val, max_val], 'r--')
        plt.title(f'水头预测 vs 真实值 (R² = {avg_head_test_metrics["r2"]:.4f})')
        plt.xlabel('真实水头')
        plt.ylabel('预测水头')
        
        plt.subplot(2, 1, 2)
        all_pred_conc = np.concatenate([df['pred_conc'].values for df in predictions_all])
        all_true_conc = np.concatenate([df['true_conc'].values for df in predictions_all])
        plt.scatter(all_true_conc, all_pred_conc, alpha=0.3)
        min_val = min(all_true_conc.min(), all_pred_conc.min())
        max_val = max(all_true_conc.max(), all_pred_conc.max())
        plt.plot([min_val, max_val], [min_val, max_val], 'r--')
        plt.title(f'浓度预测 vs 真实值 (R² = {avg_conc_test_metrics["r2"]:.4f})')
        plt.xlabel('真实浓度')
        plt.ylabel('预测浓度')
        
        plt.tight_layout()
        plt.savefig(os.path.join(config['save_path'], 'predictions_vs_truth.png'), dpi=300)
    except Exception as e:
        print(f"无法绘制预测图表: {e}")
        print("跳过图表绘制，继续返回结果...")
    
    return avg_head_train_metrics, avg_conc_train_metrics, avg_head_test_metrics, avg_conc_test_metrics


avg_head_train_metrics, avg_conc_train_metrics, avg_head_test_metrics, avg_conc_test_metrics = train_rf()