# 实时负载预测与资源管理 - 高级建模 (main3)

本笔记本将使用扩展特征集实现更先进的预测模型。基于处理过的数据，我们将探索更复杂的特征工程和模型架构。

In [1]:
# 导入必要的库
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import os
import joblib
import warnings
warnings.filterwarnings('ignore')

# 机器学习库
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb
from statsmodels.tsa.arima.model import ARIMA

# 深度学习库
try:
    import tensorflow as tf
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import LSTM, Dense, Dropout
    from tensorflow.keras.callbacks import EarlyStopping
    tf_available = True
except ImportError:
    print("警告: TensorFlow不可用，LSTM模型将不可用")
    tf_available = False

# 设置可视化样式
plt.style.use('ggplot')
sns.set(style="whitegrid")
%matplotlib inline

# 设置pandas显示选项
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', 1000)

## 1. 数据加载与探索

In [None]:
# 加载扩展特征集的数据
data_path = 'processed_all_fields_data/c7_user_DrrEIEW_timeseries.csv'

try:
    df = pd.read_csv(data_path)
    print(f"成功读取数据，形状: {df.shape}")
    
    # 显示前几行数据
    display(df.head())
    
    # 查看数据类型和基本信息
    display(df.info())
    
    # 查看数值特征的统计摘要
    display(df.describe())
    
except Exception as e:
    print(f"读取数据时出错: {e}")

### 1.1 检查缺失值

In [None]:
# 检查缺失值
missing_values = df.isnull().sum()
missing_percentage = (missing_values / len(df)) * 100

missing_df = pd.DataFrame({
    '缺失值数量': missing_values,
    '缺失百分比': missing_percentage
}).sort_values('缺失百分比', ascending=False)

display(missing_df[missing_df['缺失值数量'] > 0])

### 1.2 时间特征检查与转换

In [None]:
# 检查并转换时间特征
time_columns = [col for col in df.columns if 'time' in col.lower() and 'dt' not in col.lower()]
print(f"时间相关列: {time_columns}")

for col in time_columns:
    if col in df.columns:
        if df[col].dtype == 'int64' or df[col].dtype == 'float64':
            df[f'{col}_dt'] = pd.to_datetime(df[col], unit='us')
            print(f"转换列 {col} 为日期时间格式")

# 确保有时间序列索引
if 'time_dt' in df.columns:
    # 将时间列设为索引
    df_ts = df.set_index('time_dt').sort_index()
    print("已将时间列设为索引并排序")
    display(df_ts.head())
else:
    print("没有找到time_dt列，检查时间列转换")

### 1.3 目标变量的分布与时间序列可视化

In [None]:
# 定义目标变量 (CPU和内存使用率)
target_vars = ['average_usage_cpu', 'average_usage_memory']

# 检查目标变量是否存在
target_vars = [var for var in target_vars if var in df.columns]

if target_vars:
    # 可视化目标变量分布
    fig, axes = plt.subplots(len(target_vars), 1, figsize=(12, 5*len(target_vars)))
    if len(target_vars) == 1:
        axes = [axes]
        
    for i, var in enumerate(target_vars):
        # 直方图
        sns.histplot(df[var], ax=axes[i], kde=True)
        axes[i].set_title(f'{var} Distribution')
        axes[i].set_xlabel(var)
    
    plt.tight_layout()
    plt.show()
    
    # 时间序列可视化
    if 'time_dt' in df.columns:
        fig, axes = plt.subplots(len(target_vars), 1, figsize=(16, 6*len(target_vars)))
        if len(target_vars) == 1:
            axes = [axes]
            
        for i, var in enumerate(target_vars):
            axes[i].plot(df['time_dt'], df[var])
            axes[i].set_title(f'{var} Time Series')
            axes[i].set_xlabel('Time')
            axes[i].set_ylabel(var)
        
        plt.tight_layout()
        plt.show()
else:
    print("目标变量不存在于数据集中，请检查列名")

## 2. 特征工程

基于之前的模型结果和数据分析，我们将创建更多高级特征来提升模型性能。

### 2.1 时间特征创建

In [None]:
# 创建时间特征
def create_time_features(df, time_col='time_dt'):
    """从时间列创建丰富的时间特征"""
    print("\n创建时间特征...")
    
    # 确保列存在
    if time_col not in df.columns:
        print(f"列 {time_col} 不存在")
        return df
    
    # 复制数据框以避免修改原始数据
    df_new = df.copy()
    
    # 确保时间列是datetime类型
    df_new[time_col] = pd.to_datetime(df_new[time_col])
    print(f"转换 {time_col} 为datetime类型")
    
    # 从datetime创建特征
    df_new['hour_of_day'] = df_new[time_col].dt.hour
    df_new['day_of_week'] = df_new[time_col].dt.dayofweek
    df_new['day_of_month'] = df_new[time_col].dt.day
    df_new['month'] = df_new[time_col].dt.month
    
    # 创建周末指标 (0=工作日, 1=周末)
    df_new['is_weekend'] = df_new['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)
    
    # 创建一天中的时段分类
    def get_day_part(hour):
        if 5 <= hour < 12:
            return 'morning'
        elif 12 <= hour < 17:
            return 'afternoon'
        elif 17 <= hour < 22:
            return 'evening'
        else:
            return 'night'
    
    df_new['day_part'] = df_new['hour_of_day'].apply(get_day_part)
    
    # 对时段进行独热编码
    df_new = pd.get_dummies(df_new, columns=['day_part'], prefix='day_part')
    
    # 创建小时和日期的周期性特征（正弦和余弦变换）
    df_new['hour_sin'] = np.sin(2 * np.pi * df_new['hour_of_day'] / 24)
    df_new['hour_cos'] = np.cos(2 * np.pi * df_new['hour_of_day'] / 24)
    df_new['day_sin'] = np.sin(2 * np.pi * df_new['day_of_week'] / 7)
    df_new['day_cos'] = np.cos(2 * np.pi * df_new['day_of_week'] / 7)
    
    return df_new

# 应用时间特征创建函数
if 'time_dt' in df.columns:
    df = create_time_features(df)
    print("已创建时间特征")
    
    # 显示新增的时间特征列
    time_feature_cols = ['hour_of_day', 'day_of_week', 'is_weekend', 'hour_sin', 'hour_cos']
    time_feature_cols = [col for col in time_feature_cols if col in df.columns]
    
    if time_feature_cols:
        display(df[time_feature_cols].head())
else:
    print("无法创建时间特征，缺少time_dt列")

### 2.2 滑动窗口特征（滞后特征）

In [None]:
# 创建滞后特征（基于排序后的时间序列）
def create_lag_features(df, target_cols, lag_periods=[1, 3, 6, 12, 24], sort_col='time_dt'):
    """为目标列创建滞后特征"""
    # 确保数据按时间排序
    df_sorted = df.sort_values(by=sort_col).copy()
    
    # 为每个目标列和每个滞后周期创建特征
    for target in target_cols:
        for lag in lag_periods:
            # 创建滞后特征
            df_sorted[f'{target}_lag_{lag}'] = df_sorted[target].shift(lag)
    
    return df_sorted

# 创建滚动窗口统计特征
def create_rolling_features(df, target_cols, windows=[3, 6, 12, 24], sort_col='time_dt'):
    """为目标列创建滚动窗口统计特征"""
    # 确保数据按时间排序
    df_sorted = df.sort_values(by=sort_col).copy()
    
    # 为每个目标列和每个窗口创建特征
    for target in target_cols:
        for window in windows:
            # 创建滚动平均值
            df_sorted[f'{target}_rolling_mean_{window}'] = df_sorted[target].rolling(window=window, min_periods=1).mean()
            # 创建滚动标准差
            df_sorted[f'{target}_rolling_std_{window}'] = df_sorted[target].rolling(window=window, min_periods=1).std()
            # 创建滚动最小值和最大值
            df_sorted[f'{target}_rolling_min_{window}'] = df_sorted[target].rolling(window=window, min_periods=1).min()
            df_sorted[f'{target}_rolling_max_{window}'] = df_sorted[target].rolling(window=window, min_periods=1).max()
    
    return df_sorted

# 应用滞后和滚动窗口特征创建
if 'time_dt' in df.columns and target_vars:
    # 创建滞后特征
    df = create_lag_features(df, target_vars)
    print("已创建滞后特征")
    
    # 创建滚动窗口特征
    df = create_rolling_features(df, target_vars)
    print("已创建滚动窗口特征")
    
    # 显示新增特征的前几行
    lag_cols = [col for col in df.columns if 'lag_' in col or 'rolling_' in col][:5]
    if lag_cols:
        display(df[lag_cols].head(10))
else:
    print("无法创建时间序列特征，缺少必要的列")

### 2.3 资源使用率特征

In [None]:
# 创建资源使用率特征
def create_utilization_features(df):
    """创建资源使用率特征"""
    df_new = df.copy()
    
    # 检查必要的列是否存在
    if 'resource_request_cpu' in df.columns and 'average_usage_cpu' in df.columns:
        # CPU使用率 = 实际使用 / 请求资源
        df_new['cpu_utilization_ratio'] = df_new['average_usage_cpu'] / df_new['resource_request_cpu']
        # 处理无穷值
        df_new['cpu_utilization_ratio'] = df_new['cpu_utilization_ratio'].replace([np.inf, -np.inf], np.nan)
        # 上限为1（100%利用率）
        df_new['cpu_utilization_ratio'] = df_new['cpu_utilization_ratio'].clip(upper=1.0)
    
    if 'resource_request_memory' in df.columns and 'average_usage_memory' in df.columns:
        # 内存使用率 = 实际使用 / 请求资源
        df_new['memory_utilization_ratio'] = df_new['average_usage_memory'] / df_new['resource_request_memory']
        # 处理无穷值
        df_new['memory_utilization_ratio'] = df_new['memory_utilization_ratio'].replace([np.inf, -np.inf], np.nan)
        # 上限为1（100%利用率）
        df_new['memory_utilization_ratio'] = df_new['memory_utilization_ratio'].clip(upper=1.0)
    
    # 资源效率比率（如果CPU和内存指标都存在）
    if 'cpu_utilization_ratio' in df_new.columns and 'memory_utilization_ratio' in df_new.columns:
        # 资源平衡指标（接近1表示CPU和内存使用平衡）
        df_new['resource_balance_ratio'] = df_new['cpu_utilization_ratio'] / df_new['memory_utilization_ratio']
        # 处理无穷值
        df_new['resource_balance_ratio'] = df_new['resource_balance_ratio'].replace([np.inf, -np.inf], np.nan)
    
    return df_new

# 应用资源使用率特征创建
df = create_utilization_features(df)
print("已创建资源使用率特征")

# 显示新增特征
utilization_cols = ['cpu_utilization_ratio', 'memory_utilization_ratio', 'resource_balance_ratio']
utilization_cols = [col for col in utilization_cols if col in df.columns]
if utilization_cols:
    display(df[utilization_cols].head())

In [None]:
df

### 2.4 任务特性特征

In [None]:
# 处理任务特性特征
task_features = ['priority', 'scheduling_class', 'collection_type', 'vertical_scaling', 'instance_index', 'failed']
task_features = [col for col in task_features if col in df.columns]

if task_features:
    print(f"发现任务特性特征: {task_features}")
    
    # 对分类特征进行独热编码
    categorical_features = []
    for col in task_features:
        if df[col].dtype == 'object' or df[col].nunique() < 10:  # 分类特征判断条件
            categorical_features.append(col)
    
    if categorical_features:
        print(f"将进行独热编码的分类特征: {categorical_features}")
        df = pd.get_dummies(df, columns=categorical_features, prefix=categorical_features)
        
    # 显示处理后的前几行
    new_cols = [col for col in df.columns if any(col.startswith(f"{feat}_") for feat in categorical_features)]
    if new_cols:
        display(df[new_cols[:5]].head())  # 只显示前5个新列
else:
    print("未找到任务特性特征")

### 2.5 内存和CPU效率指标

In [None]:
# 处理CPU和内存效率指标
efficiency_features = ['cycles_per_instruction', 'memory_accesses_per_instruction',
                       'assigned_memory', 'page_cache_memory']
efficiency_features = [col for col in efficiency_features if col in df.columns]

if efficiency_features:
    print(f"发现效率指标特征: {efficiency_features}")
    
    # 检查这些特征的缺失情况
    missing = df[efficiency_features].isnull().sum()
    missing_pct = (missing / len(df)) * 100
    
    for col, miss, pct in zip(efficiency_features, missing, missing_pct):
        print(f"{col}: {miss} 缺失值 ({pct:.2f}%)")
        
        # 如果缺失值不太多，使用中位数填充
        if pct < 50:
            median_val = df[col].median()
            df[col] = df[col].fillna(median_val)
            print(f"  - 使用中位数 {median_val:.6f} 填充缺失值")
            
    # 显示处理后的效率指标
    display(df[efficiency_features].describe())
    
    # 创建新的复合效率指标
    if 'cycles_per_instruction' in df.columns and 'memory_accesses_per_instruction' in df.columns:
        # 计算计算密集型指标 (高CPI, 低MAI意味着计算密集)
        df['compute_intensity'] = df['cycles_per_instruction'] / (df['memory_accesses_per_instruction'] + 0.001)
        print("已创建计算密集型指标")
        
    if 'assigned_memory' in df.columns and 'page_cache_memory' in df.columns:
        # 计算缓存使用比例
        df['cache_ratio'] = df['page_cache_memory'] / (df['assigned_memory'] + 0.0001)
        print("已创建缓存使用比例")
        
    # 显示新创建的指标
    new_metrics = ['compute_intensity', 'cache_ratio']
    new_metrics = [col for col in new_metrics if col in df.columns]
    if new_metrics:
        display(df[new_metrics].describe())
else:
    print("未找到效率指标特征")

## 3. 准备模型训练数据

In [None]:
def prepare_data_for_modeling(df, target_vars):
    """准备模型训练数据"""
    print("\n准备模型训练数据...")
    
    # 处理缺失值
    print("处理缺失值...")
    for col in df.columns:
        if df[col].isnull().sum() > 0:
            if df[col].dtype in ['int64', 'float64']:
                # 数值型列用中位数填充
                df[col] = df[col].fillna(df[col].median())
            else:
                # 非数值型列用众数填充
                df[col] = df[col].fillna(df[col].mode()[0])
    
    # 删除无用列
    cols_to_drop = []
    
    # 删除高基数ID列
    id_cols = [col for col in df.columns if 'id' in col.lower() or 'name' in col.lower() or 'user' in col.lower()]
    cols_to_drop.extend(id_cols)
    
    # 删除原始时间戳列（保留转换后的dt列）
    timestamp_cols = [col for col in df.columns if ('time' in col.lower() and 'dt' not in col.lower())]
    cols_to_drop.extend(timestamp_cols)
    
    # 排除目标变量
    cols_to_drop = [col for col in cols_to_drop if col not in target_vars]
    
    # 删除全是NaN的列
    null_cols = df.columns[df.isnull().all()].tolist()
    cols_to_drop.extend(null_cols)
    
    # 删除列
    df = df.drop(columns=[col for col in cols_to_drop if col in df.columns], errors='ignore')
    print(f"删除了 {len(cols_to_drop)} 列")
    
    # 将分类变量转换为数值
    object_cols = df.select_dtypes(include=['object']).columns
    for col in object_cols:
        if col not in target_vars:  # 不转换目标变量
            # 对分类变量进行标签编码
            df[col] = pd.factorize(df[col])[0]
    
    print("数据准备完成")
    return df

df = prepare_data_for_modeling(df, target_vars)

### 4. 为每个目标变量构建模型

In [13]:
# 5. 可视化预测结果
def visualize_predictions_separate(y_true, predictions_dict, title_prefix="Prediction Comparison"):
    """Create a separate prediction vs true value comparison chart for each model"""
    
    # First, create an overview chart containing all models
    plt.figure(figsize=(12, 6))
    plt.plot(range(len(y_true)), y_true, 'k-', label='True Value')
    
    for model_name, preds in predictions_dict.items():
        plt.plot(range(len(preds)), preds, '--', label=f'{model_name} Prediction')
    
    plt.title(f"{title_prefix} - Overview")
    plt.xlabel('Time')
    plt.ylabel('Target Value')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(f"{title_prefix.replace(' ', '_')}_overview.png")
    plt.show()
    
    # Then, create a separate chart for each model
    for model_name, preds in predictions_dict.items():
        plt.figure(figsize=(12, 6))
        plt.plot(range(len(y_true)), y_true, 'k-', label='True Value')
        plt.plot(range(len(preds)), preds, 'r--', label=f'{model_name} Prediction')
        
        plt.title(f"{title_prefix} - {model_name}")
        plt.xlabel('Time')
        plt.ylabel('Target Value')
        plt.legend()
        plt.grid(True)
        plt.tight_layout()
        plt.savefig(f"{title_prefix.replace(' ', '_')}_{model_name.replace(' ', '_')}.png")
        plt.show()
    
    print(f"Created {len(predictions_dict) + 1} prediction comparison charts")

# 4. 模型构建与评估
def evaluate_models(df, target_var, test_size=0.2, random_state=42):
    """Build and evaluate multiple prediction models"""
    print(f"\nEvaluating prediction models for {target_var}...")
    
    # Prepare features and target
    y = df[target_var]
    X = df.drop(columns=[col for col in df.columns if col in [target_var] or col.startswith('time_')])
    
    print(f"Feature count: {X.shape[1]}")
    print(f"Sample count: {X.shape[0]}")
    
    # 创建训练集和测试集 (时间序列分割)
    # 为确保我们不用未来数据预测过去，使用最后test_size比例的数据作为测试集
    split_idx = int(len(X) * (1 - test_size))
    X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
    y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]
    
    print(f"Training set shape: {X_train.shape}, Test set shape: {X_test.shape}")
    
    # 特征标准化
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # 模型结果存储
    model_results = []
    
    # 1. ARIMA model
    try:
        print("\nTraining ARIMA model...")
        # 简化ARIMA，仅使用目标变量的时间序列
        # 对于复杂变量，可能需要使用SARIMAX
        model = ARIMA(y_train, order=(5,1,0))
        arima_model = model.fit()
        
        # Predict
        arima_preds = arima_model.forecast(steps=len(y_test))
        
        # Evaluate
        rmse = np.sqrt(mean_squared_error(y_test, arima_preds))
        mae = mean_absolute_error(y_test, arima_preds)
        r2 = r2_score(y_test, arima_preds)
        
        print(f"ARIMA - RMSE: {rmse:.6f}, MAE: {mae:.6f}, R²: {r2:.6f}")
        model_results.append({"model": "ARIMA", "rmse": rmse, "mae": mae, "r2": r2})
    except Exception as e:
        print(f"ARIMA model training failed: {e}")
    
    # 2. Random Forest
    print("\nTraining Random Forest...")
    rf = RandomForestRegressor(n_estimators=100, random_state=random_state)
    rf.fit(X_train_scaled, y_train)
    
    # Predict
    rf_preds = rf.predict(X_test_scaled)
    
    # Evaluate
    rmse = np.sqrt(mean_squared_error(y_test, rf_preds))
    mae = mean_absolute_error(y_test, rf_preds)
    r2 = r2_score(y_test, rf_preds)
    
    print(f"Random Forest - RMSE: {rmse:.6f}, MAE: {mae:.6f}, R²: {r2:.6f}")
    model_results.append({"model": "Random Forest", "rmse": rmse, "mae": mae, "r2": r2})
    
    # Feature importance
    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'importance': rf.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print("\nTop 10 feature importance:")
    print(feature_importance.head(10))
    
    # 3. XGBoost
    print("\nTraining XGBoost...")
    xgb_model = xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=random_state)
    xgb_model.fit(X_train_scaled, y_train)
    
    # Predict
    xgb_preds = xgb_model.predict(X_test_scaled)
    
    # Evaluate
    rmse = np.sqrt(mean_squared_error(y_test, xgb_preds))
    mae = mean_absolute_error(y_test, xgb_preds)
    r2 = r2_score(y_test, xgb_preds)
    
    print(f"XGBoost - RMSE: {rmse:.6f}, MAE: {mae:.6f}, R²: {r2:.6f}")
    model_results.append({"model": "XGBoost", "rmse": rmse, "mae": mae, "r2": r2})
    
    # 4. LSTM (if TensorFlow is available)
    if tf_available and len(X_train) > 50:  # Ensure there's enough data
        try:
            print("\nTraining LSTM model...")
            
            # 准备LSTM输入 (样本, 时间步, 特征)
            # 这里我们使用最简单的方式：每个样本的前LOOKBACK个时间步作为输入
            LOOKBACK = 5
            
            def create_sequences(X, y, time_steps=LOOKBACK):
                X_seq, y_seq = [], []
                for i in range(len(X) - time_steps):
                    X_seq.append(X[i:i + time_steps])
                    y_seq.append(y[i + time_steps])
                return np.array(X_seq), np.array(y_seq)
            
            # Create sequences
            X_train_seq, y_train_seq = create_sequences(X_train_scaled, y_train.values)
            
            # Build LSTM model
            model = Sequential([
                LSTM(50, activation='relu', input_shape=(X_train_seq.shape[1], X_train_seq.shape[2]), return_sequences=True),
                Dropout(0.2),
                LSTM(50, activation='relu'),
                Dropout(0.2),
                Dense(1)
            ])
            
            model.compile(optimizer='adam', loss='mse')
            
            # Early stopping
            early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
            
            # Train
            history = model.fit(
                X_train_seq, y_train_seq,
                epochs=50,
                batch_size=32,
                validation_split=0.2,
                callbacks=[early_stop],
                verbose=1
            )
            
            # 为测试集创建序列
            # 注意：我们需要包括训练集的最后几个样本来预测测试集的第一个样本
            X_full = np.vstack((X_train_scaled[-LOOKBACK:], X_test_scaled))
            X_test_seq = []
            for i in range(len(X_test)):
                X_test_seq.append(X_full[i:i + LOOKBACK])
            X_test_seq = np.array(X_test_seq)
            
            # Predict
            lstm_preds = model.predict(X_test_seq).flatten()
            
            # Evaluate
            rmse = np.sqrt(mean_squared_error(y_test, lstm_preds))
            mae = mean_absolute_error(y_test, lstm_preds)
            r2 = r2_score(y_test, lstm_preds)
            
            print(f"LSTM - RMSE: {rmse:.6f}, MAE: {mae:.6f}, R²: {r2:.6f}")
            model_results.append({"model": "LSTM", "rmse": rmse, "mae": mae, "r2": r2})
        
        except Exception as e:
            print(f"LSTM model training failed: {e}")
    
    # Summarize results
    results_df = pd.DataFrame(model_results)
    results_df = results_df.sort_values('rmse')
    
    print("\nModel performance summary:")
    print(results_df)

    # 可视化预测结果
    visualize_predictions_separate(
        y_true=y_test, 
        predictions_dict=predictions_dict,
        title_prefix=f"{target_var} Prediction Comparison"
    )
    
    # Return the best model and evaluation results
    return results_df, feature_importance


In [None]:
for target_var in target_vars:
    # 过滤掉含有NaN的行
    df_clean = df.dropna(subset=[target_var])
    
    # 过滤掉其他目标变量的滞后特征
    other_targets = [t for t in target_vars if t != target_var]
    cols_to_drop = []
    for other_target in other_targets:
        cols_to_drop.extend([col for col in df_clean.columns if col.startswith(f"{other_target}_lag_")])
        cols_to_drop.extend([col for col in df_clean.columns if col.startswith(f"{other_target}_rolling_")])
    
    df_model = df_clean.drop(columns=cols_to_drop, errors='ignore')
    
    # 构建和评估模型
    results_df, feature_importance = evaluate_models(df_model, target_var)
    
    # 保存结果
    results_df.to_csv(f"model_results_{target_var}.csv", index=False)
    feature_importance.to_csv(f"feature_importance_{target_var}.csv", index=False)

print("建模完成！")