In [1]:
import sys
import os
from pathlib import Path
current_dir = os.getcwd()  
project_root = os.path.dirname(current_dir)  
sys.path.insert(0, project_root) 
import numpy as np
import pandas as pd
from src.data.data_preprocessor import DataPreprocessor
from src.config.config import Config
data_preprocessor = DataPreprocessor()

def detect_and_remove_anomalies(df, target_col, country_col='Country Name', year_col='Year', 
                               z_threshold=2.0, ratio_threshold=2.0, 
                               min_years=3, return_anomalies=False):
    """
    检测并移除目标列中的异常值
    
    参数:
    - df: 数据框
    - target_col: 目标列名（建筑垃圾量）
    - country_col: 国家列名
    - year_col: 年份列名
    - z_threshold: Z-score阈值，超过此值视为异常
    - ratio_threshold: 比值阈值，与相邻年份比值超过此值视为异常
    - min_years: 最少需要的年份数据量，少于此值的国家不进行异常检测
    - return_anomalies: 是否返回异常值信息
    
    返回:
    - 清洗后的数据框，如果return_anomalies=True，则同时返回异常值信息
    """
    # 复制数据框，避免修改原始数据
    df_clean = df.copy()
    
    # 存储所有异常记录
    all_anomalies = pd.DataFrame()
    
    # 按国家分组处理
    for country, country_data in df_clean.groupby(country_col):
        # 如果数据量太少，跳过异常检测
        if len(country_data) < min_years:
            continue
            
        # 按年份排序
        country_data = country_data.sort_values(year_col)
        
        # 方法1: 基于Z-score的异常检测
        z_scores = np.abs((country_data[target_col] - country_data[target_col].mean()) / country_data[target_col].std())
        z_anomalies = country_data[z_scores > z_threshold].index
        
        # 方法2: 基于相邻年份比值的异常检测
        country_data_sorted = country_data.sort_values(year_col)
        
        # 计算与前一年的比值
        prev_ratios = country_data_sorted[target_col] / country_data_sorted[target_col].shift(1)
        # 计算与后一年的比值
        next_ratios = country_data_sorted[target_col] / country_data_sorted[target_col].shift(-1)
        
        # 找出异常记录
        ratio_anomalies = country_data_sorted[
            ((prev_ratios > ratio_threshold) | (prev_ratios < 1/ratio_threshold)) & 
            ((next_ratios > ratio_threshold) | (next_ratios < 1/ratio_threshold))
        ].index
        
        # 方法3: 考虑年份间隔的异常检测
        # 计算年份间隔
        year_gaps = country_data_sorted[year_col].diff()
        # 调整比值阈值，根据年份间隔
        adjusted_ratios = prev_ratios / year_gaps
        gap_anomalies = country_data_sorted[
            (adjusted_ratios > ratio_threshold) & (year_gaps > 1)
        ].index
        
        # 综合多种方法，确定最终的异常值
        # 至少满足两种方法判定为异常的记录
        anomaly_indices = list(set(z_anomalies) | set(ratio_anomalies) | set(gap_anomalies))
        
        if anomaly_indices and return_anomalies:
            # 收集异常记录信息
            anomalies = country_data.loc[anomaly_indices].copy()
            anomalies['z_score'] = z_scores.loc[anomaly_indices]
            anomalies['prev_ratio'] = prev_ratios.loc[anomaly_indices]
            anomalies['next_ratio'] = next_ratios.loc[anomaly_indices]
            anomalies['year_gap'] = year_gaps.loc[anomaly_indices]
            all_anomalies = pd.concat([all_anomalies, anomalies])
        
        # 从清洗数据中移除异常值
        df_clean = df_clean.drop(anomaly_indices)
    
    if return_anomalies:
        return df_clean, all_anomalies
    else:
        return df_clean

In [None]:
"""加载1990-2022数据"""
historical_df = pd.read_excel(
    Config.FEATURE_CONFIG['historical_data_path'],
    sheet_name=Config.FEATURE_CONFIG['historical_sheet'],
    usecols=Config.FEATURE_CONFIG['usecols'] 
)

"""执行完整处理流程"""
# 第一阶段：处理历史数据
all_countries_df = data_preprocessor.process_historical_data(historical_df)
features_path = Path(Config.PATH_CONFIG['features_dir']) / 'global_features.csv'
all_countries_df.to_csv(features_path, index=False)

In [None]:
# 第二阶段：处理MSW数据
"""加载包含MSW的目标数据"""
msw_df = pd.read_excel(
    Config.FEATURE_CONFIG['historical_msw_data_path'],
    sheet_name=Config.FEATURE_CONFIG['historical_msw_sheet'],
    usecols=Config.FEATURE_CONFIG['usecols'] + [Config.DATA_CONFIG['target_column']]
)

# 先检测异常值但不移除，查看异常值情况
target_col = Config.DATA_CONFIG['target_column']
_, anomalies = detect_and_remove_anomalies(msw_df, target_col, return_anomalies=True)
print(f"检测到 {len(anomalies)} 条异常记录:")
print(f"检测到 {len(anomalies)} 条异常记录:")
if len(anomalies) > 0:
    display(anomalies.sort_values(['Country Name', 'Year']))
else:
    print("没有检测到异常记录")

# 分析每个国家数据的分布情况
print("\n===== 各国数据分布情况分析 =====")
for country, country_data in msw_df.groupby('Country Name'):
    if len(country_data) < 3:  # 跳过数据量太少的国家
        continue
        
    country_data = country_data.sort_values('Year')
    
    # 计算Z-score
    z_scores = np.abs((country_data[target_col] - country_data[target_col].mean()) / country_data[target_col].std())
    
    # 计算年度增长比例
    growth_ratios = country_data[target_col].pct_change()
    
    print(f"\n国家: {country}, 数据点数: {len(country_data)}")
    print(f"平均值: {country_data[target_col].mean():.2f}, 标准差: {country_data[target_col].std():.2f}")
    print(f"最大Z-score: {z_scores.max():.2f}")
    print(f"最大年度增长率: {growth_ratios.max():.2%}")
    print(f"最小年度增长率: {growth_ratios.min():.2%}")
    
    # 显示可能的异常点
    potential_anomalies = country_data[(z_scores > 2.0) | (growth_ratios > 0.5) | (growth_ratios < -0.3)]
    if len(potential_anomalies) > 0:
        print("潜在异常点:")
        display(potential_anomalies[['Year', target_col]])


In [None]:
df_clean = detect_and_remove_anomalies(msw_df, target_col)
print(f"原始数据: {len(msw_df)} 行, 清洗后: {len(df_clean)} 行")

In [None]:
train_df, predict_df = data_preprocessor.merge_features(df_clean)
print(f"用于训练的数据: {len(train_df)} 行, 预测的历史数据: {len(predict_df)} 行,总共: {len(train_df)+len(predict_df)}行")

In [None]:
# 保存最终数据集
train_df.to_csv(
    Path(Config.PATH_CONFIG['features_dir']) / 'training_data.csv', 
    index=False,
    encoding='utf-8-sig'
)
predict_df.to_csv(
    Path(Config.PATH_CONFIG['features_dir']) / 'prediction_data.csv', 
    index=False,
    encoding='utf-8-sig'
)

In [None]:
"""加载2022-2050数据"""
future_df = pd.read_excel(
    Config.FEATURE_CONFIG['future_data_path'],
    sheet_name=Config.FEATURE_CONFIG['future_sheet'],
    usecols=Config.FEATURE_CONFIG['usecols']+['Scenario']
)

"""加载1990-2022数据"""
historical_df = pd.read_excel(
    Config.FEATURE_CONFIG['historical_data_path'],
    sheet_name=Config.FEATURE_CONFIG['historical_sheet'],
    usecols=Config.FEATURE_CONFIG['usecols']
)

# !! 新增：在调用处理函数前，预先移除未来数据中与历史数据重叠的年份 !!
last_historical_year = historical_df['Year'].max()
print(f"历史数据最后一年: {last_historical_year}")
print(f"过滤前未来数据形状: {future_df.shape}")

future_df_filtered = future_df[future_df['Year'] > last_historical_year].copy()
print(f"过滤掉年份 <= {last_historical_year} 后，未来数据形状: {future_df_filtered.shape}")

# 检查过滤后是否还有数据
if future_df_filtered.empty:
    print("错误：过滤重叠年份后，没有有效的未来数据。请检查数据范围。")
    # 可以选择退出或抛出异常
    # sys.exit(1) 
else:
    # 使用过滤后的 future_df 调用处理函数
    processed_data_paths = data_preprocessor.process_future_data(historical_df, future_df_filtered)
    print("未来数据特征生成完成。各场景文件路径:")
    print(processed_data_paths)

历史数据最后一年: 2022
过滤前未来数据形状: (66755, 9)
过滤掉年份 <= 2022 后，未来数据形状: (65910, 9)
加载特征工程参数从: e:\code\jupyter\固废产生\SW-Prediction\cw\src\features\featurefile\feature_params.pkl
合并历史数据和未来数据...
合并后数据形状: (71341, 9)
按国家和年份排序...
应用特征工程转换...
{'0': {'count': 1792.0, 'mean': 19381.77171316964, 'std': 21426.7508082626, 'min': 893.1, '25%': 5644.435, '50%': 12494.025, '75%': 25882.927499999998, 'max': 120647.82}, '1': {'count': 1847.0, 'mean': 16535.889209528967, 'std': 18754.914493681954, 'min': 430.41, '25%': 2640.6850000000004, '50%': 8200.79, '75%': 26637.364999999998, 'max': 112445.42}, '2': {'count': 1792.0, 'mean': 15587.329481026785, 'std': 15428.149034069726, 'min': 537.27, '25%': 3450.96, '50%': 9832.51, '75%': 23609.6575, 'max': 64623.13}}
特征转换后数据形状: (71341, 67)
筛选未来数据...
筛选出的未来数据形状: (65910, 67)
按场景拆分并保存未来特征文件...
  处理场景: SSP1
    已保存场景 'SSP1' 的处理后特征 (13182 行) 至: e:\code\jupyter\固废产生\SW-Prediction\cw\src\features\featurefile\future_features_SSP1.csv
  处理场景: SSP2
    已保存场景 'SSP2' 的处理后特征 (13182 行) 至