In [1]:
import sys
import os
from pathlib import Path
current_dir = os.getcwd()  
project_root = os.path.dirname(current_dir)  
sys.path.insert(0, project_root) 
import numpy as np
import pandas as pd
from src.data.data_preprocessor import DataPreprocessor
from src.config.config import Config
data_preprocessor = DataPreprocessor()

In [None]:
"""加载1990-2022数据"""
historical_df = pd.read_excel(
    Config.FEATURE_CONFIG['historical_data_path'],
    sheet_name=Config.FEATURE_CONFIG['historical_sheet'],
    usecols=Config.FEATURE_CONFIG['usecols'] 
)

"""执行完整处理流程"""
# 第一阶段：处理历史数据
all_countries_df = data_preprocessor.process_historical_data(historical_df)

features_path = Path(Config.PATH_CONFIG['features_dir']) / 'global_features.csv'
all_countries_df.to_csv(features_path, index=False)

In [None]:
# 第二阶段：处理MSW数据
"""加载包含MSW的目标数据"""
msw_df = pd.read_excel(
    Config.FEATURE_CONFIG['historical_msw_data_path'],
    sheet_name=Config.FEATURE_CONFIG['historical_msw_sheet'],
    usecols=Config.FEATURE_CONFIG['usecols'] + [Config.DATA_CONFIG['target_column']]
)
train_df, predict_df = data_preprocessor.merge_features(msw_df)

In [None]:
# 保存最终数据集
train_df.to_csv(
    Path(Config.PATH_CONFIG['features_dir']) / 'training_data.csv', 
    index=False,
    encoding='utf-8-sig'
)
predict_df.to_csv(
    Path(Config.PATH_CONFIG['features_dir']) / 'prediction_data.csv', 
    index=False,
    encoding='utf-8-sig'
)

In [2]:
"""加载2022-2050数据"""
future_df = pd.read_excel(
    Config.FEATURE_CONFIG['future_data_path'],
    sheet_name=Config.FEATURE_CONFIG['future_sheet'],
    usecols=Config.FEATURE_CONFIG['usecols']+['Scenario']
)

"""加载1990-2022数据"""
historical_df = pd.read_excel(
    Config.FEATURE_CONFIG['historical_data_path'],
    sheet_name=Config.FEATURE_CONFIG['historical_sheet'],
    usecols=Config.FEATURE_CONFIG['usecols']
)

# !! 新增：在调用处理函数前，预先移除未来数据中与历史数据重叠的年份 !!
last_historical_year = historical_df['Year'].max()
print(f"历史数据最后一年: {last_historical_year}")
print(f"过滤前未来数据形状: {future_df.shape}")

future_df_filtered = future_df[future_df['Year'] > last_historical_year].copy()
print(f"过滤掉年份 <= {last_historical_year} 后，未来数据形状: {future_df_filtered.shape}")

# 检查过滤后是否还有数据
if future_df_filtered.empty:
    print("错误：过滤重叠年份后，没有有效的未来数据。请检查数据范围。")
    # 可以选择退出或抛出异常
    # sys.exit(1) 
else:
    # 使用过滤后的 future_df 调用处理函数
    processed_data_paths = data_preprocessor.process_future_data(historical_df, future_df_filtered)
    print("未来数据特征生成完成。各场景文件路径:")
    print(processed_data_paths)

历史数据最后一年: 2022
过滤前未来数据形状: (66755, 8)
过滤掉年份 <= 2022 后，未来数据形状: (65910, 8)
加载特征工程参数从: e:\code\jupyter\固废产生\SW-Prediction\msw\src\features\featurefile\feature_params.pkl
合并历史数据和未来数据...
合并后数据形状: (71341, 8)
按国家和年份排序...
应用特征工程转换...
特征转换后数据形状: (71341, 62)
筛选未来数据...
筛选出的未来数据形状: (65910, 62)
按场景拆分并保存未来特征文件...
  处理场景: SSP1
    已保存场景 'SSP1' 的处理后特征 (13182 行) 至: e:\code\jupyter\固废产生\SW-Prediction\msw\src\features\featurefile\future_features_SSP1.csv
  处理场景: SSP2
    已保存场景 'SSP2' 的处理后特征 (13182 行) 至: e:\code\jupyter\固废产生\SW-Prediction\msw\src\features\featurefile\future_features_SSP2.csv
  处理场景: SSP3
    已保存场景 'SSP3' 的处理后特征 (13182 行) 至: e:\code\jupyter\固废产生\SW-Prediction\msw\src\features\featurefile\future_features_SSP3.csv
  处理场景: SSP4
    已保存场景 'SSP4' 的处理后特征 (13182 行) 至: e:\code\jupyter\固废产生\SW-Prediction\msw\src\features\featurefile\future_features_SSP4.csv
  处理场景: SSP5
    已保存场景 'SSP5' 的处理后特征 (13182 行) 至: e:\code\jupyter\固废产生\SW-Prediction\msw\src\features\featurefile\future_features_SSP5.csv

未来数据