In [1]:
import sys
import os
from pathlib import Path
current_dir = os.getcwd()  
project_root = os.path.dirname(current_dir)  
sys.path.insert(0, project_root) 
import numpy as np
import pandas as pd
from src.data.data_preprocessor import DataPreprocessor
from src.config.config import Config
data_preprocessor = DataPreprocessor()

def detect_anomalies(df, target_col, threshold=5.0):
    """检测目标列中的异常变化（考虑前后行）"""
    # 按国家和年份排序
    df_sorted = df.sort_values(['Country Name', 'Year'])
    
    # 计算与前一行的比值
    df_sorted['prev_ratio'] = df_sorted.groupby('Country Name')[target_col].transform(
        lambda x: x / x.shift(1)
    )
    
    # 计算与后一行的比值
    df_sorted['next_ratio'] = df_sorted.groupby('Country Name')[target_col].transform(
        lambda x: x / x.shift(-1)
    )
    
    # 找出异常记录（与前一行相比超过阈值或与后一行相比小于1/阈值）
    anomalies = df_sorted[
        (df_sorted['prev_ratio'] > threshold) | 
        (df_sorted['prev_ratio'] < 1/threshold) |
        (df_sorted['next_ratio'] > threshold) |
        (df_sorted['next_ratio'] < 1/threshold)
    ].copy()
    
    # 获取相邻记录的信息
    anomalies['prev_year'] = anomalies.groupby('Country Name')['Year'].shift(1)
    anomalies['next_year'] = anomalies.groupby('Country Name')['Year'].shift(-1)
    anomalies['prev_value'] = anomalies.groupby('Country Name')[target_col].shift(1)
    anomalies['next_value'] = anomalies.groupby('Country Name')[target_col].shift(-1)
    
    # 计算年份间隔
    anomalies['prev_gap'] = anomalies['Year'] - anomalies['prev_year']
    anomalies['next_gap'] = anomalies['next_year'] - anomalies['Year']
    
    # 整理输出结果
    result = anomalies[[
        'Country Name', 
        'prev_year', 'Year', 'next_year',
        'prev_gap', 'next_gap',
        'prev_value', target_col, 'next_value',
        'prev_ratio', 'next_ratio'
    ]].dropna(subset=['prev_ratio', 'next_ratio'], how='all')
    
    return result

In [2]:
"""加载1990-2022数据"""
historical_df = pd.read_excel(
    Config.FEATURE_CONFIG['historical_data_path'],
    sheet_name=Config.FEATURE_CONFIG['historical_sheet'],
    usecols=Config.FEATURE_CONFIG['usecols'] 
)

"""执行完整处理流程"""
# 第一阶段：处理历史数据
all_countries_df = data_preprocessor.process_historical_data(historical_df)

features_path = Path(Config.PATH_CONFIG['features_dir']) / 'global_features.csv'
all_countries_df.to_csv(features_path, index=False)

In [3]:
# 第二阶段：处理MSW数据
"""加载包含MSW的目标数据"""
msw_df = pd.read_excel(
    Config.FEATURE_CONFIG['historical_msw_data_path'],
    sheet_name=Config.FEATURE_CONFIG['historical_msw_sheet'],
    usecols=Config.FEATURE_CONFIG['usecols'] + [Config.DATA_CONFIG['target_column']] + ['IW']
)


# 检测IW异常变化
anomalies = detect_anomalies(msw_df, Config.DATA_CONFIG['target_column'])

# 显示结果
if len(anomalies) > 0:
    print(f"发现{len(anomalies)}条异常变化:")
    pd.set_option('display.float_format', lambda x: '%.2f' % x)
    # 按最大变化比例降序排序（取prev_ratio和next_ratio中的最大值）
    anomalies['max_ratio'] = anomalies[['prev_ratio', 'next_ratio']].max(axis=1)
    display(anomalies.sort_values('max_ratio', ascending=False))
else:
    print("未发现异常变化")

未发现异常变化


In [4]:
train_df, predict_df = data_preprocessor.merge_features(msw_df)

# 保存最终数据集
train_df.to_csv(
    Path(Config.PATH_CONFIG['features_dir']) / 'training_data.csv', 
    index=False,
    encoding='utf-8-sig'
)
predict_df.to_csv(
    Path(Config.PATH_CONFIG['features_dir']) / 'prediction_data.csv', 
    index=False,
    encoding='utf-8-sig'
)

In [5]:
"""加载2023-2050数据"""
future_df = pd.read_excel(
    Config.FEATURE_CONFIG['future_data_path'],
    sheet_name=Config.FEATURE_CONFIG['future_sheet'],
    usecols=Config.FEATURE_CONFIG['usecols']
)

"""加载1990-2022数据"""
historical_df = pd.read_excel(
    Config.FEATURE_CONFIG['historical_data_path'],
    sheet_name=Config.FEATURE_CONFIG['historical_sheet'],
    usecols=Config.FEATURE_CONFIG['usecols']
)
future_prediction_df = data_preprocessor.process_future_data(historical_df, future_df)

# 保存最终数据集
future_prediction_df.to_csv(
    Path(Config.PATH_CONFIG['features_dir']) / 'future_prediction_data.csv', 
    index=False,
    encoding='utf-8-sig'
)