In [None]:
# 导入必要的库
import pandas as pd
import numpy as np
import os
from pycaret.regression import *
import sys
sys.path.append('src')
from data.data_loader import DataLoader
# 修复相对导入错误，使用绝对导入路径
from src.features.feature_engineering import FeatureEngineering

# 初始化数据加载器
data_loader = DataLoader()
# 加载数据
train_data = pd.read_csv('src/data/train_data.csv')
country_test_data = pd.read_csv('src/data/country_test_data.csv')
time_test_data = pd.read_csv('src/data/time_test_data.csv')

# 数据检查
data_loader.analyze_datasets(train_data)
data_loader.analyze_datasets(country_test_data)
data_loader.analyze_datasets(time_test_data)

In [None]:
# 初始化特征工程
feature_engineering = FeatureEngineering()

# 处理训练集特征
train_data_processed, target_column = feature_engineering.fit_transform(
    train_data,
    target_column='MSW',
    categorical_columns=['Region', 'Income Group']
)

# 处理测试集特征
country_test_data_processed, _ = feature_engineering.transform(
    country_test_data,
    target_column='MSW',
)

# 处理测试集特征
time_test_data_processed, _ = feature_engineering.transform(
    time_test_data,
    target_column='MSW'
)

In [None]:
# 加载之前训练好的模型
model_file = r"E:\code\jupyter\固废产生\SW-Prediction\src\models\catboost"
model = load_model(model_file)
country_test_predictions = predict_model(model, data=country_test_data_processed)
time_test_predictions = predict_model(model, data=time_test_data_processed)

In [None]:
from sklearn.metrics import mean_squared_error, r2_score
# 在时间外样本测试集上评估模型
time_test_r2 = r2_score(time_test_predictions['MSW_log'], time_test_predictions['prediction_label'])
time_test_mse = mean_squared_error(time_test_predictions['MSW_log'], time_test_predictions['prediction_label'])

# 在国家外样本测试集上评估模型
country_test_r2 = r2_score(country_test_predictions['MSW_log'], country_test_predictions['prediction_label'])
country_test_mse = mean_squared_error(country_test_predictions['MSW_log'], country_test_predictions['prediction_label'])

# 评估对数空间和原始空间的性能
# 将预测结果转换回原始空间
time_test_predictions['MSW_pred'] = np.expm1(time_test_predictions['prediction_label'])
country_test_predictions['MSW_pred'] = np.expm1(country_test_predictions['prediction_label'])

# 计算原始空间的性能指标
time_original_r2 = r2_score(time_test_predictions['MSW'], time_test_predictions['MSW_pred'])
time_original_mse = mean_squared_error(time_test_predictions['MSW'], time_test_predictions['MSW_pred'])
country_original_r2 = r2_score(country_test_predictions['MSW'], country_test_predictions['MSW_pred'])
country_original_mse = mean_squared_error(country_test_predictions['MSW'], country_test_predictions['MSW_pred'])

# 计算平均绝对百分比误差(MAPE)
time_test_predictions['Error_percent'] = np.abs((time_test_predictions['MSW_pred'] - time_test_predictions['MSW']) / time_test_predictions['MSW']) * 100
country_test_predictions['Error_percent'] = np.abs((country_test_predictions['MSW_pred'] - country_test_predictions['MSW']) / country_test_predictions['MSW']) * 100

time_mape = time_test_predictions['Error_percent'].mean()
country_mape = country_test_predictions['Error_percent'].mean()

# 输出评估结果
print("\n===== 模型评估结果 =====")
print("\n时间外样本测试集评估:")
print(f"对数空间 - MSE: {time_test_mse:.4f}, R²: {time_test_r2:.4f}")
print(f"原始空间 - MSE: {time_original_mse:.2f}, R²: {time_original_r2:.4f}")
print(f"MAPE: {time_mape:.2f}%")

print("\n国家外样本测试集评估:")
print(f"对数空间 - MSE: {country_test_mse:.4f}, R²: {country_test_r2:.4f}")
print(f"原始空间 - MSE: {country_original_mse:.2f}, R²: {country_original_r2:.4f}")
print(f"MAPE: {country_mape:.2f}%")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
# 时间外样本测试集 - 各国家历史数据与预测数据对比图

# 将预测结果与原始数据合并
time_test_results = time_test_data_processed.copy()
time_test_results['MSW_pred'] = np.expm1(time_test_predictions['prediction_label'])

# 选择几个代表性国家进行可视化
sample_countries = time_test_results['Country Name'].value_counts().head(10).index.tolist()

# 获取这些国家的完整历史数据（包括训练集和测试集）
full_data = pd.concat([train_data_processed, time_test_data])

# 为每个样本国家绘制时间序列图
plt.figure(figsize=(18, 15))

for i, country in enumerate(sample_countries):
    plt.subplot(6, 2, i+1)
    
    # 获取该国家的数据
    country_full = full_data[full_data['Country Name'] == country].sort_values('Year')
    country_train = train_data_processed[train_data_processed['Country Name'] == country].sort_values('Year')
    country_test = time_test_results[time_test_results['Country Name'] == country].sort_values('Year')
    
    # 绘制历史数据
    plt.plot(country_train['Year'], country_train['MSW'], 'o-', color='blue', label='历史数据')
    
    # 绘制测试集实际值
    plt.plot(country_test['Year'], country_test['MSW'], 'o-', color='green', label='实际值')
    
    # 绘制测试集预测值
    plt.plot(country_test['Year'], country_test['MSW_pred'], 'x--', color='red', label='预测值')
    
    plt.title(f'{country}的MSW时间序列')
    plt.xlabel('年份')
    plt.ylabel('MSW')
    plt.legend()
    plt.grid(True, linestyle='--', alpha=0.7)

plt.tight_layout()
plt.show()

# 绘制所有国家的预测误差分布
time_test_results['Error_percent'] = ((time_test_results['MSW_pred'] - time_test_results['MSW']) / time_test_results['MSW']) * 100

plt.figure(figsize=(10, 6))
plt.hist(time_test_results['Error_percent'], bins=20, alpha=0.7)
plt.axvline(x=0, color='r', linestyle='--')
plt.xlabel('预测误差百分比 (%)')
plt.ylabel('频数')
plt.title('时间外样本测试集 - 预测误差百分比分布')
plt.grid(True, linestyle='--', alpha=0.7)
plt.show()

# 绘制预测误差与年份的关系
plt.figure(figsize=(12, 6))
plt.scatter(time_test_results['Year'], time_test_results['Error_percent'], alpha=0.6)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('年份')
plt.ylabel('预测误差百分比 (%)')
plt.title('时间外样本测试集 - 预测误差与年份的关系')
plt.grid(True, linestyle='--', alpha=0.7)
plt.show()