In [1]:
# 导入必要的库
import pandas as pd
import numpy as np
import os
from pycaret.regression import *
import sys
sys.path.append('src')
from data.data_loader import DataLoader
from features.feature_engineering import FeatureEngineering

# 初始化数据加载器
data_loader = DataLoader()
# 加载数据
train_data = pd.read_csv('src/data/train_data.csv')
country_test_data = pd.read_csv('src/data/country_test_data.csv')
time_test_data = pd.read_csv('src/data/time_test_data.csv')

# 数据检查
data_loader.analyze_datasets(train_data)
data_loader.analyze_datasets(country_test_data)
data_loader.analyze_datasets(time_test_data)



数据集统计信息:
总数据条数: 1476
国家总数: 59
包含的国家: Albania, Argentina, Australia, Austria, Bangladesh, Belgium, Bosnia and Herzegovina, Bulgaria, Canada, China, Colombia, Congo, Dem. Rep., Croatia, Cyprus, Denmark, Egypt, Arab Rep., Ethiopia, Finland, France, Germany, Greece, Hungary, Iceland, India, Indonesia, Iran, Islamic Rep., Iraq, Ireland, Italy, Japan, Korea, Rep., Latvia, Lithuania, Luxembourg, Malta, Mexico, Montenegro, Morocco, Netherlands, Nigeria, North Macedonia, Norway, Pakistan, Peru, Poland, Portugal, Romania, Russian Federation, Saudi Arabia, Slovak Republic, Spain, Sweden, Tanzania, Thailand, Turkiye, Uganda, Ukraine, United Kingdom, United States

数据集统计信息:
总数据条数: 283
国家总数: 10
包含的国家: Algeria, Brazil, Czechia, Estonia, Malaysia, Philippines, Serbia, Slovenia, South Africa, Switzerland

数据集统计信息:
总数据条数: 223
国家总数: 59
包含的国家: Albania, Argentina, Australia, Austria, Bangladesh, Belgium, Bosnia and Herzegovina, Bulgaria, Canada, China, Colombia, Congo, Dem. Rep., Croatia, Cyprus, Denmark,

In [None]:
# 初始化特征工程
feature_engineering = FeatureEngineering()

# 处理训练集特征
train_data_processed, target_column = feature_engineering.fit_transform(
    train_data,
    target_column='MSW',
    categorical_columns=['Region', 'Income Group']
)

# 处理测试集特征
country_test_data_processed, _ = feature_engineering.transform(
    country_test_data,
    target_column='MSW',
)

# 处理测试集特征
time_test_data_processed, _ = feature_engineering.transform(
    time_test_data,
    target_column='MSW'
)

# 设置实验
feature_cols = [col for col in train_data_processed.columns 
               if col not in ['MSW', 'Country Name', 'Year']]

train = train_data_processed[feature_cols]

In [3]:
# 加载之前训练好的模型
models_dir = r"E:\code\jupyter\固废产生\SW-Prediction\src\models"
model_files = [
    os.path.join(models_dir, "model_1"),
    os.path.join(models_dir, "model_2"),
    os.path.join(models_dir, "model_3")
]

loaded_models = []
for model_file in model_files:
    model = load_model(model_file)
    loaded_models.append(model)
    print(f"已加载模型: {model_file}")

# 使用PyCaret设置实验环境
s = setup(  train_size = 0.8,
            data=train,
            target=target_column,
            categorical_features=['Region', 'Income Group'],
            fold_strategy='timeseries',
            data_split_shuffle = False,
            fold_shuffle = False,
            fold = 5,
            normalize_method = 'minmax',
            normalize = True)
# 模型调优
tuned_models = []
for model in loaded_models:
    tuned_model = tune_model(model, n_iter=50, optimize = 'R2',search_library = 'optuna',
                            early_stopping = 20)
    tuned_models.append(tuned_model)

Transformation Pipeline and Model Successfully Loaded
已加载模型: E:\code\jupyter\固废产生\SW-Prediction\src\models\model_1
Transformation Pipeline and Model Successfully Loaded
已加载模型: E:\code\jupyter\固废产生\SW-Prediction\src\models\model_2
Transformation Pipeline and Model Successfully Loaded
已加载模型: E:\code\jupyter\固废产生\SW-Prediction\src\models\model_3


Unnamed: 0,Description,Value
0,Session id,1830
1,Target,MSW_None
2,Target type,Regression
3,Original data shape,"(1476, 23)"
4,Transformed data shape,"(1476, 32)"
5,Transformed train set shape,"(1180, 32)"
6,Transformed test set shape,"(296, 32)"
7,Numeric features,20
8,Categorical features,2
9,Preprocess,True


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,21975196.3199,2685307074144279.0,51819948.6119,-0.0542,1.2419,1.3872
1,21281262.5672,1158308234583562.0,34033927.6985,0.7031,1.0098,0.9551
2,2501432.9943,14417160428390.049,3796993.6039,0.9585,0.2057,0.1539
3,3522402.4524,23552710093794.99,4853113.4433,0.8336,1.3609,5.2895
4,2406268.4819,11848340716761.527,3442141.8792,0.9263,0.3253,0.2515
Mean,10337312.5631,778686703993357.5,19589225.0474,0.6735,0.8287,1.6074
Std,9229906.2691,1050874883296629.6,19873322.9742,0.3745,0.475,1.8965


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,21616971.0686,2687150429486317.0,51837731.7163,-0.0549,1.1672,0.8727
1,16576280.5757,1287032979351065.2,35875241.8717,0.6701,0.7472,0.9257
2,4176037.8028,43805210570991.625,6618550.4887,0.874,0.278,0.2298
3,3034030.8687,19646793934163.207,4432470.4098,0.8612,1.138,2.5504
4,2749887.6446,16332299134478.076,4041323.9334,0.8984,0.3586,0.2484
Mean,9630641.5921,810793542495403.0,20561063.684,0.6497,0.7378,0.9654
Std,7906032.4516,1057626788129407.6,19698634.5385,0.3616,0.3741,0.8459


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,21646710.867,2649327310823788.0,51471616.5554,-0.0401,1.2575,1.2059
1,18097799.6435,1406150727966570.0,37498676.3495,0.6396,0.5975,0.5677
2,3440735.6339,29526324479551.867,5433813.07,0.9151,0.2761,0.1955
3,2864567.8046,21232456696579.79,4607868.9973,0.85,0.8052,1.1096
4,2163101.2751,9327773797758.49,3054140.4352,0.942,0.3438,0.2736
Mean,9642583.0448,823112918752849.8,20413223.0815,0.6613,0.656,0.6705
Std,8437260.479,1059247875897535.8,20159693.5041,0.3664,0.3548,0.4179


In [4]:
# 创建模型集成
ensemble_model = blend_models(tuned_models)

# 评估集成模型
evaluate_model(ensemble_model)
final_model = finalize_model(ensemble_model)
predict_model(final_model, data=country_test_data_processed)
predict_model(final_model, data=time_test_data_processed)   

# 保存集成模型
save_model(ensemble_model, os.path.join(models_dir, 'ensemble_model'))

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,21302501.9519,2670024709983936.5,51672281.8345,-0.0482,1.173,1.0952
1,17287070.1263,1201128727306073.5,34657304.0975,0.6921,0.6793,0.718
2,2644819.4762,17039911865185.787,4127942.8127,0.951,0.2121,0.1605
3,2842303.5682,16502887065908.066,4062374.56,0.8834,1.1831,2.9599
4,2267879.6296,10177978857919.639,3190294.4782,0.9367,0.3123,0.2451
Mean,9268914.9504,782974843015804.6,19542039.5566,0.683,0.712,1.0357
Std,8286042.4669,1049491347523943.0,20027070.005,0.3771,0.4111,1.0196


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Voting Regressor,4608952.1583,56218794458248.64,7497919.3419,0.8353,0.4115,0.4202


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Voting Regressor,3167203.0288,45377205307606.82,6736260.4839,0.9863,0.217,0.1565


Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('numerical_imputer',
                  TransformerWrapper(include=['Population', 'GDP PPP 2017',
                                              'GDP PPP/capita 2017',
                                              'year_trend', 'year_trend_squared',
                                              'year_trend_cubed',
                                              'year_trend_log',
                                              'year_since_2000',
                                              'year_normalized',
                                              'trend_region_Europe & Central '
                                              'Asia',
                                              'trend_region_Latin America & '
                                              'Caribbean',
                                              'trend_region_East Asia & Pacif...
                                              ('Gradient Boosting Regressor',
     