In [1]:
import sys
import os
current_dir = os.getcwd()  
project_root = os.path.dirname(current_dir)  
sys.path.insert(0, project_root) 
from pathlib import Path
import warnings
import pandas as pd
from pycaret.regression import *
from src.config.config import Config
from src.data.data_loader import DataLoader
from src.models.model_evaluator import ModelEvaluator

def train_models(train: pd.DataFrame,val: pd.DataFrame,target_column: str):
    """训练模型流程
    Args:
        data_path: 可选的数据文件路径
    """
    model_evaluator.setup_experiment(
        train_size=Config.MODEL_CONFIG['train_size'],
        train_data=train,
        val_data = val,
        target_column = target_column,
        categorical_features = Config.FEATURE_CONFIG['categorical_columns']
    )

    # 训练模型并返回最佳模型字典
    return model_evaluator.train_top_models(n_models=Config.MODEL_CONFIG['n_top_models'])

def tune(models: dict):
    """模型调优和集成"""
    # 对top模型进行调优
    tuned_models = model_evaluator.tune_models(models)

    return tuned_models

def ensemble(models: dict):
    """模型集成"""
    # 对top模型进行集成
    blended_model = model_evaluator.ensemble_models(models)

    return blended_model    
    # 模型
    
def save_models(models: dict):

    model_evaluator.saved_models(models)

In [2]:
# 示例用法
df = pd.read_csv(Path(Config.PATH_CONFIG['features_dir'])  / 'training_data.csv')
data_loader = DataLoader()
model_evaluator = ModelEvaluator()

# 划分数据集
train_data, val_data, country_test_data = data_loader.split_data_by_countries(
    df,
    train_size=Config.DATA_CONFIG['country_train_size'],
    val_size=Config.DATA_CONFIG['country_val_size'],
    random_state=Config.DATA_CONFIG['random_state']
)

train_data, time_test_data = data_loader.split_data_by_time(train_data,
    test_size=Config.DATA_CONFIG['time_test_size'])

# 数据分析
print('\n训练集统计信息:')
data_loader.analyze_datasets(train_data)
print('\n验证集统计信息:')
data_loader.analyze_datasets(val_data)
print('\n外部国家测试集统计信息:')
data_loader.analyze_datasets(country_test_data)
print('\n外部时间测试集统计信息:')
data_loader.analyze_datasets(time_test_data)
#合并验证集和外部国家测试集作为测试集
test_data = pd.concat([val_data, country_test_data])
print('\n合并，作为测试集统计信息:')
data_loader.analyze_datasets(test_data)


训练集统计信息:
总数据条数: 1399
国家总数: 57
包含的国家: Albania, Argentina, Austria, Bangladesh, Belgium, Bosnia and Herzegovina, Brazil, Bulgaria, Canada, China, Colombia, Congo, Dem. Rep., Croatia, Cyprus, Czechia, Denmark, Estonia, Ethiopia, Greece, Hungary, Iceland, Indonesia, Iran, Islamic Rep., Iraq, Ireland, Italy, Japan, Korea, Rep., Latvia, Lithuania, Luxembourg, Malaysia, Malta, Mexico, Montenegro, Morocco, Netherlands, Nigeria, North Macedonia, Norway, Peru, Philippines, Poland, Romania, Russian Federation, Saudi Arabia, Serbia, Slovak Republic, Slovenia, South Africa, Spain, Sweden, Switzerland, Tanzania, Turkiye, Uganda, United Kingdom

特征数量: 21
特征列表: MSW, MSW_log, development_stage, gdp_10y_ma, gdp_5y_ma, gdp_acceleration, gdp_growth_rate, gdp_per_capita_growth, gdp_per_capita_ma, gdp_per_capita_power, gdp_pop_interaction, gdp_pop_nonlinear, gdp_trend, pop_density_trend, pop_growth_rate, pop_trend, similar_countries_gdp_growth, stage_avg_gdp_growth, year_trend, year_trend_log, year_trend_s

In [3]:
train_data.to_csv(os.path.join(Config.PATH_CONFIG['data_dir'], 'train.csv'), index=False)
test_data.to_csv(os.path.join(Config.PATH_CONFIG['data_dir'], 'test.csv'), index=False)
time_test_data.to_csv(os.path.join(Config.PATH_CONFIG['data_dir'], 'time_test_data.csv'), index=False)

# 排除不需要的列
feature_cols = [col for col in train_data.columns 
                if col not in [Config.DATA_CONFIG['target_column'], 'Country Name', 'Year']]

train = train_data[feature_cols]
test = test_data[feature_cols]
time_test_data = time_test_data[feature_cols] 

target_column = Config.DATA_CONFIG['target_column']
method = Config.FEATURE_CONFIG['target_transform_method']
transformed_column = f'{target_column}_{method}'


In [4]:
# 训练模型
models = train_models(train,test,transformed_column)

Unnamed: 0,Description,Value
0,Session id,888
1,Target,MSW_log
2,Target type,Regression
3,Original data shape,"(1769, 25)"
4,Transformed data shape,"(1769, 38)"
5,Transformed train set shape,"(1399, 38)"
6,Transformed test set shape,"(370, 38)"
7,Numeric features,21
8,Categorical features,3
9,Preprocess,True


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
br,Bayesian Ridge,0.3203,0.1685,0.4028,0.8855,0.0251,0.0211,0.012
gbr,Gradient Boosting Regressor,0.3255,0.2031,0.4294,0.8558,0.0273,0.0217,0.12
ridge,Ridge Regression,0.3321,0.2065,0.4238,0.8423,0.0268,0.0222,0.434
lightgbm,Light Gradient Boosting Machine,0.3515,0.238,0.4627,0.8385,0.029,0.0233,0.098
xgboost,Extreme Gradient Boosting,0.3541,0.2396,0.4625,0.8382,0.0291,0.0235,0.096
rf,Random Forest Regressor,0.3558,0.2353,0.4631,0.8357,0.0291,0.0236,0.068
ada,AdaBoost Regressor,0.368,0.2447,0.473,0.8259,0.03,0.0246,0.042
catboost,CatBoost Regressor,0.3669,0.315,0.4935,0.7699,0.0315,0.0249,0.492
et,Extra Trees Regressor,0.3822,0.3031,0.5203,0.8077,0.0333,0.0259,0.04
dt,Decision Tree Regressor,0.4016,0.2858,0.5228,0.812,0.033,0.0269,0.016


In [5]:
# 调优
tune_models = tune(models)

当前调优模型: br , 结果如下:


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.4338,0.2447,0.4946,0.7748,0.0315,0.0297
1,0.2723,0.1124,0.3353,0.9665,0.02,0.0173
2,0.3127,0.2272,0.4766,0.9228,0.0315,0.022
3,0.2618,0.0834,0.2888,0.9181,0.0172,0.0168
4,0.3144,0.1693,0.4114,0.8465,0.0249,0.0197
Mean,0.319,0.1674,0.4014,0.8857,0.025,0.0211
Std,0.0611,0.0626,0.0794,0.0675,0.0058,0.0047


Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Bayesian Ridge,0.448,0.6665,0.8164,0.5747,0.0444,0.0252


当前调优模型: gbr , 结果如下:


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.4445,0.2823,0.5313,0.7402,0.0344,0.0305
1,0.3132,0.2081,0.4562,0.9379,0.0313,0.0216
2,0.2614,0.1113,0.3336,0.9622,0.021,0.0175
3,0.2873,0.1403,0.3746,0.8622,0.023,0.0188
4,0.2125,0.0718,0.2679,0.9349,0.0155,0.0131
Mean,0.3038,0.1627,0.3927,0.8875,0.025,0.0203
Std,0.0778,0.0745,0.0923,0.0809,0.0069,0.0058


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Gradient Boosting Regressor,0.3781,0.3217,0.5672,0.7947,0.03,0.0211


当前调优模型: ridge , 结果如下:


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.4795,0.3297,0.5742,0.6966,0.0371,0.0333
1,0.2306,0.1057,0.3251,0.9685,0.02,0.0151
2,0.2589,0.1405,0.3749,0.9522,0.0246,0.0179
3,0.2575,0.081,0.2846,0.9204,0.017,0.0165
4,0.3156,0.1724,0.4152,0.8437,0.0252,0.0197
Mean,0.3084,0.1659,0.3948,0.8763,0.0248,0.0205
Std,0.0899,0.0876,0.1,0.0996,0.0069,0.0066


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Ridge Regression,0.4453,0.6457,0.8036,0.588,0.0436,0.0251


当前调优模型: lightgbm , 结果如下:


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.53,0.4578,0.6766,0.5786,0.0439,0.0366
1,0.3466,0.2638,0.5136,0.9213,0.0352,0.0239
2,0.2835,0.1681,0.41,0.9428,0.0242,0.0181
3,0.264,0.1099,0.3316,0.892,0.0205,0.0173
4,0.2042,0.0665,0.2578,0.9397,0.0148,0.0125
Mean,0.3257,0.2132,0.4379,0.8549,0.0277,0.0217
Std,0.1118,0.139,0.1464,0.1393,0.0105,0.0083




Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Light Gradient Boosting Machine,0.3165,0.191,0.437,0.8781,0.0231,0.0179


当前调优模型: xgboost , 结果如下:


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.4744,0.3516,0.593,0.6764,0.038,0.0324
1,0.3181,0.205,0.4527,0.9389,0.0307,0.0218
2,0.2414,0.0989,0.3146,0.9664,0.0192,0.016
3,0.2976,0.1432,0.3784,0.8594,0.0231,0.0195
4,0.2489,0.0853,0.2921,0.9226,0.017,0.0154
Mean,0.3161,0.1768,0.4061,0.8727,0.0256,0.021
Std,0.0843,0.0969,0.1089,0.1043,0.0077,0.0061


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Extreme Gradient Boosting,0.4369,0.4395,0.6629,0.7196,0.0352,0.0244


当前调优模型: rf , 结果如下:


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.4971,0.4155,0.6446,0.6176,0.0415,0.0342
1,0.2824,0.2048,0.4525,0.9389,0.0314,0.0201
2,0.3291,0.1822,0.4268,0.9381,0.0256,0.0214
3,0.3013,0.1413,0.3759,0.8612,0.0233,0.0198
4,0.2185,0.0814,0.2853,0.9262,0.0163,0.0133
Mean,0.3257,0.205,0.437,0.8564,0.0276,0.0218
Std,0.0931,0.1133,0.1185,0.1228,0.0084,0.0068


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Random Forest Regressor,0.32,0.2188,0.4677,0.8604,0.0244,0.0179


当前调优模型: ada , 结果如下:


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.5088,0.4181,0.6466,0.6151,0.0417,0.035
1,0.3471,0.2326,0.4823,0.9306,0.0326,0.0239
2,0.3009,0.1514,0.3892,0.9485,0.0241,0.0199
3,0.305,0.1496,0.3868,0.853,0.0239,0.0201
4,0.2315,0.0855,0.2923,0.9225,0.0168,0.0141
Mean,0.3387,0.2075,0.4395,0.8539,0.0278,0.0226
Std,0.0928,0.1152,0.1198,0.1237,0.0086,0.0069


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,AdaBoost Regressor,0.3543,0.2603,0.5102,0.8339,0.0268,0.0199


当前调优模型: catboost , 结果如下:


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.5437,0.4745,0.6888,0.5632,0.0451,0.038
1,0.3419,0.2421,0.4921,0.9278,0.0336,0.0235
2,0.2101,0.0839,0.2897,0.9715,0.0174,0.0137
3,0.2636,0.1099,0.3315,0.8921,0.0203,0.0172
4,0.1929,0.0589,0.2428,0.9465,0.014,0.0118
Mean,0.3104,0.1939,0.409,0.8602,0.0261,0.0208
Std,0.1276,0.1539,0.1631,0.1507,0.0116,0.0094


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,CatBoost Regressor,0.4115,0.3901,0.6246,0.7511,0.033,0.0229


当前调优模型: et , 结果如下:


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.5216,0.5339,0.7307,0.5086,0.0484,0.0374
1,0.4057,0.3401,0.5832,0.8986,0.0375,0.027
2,0.3781,0.3043,0.5516,0.8965,0.0365,0.0264
3,0.2691,0.1023,0.3199,0.8995,0.0196,0.0176
4,0.2175,0.0757,0.2751,0.9314,0.0157,0.0132
Mean,0.3584,0.2713,0.4921,0.8269,0.0315,0.0243
Std,0.1068,0.1683,0.1706,0.1597,0.0121,0.0084


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Extra Trees Regressor,0.3982,0.3193,0.565,0.7963,0.0297,0.0223


当前调优模型: dt , 结果如下:


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.4491,0.3009,0.5485,0.7231,0.0354,0.0307
1,0.3184,0.2358,0.4856,0.9297,0.0326,0.0221
2,0.4959,0.3503,0.5918,0.8809,0.0364,0.0328
3,0.352,0.1982,0.4452,0.8054,0.0268,0.0228
4,0.2425,0.0985,0.3139,0.9106,0.0183,0.015
Mean,0.3716,0.2367,0.477,0.8499,0.0299,0.0247
Std,0.0909,0.0867,0.0959,0.0763,0.0067,0.0064


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Decision Tree Regressor,0.3731,0.2525,0.5025,0.8389,0.027,0.0214


In [6]:
# 定义需要包含的模型简称集合
include_models = {'gbr', 'lightgbm','xgboost'}

# 生成过滤后的有序字典
model_dict = {
    k: v for k, v in tune_models.items()
    if k in include_models
}

In [7]:
# 集成
models = ensemble(model_dict)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.4685,0.3421,0.5849,0.6851,0.0378,0.0322
1,0.3087,0.2088,0.457,0.9377,0.0315,0.0214
2,0.2431,0.1095,0.3309,0.9628,0.0198,0.0159
3,0.2745,0.1239,0.352,0.8783,0.0217,0.018
4,0.213,0.0684,0.2616,0.9379,0.015,0.0131
Mean,0.3016,0.1706,0.3973,0.8804,0.0252,0.0201
Std,0.0893,0.0972,0.1128,0.1015,0.0083,0.0066




Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
0,Gradient Boosting Regressor,0.3038,0.1627,0.3927,0.8875,0.025,0.0203,0.654
3,Voting Regressor,0.3016,0.1706,0.3973,0.8804,0.0252,0.0201,0.114
2,Extreme Gradient Boosting,0.3161,0.1768,0.4061,0.8727,0.0256,0.021,0.09
1,Light Gradient Boosting Machine,0.3257,0.2132,0.4379,0.8549,0.0277,0.0217,0.57


In [8]:
save_model = save_models(models)

Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
