In [1]:
import sys
import os
current_dir = os.getcwd()  
project_root = os.path.dirname(current_dir)  
sys.path.insert(0, project_root) 
from pathlib import Path
import warnings
import pandas as pd
from pycaret.regression import *
from src.config.config import Config
from src.data.data_loader import DataLoader
from src.models.model_evaluator import ModelEvaluator

def train_models(train: pd.DataFrame,val: pd.DataFrame,target_column: str):
    """训练模型流程
    Args:
        data_path: 可选的数据文件路径
    """
    model_evaluator.setup_experiment(
        train_size=Config.MODEL_CONFIG['train_size'],
        train_data=train,
        val_data = val,
        target_column = target_column,
        categorical_features = Config.FEATURE_CONFIG['categorical_columns']
    )

    # 训练模型并返回最佳模型字典
    return model_evaluator.train_top_models(n_models=Config.MODEL_CONFIG['n_top_models'])

def tune(models: dict):
    """模型调优和集成"""
    # 对top模型进行调优
    tuned_models = model_evaluator.tune_models(models)

    return tuned_models

def ensemble(models: dict):
    """模型集成"""
    # 对top模型进行集成
    blended_model = model_evaluator.ensemble_models(models)

    return blended_model    
    # 模型
    
def save_models(models: dict):

    model_evaluator.saved_models(models)

In [2]:
# 示例用法
df = pd.read_csv(Path(Config.PATH_CONFIG['features_dir'])  / 'training_data.csv')
data_loader = DataLoader()
model_evaluator = ModelEvaluator()

# 划分数据集
train_data, val_data, country_test_data = data_loader.split_data_by_countries(
    df,
    train_size=Config.DATA_CONFIG['country_train_size'],
    val_size=Config.DATA_CONFIG['country_val_size'],
    random_state=Config.DATA_CONFIG['random_state']
)

train_data, time_test_data = data_loader.split_data_by_time(train_data,
    test_size=Config.DATA_CONFIG['time_test_size'])

# 数据分析
print('\n训练集统计信息:')
data_loader.analyze_datasets(train_data)
print('\n验证集统计信息:')
data_loader.analyze_datasets(val_data)
print('\n外部国家测试集统计信息:')
data_loader.analyze_datasets(country_test_data)
print('\n外部时间测试集统计信息:')
data_loader.analyze_datasets(time_test_data)
#合并验证集和外部国家测试集作为测试集
test_data = pd.concat([val_data, country_test_data])
print('\n合并，作为测试集统计信息:')
data_loader.analyze_datasets(test_data)


训练集统计信息:
总数据条数: 1399
国家总数: 57
包含的国家: Albania, Argentina, Austria, Bangladesh, Belgium, Bosnia and Herzegovina, Brazil, Bulgaria, Canada, China, Colombia, Congo, Dem. Rep., Croatia, Cyprus, Czechia, Denmark, Estonia, Ethiopia, Greece, Hungary, Iceland, Indonesia, Iran, Islamic Rep., Iraq, Ireland, Italy, Japan, Korea, Rep., Latvia, Lithuania, Luxembourg, Malaysia, Malta, Mexico, Montenegro, Morocco, Netherlands, Nigeria, North Macedonia, Norway, Peru, Philippines, Poland, Romania, Russian Federation, Saudi Arabia, Serbia, Slovak Republic, Slovenia, South Africa, Spain, Sweden, Switzerland, Tanzania, Turkiye, Uganda, United Kingdom

特征数量: 24
特征列表: MSW, MSW_log, development_stage, gdp_10y_ma, gdp_5y_ma, gdp_acceleration, gdp_growth_rate, gdp_per_capita_growth, gdp_per_capita_ma, gdp_pop_interaction, gdp_trend, pop_density_trend, pop_growth_rate, pop_trend, region_avg_gdp, region_gdp_per_capita, similar_gdp_growth, stage_avg_gdp_growth, stage_weight, weighted_gdp, weighted_pop, year_trend

In [3]:
train_data.to_csv(os.path.join(Config.PATH_CONFIG['data_dir'], 'train.csv'), index=False)
test_data.to_csv(os.path.join(Config.PATH_CONFIG['data_dir'], 'test.csv'), index=False)
time_test_data.to_csv(os.path.join(Config.PATH_CONFIG['data_dir'], 'time_test_data.csv'), index=False)

# 排除不需要的列
feature_cols = [col for col in train_data.columns 
                if col not in [Config.DATA_CONFIG['target_column'], 'Country Name', 'Year','Region', 'Income Group']]

train = train_data[feature_cols]
test = test_data[feature_cols]
time_test_data = time_test_data[feature_cols] 

target_column = Config.DATA_CONFIG['target_column']
method = Config.FEATURE_CONFIG['target_transform_method']
transformed_column = f'{target_column}_{method}'


In [4]:
# 训练模型
models = train_models(train,test,transformed_column)

Unnamed: 0,Description,Value
0,Session id,888
1,Target,MSW_log
2,Target type,Regression
3,Original data shape,"(1769, 26)"
4,Transformed data shape,"(1769, 30)"
5,Transformed train set shape,"(1399, 30)"
6,Transformed test set shape,"(370, 30)"
7,Numeric features,24
8,Categorical features,1
9,Preprocess,True


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
et,Extra Trees Regressor,0.2901,0.1607,0.3824,0.8899,0.0242,0.0193,0.038
omp,Orthogonal Matching Pursuit,0.2951,0.162,0.378,0.8799,0.0237,0.0195,0.338
gbr,Gradient Boosting Regressor,0.3253,0.1864,0.4185,0.8848,0.0264,0.0216,0.12
lightgbm,Light Gradient Boosting Machine,0.3433,0.2236,0.4536,0.8606,0.0283,0.0226,0.096
rf,Random Forest Regressor,0.3532,0.219,0.4513,0.8587,0.0282,0.0233,0.068
xgboost,Extreme Gradient Boosting,0.3526,0.2196,0.4537,0.8654,0.0285,0.0234,0.09
catboost,CatBoost Regressor,0.3448,0.2825,0.4724,0.7877,0.0304,0.0236,0.484
ada,AdaBoost Regressor,0.3555,0.2273,0.4646,0.8442,0.0293,0.0236,0.038
huber,Huber Regressor,0.368,0.2781,0.4797,0.7792,0.0294,0.0237,0.012
br,Bayesian Ridge,0.3839,0.3041,0.5036,0.77,0.0309,0.0248,0.012


In [5]:
# 调优
tune_models = tune(models)

当前调优模型: et , 结果如下:


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.4838,0.4093,0.6397,0.6233,0.0423,0.0342
1,0.3144,0.2202,0.4693,0.9343,0.032,0.0218
2,0.173,0.0602,0.2454,0.9795,0.015,0.0114
3,0.2329,0.0792,0.2814,0.9222,0.0174,0.0153
4,0.2017,0.0702,0.265,0.9363,0.0151,0.0122
Mean,0.2811,0.1678,0.3802,0.8791,0.0244,0.019
Std,0.1118,0.1342,0.1526,0.1294,0.011,0.0084


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Extra Trees Regressor,0.3172,0.2107,0.459,0.8656,0.024,0.0178


当前调优模型: omp , 结果如下:


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.3619,0.1815,0.426,0.8329,0.0276,0.025
1,0.3295,0.1669,0.4085,0.9502,0.0263,0.0218
2,0.1766,0.0464,0.2155,0.9842,0.0141,0.0122
3,0.1777,0.0457,0.2138,0.9551,0.0129,0.0115
4,0.3099,0.1229,0.3506,0.8885,0.0205,0.019
Mean,0.2711,0.1127,0.3229,0.9222,0.0203,0.0179
Std,0.0785,0.0577,0.0918,0.0544,0.006,0.0053


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Orthogonal Matching Pursuit,0.2261,0.0748,0.2736,0.9522,0.0151,0.0132


当前调优模型: gbr , 结果如下:


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.4793,0.3499,0.5915,0.6779,0.0385,0.0331
1,0.3041,0.2187,0.4677,0.9348,0.0326,0.0213
2,0.2439,0.096,0.3098,0.9674,0.0191,0.0162
3,0.2984,0.1358,0.3685,0.8667,0.0225,0.0195
4,0.207,0.0725,0.2692,0.9343,0.0155,0.0127
Mean,0.3065,0.1746,0.4013,0.8762,0.0256,0.0206
Std,0.0935,0.1008,0.1162,0.1044,0.0086,0.0069


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Gradient Boosting Regressor,0.3207,0.235,0.4848,0.85,0.0255,0.0179


当前调优模型: lightgbm , 结果如下:


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.5421,0.4647,0.6817,0.5722,0.0443,0.0375
1,0.3131,0.2756,0.525,0.9178,0.0365,0.0223
2,0.3089,0.1642,0.4052,0.9442,0.0241,0.02
3,0.2794,0.1276,0.3572,0.8747,0.022,0.0183
4,0.226,0.0811,0.2849,0.9264,0.0164,0.0139
Mean,0.3339,0.2227,0.4508,0.8471,0.0287,0.0224
Std,0.1086,0.137,0.1394,0.1393,0.0102,0.008




Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Light Gradient Boosting Machine,0.3113,0.1866,0.432,0.8809,0.0227,0.0175


当前调优模型: rf , 结果如下:


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.49,0.3791,0.6157,0.651,0.0397,0.0337
1,0.3401,0.2254,0.4747,0.9328,0.0323,0.0235
2,0.3339,0.1948,0.4414,0.9338,0.0269,0.0218
3,0.2889,0.136,0.3688,0.8664,0.0228,0.019
4,0.1905,0.058,0.2408,0.9474,0.0138,0.0116
Mean,0.3287,0.1987,0.4283,0.8663,0.0271,0.0219
Std,0.0968,0.1068,0.1234,0.1113,0.0088,0.0071


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Random Forest Regressor,0.3154,0.1853,0.4304,0.8818,0.0227,0.0178


In [6]:
暂停

NameError: name '暂停' is not defined

In [7]:
# 定义需要包含的模型简称集合
include_models = {'rf','gbr','lightgbm','et'}

# 生成过滤后的有序字典
model_dict = {
    k: v for k, v in tune_models.items()
    if k in include_models
}

In [8]:
# 集成
models = ensemble(model_dict)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.4879,0.3842,0.6199,0.6463,0.0404,0.0339
1,0.2876,0.2153,0.464,0.9358,0.0324,0.0204
2,0.2399,0.1028,0.3206,0.9651,0.0193,0.0157
3,0.266,0.111,0.3331,0.891,0.0206,0.0174
4,0.1995,0.0649,0.2547,0.9412,0.0146,0.0122
Mean,0.2962,0.1756,0.3985,0.8759,0.0254,0.0199
Std,0.1003,0.1156,0.1299,0.1172,0.0095,0.0075




Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
0,Extra Trees Regressor,0.2811,0.1678,0.3802,0.8791,0.0244,0.019,0.484
1,Gradient Boosting Regressor,0.3065,0.1746,0.4013,0.8762,0.0256,0.0206,0.464
4,Voting Regressor,0.2962,0.1756,0.3985,0.8759,0.0254,0.0199,0.706
3,Random Forest Regressor,0.3287,0.1987,0.4283,0.8663,0.0271,0.0219,0.468
2,Light Gradient Boosting Machine,0.3339,0.2227,0.4508,0.8471,0.0287,0.0224,0.47


In [9]:
save_model = save_models(models)

Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
