In [1]:
import sys
import os
current_dir = os.getcwd()  
project_root = os.path.dirname(current_dir)  
sys.path.insert(0, project_root) 
from pathlib import Path
import warnings
import pandas as pd
import numpy as np
from pycaret.regression import *
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel, RFE, mutual_info_regression
from src.config.config import Config
from src.data.data_loader import DataLoader
from src.models.model_evaluator import ModelEvaluator

def train_models(train: pd.DataFrame,val: pd.DataFrame,target_column: str):
    """训练模型流程
    Args:
        data_path: 可选的数据文件路径
    """
    model_evaluator.setup_experiment(
        train_size=Config.MODEL_CONFIG['train_size'],
        train_data=train,
        val_data = val,
        target_column = target_column,
        categorical_features = Config.FEATURE_CONFIG['categorical_columns']
    )

    # 训练模型并返回最佳模型字典
    return model_evaluator.train_top_models(n_models=Config.MODEL_CONFIG['n_top_models'])

def tune(models: dict):
    """模型调优和集成"""
    # 对top模型进行调优
    tuned_models = model_evaluator.tune_models(models)

    return tuned_models

def ensemble(models: dict):
    """模型集成"""
    # 对top模型进行集成
    blended_model = model_evaluator.ensemble_models(models)

    return blended_model    
    # 模型
    
def save_models(models: dict):

    model_evaluator.saved_models(models)

def select_features(train_data: pd.DataFrame, test_data: pd.DataFrame, 
                   time_test_data: pd.DataFrame, n_features: int = 15) -> tuple:
    """选择最重要的特征
    
    Args:
        train_data: 训练数据
        test_data: 测试数据
        time_test_data: 时间测试数据
        n_features: 要选择的特征数量，默认为15
        
    Returns:
        tuple: (处理后的训练集, 测试集, 时间测试集)
    """
    # 排除不需要的列
    target_transformed = f"{Config.DATA_CONFIG['target_column']}_{Config.FEATURE_CONFIG['target_transform_method']}"
    feature_cols = [col for col in train_data.columns 
                    if col not in [Config.DATA_CONFIG['target_column'], target_transformed,
                                 'Country Name', 'Year', 'Region', 'Income Group']]
    
    # 自动调整特征数量，确保不超过样本量的1/10
    sample_count = len(train_data)
    max_features = min(n_features, sample_count // 10)
    if max_features < n_features:
        print(f"警告: 样本量({sample_count})较小，特征数量已从{n_features}自动调整为{max_features}")
        n_features = max_features
    
    # 准备数据
    X = train_data[feature_cols]
    y = train_data[target_transformed]
    
    # 1. 基于模型的特征选择 (SelectFromModel)
    rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
    selector1 = SelectFromModel(rf_model, threshold='median')
    selector1.fit(X, y)
    model_selected_features = X.columns[selector1.get_support()].tolist()
    
    # 计算特征重要性分数
    rf_model.fit(X, y)
    model_importance = pd.DataFrame({
        'feature': feature_cols,
        'importance': rf_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    # 2. 递归特征消除 (RFE)
    rfe_selector = RFE(estimator=RandomForestRegressor(n_estimators=50, random_state=42), 
                      n_features_to_select=n_features, step=1)
    rfe_selector.fit(X, y)
    rfe_selected_features = X.columns[rfe_selector.support_].tolist()
    
    # 计算RFE排名
    rfe_ranking = pd.DataFrame({
        'feature': feature_cols,
        'ranking': rfe_selector.ranking_
    }).sort_values('ranking')
    
    # 3. 基于互信息的特征选择 (mutual_info_regression)
    mi_scores = mutual_info_regression(X, y)
    mi_selected_features = pd.DataFrame({
        'feature': feature_cols,
        'mi_score': mi_scores
    }).sort_values('mi_score', ascending=False).head(n_features)['feature'].tolist()
    
    # 4. 添加LASSO特征选择 (适合小样本量)
    from sklearn.linear_model import LassoCV
    lasso = LassoCV(cv=5, random_state=42, max_iter=2000)
    lasso.fit(X, y)
    lasso_importance = np.abs(lasso.coef_)
    lasso_selected_features = pd.DataFrame({
        'feature': feature_cols,
        'importance': lasso_importance
    }).sort_values('importance', ascending=False).head(n_features)['feature'].tolist()
    
    # 统计每个特征被选中的次数（投票）
    feature_votes = {}
    for feature in feature_cols:
        feature_votes[feature] = 0
        if feature in model_selected_features:
            feature_votes[feature] += 1
        if feature in rfe_selected_features:
            feature_votes[feature] += 1
        if feature in mi_selected_features:
            feature_votes[feature] += 1
        if feature in lasso_selected_features:
            feature_votes[feature] += 1
    
    # 按投票数排序选择特征
    selected_features = pd.DataFrame({
        'feature': list(feature_votes.keys()),
        'votes': list(feature_votes.values())
    }).sort_values(['votes', 'feature'], ascending=[False, True]).head(n_features)['feature'].tolist()
    
    # 打印各方法选择的特征
    print("\n基于模型重要性选择的特征:")
    for i, row in model_importance.head(10).iterrows():
        print(f"{i+1}. {row['feature']} (重要性: {row['importance']:.4f})")
        
    print("\n基于递归特征消除选择的特征:")
    for i, feature in enumerate(rfe_selected_features[:10], 1):
        print(f"{i}. {feature} (排名: {rfe_ranking[rfe_ranking['feature']==feature]['ranking'].values[0]})")
        
    print("\n基于互信息选择的特征:")
    for i, feature in enumerate(mi_selected_features[:10], 1):
        mi_value = mi_scores[feature_cols.index(feature)]
        print(f"{i}. {feature} (互信息: {mi_value:.4f})")
        
    print("\n基于LASSO选择的特征:")
    for i, feature in enumerate(lasso_selected_features[:10], 1):
        lasso_value = lasso_importance[feature_cols.index(feature)]
        print(f"{i}. {feature} (LASSO系数: {lasso_value:.4f})")
    
    # 打印最终选择的特征
    print(f"\n最终选择的{len(selected_features)}个特征 (基于投票):")
    for i, feature in enumerate(selected_features, 1):
        votes = feature_votes[feature]
        print(f"{i}. {feature} (得票: {votes}/4)")
    
    # 更新数据集，保留目标变量和转换后的目标变量
    selected_columns = selected_features + [target_transformed]    
    train = train_data[selected_columns]
    test = test_data[selected_columns]
    time_test = time_test_data[selected_columns]
    
    return train, test, time_test

In [2]:
# 示例用法
df = pd.read_csv(Path(Config.PATH_CONFIG['features_dir'])  / 'training_data.csv')
data_loader = DataLoader()
model_evaluator = ModelEvaluator()

# 划分数据集
train_data, val_data, country_test_data = data_loader.split_data_by_countries(
    df,
    train_size=Config.DATA_CONFIG['country_train_size'],
    val_size=Config.DATA_CONFIG['country_val_size'],
    random_state=Config.DATA_CONFIG['random_state']
)

train_data, time_test_data = data_loader.split_data_by_time(train_data,
    test_size=Config.DATA_CONFIG['time_test_size'])

# 数据分析
print('\n训练集统计信息:')
data_loader.analyze_datasets(train_data)
print('\n验证集统计信息:')
data_loader.analyze_datasets(val_data)
print('\n外部国家测试集统计信息:')
data_loader.analyze_datasets(country_test_data)
print('\n外部时间测试集统计信息:')
data_loader.analyze_datasets(time_test_data)
#合并验证集和外部国家测试集作为测试集
test_data = pd.concat([val_data, country_test_data])
print('\n合并，作为测试集统计信息:')
data_loader.analyze_datasets(test_data)

train_data.to_csv(os.path.join(Config.PATH_CONFIG['data_dir'], 'train.csv'), index=False)
test_data.to_csv(os.path.join(Config.PATH_CONFIG['data_dir'], 'test.csv'), index=False)
time_test_data.to_csv(os.path.join(Config.PATH_CONFIG['data_dir'], 'time_test_data.csv'), index=False)


训练集统计信息:
总数据条数: 3268
国家总数: 136
包含的国家: Albania, Argentina, Armenia, Aruba, Australia, Austria, Bahamas, The, Bangladesh, Barbados, Belarus, Belgium, Belize, Benin, Botswana, Brazil, Bulgaria, Burkina Faso, Burundi, Cabo Verde, Chad, Chile, China, Colombia, Comoros, Congo, Dem. Rep., Costa Rica, Croatia, Cyprus, Denmark, Dominican Republic, Ecuador, Egypt, Arab Rep., El Salvador, Equatorial Guinea, Ethiopia, Fiji, Finland, France, Georgia, Germany, Ghana, Greece, Grenada, Guatemala, Guinea, Guinea-Bissau, Guyana, Haiti, Honduras, Hungary, Iceland, India, Indonesia, Iran, Islamic Rep., Ireland, Italy, Jamaica, Japan, Jordan, Kazakhstan, Kenya, Kiribati, Korea, Rep., Kyrgyz Republic, Lao PDR, Latvia, Lebanon, Lesotho, Liberia, Libya, Lithuania, Luxembourg, Madagascar, Malawi, Maldives, Mali, Malta, Mauritania, Mauritius, Mexico, Moldova, Mongolia, Morocco, Mozambique, Myanmar, Namibia, Nepal, Netherlands, New Zealand, Nicaragua, Niger, Nigeria, North Macedonia, Norway, Oman, Pakistan, Pap

In [3]:
# 排除不需要的列
feature_cols = [col for col in train_data.columns 
                if col not in [Config.DATA_CONFIG['target_column'], 'Country Name', 'Year','Region', 'Income Group']]

train = train_data[feature_cols]
test = test_data[feature_cols]
time_test_data = time_test_data[feature_cols] 

target_column = Config.DATA_CONFIG['target_column']
method = Config.FEATURE_CONFIG['target_transform_method']
transformed_column = f'{target_column}_{method}'

In [4]:
# 使用特征选择方法
selected_features, test_selected, time_test_selected = select_features(train_data, test_data, time_test_data, n_features=30)

# 强制添加时间相关特征
time_features = [col for col in train_data.columns if any(term in col for term in 
                ['year', 'time', 'decade', 'country_specific', 'growth_from_first', 'acceleration'])]

for col in time_features:
    if col not in selected_features.columns:
        selected_features[col] = train_data[col]
        test_selected[col] = test_data[col]
        time_test_selected[col] = time_test_data[col]
        print(f"强制添加时间特征: {col}")

# 使用增强后的特征集训练模型
train, test, time_test_data = selected_features, test_selected, time_test_selected


基于模型重要性选择的特征:
6. gdp_ppp_2017_squared (重要性: 0.1876)
8. gdp_ppp_2017_exp (重要性: 0.1718)
2. GDP PPP 2017 (重要性: 0.1682)
7. gdp_ppp_2017_cubic (重要性: 0.1619)
5. gdp_ppp_2017_log (重要性: 0.1474)
49. gdp_population_interaction (重要性: 0.1003)
41. 1_population_relative_gdp_pc (重要性: 0.0057)
31. income_group_population_rank (重要性: 0.0050)
19. population_cubic (重要性: 0.0050)
42. 2_population_relative_gdp_pc (重要性: 0.0049)

基于递归特征消除选择的特征:
1. Population (排名: 1)
2. GDP PPP 2017 (排名: 1)
3. Urban population % (排名: 1)
4. gdp_ppp_2017_log (排名: 1)
5. gdp_ppp_2017_squared (排名: 1)
6. gdp_ppp_2017_cubic (排名: 1)
7. gdp_ppp_2017_exp (排名: 1)
8. population_log (排名: 1)
9. population_squared (排名: 1)
10. population_cubic (排名: 1)

基于互信息选择的特征:
1. gdp_ppp_2017_log (互信息: 2.3811)
2. gdp_ppp_2017_squared (互信息: 2.3766)
3. gdp_ppp_2017_cubic (互信息: 2.3698)
4. gdp_ppp_2017_exp (互信息: 2.3656)
5. gdp_population_interaction (互信息: 1.9882)
6. GDP PPP 2017 (互信息: 1.8890)
7. population_squared (互信息: 1.8118)
8. population_exp (互信息: 1.8099)


In [5]:
# 训练模型
models = train_models(train,test,transformed_column)

Unnamed: 0,Description,Value
0,Session id,456
1,Target,CW_log
2,Target type,Regression
3,Original data shape,"(4184, 34)"
4,Transformed data shape,"(4184, 34)"
5,Transformed train set shape,"(3268, 34)"
6,Transformed test set shape,"(916, 34)"
7,Numeric features,33
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
et,Extra Trees Regressor,0.3706,0.3226,0.5544,0.9384,0.0374,0.0266,0.05
br,Bayesian Ridge,0.3992,0.3445,0.5766,0.9228,0.0384,0.0286,0.008
huber,Huber Regressor,0.4068,0.406,0.6194,0.9044,0.0428,0.0299,0.016
ridge,Ridge Regression,0.4171,0.3971,0.6206,0.9181,0.0416,0.0303,0.436
gbr,Gradient Boosting Regressor,0.4147,0.391,0.6207,0.9202,0.0407,0.0292,0.456
rf,Random Forest Regressor,0.4338,0.4099,0.6343,0.9185,0.0424,0.0306,0.2
lightgbm,Light Gradient Boosting Machine,0.4337,0.4239,0.6457,0.914,0.0429,0.0308,0.098
ada,AdaBoost Regressor,0.4621,0.4454,0.6502,0.9144,0.0431,0.0327,0.098
xgboost,Extreme Gradient Boosting,0.4588,0.4471,0.6663,0.9057,0.0438,0.0322,0.108
catboost,CatBoost Regressor,0.4194,0.4677,0.678,0.9058,0.0452,0.0303,0.75


In [6]:
# 定义需要包含的模型简称集合
include_models = {'rf','xgboost','et','lightgbm','gbr','catboost','knn'}

# 生成过滤后的有序字典
model_dict = {
    k: v for k, v in models.items()
    if k in include_models
}

In [7]:
print(f'当前调优模型: {", ".join(model_dict.keys())}')

# 调优
tune_models = tune(model_dict)

当前调优模型: et, gbr, rf, lightgbm, xgboost, catboost
当前调优模型: et , 结果如下:


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.2343,0.0906,0.3009,0.969,0.0208,0.0174
1,0.3883,0.3437,0.5862,0.935,0.0361,0.0263
2,0.3286,0.3141,0.5604,0.9372,0.0371,0.0226
3,0.4163,0.3473,0.5894,0.9443,0.052,0.0347
4,0.419,0.4669,0.6833,0.9169,0.0385,0.0278
Mean,0.3573,0.3125,0.5441,0.9405,0.0369,0.0258
Std,0.0696,0.1227,0.1285,0.0169,0.0099,0.0058


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Extra Trees Regressor,0.2565,0.1058,0.3252,0.9654,0.0224,0.0184


当前调优模型: gbr , 结果如下:


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.3225,0.1738,0.4169,0.9404,0.0292,0.0233
1,0.4158,0.3937,0.6274,0.9255,0.0387,0.0282
2,0.3365,0.3214,0.5669,0.9358,0.037,0.0227
3,0.4627,0.4284,0.6545,0.9314,0.0559,0.0382
4,0.4432,0.5138,0.7168,0.9085,0.0404,0.0292
Mean,0.3961,0.3662,0.5965,0.9283,0.0402,0.0283
Std,0.0566,0.1144,0.1019,0.0111,0.0087,0.0056


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Gradient Boosting Regressor,0.2629,0.1072,0.3274,0.9649,0.0218,0.0184


当前调优模型: rf , 结果如下:


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.3437,0.2104,0.4587,0.9279,0.0321,0.0248
1,0.3915,0.355,0.5959,0.9328,0.0365,0.0264
2,0.3385,0.3265,0.5714,0.9347,0.0372,0.0229
3,0.4411,0.3957,0.629,0.9366,0.0548,0.0365
4,0.4505,0.494,0.7029,0.9121,0.0397,0.0297
Mean,0.3931,0.3563,0.5916,0.9288,0.04,0.0281
Std,0.0469,0.0924,0.0798,0.0089,0.0078,0.0048


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Random Forest Regressor,0.2855,0.1188,0.3447,0.9611,0.0232,0.0201


当前调优模型: lightgbm , 结果如下:


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.3066,0.1422,0.3771,0.9513,0.0257,0.0222
1,0.4082,0.3717,0.6097,0.9297,0.0381,0.0279
2,0.3687,0.3437,0.5862,0.9313,0.0385,0.0249
3,0.4271,0.3472,0.5892,0.9444,0.0518,0.0358
4,0.4227,0.546,0.739,0.9028,0.0406,0.0275
Mean,0.3866,0.3502,0.5802,0.9319,0.0389,0.0277
Std,0.045,0.1282,0.1161,0.0166,0.0083,0.0046




Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Light Gradient Boosting Machine,0.2614,0.104,0.3225,0.9659,0.0212,0.0182


当前调优模型: xgboost , 结果如下:


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.3494,0.2162,0.465,0.9259,0.0307,0.0247
1,0.4369,0.4118,0.6417,0.9221,0.0411,0.0306
2,0.3527,0.363,0.6025,0.9274,0.0394,0.0239
3,0.4293,0.3953,0.6288,0.9367,0.0538,0.0357
4,0.444,0.5399,0.7348,0.9039,0.0411,0.0292
Mean,0.4025,0.3853,0.6146,0.9232,0.0412,0.0288
Std,0.0422,0.1037,0.0871,0.0108,0.0074,0.0043


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Extreme Gradient Boosting,0.2429,0.0925,0.3041,0.9697,0.0197,0.0168


当前调优模型: catboost , 结果如下:


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.3207,0.1568,0.396,0.9463,0.0279,0.0238
1,0.398,0.3558,0.5965,0.9327,0.0366,0.0266
2,0.4151,0.3908,0.6251,0.9219,0.0412,0.0279
3,0.5432,0.5391,0.7342,0.9136,0.0628,0.0459
4,0.4433,0.5552,0.7451,0.9012,0.0421,0.0294
Mean,0.424,0.3995,0.6194,0.9231,0.0421,0.0307
Std,0.0721,0.1446,0.1261,0.0155,0.0115,0.0078


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,CatBoost Regressor,0.2758,0.1203,0.3469,0.9606,0.0232,0.0193


In [8]:
# 集成
models = ensemble(tune_models)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.2933,0.1287,0.3587,0.9559,0.0246,0.0213
1,0.399,0.3622,0.6018,0.9315,0.0371,0.0272
2,0.3379,0.3277,0.5725,0.9345,0.0375,0.0228
3,0.441,0.3848,0.6203,0.9383,0.0541,0.037
4,0.4293,0.5041,0.71,0.9103,0.0395,0.0282
Mean,0.3801,0.3415,0.5727,0.9341,0.0386,0.0273
Std,0.0562,0.1219,0.1164,0.0146,0.0094,0.0055




Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
0,Extra Trees Regressor,0.3573,0.3125,0.5441,0.9405,0.0369,0.0258,0.588
6,Voting Regressor,0.3801,0.3415,0.5727,0.9341,0.0386,0.0273,0.904
3,Light Gradient Boosting Machine,0.3866,0.3502,0.5802,0.9319,0.0389,0.0277,0.082
2,Random Forest Regressor,0.3931,0.3563,0.5916,0.9288,0.04,0.0281,0.458
1,Gradient Boosting Regressor,0.3961,0.3662,0.5965,0.9283,0.0402,0.0283,0.694
4,Extreme Gradient Boosting,0.4025,0.3853,0.6146,0.9232,0.0412,0.0288,0.104
5,CatBoost Regressor,0.424,0.3995,0.6194,0.9231,0.0421,0.0307,0.104


In [None]:
for model_name, model in models.items():
    print(f"\n{'-'*30}")
    print(f"正在分析模型: {model_name.upper()}")
    print(f"{'-'*30}")
    
    # SHAP值解释（需要安装shap包）
    try:
        interpret_model(model)
        plot_model(model, plot = 'feature')
    except:
        print(f"{model_name} 不支持SHAP分析")

In [None]:
# 定义需要包含的模型简称集合
include_models = {'rf','xgboost','et','lightgbm','gbr','catboost','knn'}

# 生成过滤后的有序字典
seleced_model_dict = {
    k: v for k, v in tune_models.items()
    if k in include_models
}
# 集成
seleced_models = ensemble(seleced_model_dict)

In [9]:
seleced_models = models
save_model = save_models(seleced_models)

Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
