In [1]:
import sys
import os
current_dir = os.getcwd()  
project_root = os.path.dirname(current_dir)  
sys.path.insert(0, project_root) 
from pathlib import Path
import warnings
import pandas as pd
from pycaret.regression import *
from src.config.config import Config
from src.data.data_loader import DataLoader
from src.models.model_evaluator import ModelEvaluator

def train_models(train: pd.DataFrame,val: pd.DataFrame,target_column: str):
    """训练模型流程
    Args:
        data_path: 可选的数据文件路径
    """
    model_evaluator.setup_experiment(
        train_size=Config.MODEL_CONFIG['train_size'],
        train_data=train,
        val_data = val,
        target_column = target_column,
        categorical_features = Config.FEATURE_CONFIG['categorical_columns']
    )

    # 训练模型并返回最佳模型字典
    return model_evaluator.train_top_models(n_models=Config.MODEL_CONFIG['n_top_models'])

def tune(models: dict):
    """模型调优和集成"""
    # 对top模型进行调优
    tuned_models = model_evaluator.tune_models(models)

    return tuned_models

def ensemble(models: dict):
    """模型集成"""
    # 对top模型进行集成
    blended_model = model_evaluator.ensemble_models(models)

    return blended_model    
    # 模型
    
def save_models(models: dict):

    model_evaluator.saved_models(models)

In [2]:
# 示例用法
df = pd.read_csv(Path(Config.PATH_CONFIG['features_dir'])  / 'training_data.csv')
data_loader = DataLoader()
model_evaluator = ModelEvaluator()

# 划分数据集
train_data, val_data, country_test_data = data_loader.split_data_by_countries(
    df,
    train_size=Config.DATA_CONFIG['country_train_size'],
    val_size=Config.DATA_CONFIG['country_val_size'],
    random_state=Config.DATA_CONFIG['random_state']
)

train_data, time_test_data = data_loader.split_data_by_time(train_data,
    test_size=Config.DATA_CONFIG['time_test_size'])

# 数据分析
print('\n训练集统计信息:')
data_loader.analyze_datasets(train_data)
print('\n验证集统计信息:')
data_loader.analyze_datasets(val_data)
print('\n外部国家测试集统计信息:')
data_loader.analyze_datasets(country_test_data)
print('\n外部时间测试集统计信息:')
data_loader.analyze_datasets(time_test_data)
#合并验证集和外部国家测试集作为测试集
test_data = pd.concat([val_data, country_test_data])
print('\n合并，作为测试集统计信息:')
data_loader.analyze_datasets(test_data)


训练集统计信息:
总数据条数: 1399
国家总数: 57
包含的国家: Albania, Argentina, Austria, Bangladesh, Belgium, Bosnia and Herzegovina, Brazil, Bulgaria, Canada, China, Colombia, Congo, Dem. Rep., Croatia, Cyprus, Czechia, Denmark, Estonia, Ethiopia, Greece, Hungary, Iceland, Indonesia, Iran, Islamic Rep., Iraq, Ireland, Italy, Japan, Korea, Rep., Latvia, Lithuania, Luxembourg, Malaysia, Malta, Mexico, Montenegro, Morocco, Netherlands, Nigeria, North Macedonia, Norway, Peru, Philippines, Poland, Romania, Russian Federation, Saudi Arabia, Serbia, Slovak Republic, Slovenia, South Africa, Spain, Sweden, Switzerland, Tanzania, Turkiye, Uganda, United Kingdom

特征数量: 30
特征列表: MSW, MSW_log, economic_stage, gdp_ppp_2017_growth, gdp_ppp_2017_log, gdp_ppp_2017_momentum, gdp_ppp_2017_relative_change, gdp_ppp_2017_relative_position, gdp_ppp_2017_squared, gdp_ppp_per_capita_2017_growth, gdp_ppp_per_capita_2017_log, gdp_ppp_per_capita_2017_momentum, gdp_ppp_per_capita_2017_relative_change, gdp_ppp_per_capita_2017_relative_

In [3]:
train_data.to_csv(os.path.join(Config.PATH_CONFIG['data_dir'], 'train.csv'), index=False)
test_data.to_csv(os.path.join(Config.PATH_CONFIG['data_dir'], 'test.csv'), index=False)
time_test_data.to_csv(os.path.join(Config.PATH_CONFIG['data_dir'], 'time_test_data.csv'), index=False)

# 排除不需要的列
feature_cols = [col for col in train_data.columns 
                if col not in [Config.DATA_CONFIG['target_column'], 'Country Name', 'Year','Region', 'Income Group']]

train = train_data[feature_cols]
test = test_data[feature_cols]
time_test_data = time_test_data[feature_cols] 

target_column = Config.DATA_CONFIG['target_column']
method = Config.FEATURE_CONFIG['target_transform_method']
transformed_column = f'{target_column}_{method}'


In [4]:
# 训练模型
models = train_models(train,test,transformed_column)

Unnamed: 0,Description,Value
0,Session id,888
1,Target,MSW_log
2,Target type,Regression
3,Original data shape,"(1769, 32)"
4,Transformed data shape,"(1769, 32)"
5,Transformed train set shape,"(1399, 32)"
6,Transformed test set shape,"(370, 32)"
7,Numeric features,31
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
et,Extra Trees Regressor,0.3975,0.2906,0.4997,0.8669,0.0296,0.0249,0.04
ada,AdaBoost Regressor,0.4176,0.2999,0.5079,0.8649,0.0299,0.0261,0.038
gbr,Gradient Boosting Regressor,0.4144,0.2904,0.5161,0.8553,0.0308,0.0262,0.138
lightgbm,Light Gradient Boosting Machine,0.4261,0.3108,0.5333,0.8543,0.0315,0.0268,0.112
xgboost,Extreme Gradient Boosting,0.4449,0.3207,0.5507,0.8372,0.0326,0.028,0.098
catboost,CatBoost Regressor,0.4382,0.3574,0.5616,0.8116,0.0341,0.0279,0.588
rf,Random Forest Regressor,0.4606,0.3375,0.5638,0.8331,0.0337,0.0292,0.068
dt,Decision Tree Regressor,0.4862,0.3974,0.6177,0.8077,0.037,0.0309,0.01
omp,Orthogonal Matching Pursuit,0.4993,0.5762,0.6316,0.7427,0.0386,0.0317,0.354
ridge,Ridge Regression,0.457,0.6045,0.6884,0.686,0.0399,0.0286,0.434


In [5]:
# 定义需要包含的模型简称集合
include_models = {'rf','xgboost','et','lightgbm','gbr','catboost'}

# 生成过滤后的有序字典
model_dict = {
    k: v for k, v in models.items()
    if k in include_models
}

In [None]:
print(f'当前调优模型: {", ".join(model_dict.keys())}')

# 调优
tune_models = tune(model_dict)

当前调优模型: et, gbr, lightgbm, xgboost, catboost, rf
当前调优模型: et , 结果如下:


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

In [None]:
# 集成
models = ensemble(model_dict)

In [None]:
for model_name, model in models.items():
    print(f"\n{'-'*30}")
    print(f"正在分析模型: {model_name.upper()}")
    print(f"{'-'*30}")
    
    # SHAP值解释（需要安装shap包）
    try:
        interpret_model(model)
        plot_model(model, plot = 'feature')
    except:
        print(f"{model_name} 不支持SHAP分析")

In [None]:
save_model = save_models(models)