In [1]:
import sys
from pathlib import Path

# 添加项目根目录到Python路径（Jupyter专用）
sys.path.append(str(Path.cwd().parent.parent))

from src.models.model_evaluator import ModelEvaluator
import os
import pandas as pd
from pycaret.regression import *
from src.config.config import Config
from src.data.data_loader import DataLoader
from src.features.feature_engineering import FeatureEngineering
from src.models.model_evaluator import ModelEvaluator

In [3]:
# 初始化组件
data_path = Config.DATA_CONFIG['data_path']
data_loader = DataLoader(data_path)
feature_engineering = FeatureEngineering()
model_evaluator = ModelEvaluator()

# 加载数据
df = data_loader.load_data(
    sheet_name='msw_result',
    target_column=Config.DATA_CONFIG['target_column'],
    feature_columns=Config.DATA_CONFIG['feature_columns']
)

# 划分数据集
train_data, country_test_data = data_loader.split_data_by_countries(
    df,
    test_size=Config.DATA_CONFIG['test_size'],
    random_state=Config.DATA_CONFIG['random_state']
)
train_data, time_test_data = data_loader.split_data_by_time(train_data)


# 保存数据集
train_data.to_csv(os.path.join(Config.PATH_CONFIG['data_dir'], 'train_data.csv'), index=False)
country_test_data.to_csv(os.path.join(Config.PATH_CONFIG['data_dir'], 'country_test_data.csv'), index=False)
time_test_data.to_csv(os.path.join(Config.PATH_CONFIG['data_dir'], 'time_test_data.csv'), index=False)

# 数据分析
print('\n训练集统计信息:')
data_loader.analyze_datasets(train_data)
print('\n国家测试集统计信息:')
data_loader.analyze_datasets(country_test_data)
print('\n时间测试集统计信息:')
data_loader.analyze_datasets(time_test_data)

# 特征工程
train_data_processed, target_column = feature_engineering.fit_transform(
    train_data,
    target_column= Config.DATA_CONFIG['target_column'],
    categorical_columns= Config.FEATURE_CONFIG['categorical_columns']
)

# 设置实验
feature_cols = [col for col in train_data_processed.columns 
                if col not in [Config.DATA_CONFIG['target_column'], 'Country Name', 'Year']]

model_evaluator.setup_experiment(
    train_size=Config.MODEL_CONFIG['train_size'],
    train_data=train_data_processed[feature_cols],
    target_column=target_column
)





训练集统计信息:

数据集统计信息:
总数据条数: 1476
国家总数: 59
包含的国家: Albania, Argentina, Australia, Austria, Bangladesh, Belgium, Bosnia and Herzegovina, Bulgaria, Canada, China, Colombia, Congo, Dem. Rep., Croatia, Cyprus, Denmark, Egypt, Arab Rep., Ethiopia, Finland, France, Germany, Greece, Hungary, Iceland, India, Indonesia, Iran, Islamic Rep., Iraq, Ireland, Italy, Japan, Korea, Rep., Latvia, Lithuania, Luxembourg, Malta, Mexico, Montenegro, Morocco, Netherlands, Nigeria, North Macedonia, Norway, Pakistan, Peru, Poland, Portugal, Romania, Russian Federation, Saudi Arabia, Slovak Republic, Spain, Sweden, Tanzania, Thailand, Turkiye, Uganda, Ukraine, United Kingdom, United States

国家测试集统计信息:

数据集统计信息:
总数据条数: 283
国家总数: 10
包含的国家: Algeria, Brazil, Czechia, Estonia, Malaysia, Philippines, Serbia, Slovenia, South Africa, Switzerland

时间测试集统计信息:

数据集统计信息:
总数据条数: 223
国家总数: 59
包含的国家: Albania, Argentina, Australia, Austria, Bangladesh, Belgium, Bosnia and Herzegovina, Bulgaria, Canada, China, Colombia, Congo, De

Unnamed: 0,Description,Value
0,Session id,123
1,Target,MSW_log
2,Target type,Regression
3,Original data shape,"(1476, 23)"
4,Transformed data shape,"(1476, 32)"
5,Transformed train set shape,"(1180, 32)"
6,Transformed test set shape,"(296, 32)"
7,Numeric features,20
8,Categorical features,2
9,Preprocess,True


读取models目录模型文件： ['lightgbm', 'gbr', 'rf', 'xgboost']
Transformation Pipeline and Model Successfully Loaded
Transformation Pipeline and Model Successfully Loaded
Transformation Pipeline and Model Successfully Loaded
Transformation Pipeline and Model Successfully Loaded


Processing:   0%|          | 0/6 [00:00<?, ?it/s]

ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
joblib.externals.loky.process_executor._RemoteTraceback: 
"""
Traceback (most recent call last):
  File "d:\Anaconda\envs\pycaret3.0\lib\site-packages\joblib\_parallel_backends.py", line 273, in _wrap_func_call
    return func()
  File "d:\Anaconda\envs\pycaret3.0\lib\site-packages\joblib\parallel.py", line 589, in __call__
    return [func(*args, **kwargs)
  File "d:\Anaconda\envs\pycaret3.0\lib\site-packages\joblib\parallel.py", line 589, in <listcomp>
    return [func(*args, **kwargs)
  File "d:\Anaconda\envs\pycaret3.0\lib\site-packages\sklearn\utils\parallel.py", line 123, in __call__
    return self.function(*args, **kwargs)
  File "d:\Anaconda\envs\pycaret3.0\lib\site-packages\sklearn\ensemble\_base.py", line 46, in _fit_single_estimator
    estimator.fit(X, y)
  File "d:\Anaconda\envs\pycaret3.0\lib\site-packages\pycaret\internal\pipeline.py", line 270, in fit
    X, y, _ = self._fit(X, y, **fit_params_steps)
  File "d:\Anaconda\envs\pycaret3.0\lib\site-packages\pycaret\internal\pipeline.py", line 246, in _fit
    fitted_transformer = self._memory_fit(
  File "d:\Anaconda\envs\pycaret3.0\lib\site-packages\joblib\memory.py", line 353, in __call__
    return self.func(*args, **kwargs)
  File "d:\Anaconda\envs\pycaret3.0\lib\site-packages\pycaret\internal\pipeline.py", line 68, in _fit_one
    transformer.fit(*args, **fit_params)
  File "d:\Anaconda\envs\pycaret3.0\lib\site-packages\pycaret\internal\preprocess\transformers.py", line 229, in fit
    self.transformer.fit(*args, **fit_params)
  File "d:\Anaconda\envs\pycaret3.0\lib\site-packages\sklearn\impute\_base.py", line 390, in fit
    X = self._validate_input(X, in_fit=True)
  File "d:\Anaconda\envs\pycaret3.0\lib\site-packages\sklearn\impute\_base.py", line 344, in _validate_input
    raise ve
  File "d:\Anaconda\envs\pycaret3.0\lib\site-packages\sklearn\impute\_base.py", line 327, in _validate_input
    X = self._validate_data(
  File "d:\Anaconda\envs\pycaret3.0\lib\site-packages\sklearn\base.py", line 565, in _validate_data
    X = check_array(X, input_name="X", **check_params)
  File "d:\Anaconda\envs\pycaret3.0\lib\site-packages\sklearn\utils\validation.py", line 778, in check_array
    dtype_orig = np.result_type(*dtypes_orig)
  File "<__array_function__ internals>", line 200, in result_type
ValueError: at least one array or dtype is required
"""

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "d:\Anaconda\envs\pycaret3.0\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\Anaconda\envs\pycaret3.0\lib\site-packages\pycaret\internal\pipeline.py", line 275, in fit
    fitted_estimator = self._memory_fit(
  File "d:\Anaconda\envs\pycaret3.0\lib\site-packages\joblib\memory.py", line 353, in __call__
    return self.func(*args, **kwargs)
  File "d:\Anaconda\envs\pycaret3.0\lib\site-packages\pycaret\internal\pipeline.py", line 68, in _fit_one
    transformer.fit(*args, **fit_params)
  File "d:\Anaconda\envs\pycaret3.0\lib\site-packages\sklearn\ensemble\_voting.py", line 598, in fit
    return super().fit(X, y, sample_weight)
  File "d:\Anaconda\envs\pycaret3.0\lib\site-packages\sklearn\ensemble\_voting.py", line 81, in fit
    self.estimators_ = Parallel(n_jobs=self.n_jobs)(
  File "d:\Anaconda\envs\pycaret3.0\lib\site-packages\sklearn\utils\parallel.py", line 63, in __call__
    return super().__call__(iterable_with_config)
  File "d:\Anaconda\envs\pycaret3.0\lib\site-packages\joblib\parallel.py", line 1952, in __call__
    return output if self.return_generator else list(output)
  File "d:\Anaconda\envs\pycaret3.0\lib\site-packages\joblib\parallel.py", line 1595, in _get_outputs
    yield from self._retrieve()
  File "d:\Anaconda\envs\pycaret3.0\lib\site-packages\joblib\parallel.py", line 1699, in _retrieve
    self._raise_error_fast()
  File "d:\Anaconda\envs\pycaret3.0\lib\site-packages\joblib\parallel.py", line 1734, in _raise_error_fast
    error_job.get_result(self.timeout)
  File "d:\Anaconda\envs\pycaret3.0\lib\site-packages\joblib\parallel.py", line 736, in get_result
    return self._return_or_raise()
  File "d:\Anaconda\envs\pycaret3.0\lib\site-packages\joblib\parallel.py", line 754, in _return_or_raise
    raise self._result
ValueError: at least one array or dtype is required


In [None]:
blend_models(estimator_list = [lightgbm,gbr,rf,xgboost])

In [None]:

for method in ['blend', 'stack']:
    print(f"\n{'='*30} 测试 {method.upper()} 集成 {'='*30}")
    evaluator.create_ensemble(ensemble_method = method) 