In [None]:
#导入常用库
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn as sk
import seaborn as sns
#设置显示中文
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
# 无视警告
import warnings 
# warnings.filterwarnings('always') # 显示所有警告
warnings.filterwarnings('ignore') # 忽略所有警告


def data_rename(data):
    '''
    重新命名列名，帮助我这个不会英语的人更好理解数据
    :param data: 待处理的数据
    :return: 重新命名后的数据'''
    # 检查列名
    print(data.columns.tolist())
    # 创建英文列名到中文列名的映射字典
    column_mapping = {'City Name': '城市名称','Type': '类型','Package': '包装','Variety': '品种','Sub Variety': '子品种','Grade': '等级','Date': '日期','Low Price': '最低价格','High Price': '最高价格','Mostly Low': '主要最低价','Mostly High': '主要最高价', 'Origin': '产地','Origin District': '产地区域','Item Size': '物品尺寸','Color': '颜色','Environment': '环境','Unit of Sale': '销售单位','Quality': '质量','Condition': '状况','Appearance': '外观','Storage': '储存','Crop': '作物','Repack': '重新包装','Trans Mode': '运输模式','Unnamed: 24': '未命名: 24','Unnamed: 25': '未命名: 25'}
    # 替换DataFrame的列名
    data.rename(columns=column_mapping, inplace=True)
    return data
def date_chuli0(data):
    '''
    将日期列转换为日期格式
    :param data:
    :return:日期格式化后的数据
    '''
    # data_rename(data)
    # 将日期列转换为日期格式
    data['日期'] = pd.to_datetime(data['日期'], format='%m/%d/%y')
    data["年份"] = data['日期'].dt.year
    data["月份"] = data['日期'].dt.month
    data["日"] = data['日期'].dt.day
    data["星期"] = data['日期'].dt.weekday
    data.drop(columns=['日期'], inplace=True)
    data['均价']= (data['最低价格'] + data['最高价格']) / 2
    return data

def data_tezheng(data):
    """
    数据特征分析和特征处理以及选择
    data: 数据集
    return: 特征处理后的,仅仅包含所需要的特征x和目标y的数据集
    *注意在时间维度上，去除日期会导致相似/相同 记录的重复，可能会造成数据泄露---需要对y进行处理，去除重复的y（删除/均值）
    """
    data = data_Hot_Deck_Imputation(data) # 热卡填补法
    # 重新包装中N与E的数量比过大，E只有5条记录，且存在至少两个特征的缺失值，故不考虑重新包装特征
    data.drop(columns=['重新包装'], inplace=True)
    # 删除全部为空的列
    data = data.dropna(axis=1, how='all')
    # 删除空值数量过多的列---该类数据的信息量太少，对后续分析无意义
    data = data.dropna(thresh=len(data)*0.2, axis=1) #thresh参数:指定保留的行或列中至少应包含的非缺失值的数量(可以容忍的缺失值数量)
    data['均价']= (data['最低价格'] + data['最高价格']) / 2
    # 去除其他无法填充，依旧存在空值的记录
    data = data.dropna(axis=0, how='any')
    # 去除异常值
    q1 = data['均价'].quantile(0.25)
    q3 = data['均价'].quantile(0.75)
    iqr = q3 - q1
    data = data[~((data['均价'] < (q1 - 1.5 * iqr)) | (data['均价'] > (q3 + 1.5 * iqr)))]
    # data = data[[ '月份', '星期', '物品尺寸', '包装', '品种', '产地', '均价']]
    # x=data[[ '月份', '星期', '物品尺寸', '包装', '品种', '产地']]
    # y=data['均价']
    # print(data.info())
    return data

def data_Hot_Deck_Imputation(data):
    """
    热卡填补法-手动查找
    """
    print("="*50,'热卡填补法-手动',"="*50)
    print('检查填充前空值情况：',data['产地'].isnull().sum()) 
    # 使用空值记录中其他存在的特征相同的记录的品种 填充，如 城市名称、包装、产地、（颜色、尺寸）------ 使用其中出现频率最高的品种属性填充
    data['产地'].fillna(data.groupby(['城市名称','包装','物品尺寸','品种'])['产地'].transform(lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan), inplace=True)
    print('检查填充后是否还有空值：',data['产地'].isnull().sum())
    print('检查填充前空值情况：',data['品种'].isnull().sum()) 
    # 使用空值记录中其他存在的特征相同的记录的品种 填充，如 城市名称、包装、产地、（颜色、尺寸）------ 使用其中出现频率最高的品种属性填充
    data['品种'].fillna(data.groupby(['城市名称','包装','物品尺寸','产地','最低价格'])['品种'].transform(lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan), inplace=True)
    print('检查填充后是否还有空值：',data['品种'].isnull().sum()) # 检查填充后是否还有空值
    data['品种'].fillna(data.groupby(['城市名称','包装','物品尺寸','产地'])['品种'].transform(lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan), inplace=True)
    print('放宽填要求后是否还有空值：',data['品种'].isnull().sum()) # 品牌成功填充
    return data

def data_str_bianma():
    """
    对于非数值类标签/离散标签进行 编码 ： 独热编码，
    """
    pass





In [None]:
# 读取数据集
data = pd.read_csv('../data/US-pumpkins.csv')



In [None]:

data=date_chuli0(data_rename(data))
# 数据特征分析和特征处理以及选择
data = data_tezheng(data)
data
# 保存特征处理后的数据集

In [None]:
column_name_list = [column for column in data.columns if data[column].dtype == 'object']
column_name_list

In [None]:
# !pip install category-encoders

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from category_encoders import TargetEncoder
import lightgbm as lgb
import xgboost as xgb

def encode_features(x, encoding_method='one_hot'):
    """对离散特征进行编码"""
    categorical_features = ['物品尺寸', '包装', '品种', '产地']
    
    if encoding_method == 'one_hot':
        preprocessor = ColumnTransformer(
            transformers=[
                ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
            ],
            remainder='passthrough'
        )
    elif encoding_method == 'ordinal':
        preprocessor = ColumnTransformer(
            transformers=[
                ('cat', OrdinalEncoder(), categorical_features)
            ],
            remainder='passthrough'
        )
    else:
        raise ValueError("不支持的编码方法")
    return preprocessor

def get_model(model_name, random_state=42):
    """根据模型名称获取模型实例"""
    models = {
        'decision_tree': DecisionTreeRegressor(random_state=random_state), # 回归树
        'random_forest': RandomForestRegressor(random_state=random_state), # 随机森林
        'linear_regression': LinearRegression(), # 线性回归
        'lgbm': lgb.LGBMRegressor(random_state=random_state), # LightGBM回归
        'xgboost': xgb.XGBRegressor(random_state=random_state) # XGBoost回归
    }
    
    if model_name not in models:
        raise ValueError(f"不支持的模型: {model_name}")
    return models[model_name]

def get_param_grid(model_name):
    """获取模型对应的超参数网格"""
    param_grids = {
        'decision_tree': {
            'regressor__max_depth': [5, 10, 20, None],
            'regressor__min_samples_split': [2, 5, 10],
            'regressor__min_samples_leaf': [1, 2, 4]
        },
        'random_forest': {
            'regressor__n_estimators': [50, 100, 200],
            'regressor__max_depth': [5, 10, 20, None],
            'regressor__max_features': ['sqrt', 'log2', None]
        },
        'linear_regression': {
            'regressor__fit_intercept': [True, False]
        },
        'ridge': {
            'regressor__alpha': [0.01, 0.1, 1.0, 10.0, 100.0],
            'regressor__solver': ['auto', 'svd', 'cholesky']
        },
        'lasso': {
            'regressor__alpha': [0.01, 0.1, 1.0, 10.0, 100.0],
            'regressor__selection': ['cyclic', 'random']
        },
        'lgbm': {
            'regressor__n_estimators': [50, 100, 200],
            'regressor__learning_rate': [0.01, 0.05, 0.1],
            'regressor__num_leaves': [31, 63, 127],
            'regressor__max_depth': [5, 10, 20, -1],
            'regressor__subsample': [0.8, 1.0]
        },
        'xgboost': {
            'regressor__n_estimators': [50, 100, 200],
            'regressor__learning_rate': [0.01, 0.05, 0.1],
            'regressor__max_depth': [3, 6, 9],
            'regressor__subsample': [0.8, 1.0],
            'regressor__colsample_bytree': [0.8, 1.0],
            'regressor__gamma': [0, 0.1, 0.2]
        }
    }

    
    return param_grids.get(model_name, {})


In [None]:

def find_best_hyperparameters(x, y, preprocessor, model_name, n_splits=5):
    """使用网格搜索寻找最佳超参数"""
    model = get_model(model_name)
    
    # 创建评估管道
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # 获取超参数网格
    param_grid = get_param_grid(model_name)
    
    # 对线性模型添加标准化
    if model_name in ['linear_regression', 'ridge', 'lasso']:
        pipeline.steps.insert(1, ('scaler', StandardScaler(with_mean=False)))
    
    # 设置网格搜索
    grid_search = GridSearchCV(
        pipeline, param_grid, cv=n_splits, scoring='neg_mean_squared_error', n_jobs=-1
    )
    
    # 执行网格搜索
    grid_search.fit(x, y)
    
    return grid_search.best_estimator_, grid_search.best_params_

def kfold_cross_validate(model, x, y, n_splits=5, random_state=42):
    """执行K折交叉验证并返回详细结果"""
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    results = []
    
    for fold, (train_idx, test_idx) in enumerate(kf.split(x)):
        x_train, x_test = x.iloc[train_idx], x.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        
        model.fit(x_train, y_train)
        y_pred = model.predict(x_test)
        
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        
        results.append({
            'fold': fold + 1,
            'train_size': len(train_idx),
            'test_size': len(test_idx),
            'mse': mse,
            'rmse': rmse,
            'mae': mae,
            'r2': r2,
            'model_params': model.get_params()['regressor'].__dict__
        })
    
    return results


In [None]:

def print_validation_results(results, model_name, encoding_method, best_params=None):
    """打印详细的验证结果"""
    print(f"\n=== {model_name.upper()} + {encoding_method.upper()} 验证结果 ===")
    
    if best_params:
        print("\n最佳超参数:")
        for param, value in best_params.items():
            print(f"  - {param.replace('regressor__', '')}: {value}")
    
    print("\n每折验证结果:")
    for fold_result in results:
        print(f"\n第 {fold_result['fold']} 折:")
        print(f"  训练集大小: {fold_result['train_size']}")
        print(f"  测试集大小: {fold_result['test_size']}")
        print(f"  MSE: {fold_result['mse']:.4f}")
        print(f"  RMSE: {fold_result['rmse']:.4f}")
        print(f"  MAE: {fold_result['mae']:.4f}")
        print(f"  R²: {fold_result['r2']:.4f}")
        
        print("\n  当前模型参数:")
        for param, value in fold_result['model_params'].items():
            if not param.endswith('_') and not callable(value):
                print(f"    - {param}: {value}")
    
    all_mse = [r['mse'] for r in results]
    all_rmse = [r['rmse'] for r in results]
    all_mae = [r['mae'] for r in results]
    all_r2 = [r['r2'] for r in results]
    
    print("\n总体性能:")
    print(f"  平均MSE: {np.mean(all_mse):.4f} ± {np.std(all_mse):.4f}")
    print(f"  平均RMSE: {np.mean(all_rmse):.4f} ± {np.std(all_rmse):.4f}")
    print(f"  平均MAE: {np.mean(all_mae):.4f} ± {np.std(all_mae):.4f}")
    print(f"  平均R²: {np.mean(all_r2):.4f} ± {np.std(all_r2):.4f}")


# 管道使用和k折交叉验证的测试

In [None]:
from sklearn.datasets  import load_iris 
from sklearn.preprocessing  import StandardScaler, LabelEncoder 
from sklearn.model_selection  import KFold, cross_val_score 
from sklearn.pipeline  import Pipeline 
from sklearn.linear_model  import LogisticRegression 
from sklearn.compose  import ColumnTransformer 
 
# 加载数据集 
data = load_iris()
X, y = data.data,  data.target  

# 创建预处理和模型的管道 
preprocessor = ColumnTransformer(
    transformers=[
        ('scaler', StandardScaler(), [0, 1, 2, 3])  # 对数值特征归一化 
    ]
)
 
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=200))
])

# 5折交叉验证 
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y, cv=kfold, scoring='accuracy')
 
print(f"交叉验证平均准确率: {scores.mean():.2f}  ± {scores.std():.2f}") 

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# 1. 创建模拟数据集
# 生成1000个样本，包含数值特征、名义分类特征和有序分类特征
np.random.seed(42)
n_samples = 1000

# 数值特征
age = np.random.normal(40, 10, n_samples)
income = np.random.normal(50000, 15000, n_samples)

# 名义分类特征（无序）
education = np.random.choice(['High School', 'Bachelor', 'Master', 'PhD'], size=n_samples)
city = np.random.choice(['New York', 'London', 'Tokyo', 'Paris'], size=n_samples)

# 有序分类特征（有内在顺序）
experience_level = np.random.choice(['Junior', 'Mid-Level', 'Senior', 'Executive'], 
                                   size=n_samples, p=[0.3, 0.4, 0.2, 0.1])
satisfaction = np.random.choice(['Very Low', 'Low', 'Medium', 'High', 'Very High'], 
                               size=n_samples)

# 目标变量
target = (age * 0.1 + (income > 55000).astype(int) + 
          np.where(education == 'PhD', 1, 0) + 
          np.where(experience_level == 'Executive', 2, 0) > 2).astype(int)

# 创建DataFrame
data = pd.DataFrame({
    'age': age,
    'income': income,
    'education': education,
    'city': city,
    'experience_level': experience_level,
    'satisfaction': satisfaction,
    'target': target
})


In [None]:
# 2. 定义特征类型
X = data.drop('target', axis=1)
y = data['target']

# 指定特征类型
numerical_features = ['age', 'income']  # 数值特征
nominal_features = ['education', 'city']  # 名义分类特征（无序）
ordinal_features = ['experience_level', 'satisfaction']  # 有序分类特征

# 3. 定义有序特征的类别顺序
# 这很重要，因为OrdinalEncoder需要知道类别的内在顺序
experience_categories = ['Junior', 'Mid-Level', 'Senior', 'Executive']
satisfaction_categories = ['Very Low', 'Low', 'Medium', 'High', 'Very High']

# 4. 创建列转换器
preprocessor = ColumnTransformer(
    transformers=[
        # 数值特征：标准化
        ('num', StandardScaler(), numerical_features),
        
        # 名义分类特征：One-Hot编码
        ('nom', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), nominal_features),
        
        # 有序分类特征：Ordinal编码
        ('ord', OrdinalEncoder(
            categories=[experience_categories, satisfaction_categories]),
         ordinal_features)
    ],
    remainder='drop'  # 忽略其他列（本例中没有其他列）
)

# 5. 创建完整管道
pipeline = Pipeline([
    ('preprocessor', preprocessor),  # 特征预处理
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))  # 分类器
])

# 6. 使用5折交叉验证评估模型
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y, cv=kfold, scoring='accuracy')

print("交叉验证准确率:", scores)
print(f"平均准确率: {np.mean(scores):.4f} (±{np.std(scores):.4f})")

# 7. 查看预处理后的特征名称（可选）
# 拟合一次以获取特征名称
pipeline.fit(X, y)

# 获取One-Hot编码后的特征名称
onehot_columns = pipeline.named_steps['preprocessor'].named_transformers_['nom'].get_feature_names_out(nominal_features)

# 获取所有特征名称
all_features = (
    list(numerical_features) + 
    list(onehot_columns) + 
    ordinal_features
)

print("\n预处理后的特征名称:")
print(all_features)

In [7]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
import joblib

# 1. 创建模拟数据集
np.random.seed(42)
n_samples = 1000

# 数值特征
age = np.random.normal(40, 10, n_samples)
income = np.random.normal(50000, 15000, n_samples)

# 名义分类特征
education = np.random.choice(['High School', 'Bachelor', 'Master', 'PhD'], size=n_samples)
city = np.random.choice(['New York', 'London', 'Tokyo', 'Paris'], size=n_samples)

# 有序分类特征
experience_level = np.random.choice(['Junior', 'Mid-Level', 'Senior', 'Executive'], 
                                   size=n_samples, p=[0.3, 0.4, 0.2, 0.1])
satisfaction = np.random.choice(['Very Low', 'Low', 'Medium', 'High', 'Very High'], 
                               size=n_samples)

# 目标变量
target = (age * 0.1 + (income > 55000).astype(int) + 
          np.where(education == 'PhD', 1, 0) + 
          np.where(experience_level == 'Executive', 2, 0) > 2).astype(int)

# 创建DataFrame
data = pd.DataFrame({
    'age': age,
    'income': income,
    'education': education,
    'city': city,
    'experience_level': experience_level,
    'satisfaction': satisfaction,
    'target': target
})

# 2. 定义特征类型
X = data.drop('target', axis=1)
y = data['target']

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 指定特征类型
numerical_features = ['age', 'income']
nominal_features = ['education', 'city']  # 名义分类特征
ordinal_features = ['experience_level', 'satisfaction']  # 有序分类特征

# 3. 定义有序特征的类别顺序

experience_categories = ['Junior', 'Mid-Level', 'Senior', 'Executive']
satisfaction_categories = ['Very Low', 'Low', 'Medium', 'High', 'Very High']
ordinal_features_zd={'experience_level':experience_categories, 'satisfaction':satisfaction_categories}

# 4. 创建编码映射字典（用于保存所有映射关系）
encoding_mappings = {
    'nominal': {},
    'ordinal': {},
    'numerical': {}
}

# # 5. 创建列转换器
# preprocessor = ColumnTransformer(
#     transformers=[
#         # 数值特征：标准化
#         ('num', StandardScaler(), numerical_features),
        
#         # 名义分类特征：One-Hot编码
#         ('nom', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), nominal_features),
        
#         # 有序分类特征：Ordinal编码
#         ('ord', OrdinalEncoder(
#             categories=[ordinal_features_zd['experience_level'], ordinal_features_zd['satisfaction']]), 
#          ordinal_features)
#     ],
#     remainder='drop'
# )
def pipeline_preprocessor_ColumnTransformer(numerical_features, nominal_features, ordinal_features,ordinal_features_zd, data):
    # ordinal_features_zd={'experience_level':experience_categories, 'satisfaction':satisfaction_categories}

    # 4. 创建编码映射字典（用于保存所有映射关系）
    encoding_mappings = {
        'nominal': {},
        'ordinal': {},
        'numerical': {}
    }

    # 5. 创建列转换器
    preprocessor = ColumnTransformer(
        transformers=[
            # 数值特征：标准化
            ('num', StandardScaler(), numerical_features),
            
            # 名义分类特征：One-Hot编码
            ('nom', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), nominal_features),
            
            # 有序分类特征：Ordinal编码
            ('ord', OrdinalEncoder(
                categories=[ordinal_features_zd[feature] for feature in ordinal_features]), 
            ordinal_features)
        ],
        remainder='drop'
    )
    return preprocessor
preprocessor=pipeline_preprocessor_ColumnTransformer(numerical_features, nominal_features, ordinal_features,ordinal_features_zd, data)

# 6. 创建完整管道
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

# 7. 训练管道并获取编码映射
pipeline.fit(X_train, y_train)

# 8. 提取并保存映射关系
# 获取One-Hot编码器的映射
onehot_encoder = pipeline.named_steps['preprocessor'].named_transformers_['nom'] 
for i, feature in enumerate(nominal_features):
    categories = onehot_encoder.categories_[i]
    mapping = {category: idx for idx, category in enumerate(categories)}
    encoding_mappings['nominal'][feature] = mapping

# 获取Ordinal编码器的映射
ordinal_encoder = pipeline.named_steps['preprocessor'].named_transformers_['ord']
for i, feature in enumerate(ordinal_features):
    categories = ordinal_encoder.categories_[i]
    mapping = {category: idx for idx, category in enumerate(categories)}
    encoding_mappings['ordinal'][feature] = mapping

# 获取数值特征的标准化参数
scaler = pipeline.named_steps['preprocessor'].named_transformers_['num']
for i, feature in enumerate(numerical_features):
    mapping = {
        'mean': scaler.mean_[i],
        'std': scaler.scale_[i]
    }
    encoding_mappings['numerical'][feature] = mapping

# 9. 输出映射字典
print("="*50)
print("完整的编码映射字典:")
print("="*50)
print("名义特征 (One-Hot编码):")
for feature, mapping in encoding_mappings['nominal'].items():
    print(f"  {feature}:")
    for category, code in mapping.items():
        print(f"    {category} → {code}")

print("\n有序特征 (Ordinal编码):")
for feature, mapping in encoding_mappings['ordinal'].items():
    print(f"  {feature}:")
    for category, code in mapping.items():
        print(f"    {category} → {code}")

print("\n数值特征 (标准化参数):")
for feature, params in encoding_mappings['numerical'].items():
    print(f"  {feature}: 均值={params['mean']:.2f}, 标准差={params['std']:.2f}")

# 10. 使用5折交叉验证评估模型
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X_train, y_train, cv=kfold, scoring='accuracy')

print("\n" + "="*50)
print(f"交叉验证准确率: {np.mean(scores):.4f} (±{np.std(scores):.4f})")
print("="*50)

# 11. 在测试集上评估最终模型
test_score = pipeline.score(X_test, y_test)
print(f"测试集准确率: {test_score:.4f}")

# 12. 保存管道和映射字典
joblib.dump(pipeline, 'model_pipeline.pkl')
joblib.dump(encoding_mappings, 'encoding_mappings.pkl')

print("\n管道和编码映射已保存为 'model_pipeline.pkl' 和 'encoding_mappings.pkl'")

完整的编码映射字典:
名义特征 (One-Hot编码):
  education:
    Bachelor → 0
    High School → 1
    Master → 2
    PhD → 3
  city:
    London → 0
    New York → 1
    Paris → 2
    Tokyo → 3

有序特征 (Ordinal编码):
  experience_level:
    Junior → 0
    Mid-Level → 1
    Senior → 2
    Executive → 3
  satisfaction:
    Very Low → 0
    Low → 1
    Medium → 2
    High → 3
    Very High → 4

数值特征 (标准化参数):
  age: 均值=40.25, 标准差=9.80
  income: 均值=51281.86, 标准差=14931.28

交叉验证准确率: 0.9900 (±0.0064)
测试集准确率: 1.0000

管道和编码映射已保存为 'model_pipeline.pkl' 和 'encoding_mappings.pkl'


In [None]:
# 使用前面保存的管道和映射字典进行预测
pipeline = joblib.load('model_pipeline.pkl')
encoding_mappings = joblib.load('encoding_mappings.pkl')

# 13. 使用保存的映射字典进行预测
# 示例输入数据
new_data = pd.DataFrame({
    'age': [35],
    'income': [60000],
    'education': ['PhD'],
    'city': ['New York'],
    'experience_level': ['Executive'],
    'satisfaction': ['Very High']
})


# 对新数据进行编码---错误，因为管道pipeline已经完成编码，无需上述在进行手动编码
# encoded_data = new_data.copy()
# for feature, mapping in encoding_mappings['nominal'].items():
#     encoded_data[feature] = encoded_data[feature].map(mapping)

# for feature, mapping in encoding_mappings['ordinal'].items():
#     encoded_data[feature] = encoded_data[feature].map(mapping)
# encoded_data[numerical_features] = encoded_data[numerical_features].apply(lambda x: (x - encoding_mappings['numerical'][x.name]['mean']) / encoding_mappings['numerical'][x.name]['std'])

# # 使用保存的管道进行预测---错误，因为管道pipeline已经完成编码，无需上述在进行手动编码
# prediction = pipeline.predict(encoded_data)
# print(f"预测结果: {prediction[0]}")
# 只需标准化数值特征（如果 pipeline 里没做），否则直接用原始 new_data
prediction = pipeline.predict(new_data)
print(f"预测结果: {prediction[0]}")


In [1]:
!pip install xgboost --upgrade

Collecting xgboost
  Downloading xgboost-2.1.4-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-2.1.4-py3-none-win_amd64.whl (124.9 MB)
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.5/124.9 MB 3.4 MB/s eta 0:00:38
    --------------------------------------- 2.4/124.9 MB 6.7 MB/s eta 0:00:19
   - -------------------------------------- 5.0/124.9 MB 9.2 MB/s eta 0:00:14
   -- ------------------------------------- 6.3/124.9 MB 8.4 MB/s eta 0:00:15
   -- ------------------------------------- 8.9/124.9 MB 9.4 MB/s eta 0:00:13
   --- ------------------------------------ 11.3/124.9 MB 9.8 MB/s eta 0:00:12
   ---- ----------------------------------- 13.6/124.9 MB 10.0 MB/s eta 0:00:12
   ----- ---------------------------------- 16.3/124.9 MB 10.3 MB/s eta 0:00:11
   ----- ---------------------------------- 18.6/124.9 MB 10.5 MB/s eta 0:00:11
 

In [3]:
!pip install graphviz



<function graphviz.backend.upstream_version.version() -> Tuple[int, ...]>