In [1]:
# -*- coding: utf-8 -*-
"""
二手车价格预测 - 高级特征工程与CatBoost建模
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from catboost import CatBoostRegressor, Pool
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
import joblib
import datetime
import warnings
warnings.filterwarnings('ignore')

# 设置中文显示
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

def load_data():
    """
    加载原始数据
    """
    print("正在加载数据...")
    # 加载训练集
    train_data = pd.read_csv('used_car_train_20200313.csv', sep=' ')
    # 加载测试集
    test_data = pd.read_csv('used_car_testB_20200421.csv', sep=' ')
    
    print(f"训练集形状: {train_data.shape}")
    print(f"测试集形状: {test_data.shape}")
    
    return train_data, test_data

def preprocess_data(train_data, test_data):
    """
    数据预处理
    """
    print("\n开始数据预处理...")
    
    # 合并训练集和测试集进行特征工程
    train_data['source'] = 'train'
    test_data['source'] = 'test'
    data = pd.concat([train_data, test_data], ignore_index=True)
    
    # 保存SaleID
    train_ids = train_data['SaleID']
    test_ids = test_data['SaleID']
    
    # 从训练集获取y值
    y = train_data['price']
    
    return data, y, train_ids, test_ids

# 加载数据
train_data, test_data = load_data()

# 预处理数据
data, y, train_ids, test_ids = preprocess_data(train_data, test_data)
data

正在加载数据...
训练集形状: (150000, 31)
测试集形状: (50000, 30)

开始数据预处理...


Unnamed: 0,SaleID,name,regDate,model,brand,bodyType,fuelType,gearbox,power,kilometer,...,v_6,v_7,v_8,v_9,v_10,v_11,v_12,v_13,v_14,source
0,0,736,20040402,30.0,6,1.0,0.0,0.0,60,12.5,...,0.101988,0.129549,0.022816,0.097462,-2.881803,2.804097,-2.420821,0.795292,0.914762,train
1,1,2262,20030301,40.0,1,2.0,0.0,0.0,0,15.0,...,0.121004,0.135731,0.026597,0.020582,-4.900482,2.096338,-1.030483,-1.722674,0.245522,train
2,2,14874,20040403,115.0,15,1.0,0.0,0.0,163,12.5,...,0.114912,0.165147,0.062173,0.027075,-4.846749,1.803559,1.565330,-0.832687,-0.229963,train
3,3,71865,19960908,109.0,10,0.0,0.0,1.0,193,15.0,...,0.110300,0.121964,0.033395,0.000000,-4.509599,1.285940,-0.501868,-2.438353,-0.478699,train
4,4,111080,20120103,110.0,5,1.0,0.0,0.0,68,5.0,...,0.073205,0.091880,0.078819,0.121534,-1.896240,0.910783,0.931110,2.834518,1.923482,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199995,249995,111443,20041005,4.0,4,0.0,,1.0,150,15.0,...,0.000292,0.141804,0.076393,0.039272,2.072901,-2.531869,1.716978,-1.063437,0.326587,test
199996,249996,152834,20130409,65.0,1,0.0,0.0,0.0,179,4.0,...,0.000991,0.155868,0.108425,0.067841,1.358504,-3.290295,4.269809,0.140524,0.556221,test
199997,249997,132531,20041211,4.0,4,0.0,0.0,1.0,147,12.5,...,0.000318,0.141872,0.071968,0.042966,2.165658,-2.417885,1.370612,-1.073133,0.270602,test
199998,249998,143405,20020702,40.0,1,4.0,0.0,1.0,176,15.0,...,0.000023,0.067483,0.067526,0.009006,2.030114,-2.939244,0.569078,-1.718245,0.316379,test


In [2]:
def create_time_features(data):
    """
    创建时间特征
    """
    print("创建时间特征...")
    
    # 转换日期格式
    data['regDate'] = pd.to_datetime(data['regDate'], format='%Y%m%d', errors='coerce')
    data['creatDate'] = pd.to_datetime(data['creatDate'], format='%Y%m%d', errors='coerce')
    
    # 处理无效日期
    data.loc[data['regDate'].isnull(), 'regDate'] = pd.to_datetime('20160101', format='%Y%m%d')
    data.loc[data['creatDate'].isnull(), 'creatDate'] = pd.to_datetime('20160101', format='%Y%m%d')
    
    # 车辆年龄（天数）
    data['vehicle_age_days'] = (data['creatDate'] - data['regDate']).dt.days
    
    # 修复异常值
    data.loc[data['vehicle_age_days'] < 0, 'vehicle_age_days'] = 0
    
    # 车辆年龄（年）
    data['vehicle_age_years'] = data['vehicle_age_days'] / 365
    
    # 注册年份和月份
    data['reg_year'] = data['regDate'].dt.year
    data['reg_month'] = data['regDate'].dt.month
    data['reg_day'] = data['regDate'].dt.day
    
    # 创建年份和月份
    data['creat_year'] = data['creatDate'].dt.year
    data['creat_month'] = data['creatDate'].dt.month
    data['creat_day'] = data['creatDate'].dt.day
    
    # 是否为新车（使用年限<1年）
    data['is_new_car'] = (data['vehicle_age_years'] < 1).astype(int)
    
    # 季节特征
    data['reg_season'] = data['reg_month'].apply(lambda x: (x%12 + 3)//3)
    data['creat_season'] = data['creat_month'].apply(lambda x: (x%12 + 3)//3)
    
    # 每年行驶的公里数
    data['km_per_year'] = data['kilometer'] / (data['vehicle_age_years'] + 0.1)
    
    # 车龄分段
    data['age_segment'] = pd.cut(data['vehicle_age_years'], 
                                bins=[-0.01, 1, 3, 5, 10, 100], 
                                labels=['0-1年', '1-3年', '3-5年', '5-10年', '10年以上'])
    
    return data

# 创建时间特征
data = create_time_features(data)
data[['regDate', 'creatDate', 'reg_year', 'reg_month', 'reg_day', 'creat_year', 'creat_month', 'creat_day', 
      'vehicle_age_days', 'vehicle_age_years', 'is_new_car', 'reg_season', 'creat_season', 'km_per_year', 'age_segment']]

创建时间特征...


Unnamed: 0,regDate,creatDate,reg_year,reg_month,reg_day,creat_year,creat_month,creat_day,vehicle_age_days,vehicle_age_years,is_new_car,reg_season,creat_season,km_per_year,age_segment
0,2004-04-02,2016-04-04,2004,4,2,2016,4,4,4385,12.013699,0,2,2,1.031890,10年以上
1,2003-03-01,2016-03-09,2003,3,1,2016,3,9,4757,13.032877,0,2,2,1.142172,10年以上
2,2004-04-03,2016-04-02,2004,4,3,2016,4,2,4382,12.005479,0,2,2,1.032590,10年以上
3,1996-09-08,2016-03-12,1996,9,8,2016,3,12,7125,19.520548,0,4,2,0.764505,10年以上
4,2012-01-03,2016-03-13,2012,1,3,2016,3,13,1531,4.194521,0,1,2,1.164274,3-5年
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199995,2004-10-05,2016-03-09,2004,10,5,2016,3,9,4173,11.432877,0,4,2,1.300630,10年以上
199996,2013-04-09,2016-03-23,2013,4,9,2016,3,23,1079,2.956164,0,2,2,1.308830,1-3年
199997,2004-12-11,2016-03-16,2004,12,11,2016,3,16,4113,11.268493,0,1,2,1.099530,10年以上
199998,2002-07-02,2016-03-27,2002,7,2,2016,3,27,5017,13.745205,0,3,2,1.083408,10年以上


In [3]:
def create_car_features(data):
    """
    创建车辆特征
    """
    print("创建车辆特征...")
    
    # 缺失值处理
    numerical_features = ['power', 'kilometer', 'v_0', 'v_1', 'v_2', 'v_3', 'v_4', 'v_5', 'v_6', 'v_7', 'v_8', 'v_9', 'v_10', 'v_11', 'v_12', 'v_13', 'v_14']
    for feature in numerical_features:
        # 标记缺失值
        data[f'{feature}_missing'] = data[feature].isnull().astype(int)
        # 填充缺失值
        data[feature] = data[feature].fillna(data[feature].median())
    
    # 将model转换为数值型特征
    data['model_num'] = data['model'].astype('category').cat.codes
    #data['model_num'] = data['model'].astype('int') # 不能这么写，因为有一个为空缺值
    
    # 品牌与车型组合
    data['brand_model'] = data['brand'].astype(str) + '_' + data['model'].astype(str)
        
    # 相对年份特征
    current_year = datetime.datetime.now().year
    data['car_age_from_now'] = current_year - data['reg_year']
    
    # 处理异常值
    numerical_cols = ['power', 'kilometer', 'v_0']
    for col in numerical_cols:
        Q1 = data[col].quantile(0.05)
        Q3 = data[col].quantile(0.95)
        IQR = Q3 - Q1
        data[f'{col}_outlier'] = ((data[col] < (Q1 - 1.5 * IQR)) | (data[col] > (Q3 + 1.5 * IQR))).astype(int)
        data[col] = data[col].clip(Q1 - 1.5 * IQR, Q3 + 1.5 * IQR)
    
    return data

# 创建车辆特征
data = create_car_features(data)
data

创建车辆特征...


Unnamed: 0,SaleID,name,regDate,model,brand,bodyType,fuelType,gearbox,power,kilometer,...,v_11_missing,v_12_missing,v_13_missing,v_14_missing,model_num,brand_model,car_age_from_now,power_outlier,kilometer_outlier,v_0_outlier
0,0,736,2004-04-02,30.0,6,1.0,0.0,0.0,60,12.5,...,0,0,0,0,30,6_30.0,21,0,0,0
1,1,2262,2003-03-01,40.0,1,2.0,0.0,0.0,0,15.0,...,0,0,0,0,40,1_40.0,22,0,0,0
2,2,14874,2004-04-03,115.0,15,1.0,0.0,0.0,163,12.5,...,0,0,0,0,115,15_115.0,21,0,0,0
3,3,71865,1996-09-08,109.0,10,0.0,0.0,1.0,193,15.0,...,0,0,0,0,109,10_109.0,29,0,0,0
4,4,111080,2012-01-03,110.0,5,1.0,0.0,0.0,68,5.0,...,0,0,0,0,110,5_110.0,13,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199995,249995,111443,2004-10-05,4.0,4,0.0,,1.0,150,15.0,...,0,0,0,0,4,4_4.0,21,0,0,0
199996,249996,152834,2013-04-09,65.0,1,0.0,0.0,0.0,179,4.0,...,0,0,0,0,65,1_65.0,12,0,0,0
199997,249997,132531,2004-12-11,4.0,4,0.0,0.0,1.0,147,12.5,...,0,0,0,0,4,4_4.0,21,0,0,0
199998,249998,143405,2002-07-02,40.0,1,4.0,0.0,1.0,176,15.0,...,0,0,0,0,40,1_40.0,23,0,0,0


In [4]:
def create_statistical_features(data, train_idx):
    """
    创建统计特征
    """
    print("创建统计特征...")
    
    # 仅使用训练集数据创建统计特征
    train_data = data.iloc[train_idx].reset_index(drop=True)
    
    # 品牌级别统计，使用训练集的price，因为测试集没有price
    brand_stats = train_data.groupby('brand').agg(
        brand_price_mean=('price', 'mean'),
        brand_price_median=('price', 'median'),
        brand_price_std=('price', 'std'),
        brand_price_count=('price', 'count')
    ).reset_index()
    

    # 合并统计特征
    data = data.merge(brand_stats, on='brand', how='left')
    
    # 相对价格特征（相对于平均价格）
    data['brand_price_ratio'] = data['brand_price_mean'] / data['brand_price_mean'].mean()
    
    return data

# 找回训练集的索引
train_idx = data[data['source'] == 'train'].index
test_idx = data[data['source'] == 'test'].index

# 创建统计特征
data = create_statistical_features(data, train_idx)
data

创建统计特征...


Unnamed: 0,SaleID,name,regDate,model,brand,bodyType,fuelType,gearbox,power,kilometer,...,brand_model,car_age_from_now,power_outlier,kilometer_outlier,v_0_outlier,brand_price_mean,brand_price_median,brand_price_std,brand_price_count,brand_price_ratio
0,0,736,2004-04-02,30.0,6,1.0,0.0,0.0,60,12.5,...,6_30.0,21,0,0,0,3611.840266,1800.0,4681.293524,10217,0.610000
1,1,2262,2003-03-01,40.0,1,2.0,0.0,0.0,0,15.0,...,1_40.0,22,0,0,0,9273.311947,6499.0,9369.631497,13794,1.566160
2,2,14874,2004-04-03,115.0,15,1.0,0.0,0.0,163,12.5,...,15_115.0,21,0,0,0,9858.582990,8500.0,5425.058140,1458,1.665006
3,3,71865,1996-09-08,109.0,10,0.0,0.0,1.0,193,15.0,...,10_109.0,29,0,0,0,8470.804197,5400.0,8988.307535,14249,1.430626
4,4,111080,2012-01-03,110.0,5,1.0,0.0,0.0,68,5.0,...,5_110.0,13,0,0,0,3306.349411,2300.0,3343.624586,4665,0.558406
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199995,249995,111443,2004-10-05,4.0,4,0.0,,1.0,150,15.0,...,4_4.0,21,0,0,0,8538.788074,6000.0,8472.393191,16737,1.442107
199996,249996,152834,2013-04-09,65.0,1,0.0,0.0,0.0,179,4.0,...,1_65.0,12,0,0,0,9273.311947,6499.0,9369.631497,13794,1.566160
199997,249997,132531,2004-12-11,4.0,4,0.0,0.0,1.0,147,12.5,...,4_4.0,21,0,0,0,8538.788074,6000.0,8472.393191,16737,1.442107
199998,249998,143405,2002-07-02,40.0,1,4.0,0.0,1.0,176,15.0,...,1_40.0,23,0,0,0,9273.311947,6499.0,9369.631497,13794,1.566160


In [5]:
def encode_categorical_features(data):
    """
    编码分类特征
    """
    print("编码分类特征...")
    
    # 目标编码的替代方案 - 频率编码
    categorical_cols = ['model', 'brand', 'bodyType', 'fuelType', 'gearbox', 'notRepairedDamage']
    
    for col in categorical_cols:
        # 填充缺失值
        data[col] = data[col].fillna('未知')
        
        # 频率编码
        freq_encoding = data.groupby(col).size() / len(data)
        data[f'{col}_freq'] = data[col].map(freq_encoding)
    
    # 将分类变量转换为CatBoost可以识别的格式
    for col in categorical_cols:
        data[col] = data[col].astype('str')
    
    return data, categorical_cols

# 编码分类特征
data, categorical_cols = encode_categorical_features(data)
data

编码分类特征...


Unnamed: 0,SaleID,name,regDate,model,brand,bodyType,fuelType,gearbox,power,kilometer,...,brand_price_median,brand_price_std,brand_price_count,brand_price_ratio,model_freq,brand_freq,bodyType_freq,fuelType_freq,gearbox_freq,notRepairedDamage_freq
0,0,736,2004-04-02,30.0,6,1.0,0.0,0.0,60,12.5,...,1800.0,4681.293524,10217,0.610000,0.015675,0.068585,0.236160,0.610725,0.743770,0.742925
1,1,2262,2003-03-01,40.0,1,2.0,0.0,0.0,0,15.0,...,6499.0,9369.631497,13794,1.566160,0.030130,0.092105,0.201050,0.610725,0.743770,0.161965
2,2,14874,2004-04-03,115.0,15,1.0,0.0,0.0,163,12.5,...,8500.0,5425.058140,1458,1.665006,0.006210,0.009670,0.236160,0.610725,0.743770,0.742925
3,3,71865,1996-09-08,109.0,10,0.0,0.0,1.0,193,15.0,...,5400.0,8988.307535,14249,1.430626,0.002540,0.094810,0.275925,0.610725,0.216485,0.742925
4,4,111080,2012-01-03,110.0,5,1.0,0.0,0.0,68,5.0,...,2300.0,3343.624586,4665,0.558406,0.003610,0.030750,0.236160,0.610725,0.743770,0.742925
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199995,249995,111443,2004-10-05,4.0,4,0.0,未知,1.0,150,15.0,...,6000.0,8472.393191,16737,1.442107,0.056175,0.111345,0.275925,0.058020,0.216485,0.161965
199996,249996,152834,2013-04-09,65.0,1,0.0,0.0,0.0,179,4.0,...,6499.0,9369.631497,13794,1.566160,0.018240,0.092105,0.275925,0.610725,0.743770,0.742925
199997,249997,132531,2004-12-11,4.0,4,0.0,0.0,1.0,147,12.5,...,6000.0,8472.393191,16737,1.442107,0.056175,0.111345,0.275925,0.610725,0.216485,0.742925
199998,249998,143405,2002-07-02,40.0,1,4.0,0.0,1.0,176,15.0,...,6499.0,9369.631497,13794,1.566160,0.030130,0.092105,0.064335,0.610725,0.216485,0.742925


In [6]:
categorical_cols

['model', 'brand', 'bodyType', 'fuelType', 'gearbox', 'notRepairedDamage']

In [7]:
def feature_selection(data, categorical_cols):
    """
    特征选择和最终数据准备
    """
    print("特征选择和最终数据准备...")
    
    # 删除不再需要的列, 所有车offerType=0,seller只有1个为1，其他都为0
    drop_cols = ['regDate', 'creatDate', 'price', 'SaleID', 'name', 'offerType', 'seller', 'source']
    data = data.drop(drop_cols, axis=1, errors='ignore')
    
    # 确保所有分类特征都被正确标记
    # 添加age_segment到分类特征列表中
    if 'age_segment' not in categorical_cols and 'age_segment' in data.columns:
        categorical_cols.append('age_segment')
    
    # 确保brand_model也被标记为分类特征
    if 'brand_model' not in categorical_cols and 'brand_model' in data.columns:
        categorical_cols.append('brand_model')
    
    # 转换分类特征
    for col in categorical_cols:
        if col in data.columns:
            data[col] = data[col].astype('category')
    
    return data, categorical_cols

# 特征选择和最终准备
data, cat_features = feature_selection(data, categorical_cols)
data

特征选择和最终数据准备...


Unnamed: 0,model,brand,bodyType,fuelType,gearbox,power,kilometer,notRepairedDamage,regionCode,v_0,...,brand_price_median,brand_price_std,brand_price_count,brand_price_ratio,model_freq,brand_freq,bodyType_freq,fuelType_freq,gearbox_freq,notRepairedDamage_freq
0,30.0,6,1.0,0.0,0.0,60,12.5,0.0,1046,43.357796,...,1800.0,4681.293524,10217,0.610000,0.015675,0.068585,0.236160,0.610725,0.743770,0.742925
1,40.0,1,2.0,0.0,0.0,0,15.0,-,4366,45.305273,...,6499.0,9369.631497,13794,1.566160,0.030130,0.092105,0.201050,0.610725,0.743770,0.161965
2,115.0,15,1.0,0.0,0.0,163,12.5,0.0,2806,45.978359,...,8500.0,5425.058140,1458,1.665006,0.006210,0.009670,0.236160,0.610725,0.743770,0.742925
3,109.0,10,0.0,0.0,1.0,193,15.0,0.0,434,45.687478,...,5400.0,8988.307535,14249,1.430626,0.002540,0.094810,0.275925,0.610725,0.216485,0.742925
4,110.0,5,1.0,0.0,0.0,68,5.0,0.0,6977,44.383511,...,2300.0,3343.624586,4665,0.558406,0.003610,0.030750,0.236160,0.610725,0.743770,0.742925
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199995,4.0,4,0.0,未知,1.0,150,15.0,-,5564,46.321013,...,6000.0,8472.393191,16737,1.442107,0.056175,0.111345,0.275925,0.058020,0.216485,0.161965
199996,65.0,1,0.0,0.0,0.0,179,4.0,0.0,5220,48.086547,...,6499.0,9369.631497,13794,1.566160,0.018240,0.092105,0.275925,0.610725,0.743770,0.742925
199997,4.0,4,0.0,0.0,1.0,147,12.5,0.0,3795,46.145279,...,6000.0,8472.393191,16737,1.442107,0.056175,0.111345,0.275925,0.610725,0.216485,0.742925
199998,40.0,1,4.0,0.0,1.0,176,15.0,0.0,61,45.507088,...,6499.0,9369.631497,13794,1.566160,0.030130,0.092105,0.064335,0.610725,0.216485,0.742925


In [11]:
#data[['offerType', 'seller']]
#data['offerType'].value_counts()
#data['seller'].value_counts()

seller
0    199999
1         1
Name: count, dtype: int64

In [8]:
# 分离训练集和测试集
X_train_full = data.iloc[train_idx].reset_index(drop=True)
X_test = data.iloc[test_idx].reset_index(drop=True)

# 划分训练集和验证集
X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y, test_size=0.1, random_state=42
)

# 保存处理后的数据
joblib.dump(X_train, 'processed_data/fe_X_train.joblib')
joblib.dump(X_val, 'processed_data/fe_X_val.joblib')
joblib.dump(y_train, 'processed_data/fe_y_train.joblib')
joblib.dump(y_val, 'processed_data/fe_y_val.joblib')
joblib.dump(X_test, 'processed_data/fe_test_data.joblib')
joblib.dump(test_ids, 'processed_data/fe_sale_ids.joblib')
joblib.dump(cat_features, 'processed_data/fe_cat_features.joblib')

print("预处理后的数据已保存")

预处理后的数据已保存


In [9]:
def train_catboost_model(X_train, X_val, y_train, y_val, cat_features):
    """
    训练CatBoost模型
    """
    print("\n开始训练CatBoost模型...")
    
    # 创建数据池
    train_pool = Pool(X_train, y_train, cat_features=cat_features)
    val_pool = Pool(X_val, y_val, cat_features=cat_features)
    
    # 设置模型参数
    params = {
        'iterations': 1000,
        'learning_rate': 0.05,
        'depth': 6,
        'l2_leaf_reg': 3,
        'bootstrap_type': 'Bayesian',
        'random_seed': 42,
        'od_type': 'Iter',
        'od_wait': 100,
        'verbose': 100,
        'loss_function': 'MAE',
        'eval_metric': 'MAE',
        'task_type': 'CPU',
        'thread_count': -1
    }
    
    # 创建模型
    model = CatBoostRegressor(**params)
    
    # 训练模型
    model.fit(
        train_pool,
        eval_set=val_pool,
        use_best_model=True,
        plot=True
    )
    
    # 保存模型
    model.save_model('processed_data/fe_catboost_model.cbm')
    print("模型已保存到 processed_data/fe_catboost_model.cbm")
    
    return model

# 训练CatBoost模型
model = train_catboost_model(X_train, X_val, y_train, y_val, cat_features)


开始训练CatBoost模型...


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 4308.8010950	test: 4274.9761098	best: 4274.9761098 (0)	total: 335ms	remaining: 5m 34s
100:	learn: 954.7043219	test: 946.0559913	best: 946.0559913 (100)	total: 21s	remaining: 3m 7s
200:	learn: 797.4682816	test: 801.6376459	best: 801.6376459 (200)	total: 40.4s	remaining: 2m 40s
300:	learn: 739.8078409	test: 747.9047529	best: 747.9047529 (300)	total: 59.5s	remaining: 2m 18s
400:	learn: 700.3474033	test: 712.6415834	best: 712.6415834 (400)	total: 1m 18s	remaining: 1m 56s
500:	learn: 672.1327263	test: 687.3738583	best: 687.3738583 (500)	total: 1m 37s	remaining: 1m 36s
600:	learn: 647.4870590	test: 664.3432890	best: 664.3432890 (600)	total: 1m 55s	remaining: 1m 16s
700:	learn: 631.2893105	test: 649.8212841	best: 649.8212841 (700)	total: 2m 15s	remaining: 57.6s
800:	learn: 615.2598938	test: 634.8617243	best: 634.8617243 (800)	total: 2m 35s	remaining: 38.6s
900:	learn: 601.0233403	test: 622.3573753	best: 622.3573753 (900)	total: 2m 55s	remaining: 19.3s
999:	learn: 590.7303201	test: 6

In [10]:
def evaluate_model(model, X_val, y_val, cat_features):
    """
    评估模型性能
    """
    # 创建验证数据池
    val_pool = Pool(X_val, cat_features=cat_features)
    
    # 预测
    y_pred = model.predict(val_pool)
    
    # 计算评估指标
    mse = mean_squared_error(y_val, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_val, y_pred)
    r2 = r2_score(y_val, y_pred)
    
    print("\n模型评估结果：")
    print(f"均方根误差 (RMSE): {rmse:.2f}")
    print(f"平均绝对误差 (MAE): {mae:.2f}")
    print(f"R2分数: {r2:.4f}")
    
    # 绘制预测值与实际值的对比图
    plt.figure(figsize=(10, 6))
    plt.scatter(y_val, y_pred, alpha=0.5)
    plt.plot([y_val.min(), y_val.max()], [y_val.min(), y_val.max()], 'r--', lw=2)
    plt.xlabel('实际价格')
    plt.ylabel('预测价格')
    plt.title('CatBoost预测价格 vs 实际价格')
    plt.tight_layout()
    plt.savefig('fe_catboost_prediction_vs_actual.png')
    plt.close()
    
    return rmse, mae, r2

# 评估模型
rmse, mae, r2 = evaluate_model(model, X_val, y_val, cat_features)


模型评估结果：
均方根误差 (RMSE): 1475.95
平均绝对误差 (MAE): 613.66
R2分数: 0.9598


In [11]:
def plot_feature_importance(model, X_train):
    """
    绘制特征重要性图
    """
    # 获取特征重要性
    feature_importance = model.get_feature_importance()
    feature_names = X_train.columns
    
    # 创建特征重要性DataFrame
    importance_df = pd.DataFrame({
        'feature': feature_names,
        'importance': feature_importance
    })
    importance_df = importance_df.sort_values('importance', ascending=False)
    
    # 保存特征重要性到CSV
    importance_df.to_csv('fe_catboost_feature_importance.csv', index=False)
    
    # 绘制特征重要性图
    plt.figure(figsize=(14, 8))
    sns.barplot(x='importance', y='feature', data=importance_df.head(20))
    plt.title('CatBoost Top 20 特征重要性')
    plt.tight_layout()
    plt.savefig('fe_catboost_feature_importance.png')
    plt.close()
    
    return importance_df
    
# 绘制特征重要性
importance_df = plot_feature_importance(model, X_train)

In [12]:
def predict_test_data(model, X_test, test_ids, cat_features):
    """
    预测测试集数据
    """
    print("\n正在预测测试集...")
    
    # 创建测试数据池
    test_pool = Pool(X_test, cat_features=cat_features)
    
    # 预测
    predictions = model.predict(test_pool)
    
    # 创建提交文件
    submit_data = pd.DataFrame({
        'SaleID': test_ids,
        'price': predictions
    })
    
    # 保存预测结果
    submit_data.to_csv('fe_catboost_submit_result.csv', index=False)
    print("预测结果已保存到 fe_catboost_submit_result.csv")

# 预测测试集
predict_test_data(model, X_test, test_ids, cat_features)

print("\n模型训练、评估和预测完成！")
print(f"Top 10 重要特征:\n{importance_df.head(10)}")


正在预测测试集...
预测结果已保存到 fe_catboost_submit_result.csv

模型训练、评估和预测完成！
Top 10 重要特征:
        feature  importance
12          v_3   30.396318
9           v_0   19.265201
21         v_12   15.101707
5         power    5.019121
17          v_8    3.296565
18          v_9    3.107797
36  age_segment    2.891549
15          v_6    2.842439
19         v_10    2.609743
6     kilometer    2.175525


In [37]:

# X_test['brand_model'].isnull().sum()
# cat_features
# X_test['vehicle_age_years'].describe()
# #X_test['vehicle_age_years'].isnull().sum()
# data.loc[data['age_segment'].isnull(), 'vehicle_age_years']


155610    0.0
Name: vehicle_age_years, dtype: float64

In [12]:
# # 车龄分段
# data['age_segment'] = pd.cut(data['vehicle_age_years'], 
#                             bins=[-0.01, 1, 3, 5, 10, 100], 
#                             labels=['0-1年', '1-3年', '3-5年', '5-10年', '10年以上'])

In [13]:
cat_features

['model',
 'brand',
 'bodyType',
 'fuelType',
 'gearbox',
 'notRepairedDamage',
 'age_segment',
 'brand_model']