In [1]:
# Code that might generate warnings
# %%
import warnings
import datetime
import os
import math
import time
import pandas as pd
import numpy as np
import pickle
import progressbar
import joblib
from joblib import Parallel, delayed
import multiprocessing
from multiprocessing import Process, Pool, Manager

import itertools
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
from scipy.stats import skew, kurtosis
from numpy.lib.stride_tricks import as_strided as stride
from geopy.distance import geodesic
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

from math import sqrt
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import r2_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

from sklearn.linear_model import LinearRegression
from scipy.stats import mode
from statsmodels.tools.tools import add_constant
from statsmodels.stats.stattools import durbin_watson
from statsmodels.stats.outliers_influence import variance_inflation_factor

warnings.filterwarnings("ignore")  # Ignore all warnings

# 每一条轨迹长短不一，在轨迹内采用滑动窗口L1（即决定窗口）,步长为L2,划分为子轨迹，提取子轨迹的特征。
# 在子轨迹上训练模型，最后可以基于轨迹进行后处理
L1 = 5  # s 滑动窗口的长度，暂时依据是20年直接给的5s的数据
L2 = 1  # 滑动窗口的步长，假设1s内没有变化（主要原因是GPS数据是1s记录的）

# locs = ['Hand', 'Bag', 'Hips', 'Torso']

# loc = locs[0]  # 'Hand'
# dataset = 'train'

label_map = {
    1: 'Still',
    2: 'Walking',
    3: 'Run',
    4: 'Bike',
    5: 'Car',
    6: 'Bus',
    7: 'Train',
    8: 'Subway'
}

filenames = {
    'train': {
        'Hand': {
            'Location': '/DATA2/lvxiaoling/limengyuan/SHL2023/train/Hand/Location.pkl',
            'Location_new': '/DATA2/lvxiaoling/limengyuan/SHL2023/train/Hand/Location_new.pkl',
            # 进行标签匹配后的数据，时间戳是1HZ的label数据
            'Mag': '/DATA2/lvxiaoling/limengyuan/SHL2023/train/Hand/Mag.pkl',
            'Gyr': '/DATA2/lvxiaoling/limengyuan/SHL2023/train/Hand/Gyr.pkl',
            'Acc': '/DATA2/lvxiaoling/limengyuan/SHL2023/train/Hand/Acc.pkl',
            'GPS': '/DATA2/lvxiaoling/limengyuan/SHL2023/train/Hand/GPS.pkl',
            'GPS_new': '/DATA2/lvxiaoling/limengyuan/SHL2023/train/Hand/GPS_new.pkl',
        },
        'Bag': {
            'Location': '/DATA2/lvxiaoling/limengyuan/SHL2023/train/Bag/Location.pkl',
            'Location_new': '/DATA2/lvxiaoling/limengyuan/SHL2023/train/Bag/Location_new.pkl',
            'Mag': '/DATA2/lvxiaoling/limengyuan/SHL2023/train/Bag/Mag.pkl',
            'Gyr': '/DATA2/lvxiaoling/limengyuan/SHL2023/train/Bag/Gyr.pkl',
            'Acc': '/DATA2/lvxiaoling/limengyuan/SHL2023/train/Bag/Acc.pkl',
            'GPS': '/DATA2/lvxiaoling/limengyuan/SHL2023/train/Bag/GPS.pkl',
            'GPS_new': '/DATA2/lvxiaoling/limengyuan/SHL2023/train/Bag/GPS_new.pkl',
        },
        'Hips': {
            'Location': '/DATA2/lvxiaoling/limengyuan/SHL2023/train/Hips/Location.pkl',
            'Location_new': '/DATA2/lvxiaoling/limengyuan/SHL2023/train/Hips/Location_new.pkl',
            'Mag': '/DATA2/lvxiaoling/limengyuan/SHL2023/train/Hips/Mag.pkl',
            'Gyr': '/DATA2/lvxiaoling/limengyuan/SHL2023/train/Hips/Gyr.pkl',
            'Acc': '/DATA2/lvxiaoling/limengyuan/SHL2023/train/Hips/Acc.pkl',
            'GPS': '/DATA2/lvxiaoling/limengyuan/SHL2023/train/Hips/GPS.pkl',
            'GPS_new': '/DATA2/lvxiaoling/limengyuan/SHL2023/train/Hips/GPS_new.pkl',
        },
        'Torso': {
            'Location': '/DATA2/lvxiaoling/limengyuan/SHL2023/train/Torso/Location.pkl',
            'Location_new': '/DATA2/lvxiaoling/limengyuan/SHL2023/train/Torso/Location_new.pkl',
            'Mag': '/DATA2/lvxiaoling/limengyuan/SHL2023/train/Torso/Mag.pkl',
            'Gyr': '/DATA2/lvxiaoling/limengyuan/SHL2023/train/Torso/Gyr.pkl',
            'Acc': '/DATA2/lvxiaoling/limengyuan/SHL2023/train/Torso/Acc.pkl',
            'GPS': '/DATA2/lvxiaoling/limengyuan/SHL2023/train/Torso/GPS.pkl',
            'GPS_new': '/DATA2/lvxiaoling/limengyuan/SHL2023/train/Torso/GPS_new.pkl',
        },
        'Label': '/DATA2/lvxiaoling/limengyuan/SHL2023/train/Label.pkl'
    },
    'valid': {
        'Hand': {
            'Location': '/DATA2/lvxiaoling/limengyuan/SHL2023/valid/Hand/Location.pkl',
            'Mag': '/DATA2/lvxiaoling/limengyuan/SHL2023/valid/Hand/Mag.pkl',
            'Gyr': '/DATA2/lvxiaoling/limengyuan/SHL2023/valid/Hand/Gyr.pkl',
            'Acc': '/DATA2/lvxiaoling/limengyuan/SHL2023/valid/Hand/Acc.pkl',
            'GPS': '/DATA2/lvxiaoling/limengyuan/SHL2023/valid/Hand/GPS.pkl',
        },
        'Bag': {
            'Location': '/DATA2/lvxiaoling/limengyuan/SHL2023/valid/Bag/Location.pkl',
            'Location_new': '/DATA2/lvxiaoling/limengyuan/SHL2023/valid/Bag/Location_new.pkl',
            'Mag': '/DATA2/lvxiaoling/limengyuan/SHL2023/valid/Bag/Mag.pkl',
            'Gyr': '/DATA2/lvxiaoling/limengyuan/SHL2023/valid/Bag/Gyr.pkl',
            'Acc': '/DATA2/lvxiaoling/limengyuan/SHL2023/valid/Bag/Acc.pkl',
            'GPS': '/DATA2/lvxiaoling/limengyuan/SHL2023/valid/Bag/GPS.pkl',
            'GPS_new': '/DATA2/lvxiaoling/limengyuan/SHL2023/valid/Bag/GPS_new.pkl',
        },

        'Hips': {
            'Location': '/DATA2/lvxiaoling/limengyuan/SHL2023/valid/Hips/Location.pkl',
            'Location_new': '/DATA2/lvxiaoling/limengyuan/SHL2023/valid/Hips/Location_new.pkl',
            'Mag': '/DATA2/lvxiaoling/limengyuan/SHL2023/valid/Hips/Mag.pkl',
            'Gyr': '/DATA2/lvxiaoling/limengyuan/SHL2023/valid/Hips/Gyr.pkl',
            'Acc': '/DATA2/lvxiaoling/limengyuan/SHL2023/valid/Hips/Acc.pkl',
            'GPS': '/DATA2/lvxiaoling/limengyuan/SHL2023/valid/Hips/GPS.pkl',
            'GPS_new': '/DATA2/lvxiaoling/limengyuan/SHL2023/valid/Hips/GPS_new.pkl',
        },
        'Torso': {
            'Location': '/DATA2/lvxiaoling/limengyuan/SHL2023/valid/Torso/Location.pkl',
            'Location_new': '/DATA2/lvxiaoling/limengyuan/SHL2023/valid/Torso/Location_new.pkl',
            'Mag': '/DATA2/lvxiaoling/limengyuan/SHL2023/valid/Torso/Mag.pkl',
            'Gyr': '/DATA2/lvxiaoling/limengyuan/SHL2023/valid/Torso/Gyr.pkl',
            'Acc': '/DATA2/lvxiaoling/limengyuan/SHL2023/valid/Torso/Acc.pkl',
            'GPS': '/DATA2/lvxiaoling/limengyuan/SHL2023/valid/Torso/GPS.pkl',
            'GPS_new': '/DATA2/lvxiaoling/limengyuan/SHL2023/valid/Torso/GPS_new.pkl',
        },
        'Label': '/DATA2/lvxiaoling/limengyuan/SHL2023/valid/Label.pkl',
    },
    'test': {
        'Location': '/DATA2/lvxiaoling/limengyuan/SHL2023/test/Location.pkl',
        'Location_new': '/DATA2/lvxiaoling/limengyuan/SHL2023/test/Location_new.pkl',
        'Mag': '/DATA2/lvxiaoling/limengyuan/SHL2023/test/Mag.pkl',
        'Gyr': '/DATA2/lvxiaoling/limengyuan/SHL2023/test/Gyr.pkl',
        'Acc': '/DATA2/lvxiaoling/limengyuan/SHL2023/test/Acc.pkl',
        'GPS': '/DATA2/lvxiaoling/limengyuan/SHL2023/test/GPS.pkl',
        'GPS_new': '/DATA2/lvxiaoling/limengyuan/SHL2023/test/GPS_new.pkl',
        'Label': '/DATA2/lvxiaoling/limengyuan/SHL2023/test/Label.pkl'
    }
}



# 数据读取

假设前面的特征工程部分已经处理完成，接下来是完成模型部分
前面已经得到了一个名为 results 的 DataFrame，现在需要进行 XGBoost 建模

In [2]:
os.getcwd()

'/DATA2/lvxiaoling/limengyuan/SHL2023/lmy/newwww'

In [3]:
def fill_NA(dataset):
    # 计算每列的缺失值比例
    missing_ratio = dataset.isnull().mean()
    # 找到缺失值比例超过0.1的列
    #columns_to_drop = missing_ratio[missing_ratio > 0.1].index
    # 删除缺失值比例超过0.1的列
    #dataset = dataset.drop(columns=columns_to_drop)
    
    # 将空值替换为0
    new_dataset = dataset.fillna(0)
    
    # 在训练Boost模型时，要求标签类别必须是从0开始的连续整数
#     label_mapping = {1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7}
#     new_dataset['label'] = new_dataset['label'].map(label_mapping)
    new_dataset = new_dataset.drop(columns=['idx', 'timestamp', 'trajectory_id','label','label_idx'])
    
    return new_dataset


由于担心数据泄露的风险，所以这次不混合训练，看看对测试集的预测是否发生改变

In [4]:
# 从csv文件中读取数据，并添加新的new_label列
train_hand = pd.read_pickle('/DATA2/lvxiaoling/limengyuan/SHL2023/train/Hand/data.pkl')
train_hand['new_label'] = 0
valid_hand = pd.read_pickle('/DATA2/lvxiaoling/limengyuan/SHL2023/valid/Hand/data.pkl')
valid_hand['new_label'] = 0
train_bag = pd.read_pickle('/DATA2/lvxiaoling/limengyuan/SHL2023/train/Bag/data.pkl')
train_bag['new_label'] = 1
valid_bag = pd.read_pickle('/DATA2/lvxiaoling/limengyuan/SHL2023/valid/Bag/data.pkl')
valid_bag['new_label'] = 1
train_hips = pd.read_pickle('/DATA2/lvxiaoling/limengyuan/SHL2023/train/Hips/data.pkl')
train_hips['new_label'] = 2
valid_hips = pd.read_pickle('/DATA2/lvxiaoling/limengyuan/SHL2023/valid/Hips/data.pkl')
valid_hips['new_label'] = 2
train_torso = pd.read_pickle('/DATA2/lvxiaoling/limengyuan/SHL2023/train/Torso/data.pkl')
train_torso['new_label'] = 3
valid_torso = pd.read_pickle('/DATA2/lvxiaoling/limengyuan/SHL2023/valid/Torso/data.pkl')
valid_torso['new_label'] = 3


In [5]:
# 导入测试集
test = pd.read_pickle('/DATA2/lvxiaoling/limengyuan/SHL2023/test/data.pkl')


由于valid和test数据来源都是user2和user3，所以这里仅用valid数据集来训练

In [6]:
new_test = fill_NA(test)
new_test


Unnamed: 0,acc_x_mean,acc_y_mean,acc_z_mean,acc_total_mean,gyr_x_mean,gyr_y_mean,gyr_z_mean,gyr_total_mean,mag_x_mean,mag_y_mean,...,raliways_class,transport_class,traffic_class,landuse_class,roads_class,roads_code,reception_nan_ratio,reception_num,snr_mean,snr_max
0,2.305693,5.297033,7.167037,10.072129,0.062614,0.034294,-0.411274,1.533064,-14.714246,-32.917247,...,0,0,0,7,6,1,0.0,14.0,17.706374,25.0
1,1.806154,5.168713,7.244356,9.975430,0.080179,0.100112,-0.410911,1.485642,-9.960925,-31.251203,...,0,0,0,7,6,1,0.0,13.6,17.687912,25.0
2,1.332167,4.961737,7.439351,9.923214,-0.028255,-0.000603,-0.575047,1.316906,-6.205048,-25.854493,...,0,0,0,7,6,1,0.0,13.4,17.517582,25.0
3,0.360465,4.546942,8.489115,9.835957,-0.063320,-0.263411,-0.518863,0.998637,2.191515,-22.126724,...,0,0,0,7,6,1,0.0,13.2,17.298901,24.0
4,0.300997,4.487032,8.429255,9.746678,-0.013252,-0.203949,-0.427161,0.954324,8.890495,-20.887286,...,0,0,0,7,6,1,0.0,13.0,17.046154,24.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
462265,-0.934177,2.883688,9.102895,9.835033,-0.027602,0.015833,0.079736,0.781714,10.180395,9.719222,...,0,0,2,0,3,2,0.0,13.8,29.886813,43.0
462266,-0.814674,2.672222,9.045871,9.710407,0.016240,0.013925,0.024624,0.764337,10.808655,8.886239,...,0,0,2,0,3,2,0.0,13.6,30.115385,43.0
462267,-0.634535,2.968848,9.012172,9.791114,-0.010741,-0.004730,0.028668,0.800318,9.688708,6.887571,...,0,0,2,0,3,2,0.0,13.4,30.370330,43.0
462268,-0.493971,3.080875,9.088617,9.901386,0.025148,-0.000146,-0.019927,0.844734,9.083010,5.583259,...,0,0,2,0,3,2,0.0,13.4,30.470330,43.0


In [7]:
# 处理原始数据并将两个数据集进行拼接
new_train_hand = fill_NA(train_hand)
new_train_bag = fill_NA(train_bag)
new_train_hips = fill_NA(train_hips)
new_train_torso = fill_NA(train_torso)
new_valid_hand = fill_NA(valid_hand)
new_valid_bag = fill_NA(valid_bag)
new_valid_hips = fill_NA(valid_hips)
new_valid_torso = fill_NA(valid_torso)

new_train = pd.concat([new_train_hand, new_train_bag, new_train_hips, new_train_torso], axis=0).reset_index(drop=True)
new_valid = pd.concat([new_valid_hand, new_valid_bag, new_valid_hips, new_valid_torso], axis=0).reset_index(drop=True)

# new_result = pd.concat([new_train, new_valid], axis=0).reset_index(drop=True)
# new_result


In [34]:
'''
# 划分训练集和临时集
train, valid = train_test_split(new_valid, test_size=0.2, random_state=42)

# 训练集和验证集
y_train = train['new_label']
X_train = train.drop(columns=['new_label', 'label'])
y_valid = valid['new_label']
X_valid = valid.drop(columns=['new_label', 'label'])

'''

In [25]:
new_train

Unnamed: 0,acc_x_mean,acc_y_mean,acc_z_mean,acc_total_mean,gyr_x_mean,gyr_y_mean,gyr_z_mean,gyr_total_mean,mag_x_mean,mag_y_mean,...,transport_class,traffic_class,landuse_class,roads_class,roads_code,reception_nan_ratio,reception_num,snr_mean,snr_max,new_label
0,0.765344,-0.640161,4.132385,10.699227,-0.371733,0.358142,0.478508,1.658991,5.780782,-1.503930,...,0,0,1,2,2,0.0,8.0,29.426190,42.0,0
1,-0.902888,0.383509,1.819602,10.951662,-0.655154,0.038576,0.606337,2.379206,6.764845,-9.711275,...,0,0,1,2,2,0.0,9.0,27.926190,42.0,0
2,-2.662917,0.111713,1.309267,10.849929,-0.768043,0.424787,0.649773,3.015698,13.626552,-12.975913,...,0,0,1,2,2,0.0,10.4,26.169048,42.0,0
3,-4.389154,0.096886,-2.301307,11.171014,-0.691381,0.408220,0.769724,3.269162,14.341032,-16.027279,...,0,0,1,2,2,0.0,11.8,24.426190,42.0,0
4,-4.520223,0.758477,-4.972611,10.904985,-0.428414,-0.014670,0.477138,2.608470,9.525941,-16.206243,...,0,0,1,2,2,0.0,13.0,22.793223,42.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3922571,-5.113448,6.384056,-5.291904,9.781164,0.004808,0.001458,-0.000193,0.190184,36.703846,3.081658,...,0,0,1,2,2,0.0,16.0,23.312500,37.0,3
3922572,-5.113056,6.326708,-5.370577,9.786840,0.003192,-0.000978,-0.000454,0.189880,36.734729,2.896407,...,0,0,1,2,2,0.0,16.0,23.325000,37.0,3
3922573,-5.119358,6.299730,-5.409853,9.789359,-0.001941,-0.000461,0.000228,0.182239,36.633110,2.964067,...,0,0,1,2,2,0.0,16.0,23.300000,37.0,3
3922574,-5.120420,6.293404,-5.266023,9.715804,-0.007761,-0.003066,-0.003745,0.207311,36.867668,3.046823,...,0,0,1,2,2,0.0,15.8,23.439167,37.0,3


In [8]:
# # 合并训练集和验证集为训练集
# X_train = pd.concat([X_train, X_val], axis=0)
# y_train = pd.concat([y_train, y_val], axis=0)

# 转化为 numpy
X_train, y_train = new_train.drop(columns='new_label').values, new_train['new_label'].values
X_valid, y_valid = new_valid.drop(columns='new_label').values, new_valid['new_label'].values
# X_test, y_test = X_test.to_numpy(), y_test.to_numpy()


# 建模

分别建立XGBoost、LightGBM和catboost模型,但是由于机器学习的模型一般不需要使用验证集,所以将上面的训练集和验证集合并为训练集

## 建立 XGBoost 模型

建立 XGBoost 模型并进行预测

In [9]:
"""
多线程
"""
num_classes = 4  # 4分类问题
# 设置参数，包括线程数
params = {
    'objective': 'multi:softmax',
    'num_class': num_classes,  # 类别数目
    'nthread': 4,  # 设置线程数为4
    # 其他参数...
}
start_time_XGB = time.time()
# 创建DMatrix对象
dtrain = xgb.DMatrix(X_valid, label=y_valid)

# 训练模型
model = xgb.train(params, dtrain, num_boost_round=10)
end_time_XGB = time.time()
during_XGB = end_time_XGB - start_time_XGB
print(f"训练XGBoost模型用时:{during_XGB}秒")


训练XGBoost模型用时:303.9319860935211秒


In [10]:
# # 加载模型，如果本地有保存模型的话
# loaded_model = xgboost.Booster()
# loaded_model.load_model('xgboost_model.model')

# 创建DMatrix对象
dvalid = xgb.DMatrix(X_valid, label=y_valid)
# 计算acc值
y_pred_XGB = model.predict(dvalid)
# 整体的预测准确率
accuracy_XGB = accuracy_score(y_valid, y_pred_XGB)
print("Accuracy:", accuracy_XGB)
# 各个类别的预测准确率
report_XGB = classification_report(y_valid, y_pred_XGB)
print(report_XGB)
# 计算宏F1值
f1_XGB = f1_score(y_valid, y_pred_XGB, average='macro')
print("F1 score:", f1_XGB)


Accuracy: 0.9692116851486116
              precision    recall  f1-score   support

           0       0.97      0.95      0.96    143293
           1       0.98      0.95      0.97    143293
           2       0.97      0.98      0.98    143293
           3       0.95      0.99      0.97    143293

    accuracy                           0.97    573172
   macro avg       0.97      0.97      0.97    573172
weighted avg       0.97      0.97      0.97    573172

F1 score: 0.9691691908836995


In [11]:
dtest = xgb.DMatrix(new_test.to_numpy())
# 预测
y_pred_XGB = model.predict(dtest)
y_pred_XGB

array([0., 0., 0., ..., 0., 0., 0.], dtype=float32)

In [16]:
sum(y_pred_XGB==3)/len(y_pred_XGB)

0.03650031366950051

## 建立 LightGBM 模型

建立 LightGBM 模型并进行预测

In [17]:
# 定义模型参数
num_classes = 4  # 4分类问题

params = {
    'objective': 'multiclass',  # 多类别分类问题
    'num_class': num_classes,  # 类别数量
    'metric': 'multi_logloss'  # 损失函数
}


In [18]:
# 建立 LightGBM 模型
start_time_LGBM = time.time()
lgb_model = lgb.LGBMClassifier(**params)
lgb_model.fit(X_valid, y_valid)
end_time_LGBM = time.time()
during_LGBM = end_time_LGBM - start_time_LGBM
print(f"训练LightGBM模型用时:{during_LGBM}秒")


训练LightGBM模型用时:72.97076416015625秒


In [38]:
# 保存模型到文件
# joblib.dump(lgb_model, 'lgb_model.pkl')


['lgb_model.pkl']

In [19]:
# # 加载保存的模型
# lgb_model = joblib.load('lgb_model.pkl')

# 计算acc值
y_pred_LGBM = lgb_model.predict(X_valid)
# 整体的预测准确率
accuracy_LGBM = accuracy_score(y_valid, y_pred_LGBM)
print("Accuracy:", accuracy_LGBM)
# 各个类别的预测准确率
report_LGBM = classification_report(y_valid, y_pred_LGBM)
print(report_LGBM)
# 计算宏F1值
f1_LGBM = f1_score(y_valid, y_pred_LGBM, average='macro')
print("F1 score:", f1_LGBM)


Accuracy: 0.9996824687877286
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    143293
           1       1.00      1.00      1.00    143293
           2       1.00      1.00      1.00    143293
           3       1.00      1.00      1.00    143293

    accuracy                           1.00    573172
   macro avg       1.00      1.00      1.00    573172
weighted avg       1.00      1.00      1.00    573172

F1 score: 0.999682461887012


In [20]:
# # 加载保存的模型
# lgb_model = joblib.load('lgb_model.pkl')

# 预测
y_pred_LGBM = lgb_model.predict(new_test)
y_pred_LGBM


array([0, 0, 0, ..., 0, 0, 0])

0.948783178661821

In [24]:
sum(y_pred_LGBM==3)/len(y_pred_LGBM)

0.007761697709131027

## 建立 CatBoost 模型

建立 CatBoost 模型并进行预测

In [25]:
# 建立 CatBoost 模型
start_time_cat = time.time()
cat_model = CatBoostClassifier(iterations=100, learning_rate=0.1, depth=6, thread_count=4)
# 拟合模型
cat_model.fit(X_valid, y_valid, verbose=False)

end_time_cat = time.time()
during_cat = end_time_cat - start_time_cat
print(f"训练CatBoost模型用时:{during_cat}秒")


训练CatBoost模型用时:88.01411867141724秒


In [81]:
# 保存模型到本地文件
# cat_model.save_model("catboost_model.bin")


In [26]:
# # 加载保存的CatBoost模型
# cat_model = CatBoostClassifier()
# cat_model.load_model("catboost_model.bin")

# 计算acc值
y_pred_cat = cat_model.predict(X_valid)
# 整体的预测准确率
accuracy_cat = accuracy_score(y_valid, y_pred_cat)
print("Accuracy:", accuracy_cat)
# 各个类别的预测准确率
report_cat = classification_report(y_valid, y_pred_cat)
print(report_cat)
# 计算宏F1值
f1_cat = f1_score(y_valid, y_pred_cat, average='macro')
print("F1 score:", f1_cat)


Accuracy: 0.9760839678141989
              precision    recall  f1-score   support

           0       0.97      0.97      0.97    143293
           1       0.98      0.97      0.98    143293
           2       0.98      0.99      0.98    143293
           3       0.97      0.99      0.98    143293

    accuracy                           0.98    573172
   macro avg       0.98      0.98      0.98    573172
weighted avg       0.98      0.98      0.98    573172

F1 score: 0.9760659206873246


In [27]:
# 加载保存的CatBoost模型
# cat_model = CatBoostClassifier()
# cat_model.load_model("catboost_model.bin")

# 预测
y_pred_cat = cat_model.predict(new_test)
y_pred_cat


array([[0],
       [0],
       [0],
       ...,
       [0],
       [0],
       [0]])

In [31]:
sum(y_pred_cat == 3)/len(y_pred_cat)

array([0.01762823])

# 建立集成模型

用 XGBoost、LightGBM、CatBoost 三个模型进行集成学习

In [25]:
# 将每个基模型的预测结果收集起来，形成一个预测结果的集合
# y_pred_LGBM = lgb_model.predict(X_test)
# y_pred_cat = cat_model.predict(X_test)
# y_pred_XGB = xgboost_model.predict(X_test)

y_pred_ensemble = np.column_stack((y_pred_XGB, y_pred_LGBM, y_pred_cat))
print(y_pred_ensemble.shape)


(904688, 3)


In [26]:
# 进行投票得出最后的预测结果
modes, counts = mode(y_pred_ensemble, axis=1)
y_pred = modes[:, 0]
print(y_pred.shape)


(904688,)


In [27]:
# 整体的预测准确率
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
# 各个类别的预测准确率
report = classification_report(y_test, y_pred)
print(report)
# 计算宏F1值
f1 = f1_score(y_test, y_pred, average='macro')
print("F1 score:", f1)


Accuracy: 0.9479754346249757
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    259519
           1       0.94      0.95      0.94    188545
           2       0.95      0.89      0.92    245237
           3       0.90      0.95      0.92    211387

    accuracy                           0.95    904688
   macro avg       0.95      0.95      0.95    904688
weighted avg       0.95      0.95      0.95    904688

F1 score: 0.9459751432061382
