In [2]:
# Code that might generate warnings
# %%
import warnings
import datetime
import os
import math
import time
import pandas as pd
import numpy as np
import pickle
import progressbar
import joblib
from joblib import Parallel, delayed
import multiprocessing
from multiprocessing import Process, Pool, Manager

import itertools
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
from scipy.stats import skew, kurtosis
from numpy.lib.stride_tricks import as_strided as stride
from geopy.distance import geodesic
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

from math import sqrt
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import r2_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

from sklearn.linear_model import LinearRegression
from scipy.stats import mode
from statsmodels.tools.tools import add_constant
from statsmodels.stats.stattools import durbin_watson
from statsmodels.stats.outliers_influence import variance_inflation_factor

warnings.filterwarnings("ignore")  # Ignore all warnings

# 每一条轨迹长短不一，在轨迹内采用滑动窗口L1（即决定窗口）,步长为L2,划分为子轨迹，提取子轨迹的特征。
# 在子轨迹上训练模型，最后可以基于轨迹进行后处理
L1 = 5  # s 滑动窗口的长度，暂时依据是20年直接给的5s的数据
L2 = 1  # 滑动窗口的步长，假设1s内没有变化（主要原因是GPS数据是1s记录的）

locs = ['Hand', 'Bag', 'Hips', 'Torso']

loc = locs[0]  # 'Hand'
dataset = 'train'

label_map = {
    1: 'Still',
    2: 'Walking',
    3: 'Run',
    4: 'Bike',
    5: 'Car',
    6: 'Bus',
    7: 'Train',
    8: 'Subway'
}

filenames = {
    'train': {
        'Hand': {
            'Location': '/DATA2/lvxiaoling/limengyuan/SHL2023/train/Hand/Location.pkl',
            'Location_new': '/DATA2/lvxiaoling/limengyuan/SHL2023/train/Hand/Location_new.pkl',
            # 进行标签匹配后的数据，时间戳是1HZ的label数据
            'Mag': '/DATA2/lvxiaoling/limengyuan/SHL2023/train/Hand/Mag.pkl',
            'Gyr': '/DATA2/lvxiaoling/limengyuan/SHL2023/train/Hand/Gyr.pkl',
            'Acc': '/DATA2/lvxiaoling/limengyuan/SHL2023/train/Hand/Acc.pkl',
            'GPS': '/DATA2/lvxiaoling/limengyuan/SHL2023/train/Hand/GPS.pkl',
            'GPS_new': '/DATA2/lvxiaoling/limengyuan/SHL2023/train/Hand/GPS_new.pkl',
        },
        'Bag': {
            'Location': '/DATA2/lvxiaoling/limengyuan/SHL2023/train/Bag/Location.pkl',
            'Location_new': '/DATA2/lvxiaoling/limengyuan/SHL2023/train/Bag/Location_new.pkl',
            'Mag': '/DATA2/lvxiaoling/limengyuan/SHL2023/train/Bag/Mag.pkl',
            'Gyr': '/DATA2/lvxiaoling/limengyuan/SHL2023/train/Bag/Gyr.pkl',
            'Acc': '/DATA2/lvxiaoling/limengyuan/SHL2023/train/Bag/Acc.pkl',
            'GPS': '/DATA2/lvxiaoling/limengyuan/SHL2023/train/Bag/GPS.pkl',
            'GPS_new': '/DATA2/lvxiaoling/limengyuan/SHL2023/train/Bag/GPS_new.pkl',
        },
        'Hips': {
            'Location': '/DATA2/lvxiaoling/limengyuan/SHL2023/train/Hips/Location.pkl',
            'Location_new': '/DATA2/lvxiaoling/limengyuan/SHL2023/train/Hips/Location_new.pkl',
            'Mag': '/DATA2/lvxiaoling/limengyuan/SHL2023/train/Hips/Mag.pkl',
            'Gyr': '/DATA2/lvxiaoling/limengyuan/SHL2023/train/Hips/Gyr.pkl',
            'Acc': '/DATA2/lvxiaoling/limengyuan/SHL2023/train/Hips/Acc.pkl',
            'GPS': '/DATA2/lvxiaoling/limengyuan/SHL2023/train/Hips/GPS.pkl',
            'GPS_new': '/DATA2/lvxiaoling/limengyuan/SHL2023/train/Hips/GPS_new.pkl',
        },
        'Torso': {
            'Location': '/DATA2/lvxiaoling/limengyuan/SHL2023/train/Torso/Location.pkl',
            'Location_new': '/DATA2/lvxiaoling/limengyuan/SHL2023/train/Torso/Location_new.pkl',
            'Mag': '/DATA2/lvxiaoling/limengyuan/SHL2023/train/Torso/Mag.pkl',
            'Gyr': '/DATA2/lvxiaoling/limengyuan/SHL2023/train/Torso/Gyr.pkl',
            'Acc': '/DATA2/lvxiaoling/limengyuan/SHL2023/train/Torso/Acc.pkl',
            'GPS': '/DATA2/lvxiaoling/limengyuan/SHL2023/train/Torso/GPS.pkl',
            'GPS_new': '/DATA2/lvxiaoling/limengyuan/SHL2023/train/Torso/GPS_new.pkl',
        },
        'Label': '/DATA2/lvxiaoling/limengyuan/SHL2023/train/Label.pkl'
    },
    'valid': {
        'Hand': {
            'Location': '/DATA2/lvxiaoling/limengyuan/SHL2023/valid/Hand/Location.pkl',
            'Mag': '/DATA2/lvxiaoling/limengyuan/SHL2023/valid/Hand/Mag.pkl',
            'Gyr': '/DATA2/lvxiaoling/limengyuan/SHL2023/valid/Hand/Gyr.pkl',
            'Acc': '/DATA2/lvxiaoling/limengyuan/SHL2023/valid/Hand/Acc.pkl',
            'GPS': '/DATA2/lvxiaoling/limengyuan/SHL2023/valid/Hand/GPS.pkl',
        },
        'Bag': {
            'Location': '/DATA2/lvxiaoling/limengyuan/SHL2023/valid/Bag/Location.pkl',
            'Location_new': '/DATA2/lvxiaoling/limengyuan/SHL2023/valid/Bag/Location_new.pkl',
            'Mag': '/DATA2/lvxiaoling/limengyuan/SHL2023/valid/Bag/Mag.pkl',
            'Gyr': '/DATA2/lvxiaoling/limengyuan/SHL2023/valid/Bag/Gyr.pkl',
            'Acc': '/DATA2/lvxiaoling/limengyuan/SHL2023/valid/Bag/Acc.pkl',
            'GPS': '/DATA2/lvxiaoling/limengyuan/SHL2023/valid/Bag/GPS.pkl',
            'GPS_new': '/DATA2/lvxiaoling/limengyuan/SHL2023/valid/Bag/GPS_new.pkl',
        },

        'Hips': {
            'Location': '/DATA2/lvxiaoling/limengyuan/SHL2023/valid/Hips/Location.pkl',
            'Location_new': '/DATA2/lvxiaoling/limengyuan/SHL2023/valid/Hips/Location_new.pkl',
            'Mag': '/DATA2/lvxiaoling/limengyuan/SHL2023/valid/Hips/Mag.pkl',
            'Gyr': '/DATA2/lvxiaoling/limengyuan/SHL2023/valid/Hips/Gyr.pkl',
            'Acc': '/DATA2/lvxiaoling/limengyuan/SHL2023/valid/Hips/Acc.pkl',
            'GPS': '/DATA2/lvxiaoling/limengyuan/SHL2023/valid/Hips/GPS.pkl',
            'GPS_new': '/DATA2/lvxiaoling/limengyuan/SHL2023/valid/Hips/GPS_new.pkl',
        },
        'Torso': {
            'Location': '/DATA2/lvxiaoling/limengyuan/SHL2023/valid/Torso/Location.pkl',
            'Location_new': '/DATA2/lvxiaoling/limengyuan/SHL2023/valid/Torso/Location_new.pkl',
            'Mag': '/DATA2/lvxiaoling/limengyuan/SHL2023/valid/Torso/Mag.pkl',
            'Gyr': '/DATA2/lvxiaoling/limengyuan/SHL2023/valid/Torso/Gyr.pkl',
            'Acc': '/DATA2/lvxiaoling/limengyuan/SHL2023/valid/Torso/Acc.pkl',
            'GPS': '/DATA2/lvxiaoling/limengyuan/SHL2023/valid/Torso/GPS.pkl',
            'GPS_new': '/DATA2/lvxiaoling/limengyuan/SHL2023/valid/Torso/GPS_new.pkl',
        },
        'Label': '/DATA2/lvxiaoling/limengyuan/SHL2023/valid/Label.pkl',
    },
    'test': {
        'Location': '/DATA2/lvxiaoling/limengyuan/SHL2023/test/Location.pkl',
        'Location_new': '/DATA2/lvxiaoling/limengyuan/SHL2023/test/Location_new.pkl',
        'Mag': '/DATA2/lvxiaoling/limengyuan/SHL2023/test/Mag.pkl',
        'Gyr': '/DATA2/lvxiaoling/limengyuan/SHL2023/test/Gyr.pkl',
        'Acc': '/DATA2/lvxiaoling/limengyuan/SHL2023/test/Acc.pkl',
        'GPS': '/DATA2/lvxiaoling/limengyuan/SHL2023/test/GPS.pkl',
        'GPS_new': '/DATA2/lvxiaoling/limengyuan/SHL2023/test/GPS_new.pkl',
        'Label': '/DATA2/lvxiaoling/limengyuan/SHL2023/test/Label.pkl'
    }
}



# 数据读取

假设前面的特征工程部分已经处理完成，接下来是完成模型部分
前面已经得到了一个名为 results 的 DataFrame，现在需要进行 XGBoost 建模

In [3]:
# 读取数据
results1 = pd.read_pickle('/DATA2/lvxiaoling/limengyuan/SHL2023/train/Hand/data.pkl').drop_duplicates(keep='first')
results2 = pd.read_pickle('/DATA2/lvxiaoling/limengyuan/SHL2023/valid/Hand/data.pkl').drop_duplicates(keep='first')
test = pd.read_pickle('/DATA2/lvxiaoling/limengyuan/SHL2023/test/data.pkl').drop_duplicates(keep='first')


In [4]:
def fill_NA(dataset):
    # 计算每列的缺失值比例
    missing_ratio = dataset.isnull().mean()
    # 找到缺失值比例超过0.1的列
    columns_to_drop = missing_ratio[missing_ratio > 0.1].index
    # 删除缺失值比例超过0.1的列
    dataset = dataset.drop(columns=columns_to_drop)
    
    # 将空值替换为0
    new_dataset = dataset.fillna(0)
    
    # 在训练Boost模型时，要求标签类别必须是从0开始的连续整数
    label_mapping = {1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7}
    new_dataset['label'] = new_dataset['label'].map(label_mapping)
    #new_dataset = new_dataset.drop(columns=['idx', 'timestamp', 'trajectory_id'])
    # 删的有点早难以匹配和后处理
    return new_dataset

In [5]:
# 处理原始数据并将两个数据集进行拼接
new_results1 = fill_NA(results1)
new_results2 = fill_NA(results2)
new_test = fill_NA(test)

new_result = pd.concat([new_results1, new_results2,new_test], axis=0).reset_index(drop=True)
#test一起处理哑变量

# 处理哑变量
dummy_columns = ['raliways_class',
                 'transport_class',
                 'traffic_class',
                 'landuse_class',
                 'roads_class',
                 'roads_code',]
# pd.get_dummies 是只能传入一列
dummy_data = pd.concat([pd.get_dummies(new_result[dummy_columns_i]) for dummy_columns_i in dummy_columns], axis=1)
# 拼接并保留原始分类变量
new_result_with_dummies = pd.concat([new_result, dummy_data], axis=1)

In [6]:
new_result_with_dummies['idx']

0                524
1                624
2                724
3                824
4                924
             ...    
1581544    158833732
1581545    158833832
1581546    158833932
1581547    158834032
1581548    158834132
Name: idx, Length: 1581549, dtype: int64

In [7]:
# 读取 train_idx, valid_idx 原来的idx应该重新运行代码被覆盖掉了
train_idx = pd.read_pickle('/DATA2/lvxiaoling/limengyuan/SHL2023/train/trainidx.pkl')
valid_idx = pd.read_pickle('/DATA2/lvxiaoling/limengyuan/SHL2023/train/validx.pkl')

X = new_result_with_dummies

data_train = X[X['idx'].isin(results1['idx'])]
data_valid = X[X['idx'].isin(results2['idx'])]
data_test = X[X['idx'].isin(test['idx'])]

y_train = data_train['label'].values
y_valid = data_valid['label'].values

X_train = data_train.drop(columns=[ 'label', 'label_idx', 'idx', 'timestamp', 'trajectory_id']).values
X_valid = data_valid.drop(columns=[ 'label', 'label_idx', 'idx', 'timestamp', 'trajectory_id']).values

# 打印各个数据集的形状
print(f"训练集 X_train:{X_train.shape}  y_train:{y_train.shape}")
print(f"验证集 X_valid:{X_valid.shape}  y_valid:{y_valid.shape}")

训练集 X_train:(975986, 317)  y_train:(975986,)
验证集 X_valid:(143293, 317)  y_valid:(143293,)


In [8]:
data_train.shape

(975986, 322)

In [9]:
y_test = data_test['label']
X_test = data_test.drop(columns=['label', 'label_idx', 'idx', 'timestamp', 'trajectory_id']).values

In [10]:
for i in dummy_columns:
    if len(data_valid[~data_valid[i].isin(data_train[i])]):
        print(i,'val')
    if len(data_test[~data_test[i].isin(data_train[i])]):
        print(i,'test')


In [11]:
# # 合并训练集和验证集为训练集
# X_train = pd.concat([X_train, X_val], axis=0)
# y_train = pd.concat([y_train, y_val], axis=0)

# 转化为 numpy
if not isinstance(X_train, np.ndarray):
    X_train, y_train = X_train.to_numpy(), y_train.to_numpy()
if not isinstance(X_valid, np.ndarray):
    X_valid, y_valid = X_valid.to_numpy(), y_valid.to_numpy()
# X_test, y_test = X_test.to_numpy(), y_test.to_numpy()

In [12]:
def softmax_row_wise(x):
    # 计算指数
    exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
    # 计算每行的指数和
    exp_sum = np.sum(exp_x, axis=1, keepdims=True)
    # 计算Softmax结果
    softmax_result = exp_x / exp_sum
    return softmax_result


In [13]:
def get_out(pred,y_probs,dataset):
    if dataset == 'train':
        res = data_train[['label', 'label_idx', 'idx', 'timestamp', 'trajectory_id']]
    elif dataset == 'val':
        res = data_valid[['label', 'label_idx', 'idx', 'timestamp', 'trajectory_id']]
    elif dataset == 'test':
        res = data_test[['label', 'label_idx', 'idx', 'timestamp', 'trajectory_id']]

    res[['out_{}'.format(i) for i in range(8)]] =  y_probs
    res['out_pred'] = pred
    return res

# 建模

分别建立XGBoost、LightGBM和catboost模型,但是由于机器学习的模型一般不需要使用验证集,所以将上面的训练集和验证集合并为训练集

## Boost 模型

### 建立 XGBoost 模型

建立 XGBoost 模型并进行预测

In [21]:
"""
多线程
"""
num_classes = 8  # 8分类问题
# 设置参数，包括线程数
params = {
    'objective': 'multi:softmax',
    'num_class': num_classes,  # 类别数目
    'nthread': 8,  # 设置线程数为8
    # 其他参数...
}

# 创建DMatrix对象
dtrain = xgb.DMatrix(X_train, label=y_train)

# 训练模型
xgboost_model = xgb.train(params, dtrain, num_boost_round=10)

In [22]:
# 保存模型到本地
xgboost_model.save_model('xgboost_model_Boost0622_raw.model')

In [17]:
# # 加载模型，如果本地有保存模型的话
# 验证集
xgboost_model = xgb.Booster()
xgboost_model.load_model('xgboost_model_Boost0622_raw.model')

# 创建DMatrix对象
dvalid = xgb.DMatrix(X_valid, label=y_valid)
# 计算acc值
y_pred_XGB = xgboost_model.predict(dvalid)
y_probs_XGB = softmax_row_wise(xgboost_model.predict(dvalid,output_margin=True))
# 整体的预测准确率
accuracy_XGB = accuracy_score(y_valid, y_pred_XGB)
print("Accuracy:", accuracy_XGB)
# 各个类别的预测准确率
report_XGB = classification_report(y_valid, y_pred_XGB)
print(report_XGB)
# 计算宏F1值
f1_XGB = f1_score(y_valid, y_pred_XGB, average='macro')
print("F1 score:", f1_XGB)
xg_valid = get_out(pred=y_pred_XGB,y_probs= y_probs_XGB,dataset='val')
xg_valid.to_pickle('/DATA2/lvxiaoling/limengyuan/SHL2023/out_put2/val_ml1.pkl')

Accuracy: 0.8007509089767121
              precision    recall  f1-score   support

         0.0       0.83      0.94      0.88     29676
         1.0       0.91      0.84      0.87     25879
         2.0       0.98      0.77      0.86      2754
         3.0       0.94      0.21      0.34     12001
         4.0       0.88      0.58      0.70     20438
         5.0       0.34      0.84      0.48      9138
         6.0       0.91      0.93      0.92     21763
         7.0       0.91      0.96      0.94     21644

    accuracy                           0.80    143293
   macro avg       0.84      0.76      0.75    143293
weighted avg       0.86      0.80      0.80    143293

F1 score: 0.749640212886812


In [41]:
y_probs_XGB.max(1).mean()


0.7860838

In [33]:
a[0]

array([-0.9332351 ,  1.2616082 , -1.0502172 , -0.42409664, -0.66000843,
        0.4752401 , -0.5853704 , -0.233704  ], dtype=float32)

In [24]:
# 训练集
dtrain = xgb.DMatrix(X_train, label=y_train)
# 计算acc值
y_pred_XGB = xgboost_model.predict(dtrain)
y_probs_XGB = softmax_row_wise(xgboost_model.predict(dtrain,output_margin=True))
# 整体的预测准确率
accuracy_XGB = accuracy_score(y_train, y_pred_XGB)
print("Accuracy:", accuracy_XGB)
# 各个类别的预测准确率
report_XGB = classification_report(y_train, y_pred_XGB)
print(report_XGB)
# 计算宏F1值
f1_XGB = f1_score(y_train, y_pred_XGB, average='macro')
print("F1 score:", f1_XGB)

xg_train = get_out(pred=y_pred_XGB,y_probs= y_probs_XGB,dataset='train')
#xg_train.to_pickle('/DATA2/lvxiaoling/limengyuan/SHL2023/out_put/train_ml1.pkl')

Accuracy: 0.942334213810444
              precision    recall  f1-score   support

         0.0       0.93      0.96      0.94    121421
         1.0       0.97      0.96      0.96    121730
         2.0       0.99      0.98      0.99     42051
         3.0       0.99      0.97      0.98    116713
         4.0       0.92      0.90      0.91    158280
         5.0       0.88      0.89      0.89    141178
         6.0       0.94      0.98      0.96    155870
         7.0       0.96      0.93      0.95    118743

    accuracy                           0.94    975986
   macro avg       0.95      0.95      0.95    975986
weighted avg       0.94      0.94      0.94    975986

F1 score: 0.9478051324856609


In [21]:
# 测试集
dtest = xgb.DMatrix(X_test, label=y_test.fillna(0))
# 计算acc值
y_pred_XGB = xgboost_model.predict(dtest)
y_probs_XGB = softmax_row_wise(xgboost_model.predict(dtest,output_margin=True))


xg_test = get_out(pred=y_pred_XGB,y_probs= y_probs_XGB,dataset='test')
# xg_test.to_pickle('/DATA2/lvxiaoling/limengyuan/SHL2023/out_put/test_ml1.pkl')

### 建立 LightGBM 模型

建立 LightGBM 模型并进行预测

In [25]:
# 定义模型参数
num_classes = 8  # 8分类问题

params = {
    'objective': 'multiclass',  # 多类别分类问题
    'num_class': num_classes,  # 类别数量
    'metric': 'multi_logloss'  # 损失函数
}

In [26]:
# 建立 LightGBM 模型
lgb_model = lgb.LGBMClassifier(**params)
lgb_model.fit(X_train, y_train)

In [27]:
# 保存模型到文件
joblib.dump(lgb_model, 'lgb_model_Boost0622_raw.pkl')

['lgb_model_Boost0622_raw.pkl']

In [18]:
# # 加载保存的模型
# 验证集
lgb_model = joblib.load('lgb_model_Boost0622_raw.pkl')

# 计算acc值
y_pred_LGBM = lgb_model.predict(X_valid)
y_probs_LGBM = lgb_model.predict_proba(X_valid)
# 整体的预测准确率
accuracy_LGBM = accuracy_score(y_valid, y_pred_LGBM)
print("Accuracy:", accuracy_LGBM)
# 各个类别的预测准确率
report_LGBM = classification_report(y_valid, y_pred_LGBM)
print(report_LGBM)
# 计算宏F1值
f1_LGBM = f1_score(y_valid, y_pred_LGBM, average='macro')
print("F1 score:", f1_LGBM)
lgbm_val = get_out(pred=y_pred_LGBM,y_probs= y_probs_LGBM,dataset='val')
lgbm_val.to_pickle('/DATA2/lvxiaoling/limengyuan/SHL2023/out_put2/val_ml2.pkl')

Accuracy: 0.8080087652572003
              precision    recall  f1-score   support

         0.0       0.87      0.94      0.91     29676
         1.0       0.88      0.87      0.87     25879
         2.0       0.94      0.76      0.84      2754
         3.0       0.97      0.24      0.39     12001
         4.0       0.80      0.54      0.64     20438
         5.0       0.38      0.86      0.52      9138
         6.0       0.91      0.93      0.92     21763
         7.0       0.90      0.98      0.94     21644

    accuracy                           0.81    143293
   macro avg       0.83      0.77      0.75    143293
weighted avg       0.85      0.81      0.80    143293

F1 score: 0.7544718019250383


In [27]:
# 训练集
y_pred_LGBM = lgb_model.predict(X_train)
y_probs_LGBM = lgb_model.predict_proba(X_train)

# 整体的预测准确率
accuracy_LGBM = accuracy_score(y_train, y_pred_LGBM)
print("Accuracy:", accuracy_LGBM)
# 各个类别的预测准确率
report_LGBM = classification_report(y_train, y_pred_LGBM)
print(report_LGBM)
# 计算宏F1值
f1_LGBM = f1_score(y_train, y_pred_LGBM, average='macro')
print("F1 score:", f1_LGBM)
lgbm_train = get_out(pred=y_pred_LGBM,y_probs= y_probs_LGBM,dataset='train')
# lgbm_train.to_pickle('/DATA2/lvxiaoling/limengyuan/SHL2023/out_put/train_ml2.pkl')

Accuracy: 0.9920892271481407
              precision    recall  f1-score   support

         0.0       0.99      0.98      0.99    120675
         1.0       1.00      1.00      1.00    117175
         2.0       0.97      1.00      0.99     35805
         3.0       1.00      1.00      1.00    110674
         4.0       0.99      0.99      0.99    146300
         5.0       0.99      0.99      0.99    121841
         6.0       0.99      1.00      0.99    147554
         7.0       0.99      0.99      0.99    116827

    accuracy                           0.99    916851
   macro avg       0.99      0.99      0.99    916851
weighted avg       0.99      0.99      0.99    916851

F1 score: 0.9915254422190407


In [28]:
# 测试集
y_pred_LGBM = lgb_model.predict(X_test)
y_probs_LGBM = lgb_model.predict_proba(X_test)



lgbm_test = get_out(pred=y_pred_LGBM,y_probs= y_probs_LGBM,dataset='test')
# lgbm_test.to_pickle('/DATA2/lvxiaoling/limengyuan/SHL2023/out_put/test_ml2.pkl')

### 建立 CatBoost 模型

建立 CatBoost 模型并进行预测

In [29]:
# 建立 CatBoost 模型
cat_model = CatBoostClassifier(iterations=100, learning_rate=0.1, depth=6, thread_count=8)
# 拟合模型
cat_model.fit(X_train, y_train, verbose=False)


<catboost.core.CatBoostClassifier at 0x7f981447c4c0>

In [30]:
# 保存模型到本地文件
cat_model.save_model("catboost_model_Boost0622_raw.bin")

In [19]:
# # 加载保存的CatBoost模型
# 验证集
cat_model = CatBoostClassifier()
cat_model.load_model("catboost_model_Boost0622_raw.bin")

# 计算acc值
y_pred_cat = cat_model.predict(X_valid)
y_probs_cat  = cat_model.predict_proba(X_valid)
# 整体的预测准确率
accuracy_cat = accuracy_score(y_valid, y_pred_cat)
print("Accuracy:", accuracy_cat)
# 各个类别的预测准确率
report_cat = classification_report(y_valid, y_pred_cat)
print(report_cat)
# 计算宏F1值
f1_cat = f1_score(y_valid, y_pred_cat, average='macro')
print("F1 score:", f1_cat)


cat_val = get_out(pred=y_pred_cat,y_probs= y_probs_cat,dataset='val')
cat_val.to_pickle('/DATA2/lvxiaoling/limengyuan/SHL2023/out_put2/val_ml3.pkl')


Accuracy: 0.7969963640931518
              precision    recall  f1-score   support

         0.0       0.84      0.93      0.88     29676
         1.0       0.92      0.82      0.87     25879
         2.0       0.99      0.80      0.88      2754
         3.0       0.78      0.09      0.15     12001
         4.0       0.87      0.62      0.72     20438
         5.0       0.33      0.87      0.47      9138
         6.0       0.92      0.95      0.94     21763
         7.0       0.92      0.97      0.94     21644

    accuracy                           0.80    143293
   macro avg       0.82      0.75      0.73    143293
weighted avg       0.85      0.80      0.79    143293

F1 score: 0.7332095851321211


In [37]:
# 计算acc值
y_pred_cat = cat_model.predict(X_train)
y_probs_cat  = cat_model.predict_proba(X_train)

# 整体的预测准确率
accuracy_cat = accuracy_score(y_train, y_pred_cat)
print("Accuracy:", accuracy_cat)
# 各个类别的预测准确率
report_cat = classification_report(y_train, y_pred_cat)
print(report_cat)
# 计算宏F1值
f1_cat = f1_score(y_train, y_pred_cat, average='macro')
print("F1 score:", f1_cat)

cat_train = get_out(pred=y_pred_cat,y_probs= y_probs_cat,dataset='train')
cat_train.to_pickle('/DATA2/lvxiaoling/limengyuan/SHL2023/out_put/train_ml3.pkl')


Accuracy: 0.9408060851763264
              precision    recall  f1-score   support

         0.0       0.92      0.95      0.94    120675
         1.0       0.94      0.94      0.94    117175
         2.0       0.98      0.96      0.97     35805
         3.0       0.98      0.97      0.97    110674
         4.0       0.94      0.92      0.93    146300
         5.0       0.91      0.90      0.90    121841
         6.0       0.94      0.97      0.96    147554
         7.0       0.95      0.92      0.94    116827

    accuracy                           0.94    916851
   macro avg       0.95      0.94      0.94    916851
weighted avg       0.94      0.94      0.94    916851

F1 score: 0.9437674909457809


In [38]:
# 计算acc值
y_pred_cat = cat_model.predict(X_test)
y_probs_cat  = cat_model.predict_proba(X_test)




cat_test = get_out(pred=y_pred_cat,y_probs= y_probs_cat,dataset='test')
cat_test.to_pickle('/DATA2/lvxiaoling/limengyuan/SHL2023/out_put/test_ml3.pkl')


综上，最后进入stacking的模型将会是CatBoost模型。

# 建立集成模型

用 XGBoost、LightGBM、CatBoost 三个模型进行集成学习

In [43]:
# # 首先导入上面已经训练好保存至本地的 model
# xgboost_model = xgb.XGBClassifier()
# xgboost_model.load_model('xgboost_model.model')

# lgb_model = joblib.load('lgb_model.pkl')

# cat_model = CatBoostClassifier()
# cat_model.load_model("catboost_model.bin")


In [44]:
# 将每个基模型的预测结果收集起来，形成一个预测结果的集合
# y_pred_LGBM = lgb_model.predict(X_test)
# y_pred_cat = cat_model.predict(X_test)
# y_pred_XGB = xgboost_model.predict(X_test)

y_pred_ensemble = np.column_stack((y_pred_XGB, y_pred_LGBM, y_pred_cat))
print(y_pred_ensemble.shape)


(202428, 3)


In [45]:
# 进行投票得出最后的预测结果
modes, counts = mode(y_pred_ensemble, axis=1)
y_pred = modes[:, 0]
print(y_pred.shape)


(202428,)


In [46]:
# 整体的预测准确率
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
# 各个类别的预测准确率
report = classification_report(y_test, y_pred)
print(report)
# 计算宏F1值
f1 = f1_score(y_test, y_pred, average='macro')
print("F1 score:", f1)


NameError: name 'y_test' is not defined