In [1]:
#!/usr/bin/env python
# coding: utf-8


# 导入第三方包
import pandas as pd
import numpy as np

import lightgbm as lgb

from sklearn.model_selection import KFold
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.metrics import mean_squared_error,roc_auc_score,precision_score
import warnings
warnings.filterwarnings('ignore')

In [2]:

# 读取数据集，具体下载方式可见操作手册
train = pd.read_csv('dataset/train.csv')
test = pd.read_csv('dataset/test.csv')

sample_submit = pd.read_csv('dataset/sample_submit.csv')

In [3]:
train

Unnamed: 0,id,asm_commands_add,asm_commands_call,asm_commands_cdq,asm_commands_cld,asm_commands_cli,asm_commands_cmc,asm_commands_cmp,asm_commands_cwd,asm_commands_daa,...,asm_commands_sti,asm_commands_stos,asm_commands_sub,asm_commands_test,asm_commands_wait,asm_commands_xchg,asm_commands_xor,line_count_asm,size_asm,label
0,0,459,236,1,1,0,0,183,0,13,...,2,2,841,6,0,5,15,7937,460288,7
1,1,696,1310,0,0,36,0,263,0,249,...,0,2,2755,281,0,9,252,37552,2177970,5
2,2,554,1423,0,0,61,0,763,0,85,...,20,0,2736,151,21,0,304,15245,884176,8
3,3,14,164,0,0,0,0,1,0,0,...,1,0,303,6,3,0,2,91393,5300736,6
4,4,931,3728,4,0,0,0,1641,0,40,...,0,197,8172,1429,42,0,970,9335,541421,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,49995,9,162,0,0,0,0,1,0,0,...,4,0,145,16,0,0,2,77825,4513792,6
49996,49996,561,234,0,0,6,0,122,0,171,...,0,0,620,215,0,1,215,83457,4840448,8
49997,49997,168,134,7,7,2,3,48,3,51,...,4,8,189,35,6,28,66,12767,740447,5
49998,49998,103,80,0,3,5,0,34,0,42,...,0,7,292,12,6,13,22,11009,638464,3


In [4]:
sample_submit

Unnamed: 0,id,label
0,50000,0
1,50001,0
2,50002,0
3,50003,0
4,50004,0
...,...,...
7995,57995,0
7996,57996,0
7997,57997,0
7998,57998,0


In [5]:
# 训练数据及测试数据准备
all_cols = [f for f in train.columns if f not in ['id','label']]

x_train = train[all_cols]
x_test = test[all_cols]

y_train = train['label']

In [8]:


# 作为baseline部分仅使用经典的**LightGBM**作为训练模型，我们还能尝试**XGBoost、CatBoost和NN（神经网络）**
def cv_model(clf, train_x, train_y, test_x, clf_name='lgb'):
    folds = 5
    seed = 2021
    kf = KFold(n_splits=folds, shuffle=True, random_state=seed)

    train = np.zeros((train_x.shape[0], 9))
    test = np.zeros((test_x.shape[0], 9))

    cv_scores = []

    for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
        print('************************************ {} ************************************'.format(str(i+1)))
        trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]

        train_matrix = clf.Dataset(trn_x, label=trn_y)
        valid_matrix = clf.Dataset(val_x, label=val_y)

        params = {
            'boosting_type': 'dart',
            'objective': 'multiclass',
            'num_class':9, 
            'metric': 'multi_logloss',
            'min_child_weight': 5,
            'num_leaves': 30,
            'max_depth':5,
            'lambda_l2': 10,
            'feature_fraction': 0.9,
            'bagging_fraction': 0.9,
            'bagging_freq': 4,
            'learning_rate': 0.01,
            'seed': 2021,
            'num_threads': 10,
            'n_jobs':-1,
            'silent': False,
            'verbose': -1,
        }

        model = clf.train(params, train_matrix, 20000, valid_sets=[train_matrix, valid_matrix], verbose_eval=500,early_stopping_rounds=200)
        val_pred = model.predict(val_x, num_iteration=model.best_iteration)
        test_pred = model.predict(test_x, num_iteration=model.best_iteration)
        
        train[valid_index] = val_pred
        test += test_pred / kf.n_splits
        cv_scores.append(roc_auc_score(val_y, val_pred, multi_class='ovr'))
        
        print(cv_scores)
       
    print("%s_scotrainre_list:" % clf_name, cv_scores)
    print("%s_score_mean:" % clf_name, np.mean(cv_scores))
    print("%s_score_std:" % clf_name, np.std(cv_scores))
    return train, test



In [9]:


lgb_train, lgb_test = cv_model(lgb, x_train, y_train, x_test)


# 预测结果
result = np.argmax(lgb_test, axis=1)
sample_submit['label'] = result
# sample_submit['loan_default'] = sample_submit['loan_default'].apply(lambda x:1 if x>limit_best else 0).values
sample_submit.to_csv('soft_lgb_v1.csv', index=False)

************************************ 1 ************************************
[500]	training's multi_logloss: 0.226809	valid_1's multi_logloss: 0.228892
[1000]	training's multi_logloss: 0.0423224	valid_1's multi_logloss: 0.0439312
[1500]	training's multi_logloss: 0.0146258	valid_1's multi_logloss: 0.0158334
[2000]	training's multi_logloss: 0.00888149	valid_1's multi_logloss: 0.00990567
[2500]	training's multi_logloss: 0.00663896	valid_1's multi_logloss: 0.00757024
[3000]	training's multi_logloss: 0.00566668	valid_1's multi_logloss: 0.00656598
[3500]	training's multi_logloss: 0.00509033	valid_1's multi_logloss: 0.00598818
[4000]	training's multi_logloss: 0.00439868	valid_1's multi_logloss: 0.0052949
[4500]	training's multi_logloss: 0.00409245	valid_1's multi_logloss: 0.00499511
[5000]	training's multi_logloss: 0.00371091	valid_1's multi_logloss: 0.00460997
[5500]	training's multi_logloss: 0.00358914	valid_1's multi_logloss: 0.00453621
[6000]	training's multi_logloss: 0.0034225	valid_1's m

[8500]	training's multi_logloss: 0.00280881	valid_1's multi_logloss: 0.00612089
[9000]	training's multi_logloss: 0.00270334	valid_1's multi_logloss: 0.00603245
[9500]	training's multi_logloss: 0.00261646	valid_1's multi_logloss: 0.00596852
[10000]	training's multi_logloss: 0.00253708	valid_1's multi_logloss: 0.00590694
[10500]	training's multi_logloss: 0.00250766	valid_1's multi_logloss: 0.00588384
[11000]	training's multi_logloss: 0.00249644	valid_1's multi_logloss: 0.00589801
[11500]	training's multi_logloss: 0.00246425	valid_1's multi_logloss: 0.0058828
[12000]	training's multi_logloss: 0.00246783	valid_1's multi_logloss: 0.00591408
[12500]	training's multi_logloss: 0.0024096	valid_1's multi_logloss: 0.00587038
[13000]	training's multi_logloss: 0.00233264	valid_1's multi_logloss: 0.00580187
[13500]	training's multi_logloss: 0.00230837	valid_1's multi_logloss: 0.00579258
[14000]	training's multi_logloss: 0.00227435	valid_1's multi_logloss: 0.00574359
[14500]	training's multi_logloss:

[16000]	training's multi_logloss: 0.002162	valid_1's multi_logloss: 0.00351236
[16500]	training's multi_logloss: 0.00215995	valid_1's multi_logloss: 0.00350967
[17000]	training's multi_logloss: 0.00215551	valid_1's multi_logloss: 0.00350699
[17500]	training's multi_logloss: 0.00213544	valid_1's multi_logloss: 0.00348937
[18000]	training's multi_logloss: 0.00211201	valid_1's multi_logloss: 0.00345801
[18500]	training's multi_logloss: 0.00209342	valid_1's multi_logloss: 0.00343981
[19000]	training's multi_logloss: 0.00209194	valid_1's multi_logloss: 0.00344805
[19500]	training's multi_logloss: 0.00208446	valid_1's multi_logloss: 0.00344211
[20000]	training's multi_logloss: 0.00206297	valid_1's multi_logloss: 0.00341977
[0.9999996240159675, 0.999995046947507, 0.9999120615292126, 0.9999994411123309, 0.9999981335901514]
lgb_scotrainre_list: [0.9999996240159675, 0.999995046947507, 0.9999120615292126, 0.9999994411123309, 0.9999981335901514]
lgb_score_mean: 0.999980861439034
lgb_score_std: 3.4