# 关于LGB的分类问题

In [55]:
import numpy as np
import pandas as pd

## 模拟数据

多分类

In [56]:
num_cls = 15
num_inst = 1000
np.random.seed(666)
X = np.random.rand(num_inst, 10)
y = np.random.randint(0, num_cls, size=num_inst)
df = pd.DataFrame(X, columns=['f' + str(i) for i in range(10)])
df['y'] = y
df.head()

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,y
0,0.700437,0.844187,0.676514,0.727858,0.951458,0.012703,0.413588,0.048813,0.099929,0.508066,1
1,0.200248,0.744154,0.192892,0.700845,0.293228,0.774479,0.005109,0.112858,0.110954,0.247668,9
2,0.023236,0.727321,0.340035,0.197503,0.90918,0.978347,0.532803,0.259132,0.583813,0.325691,7
3,0.888899,0.626405,0.818874,0.547345,0.416712,0.743047,0.369596,0.075167,0.775193,0.219409,8
4,0.079342,0.486781,0.153674,0.828465,0.191369,0.270409,0.561034,0.90238,0.851788,0.418082,14


## 建立模型

In [57]:
from lightgbm import LGBMClassifier
clf = LGBMClassifier(n_estimators=100, learning_rate=0.1, num_leaves=31, max_depth=-1, min_child_samples=20, min_child_weight=0, subsample=0.8, subsample_freq=1, colsample_bytree=0.8, reg_alpha=0, reg_lambda=0, random_state=666)
clf.fit(df[['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9']], df['y'])


In [58]:
clf.feature_importances_

array([4577, 4652, 4536, 4645, 4352, 4645, 4435, 4359, 4406, 4210])

In [59]:
clf.predict(df[['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9']])

array([ 1,  9,  7,  8, 14,  5, 13, 10, 14,  6,  2, 13,  9, 10,  7,  8,  4,
        6,  8,  3,  7,  8, 12,  8,  2,  4,  4,  1, 13, 13,  6,  9,  0, 10,
        0, 11,  0,  3,  5,  7,  3,  0,  4,  5, 12, 11,  7, 10,  8,  7, 11,
        1,  0, 11,  5,  8,  3,  3, 10,  5,  2, 14, 10,  2,  6, 13,  1, 10,
        0, 13,  9,  6, 14,  9,  9,  5,  1, 12,  1,  7, 13, 10, 10,  2,  1,
       11, 10,  5,  5, 10, 13,  6,  7,  8,  1,  7, 12,  9,  1,  2, 12, 14,
        0,  8,  9,  9,  0,  4, 10,  4,  4, 10,  0, 13, 13,  2,  6,  4,  4,
        6,  1,  7,  1,  5,  4,  6,  5,  2,  8, 10, 12,  3,  6, 11, 12,  9,
        1,  0,  9,  1,  0,  5, 13,  4, 12, 11,  3, 10, 13,  3,  3, 11, 10,
        6, 13, 13,  4, 13,  9, 13, 10,  0,  1, 14,  1, 13, 11, 11,  7,  8,
       14,  6,  1,  1,  7,  4,  7,  4,  5,  9, 12, 10,  5, 13, 12, 10,  5,
        9,  9,  0,  5,  3,  4,  6,  0, 13,  8,  6,  4,  3, 13,  5,  5, 11,
       12,  0,  9,  5,  9,  5,  2, 14,  2, 13, 12,  2, 14,  1,  2,  5,  1,
        4,  3,  3,  4,  7

In [60]:
clf.predict_proba(df[['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9']])

array([[8.03886051e-05, 9.96554338e-01, 5.08745947e-04, ...,
        3.38038727e-05, 6.16421914e-05, 4.97210878e-04],
       [5.99768897e-05, 4.43902452e-04, 3.75818407e-04, ...,
        7.61940028e-04, 3.89218785e-04, 4.12658225e-05],
       [4.66651585e-06, 7.08405635e-04, 2.05285435e-04, ...,
        8.62695923e-06, 9.45234211e-05, 1.73212004e-05],
       ...,
       [2.05926131e-05, 6.46076299e-04, 5.41528886e-05, ...,
        1.40466602e-04, 1.01072645e-03, 9.96690492e-01],
       [4.76729636e-05, 1.70564814e-04, 1.57212814e-05, ...,
        1.33854355e-03, 4.29904987e-05, 1.57543859e-03],
       [9.60416301e-05, 2.91794891e-04, 9.13407518e-04, ...,
        6.87102535e-04, 9.92095302e-01, 4.58563216e-04]])

In [61]:
clf.predict_proba(df[['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9']]).argmax(axis=1)

array([ 1,  9,  7,  8, 14,  5, 13, 10, 14,  6,  2, 13,  9, 10,  7,  8,  4,
        6,  8,  3,  7,  8, 12,  8,  2,  4,  4,  1, 13, 13,  6,  9,  0, 10,
        0, 11,  0,  3,  5,  7,  3,  0,  4,  5, 12, 11,  7, 10,  8,  7, 11,
        1,  0, 11,  5,  8,  3,  3, 10,  5,  2, 14, 10,  2,  6, 13,  1, 10,
        0, 13,  9,  6, 14,  9,  9,  5,  1, 12,  1,  7, 13, 10, 10,  2,  1,
       11, 10,  5,  5, 10, 13,  6,  7,  8,  1,  7, 12,  9,  1,  2, 12, 14,
        0,  8,  9,  9,  0,  4, 10,  4,  4, 10,  0, 13, 13,  2,  6,  4,  4,
        6,  1,  7,  1,  5,  4,  6,  5,  2,  8, 10, 12,  3,  6, 11, 12,  9,
        1,  0,  9,  1,  0,  5, 13,  4, 12, 11,  3, 10, 13,  3,  3, 11, 10,
        6, 13, 13,  4, 13,  9, 13, 10,  0,  1, 14,  1, 13, 11, 11,  7,  8,
       14,  6,  1,  1,  7,  4,  7,  4,  5,  9, 12, 10,  5, 13, 12, 10,  5,
        9,  9,  0,  5,  3,  4,  6,  0, 13,  8,  6,  4,  3, 13,  5,  5, 11,
       12,  0,  9,  5,  9,  5,  2, 14,  2, 13, 12,  2, 14,  1,  2,  5,  1,
        4,  3,  3,  4,  7

In [62]:
acc = clf.score(df[['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9']], df['y'])
acc

1.0

In [63]:
from sklearn.metrics import classification_report
print(classification_report(clf.predict(df[['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9']]), df['y']))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        57
           1       1.00      1.00      1.00        68
           2       1.00      1.00      1.00        62
           3       1.00      1.00      1.00        60
           4       1.00      1.00      1.00        74
           5       1.00      1.00      1.00        75
           6       1.00      1.00      1.00        64
           7       1.00      1.00      1.00        70
           8       1.00      1.00      1.00        68
           9       1.00      1.00      1.00        62
          10       1.00      1.00      1.00        62
          11       1.00      1.00      1.00        69
          12       1.00      1.00      1.00        72
          13       1.00      1.00      1.00        71
          14       1.00      1.00      1.00        66

    accuracy                           1.00      1000
   macro avg       1.00      1.00      1.00      1000
weighted avg       1.00   