# GBDT + LR

在这个组合中，GBDT 完全被用作是特征工程组件，每一条数据 $x$ 在输入到 GBDT 后，例如 GBDT 中的一棵树的最左边的叶子节点被激活，那么该树需要输出所有叶子节点的向量  $[1, 0, 0, 0]$（假如有 4 个叶子节点），同样的，GBDT 中的下一颗树，可能是最右边的叶子节点被激活（假如这棵树仍然有 4 个叶子节点），那么这棵树需要输出向量 $[0, 0, 0, 1]$，输入数据这样经过 GBDT 处理后，可以得到一组向量，把这些向量拼接起来，作为 LR 的输入。

在这个特征组合方案中，GBDT 和 LR 模型是分开训练的，所以训练过程相对的比较简单。唯一要处理的是获取经过 GBDT 处理的特征。


In [1]:
import os
import numpy as np
from scipy.sparse import coo_matrix

BASEDIR = os.getcwd()
row = []
col = []
data = []
y = []

col_cnt = -1

idx = 0
with open(BASEDIR + '/assets/datasets/criteo_ctr/small_train.txt') as f:
    line = f.readline()
    line = line.strip('\n')
    while line:
        elems = line.split(' ')
        y.append(int(elems[0]))
        for i in range(1, len(elems)):
            field, feature, value = elems[i].split(':')
            col_cnt = max(col_cnt, int(feature))
            row.append(idx)
            col.append(int(feature))
            data.append(float(value))
            
        line = f.readline()
        idx += 1

X_train = coo_matrix((np.array(data), (np.array(row), np.array(col))), shape=(idx, col_cnt + 1), dtype=float)
y_train = np.array(y)


row = []
col = []
data = []
y = []

idx = 0
with open(BASEDIR + '/assets/datasets/criteo_ctr/small_test.txt') as f:
    line = f.readline()
    line = line.strip('\n')
    while line:
        elems = line.split(' ')
        y.append(int(elems[0]))
        for i in range(1, len(elems)):
            field, feature, value = elems[i].split(':')
            col_cnt = max(col_cnt, int(feature))
            row.append(idx)
            col.append(int(feature))
            data.append(float(value))
            
        line = f.readline()
        idx += 1

X_test = coo_matrix((np.array(data), (np.array(row), np.array(col))), shape=(idx, col_cnt + 1), dtype=float)
y_test = np.array(y)

In [2]:
# sklearn version
from sklearn.ensemble import GradientBoostingClassifier

n_estimators = 50
gbm = GradientBoostingClassifier(n_estimators=n_estimators, random_state=10, subsample=0.6, max_depth=5,
                                  min_samples_split=4)

gbm.fit(X_train, y_train)
X_train = gbm.apply(X_train)
X_test = gbm.apply(X_test)

In [None]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder()
X_train = X_train.reshape(-1, n_estimators)
X_test = X_test.reshape(-1, n_estimators)
ohe.fit(X_train)
X_train = np.array(ohe.transform(X_train).toarray())
X_test = np.array(ohe.transform(X_test).toarray())

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(X_train, y_train)
print ('Accuracy of GBDT + LR: %f' % lr.score(X_test, y_test))