In [53]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, roc_auc_score
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

## 构造数据

In [54]:
data = pd.read_csv('./train.txt')

In [55]:
columns = data.columns.tolist()
categorical_features = [col for col in columns if 'C' in col]
data[categorical_features] = data[categorical_features].fillna('-1')
for col in categorical_features:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    
numerical_features = [col for col in columns if col not in categorical_features]
numerical_features.remove('label')
data[numerical_features] = data[numerical_features].fillna(0)

data_X = data.drop('label', axis=1)
data_y = data['label']

X_train, X_test, y_train, y_test = train_test_split(data_X, data_y, test_size=0.2)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
print(X_train.head())

(1599, 39) (400, 39) (1599,) (400,)
       I1   I2    I3    I4        I5    I6   I7    I8    I9  I10  ...  C17  \
832   2.0   23  10.0  10.0    1431.0  12.0  3.0  11.0  11.0  0.0  ...    3   
1871  1.0    0   0.0   1.0     696.0   3.0  1.0   3.0   3.0  1.0  ...    7   
1704  0.0  295  60.0   2.0   65462.0   0.0  0.0   5.0   2.0  0.0  ...    5   
1031  0.0    7  19.0   4.0  775584.0   0.0  0.0   4.0   5.0  0.0  ...    0   
1694  0.0   -1  47.0  29.0   10383.0  66.0  3.0  30.0  66.0  0.0  ...    0   

      C18  C19  C20   C21  C22  C23  C24  C25  C26  
832   391    0    0  1134    5    1  215    0    0  
1871   85    0    0   515    0    0   13    0    0  
1704  181   23    1  1012    3    1  411    2  452  
1031  387    0    0     2    5    8  163    0    0  
1694  490    6    3  1038    0    0  175   27  150  

[5 rows x 39 columns]


## lgb来训练GBDT模型

In [42]:
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

In [61]:
params = {
    'task':'train',
    'boosting_type':'gbdt',
    'objective':'binary',
    'metric':'binary_logloss',
    'num_leaves':64,
    'num_trees':100,
    'learning_rate':0.01,
    'feature_fraction':0.9,
    'bagging_fraction':0.8,
    'bagging_freq':5,
    'verbose':0
}
#上面的参数中树的个数是100，每个树的叶子节点数量是64
num_leaf = 64
print('Start training...')
gbm = lgb.train(params, lgb_train, categorical_feature=categorical_features, num_boost_round=100, valid_sets=lgb_train)
print('Start predicting')
#默认是预测概率值，这里我们需要预测出每个样本在100棵树所落在的叶子节点编号，设置pred_leaf为True
y_pred = gbm.predict(X_train, pred_leaf=True)

print(np.array(y_pred).shape)
print(y_pred[:2])

#将每个样本对应的每棵树的叶子节点做one hot编码 并把所有树的结果拼接，即为由GBDT得到的每个样本新的特征
print('Writing transformed training data')
transformed_training_matrix = np.zeros([len(y_pred), len(y_pred[0]) * num_leaf], dtype=np.int64)# 样本数 * 树的棵树 * 每棵树的叶子节点数量
for i in range(0, len(y_pred)):
    temp = np.arange(len(y_pred[0])) * num_leaf + np.array(y_pred[i])
    transformed_training_matrix[i][temp] += 1

Start training...




[1]	training's binary_logloss: 0.50466
[2]	training's binary_logloss: 0.502486
[3]	training's binary_logloss: 0.500383
[4]	training's binary_logloss: 0.498316
[5]	training's binary_logloss: 0.496369
[6]	training's binary_logloss: 0.494417
[7]	training's binary_logloss: 0.492455
[8]	training's binary_logloss: 0.490586
[9]	training's binary_logloss: 0.488737
[10]	training's binary_logloss: 0.486812
[11]	training's binary_logloss: 0.484954
[12]	training's binary_logloss: 0.483124
[13]	training's binary_logloss: 0.481161
[14]	training's binary_logloss: 0.479263
[15]	training's binary_logloss: 0.477422
[16]	training's binary_logloss: 0.475692
[17]	training's binary_logloss: 0.474016
[18]	training's binary_logloss: 0.472489
[19]	training's binary_logloss: 0.470922
[20]	training's binary_logloss: 0.46924
[21]	training's binary_logloss: 0.467636
[22]	training's binary_logloss: 0.46604
[23]	training's binary_logloss: 0.464538
[24]	training's binary_logloss: 0.462986
[25]	training's binary_loglo

In [78]:
#用训练好的lgb模型得到测试集的叶子节点特征
y_pred = gbm.predict(X_test, pred_leaf=True)
print('Writing transformed testing data')
transformed_testing_matrix = np.zeros([len(y_pred), len(y_pred[0]) * num_leaf], dtype=np.int64)
for i in range(0, len(y_pred)):
    temp = np.arange(len(y_pred[0])) * num_leaf + np.array(y_pred[i])
    transformed_testing_matrix[i][temp] += 1
print(pd.DataFrame(transformed_testing_matrix[0:1,:]))

Writing transformed testing data
   0  1  2  3  4  5  6  7  8  9  10  11  12  13  14  15  16  17  18  19  20  \
0  0  0  0  0  0  0  0  0  1  0   0   0   0   0   0   0   0   0   0   0   0   

   21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  \
0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   

   40  41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  \
0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   

   59  60  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  \
0   0   0   0   0   0   0   0   1   0   0   0   0   0   0   0   0   0   0   0   

   78  79  80  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  \
0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   

   97  98  99  100  101  102  103  104  105  106  107  108  109  110  111  \
0   0   0   0    0    0    0    0    0    0    0    0    0    0    0    0   

 

## 逻辑回归

In [63]:
#将GBDT新构建的特征输入逻辑回归模型进行训练和预测
lm = LogisticRegression(penalty='l2', C=0.05)
lm.fit(transformed_training_matrix, y_train)
y_pred_test = lm.predict_proba(transformed_testing_matrix)
print(y_pred_test[0:2])

[[0.94787744 0.05212256]
 [0.97778004 0.02221996]]


In [64]:
NE = (-1) / len(y_pred_test) * sum(((1+y_test)/2 * np.log(y_pred_test[:,1]) +  (1-y_test)/2 * np.log(1 - y_pred_test[:,1])))
print("Normalized Cross Entropy " + str(NE))

Normalized Cross Entropy 1.3705246197327232


In [66]:
#用auc评估预测结果
y_test_pred1 = lm.predict(transformed_testing_matrix)
print(y_test_pred1[0:2])
print('auc', roc_auc_score(y_test, y_test_pred1))

[0 0]
auc 0.7331002331002331
