In [1]:
import pandas as pd 
import numpy as np
df = pd.read_csv('data/car.csv')
df_1 = df[df['target']==1]
df_0 = df[df['target']==0].sample(n=len(df_1))
df = pd.concat([df_0,df_1],axis=0)
msk = np.random.rand(len(df)) < 0.8
df_train = df[msk]
df_test = df[~msk]
NUMERIC_COLS = [
    "ps_reg_01", "ps_reg_02", "ps_reg_03",
    "ps_car_12", "ps_car_13", "ps_car_14", "ps_car_15",
]
print(df_train.shape)
df_train.head(10)

(6999, 59)


Unnamed: 0,id,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
5705,1121103,0,1,1,4,1,0,0,0,1,...,2,2,0,4,0,1,0,1,0,0
23952,930830,0,3,4,9,0,0,0,0,1,...,5,0,4,12,0,1,0,1,0,0
10345,123596,0,4,1,6,0,4,1,0,0,...,7,1,5,3,0,0,0,0,0,0
74920,72801,0,0,2,0,0,0,0,1,0,...,3,3,2,7,0,0,1,1,0,0
98696,854474,0,4,2,1,0,0,1,0,0,...,5,4,2,10,0,1,1,0,0,0
61797,1073965,0,2,2,2,1,0,0,0,0,...,3,1,0,4,1,1,0,0,1,0
56152,460902,0,5,1,4,0,0,0,0,0,...,4,2,1,6,0,1,1,0,0,1
60957,692878,0,1,3,3,0,0,0,0,0,...,3,2,6,8,0,1,0,0,0,0
66691,1282475,0,0,1,6,0,0,0,1,0,...,10,2,2,9,0,1,0,0,0,0
22508,1136075,0,1,1,2,0,0,0,0,0,...,3,2,2,12,0,1,0,1,1,0


In [2]:
y_train = df_train['target']  # training label
y_test = df_test['target']  # training label
X_train = df_train[NUMERIC_COLS]  # training dataset
X_test = df_test[NUMERIC_COLS]  # testing dataset

In [3]:
import lightgbm as lgb 
lgb_train = lgb.Dataset(X_train,y_train)
lgb_eval = lgb.Dataset(X_test,y_test,reference=lgb_train)

params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'binary_logloss'},
    'num_leaves': 64,
    "num_trees": 100,
    'learning_rate': 0.01,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

num_leaf = 64
print('start training...')
# train GBDT
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=100,
                valid_sets=lgb_train)

start training...
You can set `force_col_wise=true` to remove the overhead.
[1]	training's binary_logloss: 0.692401
[2]	training's binary_logloss: 0.691612
[3]	training's binary_logloss: 0.690827
[4]	training's binary_logloss: 0.69007
[5]	training's binary_logloss: 0.689337
[6]	training's binary_logloss: 0.688705
[7]	training's binary_logloss: 0.687967
[8]	training's binary_logloss: 0.687355
[9]	training's binary_logloss: 0.686629
[10]	training's binary_logloss: 0.68591
[11]	training's binary_logloss: 0.685201
[12]	training's binary_logloss: 0.684508
[13]	training's binary_logloss: 0.68382
[14]	training's binary_logloss: 0.683114
[15]	training's binary_logloss: 0.682437
[16]	training's binary_logloss: 0.681817
[17]	training's binary_logloss: 0.681197
[18]	training's binary_logloss: 0.680594
[19]	training's binary_logloss: 0.679979
[20]	training's binary_logloss: 0.679369
[21]	training's binary_logloss: 0.678734
[22]	training's binary_logloss: 0.678072
[23]	training's binary_logloss: 0.

In [4]:
print('Save model...')
#gbm.save_model('model.txt')

print('Start predicting...')
y_pred = gbm.predict(X_train,pred_leaf=True)
print(np.array(y_pred).shape)
print(y_pred[0])

Save model...
Start predicting...
(6999, 100)
[43 12 39 12 62 57 48 57 58 35 16 16 16 62 16 60 46 46 50 33  9 17  9 12
  6 17 57 56 40 33 37 15  1 15 35 44 22  7 22 43 13 51 25 40 62 14 11 54
 11 32 25 17 17 15 44 17 28 17 48 21  4 58 23 52 35 29 24 23 23 28 37 21
 17 17 13 30 30  3 30 33 16 27 16 62 15 63 55  6 56 40 12 40 44 44 14 23
 23 44 26 33]


In [5]:
print('Writing transformed training data')
transformed_training_matrix = np.zeros([len(y_pred), len(y_pred[0]) * num_leaf],
                                       dtype=np.int64)  # N * num_tress * num_leafs
for i in range(0, len(y_pred)):
    temp = np.arange(len(y_pred[0])) * num_leaf + np.array(y_pred[i])
    transformed_training_matrix[i][temp] += 1

Writing transformed training data


In [6]:
# 对测试集进行tree的转变
y_pred = gbm.predict(X_test, pred_leaf=True)
print('Writing transformed testing data')
transformed_testing_matrix = np.zeros([len(y_pred), len(y_pred[0]) * num_leaf], dtype=np.int64)
for i in range(0, len(y_pred)):
    temp = np.arange(len(y_pred[0])) * num_leaf + np.array(y_pred[i])
    transformed_testing_matrix[i][temp] += 1

Writing transformed testing data


In [7]:
from sklearn.linear_model import LogisticRegression
lm = LogisticRegression(penalty='l2',C=0.05) # logestic model construction
lm.fit(transformed_training_matrix,y_train)  # fitting the data
y_pred_test = lm.predict_proba(transformed_testing_matrix)   # Give the probabilty on each label

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [8]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred_test[:,1]>0.5))
y_pred = gbm.predict(X_test,pred_leaf=False)
print(classification_report(y_test,y_pred>0.5))

              precision    recall  f1-score   support

           0       0.56      0.52      0.54       865
           1       0.54      0.57      0.55       840

    accuracy                           0.55      1705
   macro avg       0.55      0.55      0.55      1705
weighted avg       0.55      0.55      0.55      1705

              precision    recall  f1-score   support

           0       0.58      0.57      0.57       865
           1       0.56      0.57      0.56       840

    accuracy                           0.57      1705
   macro avg       0.57      0.57      0.57      1705
weighted avg       0.57      0.57      0.57      1705

