In [1]:
# 用gbdt+lr模型，对criteo的小型数据集做ctr预估

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
import lightgbm as lgb
from math import log

In [3]:
data = pd.read_csv("../jupyter_files/criteo.tiny.csv")

In [4]:
data.head() # I1,...,I13为数值特征，C1,...,C26为类别特征

Unnamed: 0,Id,Label,I1,I2,I3,I4,I5,I6,I7,I8,...,C17,C18,C19,C20,C21,C22,C23,C24,C25,C26
0,10000000,0,1.0,1,5.0,0.0,1382.0,4.0,15.0,2.0,...,e5ba7672,f54016b9,21ddcdc9,b1252a9d,07b5194c,,3a171ecb,c5c50484,e8b83407,9727dd16
1,10000001,0,2.0,0,44.0,1.0,102.0,8.0,2.0,2.0,...,07c540c4,b04e4670,21ddcdc9,5840adea,60f6221e,,3a171ecb,43f13e8b,e8b83407,731c3655
2,10000002,0,2.0,0,1.0,14.0,767.0,89.0,4.0,2.0,...,8efede7f,3412118d,,,e587c466,ad3062eb,3a171ecb,3b183c5c,,
3,10000003,0,,893,,,4392.0,,0.0,0.0,...,1e88c74f,74ef3502,,,6b3a5ca6,,3a171ecb,9117a34a,,
4,10000004,0,3.0,-1,,0.0,2.0,0.0,3.0,0.0,...,1e88c74f,26b3c7a7,,,21c9516a,,32c7478e,b34f3128,,


In [5]:
print(data['I1'].mean())                   # 数值特征右偏
print(data['I1'].median())

3.6963963963963966
1.0


In [6]:
# 取得数值特征和类别特征列名
numerical_features = ['I'] * 13
numerical_features = [col + str(i + 1) for i, col in enumerate(numerical_features)]
categorical_features = ['C'] * 26
categorical_features = [col + str(i + 1) for i, col in enumerate(categorical_features)]

In [7]:
# 标签列
target = 'Label'

data[target].value_counts()

0    1581
1     418
Name: Label, dtype: int64

In [8]:
# 用中位数填充数值特征的空值
for col in numerical_features:
    data[col] = data[col].fillna(data[col].median())

In [9]:
# 对类别特征的空值，统一以“-1”填充
for col in categorical_features:
    data[col] = data[col].fillna(-1)

In [10]:
# 对类别特征做onthot处理
for col in categorical_features:
    onehot_feats = pd.get_dummies(data[col], prefix = col)
    data.drop([col], axis = 1, inplace = True)
    data = pd.concat([data, onehot_feats], axis = 1)

In [11]:
data.head()

Unnamed: 0,Id,Label,I1,I2,I3,I4,I5,I6,I7,I8,...,C26_fb7edec8,C26_fbe10aa8,C26_fcd456fa,C26_fcd5a3f4,C26_fd6ccd1e,C26_fdd86175,C26_fe7d4d4a,C26_ff2cdc2b,C26_ff86d5e0,C26_ffc123e9
0,10000000,0,1.0,1,5.0,0.0,1382.0,4.0,15.0,2.0,...,0,0,0,0,0,0,0,0,0,0
1,10000001,0,2.0,0,44.0,1.0,102.0,8.0,2.0,2.0,...,0,0,0,0,0,0,0,0,0,0
2,10000002,0,2.0,0,1.0,14.0,767.0,89.0,4.0,2.0,...,0,0,0,0,0,0,0,0,0,0
3,10000003,0,1.0,893,6.0,4.0,4392.0,43.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,10000004,0,3.0,-1,6.0,0.0,2.0,0.0,3.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
# 划分训练集合测试集
X = data.iloc[:, 2:]
y = data.iloc[:, 1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=2020)

# 构建LGBM模型（gbdt的一种实现）

In [13]:
model = lgb.LGBMClassifier(max_depth=6, n_estimators=30, colsample_bytree=0.9, reg_alpha=0.01, reg_lambda=0.01, learning_rate=0.01)

In [14]:
model.fit(X_train, y_train,
         eval_set=[(X_train, y_train), (X_test, y_test)],
         eval_names=['train', 'validation'],
         early_stopping_rounds=5)

[1]	train's binary_logloss: 0.506816	validation's binary_logloss: 0.523611
Training until validation scores don't improve for 5 rounds.
[2]	train's binary_logloss: 0.504888	validation's binary_logloss: 0.522464
[3]	train's binary_logloss: 0.503232	validation's binary_logloss: 0.521606
[4]	train's binary_logloss: 0.501502	validation's binary_logloss: 0.520421
[5]	train's binary_logloss: 0.499698	validation's binary_logloss: 0.519289
[6]	train's binary_logloss: 0.498049	validation's binary_logloss: 0.51824
[7]	train's binary_logloss: 0.496315	validation's binary_logloss: 0.517258
[8]	train's binary_logloss: 0.494605	validation's binary_logloss: 0.516273
[9]	train's binary_logloss: 0.493009	validation's binary_logloss: 0.515382
[10]	train's binary_logloss: 0.491428	validation's binary_logloss: 0.51448
[11]	train's binary_logloss: 0.489693	validation's binary_logloss: 0.513589
[12]	train's binary_logloss: 0.488142	validation's binary_logloss: 0.512803
[13]	train's binary_logloss: 0.486642	

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=0.9,
               importance_type='split', learning_rate=0.01, max_depth=6,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=30, n_jobs=-1, num_leaves=31, objective=None,
               random_state=None, reg_alpha=0.01, reg_lambda=0.01, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

# 构建gbdt+lr模型1（只使用gbdt生成的特征）

In [15]:
# 利用样本落入叶子节点的情况生成特征
gbdt_feats = model.predict(X, pred_leaf=True)

In [16]:
gbdt_feats.shape

(1999, 30)

In [17]:
gbdt_feats_name = ['gbdt_tree_' + str(i) for i in range(gbdt_feats.shape[1])]

data1 = pd.DataFrame(gbdt_feats, columns = gbdt_feats_name)

In [18]:
data1.head()

Unnamed: 0,gbdt_tree_0,gbdt_tree_1,gbdt_tree_2,gbdt_tree_3,gbdt_tree_4,gbdt_tree_5,gbdt_tree_6,gbdt_tree_7,gbdt_tree_8,gbdt_tree_9,...,gbdt_tree_20,gbdt_tree_21,gbdt_tree_22,gbdt_tree_23,gbdt_tree_24,gbdt_tree_25,gbdt_tree_26,gbdt_tree_27,gbdt_tree_28,gbdt_tree_29
0,12,12,0,15,12,12,12,11,17,11,...,10,18,18,18,16,17,14,14,1,18
1,23,23,19,15,22,18,23,23,23,20,...,19,22,12,26,21,12,11,4,0,20
2,9,11,14,21,11,10,2,9,9,9,...,15,11,2,25,2,9,12,3,3,2
3,1,17,0,5,21,15,17,17,16,17,...,16,15,11,1,13,14,21,2,19,12
4,0,0,1,26,0,0,0,0,0,0,...,0,0,0,0,0,0,0,22,0,0


In [19]:
# 特征做onehot变换
for col in gbdt_feats_name:
    onehot_feats = pd.get_dummies(data1[col], prefix = col)
    data1.drop([col], axis = 1, inplace = True)
    data1 = pd.concat([data1, onehot_feats], axis = 1)

In [20]:
data1.head()

Unnamed: 0,gbdt_tree_0_0,gbdt_tree_0_1,gbdt_tree_0_2,gbdt_tree_0_3,gbdt_tree_0_4,gbdt_tree_0_5,gbdt_tree_0_6,gbdt_tree_0_7,gbdt_tree_0_8,gbdt_tree_0_9,...,gbdt_tree_29_13,gbdt_tree_29_14,gbdt_tree_29_15,gbdt_tree_29_16,gbdt_tree_29_17,gbdt_tree_29_18,gbdt_tree_29_19,gbdt_tree_29_20,gbdt_tree_29_21,gbdt_tree_29_22
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
X1 = data1
y1 = data.iloc[:, 1]

X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y1, test_size=0.25, random_state=2020)

In [22]:
lr = LogisticRegression(C=0.008)

In [23]:
lr.fit(X_train1, y_train1)



LogisticRegression(C=0.008, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [24]:
valid_loss = log_loss(y_test1, lr.predict_proba(X_test1)[:, 1])

In [25]:
print(valid_loss)       # 损失从0.5003降至0.4833

0.483308318963361


# 构建gbdt+lr模型2（数值特征+类别特征+gbdt生成的特征）

In [26]:
# 数值特征的处理
scaler = MinMaxScaler()
for col in numerical_features:
    data[col] = data[col].apply(lambda x: log(x) if x > 2 else x)       # 对大于2的值做log处理
    data[col] = scaler.fit_transform(data[col].values.reshape(-1, 1))   #  归一到0-1之间

In [27]:
data.head()

Unnamed: 0,Id,Label,I1,I2,I3,I4,I5,I6,I7,I8,...,C26_fb7edec8,C26_fbe10aa8,C26_fcd456fa,C26_fcd5a3f4,C26_fd6ccd1e,C26_fdd86175,C26_fe7d4d4a,C26_ff2cdc2b,C26_ff86d5e0,C26_ffc123e9
0,10000000,0,0.219593,0.273472,0.177981,0.0,0.522847,0.164213,0.365293,0.317236,...,0,0,0,0,0,0,0,0,0,0
1,10000001,0,0.439186,0.182315,0.418478,0.223919,0.334401,0.24632,0.269783,0.317236,...,0,0,0,0,0,0,0,0,0,0
2,10000002,0,0.439186,0.182315,0.110586,0.590934,0.480274,0.531701,0.186999,0.317236,...,0,0,0,0,0,0,0,0,0,0
3,10000003,0,0.219593,0.801691,0.198143,0.310417,0.606448,0.445532,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,10000004,0,0.241248,0.091157,0.198143,0.0,0.144607,0.0,0.148193,0.0,...,0,0,0,0,0,0,0,0,0,0


In [28]:
# 添加gbdt生成的特征
data2 = pd.concat([data, data1], axis=1)

In [29]:
data2.head()

Unnamed: 0,Id,Label,I1,I2,I3,I4,I5,I6,I7,I8,...,gbdt_tree_29_13,gbdt_tree_29_14,gbdt_tree_29_15,gbdt_tree_29_16,gbdt_tree_29_17,gbdt_tree_29_18,gbdt_tree_29_19,gbdt_tree_29_20,gbdt_tree_29_21,gbdt_tree_29_22
0,10000000,0,0.219593,0.273472,0.177981,0.0,0.522847,0.164213,0.365293,0.317236,...,0,0,0,0,0,1,0,0,0,0
1,10000001,0,0.439186,0.182315,0.418478,0.223919,0.334401,0.24632,0.269783,0.317236,...,0,0,0,0,0,0,0,1,0,0
2,10000002,0,0.439186,0.182315,0.110586,0.590934,0.480274,0.531701,0.186999,0.317236,...,0,0,0,0,0,0,0,0,0,0
3,10000003,0,0.219593,0.801691,0.198143,0.310417,0.606448,0.445532,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,10000004,0,0.241248,0.091157,0.198143,0.0,0.144607,0.0,0.148193,0.0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
X2 = data2.iloc[:, 2:]
y2 = data2.iloc[:, 1]

X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.25, random_state=2020)

In [31]:
lr = LogisticRegression(C=0.007)

In [32]:
lr.fit(X_train2, y_train2)



LogisticRegression(C=0.007, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [33]:
valid_loss = log_loss(y_test2, lr.predict_proba(X_test2)[:, 1])

In [34]:
print(valid_loss)     # 损失从0.4833降至0.4783

0.47836701846031676
