In [1]:
import pandas as pd
import numpy as np
import pickle
from scipy import sparse
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import roc_auc_score, log_loss
import lightgbm as lgb

import warnings
warnings.filterwarnings("ignore")

In [15]:
## 读取特征矩阵
with open('../data/temp.pkl', 'rb') as file:
    data = pickle.load(file)

### 分成不同的特征部分
is_feature = ['creative_is_download', 'creative_is_jump', 'creative_has_deeplink']
rate_feature = [i for i in data.columns if '_rate' in i]
cate_feature = ['f_channel',  'adid', 'inner_slot_id', 'os', 'advert_industry_inner', 'clear_model', 'devtype', 'clear_osv', \
                'creative_height', 'creative_type', 'sim_ip', 'advert_name', 'orderid', 'carrier', 'app_id', 'creative_id', \
                'creative_tp_dnf','app_cate_id', 'creative_width', 'clear_make', 'province', 'nnt', 'city', 'campaign_id'] + is_feature
ignore_feature = ['instance_id', 'click', 'period']
num_feature = [i for i in data.columns if '_num' in i] + ['hour'] + rate_feature

### 为了测量结果，使用Logloss衡量，且将训练集划分为：训练集+测试集+验证集(按照时间划分)
### period小于33的为训练集，period=33随即划分为验证集+测试集

In [16]:
total_train = data[data.click != -1]
train = total_train[total_train.period <= 32][num_feature + cate_feature+ignore_feature]
train_y = total_train[total_train.period <= 32]['click'] ##标签

val_and_test = total_train[total_train.period == 33][num_feature + cate_feature+ignore_feature]
val_and_test_y = total_train[total_train.period == 33]['click']
val, test, val_y, test_y = train_test_split(val_and_test, val_and_test_y, test_size=0.5, random_state=1024)

### 用gbdt训练类别型变量，得到叶子节点拼接类别型，最后使用LR模型

In [4]:
#用gbdt训练类别型变量，得到叶子节点拼接类别型，最后使用LR模型
# 模型部分
lgb_clf = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=48, max_depth=-1, learning_rate=0.05, n_estimators=350,
                           max_bin=425, subsample_for_bin=50000, objective='binary', min_split_gain=0,
                           min_child_weight=5, min_child_samples=150, subsample=0.8, subsample_freq=1,
                           colsample_bytree=1, reg_alpha=3, reg_lambda=5, seed=1000, n_jobs=10, silent=True)

train_csr = train[num_feature]
val_csr = val[num_feature]
test_csr = test[num_feature]
train_y = train_y.values

#只提取最后100维数据
lgb_clf.fit(train_csr, train_y)
new_feature_train = lgb_clf.apply(train_csr)[:,-100:]   
new_feature_val = lgb_clf.apply(val_csr)[:,-100:]
new_feature_test = lgb_clf.apply(test_csr)[:,-100:]

In [17]:
### 重命名GBDT的结果
for i in range(new_feature_train.shape[1]):
    train['gbdt_'+str(i)] = new_feature_train[:, i]  
    val['gbdt_'+str(i)] = new_feature_val[:, i]  
    test['gbdt_'+str(i)] = new_feature_test[:, i]  

### 下面的视情况而定是否需要，也就是存储的全为离散值的特征矩阵（将数值特征转为类别特征）

In [20]:
# ###将结果拼接起来，并用pickle存储
# # train['click'], val['click'], test['click'] = train_y, val_y.values, test_y.values
# data = pd.concat((pd.concat((train, val), axis = 0), test), axis = 0)

# import pickle
# ##存储中间特征矩阵便于再次访问
# with open('../data/GBDT_LR_feature.pkl', 'wb') as file:
#     pickle.dump(data, file)

In [6]:
###拼接GBDT的结果的新的类别变量
cate_feature = cate_feature + [i for i in data.columns if 'gbdt_' in i]

### CTR预估常用方法，转换为One-hot高维稀疏数据，为了节省内存，使用CSR矩阵存储

In [7]:
total_data = pd.concat((train, val, test), axis = 0)
base_train_csr = sparse.csr_matrix((len(train), 0))
base_val_csr = sparse.csr_matrix((len(val), 0))
base_test_csr = sparse.csr_matrix((len(test), 0))

enc = OneHotEncoder()
for feature in cate_feature:
    enc.fit(total_data[feature].values.reshape(-1, 1))
    base_train_csr = sparse.hstack((base_train_csr, enc.transform(train[feature].values.reshape(-1, 1))), 'csr', 'bool')
    base_val_csr = sparse.hstack((base_val_csr, enc.transform(val[feature].values.reshape(-1, 1))),'csr', 'bool')
    base_test_csr = sparse.hstack((base_test_csr, enc.transform(test[feature].values.reshape(-1, 1))),'csr', 'bool')
print('one-hot prepared !')

one-hot prepared !


### LR调参过程

In [10]:
#调参C
from sklearn.linear_model import LogisticRegression

print('训练集shape', base_train_csr.shape, '验证集shape', base_val_csr.shape, '测试集shape', base_test_csr.shape)
#使用验证集调参
for c in [0.05, 0.1, 0.001, 0.01, 0.2, 0.005]:
    print(c)
    model = LogisticRegression(C = c, verbose = 10)#C = 5
    model.fit(base_train_csr, train_y)
    
    train_pred= model.predict_proba(base_val_csr)[:, 1]
    print('得到epcoh参数的过程loss', log_loss(val_y, train_pred))
    print('\n')

训练集shape (844439, 6751) 验证集shape (78605, 6751) 测试集shape (78606, 6751)
0.05
[LibLinear]得到epcoh参数的过程loss 0.42878968705618953


0.1
[LibLinear]得到epcoh参数的过程loss 0.4287613127930187


0.001
[LibLinear]得到epcoh参数的过程loss 0.4332894429289333


0.01
[LibLinear]得到epcoh参数的过程loss 0.4292464932493843


0.2
[LibLinear]得到epcoh参数的过程loss 0.4287804222512716


0.005
[LibLinear]得到epcoh参数的过程loss 0.42979633676820933




In [11]:
### 用上面最好的参数c=0.1来训练模型并做验证
model = LogisticRegression(C = 0.1, verbose = 10)#C = 5
model.fit(base_train_csr, train_y)

train_pred= model.predict_proba(base_test_csr)[:, 1]
print('测试集Logloss ', log_loss(test_y, train_pred))

[LibLinear]测试集Logloss  0.4254597101770659
