In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.grid_search import GridSearchCV
import cPickle
import patsy

Estimators support arguments to control the fitting behaviour -- these arguments are often called hyperparameters. Among the most important ones for GBRT are:

* number of regression trees (n_estimators)
* depth of each individual tree (max_depth)
* loss function (loss)
* learning rate (learning_rate)

For example if you want to fit a regression model with 100 trees of depth 3 using least-squares:

###[GBDT（MART）概念简介](http://www.cnblogs.com/downtjs/p/3286165.html)

![](http://img.blog.csdn.net/20140502093849718)

目前GBDT有两个不同的描述版本，两者各有支持者，读文献时要注意区分。残差版本把GBDT说成一个残差迭代树，认为每一棵回归树都在学习前N-1棵树的残差，之前我写的GBDT入门教程主要在描述这一版本，ELF开源软件实现中用的也是这一版本。Gradient版本把GBDT说成一个梯度迭代树，使用梯度下降法求解，认为每一棵回归树在学习前N-1棵树的梯度下降值，之前leftnoteasy的博客中介绍的为此版本，umass的源码实现中用的则是这一版本（准确的说是LambdaMART中的MART为这一版本，MART实现则是前一版本）。

 

对GBDT无基础的朋友可以先分别看一下前面两篇博文教程。总的来说两者相同之处在于，都是迭代回归树，都是累加每颗树结果作为最终结果（Multiple Additive Regression Tree)，每棵树都在学习前N-1棵树尚存的不足，从总体流程和输入输出上两者是没有区别的；两者的不同主要在于每步迭代时，是否使用Gradient作为求解方法。前者不用Gradient而是用残差----残差是全局最优值，Gradient是局部最优方向*步长，即前者每一步都在试图让结果变成最好，后者则每步试图让结果更好一点。

 

两者优缺点。看起来前者更科学一点--有绝对最优方向不学，为什么舍近求远去估计一个局部最优方向呢？原因在于灵活性。前者最大问题是，由于它依赖残差，cost function一般固定为反映残差的均方差，因此很难处理纯回归问题之外的问题。而后者求解方法为梯度下降，只要可求导的cost function都可以使用，所以用于排序的LambdaMART就是用的后者。

##Hyperparameter tuning

太耗时间了 12h+

In [2]:
# X = train_data[0:, 1:]
# y = train_data[0:, 0]

# param_grid = {
#     'learning_rate': [0.1, 0.05],
#     'max_depth': [4, 6],
#     'min_samples_leaf': [3, 9, 17],
#     #'max_features': [1.0, 0.3, 0.1]
# }
# est = GradientBoostingClassifier(n_estimators=500)
# gs_cv = GridSearchCV(est, param_grid).fit(X, y)
# # best hyperparameter setting
# gs_cv.best_params_

##训练数据

In [None]:
train_set = pd.read_csv('data/train_set/1212train_set.csv')

##训练

In [None]:
#train_data = train_set.values
# formula = 'buy ~ f3_1 + f3_2 + f3_3 + f3_4 + \
#                  f3_5 + f3_6 + f3_7 + f3_8 + \
#                  f1_1_3 + f1_1_7 + f1_2_3 + f1_2_7 + f1_3_3 + f1_3_7 + \
#                  f2_1 + np.true_divide(f3_4, f2_1+0.01)'# + np.true_divide(f2_1, f2_2)'
formula = 'buy ~ f3_1 + f3_2 + f3_3 + f3_4 + \
                 f3_5 + f3_6 + f3_7 + f3_8 + \
                 f1_1_3 + f1_1_7 + f1_2_3 + f1_2_7 + f1_3_3 + f1_3_7 + \
                 f2_1 + np.true_divide(f3_4, f2_1+0.01)'# + np.true_divide(f2_1, f2_2)'
# 用patsy的dmatrices生成一个对 友好的dataframe
y, x = patsy.dmatrices(formula, data=train_set, return_type='dataframe')

del train_set

x = x.values[0:, 1:]
y = y.values[0:, 0]

# # http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html
# with open("models/f16_gbrt 1212[{'n_estimators': 200, 'loss': \
# 'exponential', 'learning_rate': 0.02, 'max_depth': 10, 'min_samples_leaf': 4}].pickle") as f:
#     gbrt_200 = cPickle.load(f)
    
# models = {
#     gbrt_200: 200,
# }

# 对于gbrt而言，先固定树的数量后，根据3倍步长的经验遍历一下其它参数就很容易找到最优解，再根据线上的实际效果增加数的数量
# 调参数
params = {
    'init': None, #gbrt_200,
    #'loss': 'exponential',
    'n_estimators': 700,
    'max_depth': 6,
    'min_samples_leaf': 2,
    'learning_rate': 0.1,
}

est = GradientBoostingClassifier(
    #init             = params['init'],
    #loss             = params['loss'],
    n_estimators     = params['n_estimators'],
    max_depth        = params['max_depth'],
    min_samples_leaf = params['min_samples_leaf'],
    learning_rate    = params['learning_rate']
)
est.fit(x, y)

In [None]:
mparams = dict(params)
#mparams['n_estimators'] += models[mparams.pop('init')]
mparams.pop('init')
with open('models/f16_gbrt 1212[%s].pickle'%repr(mparams), 'w') as f:
    cPickle.dump(est, f)

##预测

In [None]:
test_set = pd.read_csv('data/test_set/1212test_set.csv')
#test_data = test_set.values
test_set['buy'] = 0

formula = 'buy ~ f3_1 + f3_2 + f3_3 + f3_4 + \
                 f3_5 + f3_6 + f3_7 + f3_8 + \
                 f1_1_3 + f1_1_7 + f1_2_3 + f1_2_7 + f1_3_3 + f1_3_7 + \
                 f2_1 + np.true_divide(f3_4, f2_1+0.01)'# + np.true_divide(f2_1, f2_2)'
# 用patsy的dmatrices生成一个对 友好的dataframe
y_test, x_test = patsy.dmatrices(formula, data=test_set, return_type='dataframe')

x_test = x_test.values[:, 1:]
y_test = y_test.values[:, 0]

#output = est.predict(test_data[0:, 2:])
output_prob = est.predict_proba(x_test)

In [None]:
est

In [None]:
predict_set = pd.read_csv("data/test_set/1212ui_set.csv")

print 'predicting ...'
predict_set['buy'] = output_prob[0:,1]
print 'predict done'

predict_set.to_csv(
    "data/output/gbrt/f16_predict_set 1212 %d %d.csv"%(
        params['n_estimators'], #+models[params['init']],
        params['max_depth']
#        params['min_samples_leaf'],
#        params['loss'],
#        params['learning_rate']
    ),
    index=False)

##过滤

In [None]:
# predict_set = pd.read_csv('data/output/gbrt/predict_set.csv')

# recomm_set = predict_set.sort(columns=['buy'], ascending=False)[:421][['user_id','item_id']]
# recomm_set.to_csv("data/output/gbrt/tianchi_mobile_recommendation_predict.csv", index=False)

In [None]:
print 'ok'

In [None]:
print 'ok'

In [11]:
print 'ok'

ok
