In [4]:
#coding:utf-8
#导入warnings包，利用过滤器来实现忽略警告语句。
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno


In [34]:
train_data = pd.read_csv('train_tree.csv', sep = ' ')
test_data = pd.read_csv('test_tree.csv', sep = ' ')

In [31]:
test_data = test_data.drop('price', axis = 1)

In [35]:
# copy from https://tianchi.aliyun.com/notebook-ai/detail?spm=5176.12586969.1002.9.1cd81b43uSLnlN&postId=95460
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
    """
    start_mem = df.memory_usage().sum()
    print('内存占用{:.2f} MB'.format(start_mem/1024/1024))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum()
    print('优化后内存为: {:.2f} MB'.format(end_mem/1024/1024))
    print('内存使用减少 {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

In [36]:
train_data = reduce_mem_usage(train_data)

内存占用97.27 MB
优化后内存为: 25.03 MB
内存使用减少 74.3%


In [38]:
train_label = train_data['price']

In [None]:
# 网格调参跟随机调参都比较慢，所以我这里直接采用贝叶斯调参 + LGBM
# 主要代码借鉴https://github.com/datawhalechina/team-learning-data-mining/blob/master/HeartbeatClassification/Task4%20%E6%A8%A1%E5%9E%8B%E8%B0%83%E5%8F%82.md

### 贝叶斯调参
贝叶斯调参的主要思想是：给定优化的目标函数(广义的函数，只需指定输入和输出即可，无需知道内部结构以及数学性质)，通过不断地添加样本点来更新目标函数的后验分布(高斯过程,直到后验分布基本贴合于真实分布）。简单的说，就是考虑了上一次参数的信息，从而更好的调整当前的参数。

贝叶斯调参的步骤如下：

- 定义优化函数(rf_cv）
- 建立模型
- 定义待优化的参数
- 得到优化结果，并返回要优化的分数指标

In [85]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer,mean_absolute_error
import lightgbm as lgb
"""定义优化函数"""
def rf_cv_lgb(num_leaves, max_depth, bagging_fraction, feature_fraction, bagging_freq, min_data_in_leaf, 
              min_child_weight, min_split_gain, reg_lambda, reg_alpha):
    # 建立模型
    val = cross_val_score(
          lgb.LGBMRegressor(boosting_type='gbdt', objective='regression_l1',
                                   learning_rate=0.1,  num_leaves=int(num_leaves), max_depth=int(max_depth), 
                                   bagging_fraction=round(bagging_fraction, 2), feature_fraction=round(feature_fraction, 2),
                                   bagging_freq=int(bagging_freq), min_data_in_leaf=int(min_data_in_leaf),
                                   min_child_weight=min_child_weight, min_split_gain=min_split_gain,
                                   reg_lambda=reg_lambda, reg_alpha=reg_alpha,bagging_seed = 11, 
                                   n_jobs= 8
                                  ),
        X=train_data, y=train_label, verbose=0, cv = 5, scoring=make_scorer(mean_absolute_error)
    ).mean()
    return 1 - val

In [86]:
from bayes_opt import BayesianOptimization
"""定义优化参数"""
bayes_lgb = BayesianOptimization(
    rf_cv_lgb, 
    {
        'num_leaves':(2, 100),
        'max_depth':(3, 100),
        'bagging_fraction':(0.5, 1.0),
        'feature_fraction':(0.5, 1.0),
        'bagging_freq':(0, 100),
        'min_data_in_leaf':(20,60),
        'min_child_weight':(0, 10),
        'min_split_gain':(0.0, 1.0),
        'reg_alpha':(0.0, 10),
        'reg_lambda':(0.0, 10),
    }
)

"""开始优化"""
bayes_lgb.maximize(n_iter=10)

|   iter    |  target   | baggin... | baggin... | featur... | max_depth | min_ch... | min_da... | min_sp... | num_le... | reg_alpha | reg_la... |
-------------------------------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.9853  [0m | [0m 0.9622  [0m | [0m 65.56   [0m | [0m 0.9405  [0m | [0m 37.93   [0m | [0m 4.847   [0m | [0m 45.78   [0m | [0m 0.2895  [0m | [0m 40.23   [0m | [0m 1.277   [0m | [0m 5.271   [0m |
| [0m 2       [0m | [0m 0.958   [0m | [0m 0.6819  [0m | [0m 10.33   [0m | [0m 0.5451  [0m | [0m 21.46   [0m | [0m 8.499   [0m | [0m 24.59   [0m | [0m 0.5718  [0m | [0m 21.58   [0m | [0m 9.4     [0m | [0m 2.549   [0m |
| [95m 3       [0m | [95m 0.9854  [0m | [95m 0.5534  [0m | [95m 40.64   [0m | [95m 0.9375  [0m | [95m 70.61   [0m | [95m 9.219   [0m | [95m 49.21   [0m | [95m 0.5621  [0m | [95m 40.69   [0m | [95m 6.489 

| [95m 4       [0m | [95m 0.9858  [0m | [95m 0.8266  [0m | [95m 57.83   [0m | [95m 0.9751  [0m | [95m 58.46   [0m | [95m 0.9494  [0m | [95m 25.79   [0m | [95m 0.5946  [0m | [95m 14.34   [0m | [95m 3.074   [0m | [95m 4.445   [0m |
| [0m 5       [0m | [0m 0.9782  [0m | [0m 0.5504  [0m | [0m 18.72   [0m | [0m 0.7621  [0m | [0m 24.09   [0m | [0m 6.262   [0m | [0m 25.21   [0m | [0m 0.4093  [0m | [0m 58.12   [0m | [0m 5.536   [0m | [0m 2.582   [0m |
| [0m 6       [0m | [0m 0.9823  [0m | [0m 0.7411  [0m | [0m 57.18   [0m | [0m 0.9539  [0m | [0m 57.79   [0m | [0m 5.224   [0m | [0m 26.54   [0m | [0m 0.4759  [0m | [0m 16.71   [0m | [0m 3.48    [0m | [0m 2.158   [0m |


| [95m 7       [0m | [95m 0.9899  [0m | [95m 0.8615  [0m | [95m 64.28   [0m | [95m 1.0     [0m | [95m 66.05   [0m | [95m 0.0     [0m | [95m 53.67   [0m | [95m 0.6314  [0m | [95m 22.63   [0m | [95m 0.0     [0m | [95m 10.0    [0m |
| [0m 8       [0m | [0m 0.9485  [0m | [0m 1.0     [0m | [0m 83.58   [0m | [0m 1.0     [0m | [0m 73.39   [0m | [0m 0.0     [0m | [0m 36.72   [0m | [0m 1.0     [0m | [0m 2.0     [0m | [0m 0.0     [0m | [0m 10.0    [0m |
| [95m 9       [0m | [95m 0.9917  [0m | [95m 1.0     [0m | [95m 51.25   [0m | [95m 1.0     [0m | [95m 55.83   [0m | [95m 0.0     [0m | [95m 48.13   [0m | [95m 0.9089  [0m | [95m 30.68   [0m | [95m 0.0     [0m | [95m 10.0    [0m |


| [95m 10      [0m | [95m 0.9931  [0m | [95m 1.0     [0m | [95m 63.66   [0m | [95m 1.0     [0m | [95m 64.31   [0m | [95m 0.0     [0m | [95m 60.0    [0m | [95m 0.6568  [0m | [95m 46.9    [0m | [95m 0.0     [0m | [95m 10.0    [0m |
| [95m 11      [0m | [95m 0.9935  [0m | [95m 1.0     [0m | [95m 63.53   [0m | [95m 1.0     [0m | [95m 68.3    [0m | [95m 0.0     [0m | [95m 33.22   [0m | [95m 0.6043  [0m | [95m 56.29   [0m | [95m 0.0     [0m | [95m 10.0    [0m |
| [0m 12      [0m | [0m 0.9772  [0m | [0m 0.6094  [0m | [0m 66.83   [0m | [0m 0.7872  [0m | [0m 38.13   [0m | [0m 4.097   [0m | [0m 44.63   [0m | [0m 0.54    [0m | [0m 43.48   [0m | [0m 2.183   [0m | [0m 5.484   [0m |


| [0m 13      [0m | [0m 0.9929  [0m | [0m 1.0     [0m | [0m 60.73   [0m | [0m 1.0     [0m | [0m 69.31   [0m | [0m 0.0     [0m | [0m 44.03   [0m | [0m 0.0     [0m | [0m 41.42   [0m | [0m 0.0     [0m | [0m 10.0    [0m |
| [0m 14      [0m | [0m 0.9934  [0m | [0m 0.8841  [0m | [0m 92.5    [0m | [0m 0.9891  [0m | [0m 52.63   [0m | [0m 3.382   [0m | [0m 22.05   [0m | [0m 0.1096  [0m | [0m 53.94   [0m | [0m 5.696   [0m | [0m 6.935   [0m |
| [0m 15      [0m | [0m 0.9681  [0m | [0m 0.6903  [0m | [0m 41.95   [0m | [0m 0.6182  [0m | [0m 32.85   [0m | [0m 7.84    [0m | [0m 39.56   [0m | [0m 0.7869  [0m | [0m 44.78   [0m | [0m 1.413   [0m | [0m 5.171   [0m |


In [89]:
# 显示优化结果
bayes_lgb.max

{'target': 0.9935102854729602,
 'params': {'bagging_fraction': 1.0,
  'bagging_freq': 63.53242647658099,
  'feature_fraction': 1.0,
  'max_depth': 68.2959949349337,
  'min_child_weight': 0.0,
  'min_data_in_leaf': 33.218176076352854,
  'min_split_gain': 0.6042987976751101,
  'num_leaves': 56.28861196101464,
  'reg_alpha': 0.0,
  'reg_lambda': 10.0}}

In [93]:
1 - bayes_lgb.max['target']

0.9935102854729602

In [61]:
def mae_score_vali(preds, data_vali):
    labels = data_vali.get_label()
    score_vali = mean_absolute_error(y_true=labels, y_pred=preds)
    return 'mae_score', score_vali, True

In [90]:
# 调整一个较小的学习率，并通过cv函数确定当前最优的迭代次数
train_matrix = lgb.Dataset(train_data, label=train_label)
base_params_lgb = {
      'boosting_type': 'gbdt',
      'objective':'regression_l1',
      'learning_rate':0.01,
      'bagging_fraction': 1,
      'bagging_freq': 63,
      'feature_fraction':1,
      'max_depth': 68,
      'min_child_weight': 0,
      'min_data_in_leaf': 33,
      'num_leaves': 56,
      'reg_lambda': 10,
      'bagging_seed':11,
      'nthread': 10
    
}

cv_result_lgb = lgb.cv(
    train_set=train_matrix,
    early_stopping_rounds=1000, 
    num_boost_round=2000,
    nfold=5,
    shuffle=True,
    params=base_params_lgb,
    feval=mae_score_vali,
    stratified=False,
    seed=2018
)
print('迭代次数{}'.format(len(cv_result_lgb['mae_score-mean'])))
print('最终模型的mae_score为{}'.format(max(cv_result_lgb['mae_score-mean'])))

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 18009
[LightGBM] [Info] Number of data points in the train set: 119996, number of used features: 85
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 18009
[LightGBM] [Info] Number of data points in the train set: 119996, number of used features: 85
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 18009
[LightGBM] [Info] Number of data points in the train set: 119996, number of used features: 85
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 18009
[LightGBM] [Info] Number of data points in the train set: 119996, number of used features: 85
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 18009
[LightGBM] [Info] Number of data points in the train set: 119996, number of used features: 85
[LightGBM] [Info] Start training from score 8.085938
[LightGBM] [



迭代次数1
最终模型的mae_score为0.9837746211332045
