## 实例1

In [1]:
import pandas as pd
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import matplotlib.pyplot as plt

# sklearn数据预处理
from sklearn.model_selection import train_test_split
from sklearn.model_selection import PredefinedSplit
from sklearn.model_selection import GridSearchCV,ParameterGrid
from sklearn.metrics import roc_curve,auc,accuracy_score

In [3]:
# 参数设置
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'binary_logloss', 'auc'},
    'metric_freq': 1,
    'is_training_metric': True,
    'max_bin': 255,
    'learning_rate': 0.1,
    'num_leaves': 63,
    'tree_learner': 'serial',
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'min_data_in_leaf': 50,
    'min_sum_hessian_in_leaf': 5,
    'is_enable_sparse': True,
    'use_two_round_loading': False,
    'is_save_binary_file': False,
    'output_model': 'LightGBM_model.txt',
    'num_machines': 1,
    'local_listen_port': 12400,
    'machine_list_file': 'mlist.txt',
    'verbose': 0,
    'subsample_for_bin': 200000,
    'min_child_samples': 20,
    'min_child_weight': 0.001,
    'min_split_gain': 0.0,
    'colsample_bytree': 1.0,
    'reg_alpha': 0.0,
    'reg_lambda': 0.0
}

In [5]:
print("Loading data...")
df_train=pd.read_csv('files/data/python83/regression.train.txt',header=None,sep='\t')
df_test=pd.read_csv('files/data/python83/regression.test.txt',header=None,sep='\t')
print(df_train.shape)
print(df_test.shape)
df_train.head()

Loading data...
(7000, 29)
(500, 29)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19,20,21,22,23,24,25,26,27,28
0,1,0.869,-0.635,0.226,0.327,-0.69,0.754,-0.249,-1.092,0.0,...,-0.01,-0.046,3.102,1.354,0.98,0.978,0.92,0.722,0.989,0.877
1,1,0.908,0.329,0.359,1.498,-0.313,1.096,-0.558,-1.588,2.173,...,-1.139,-0.001,0.0,0.302,0.833,0.986,0.978,0.78,0.992,0.798
2,1,0.799,1.471,-1.636,0.454,0.426,1.105,1.282,1.382,0.0,...,1.129,0.9,0.0,0.91,1.108,0.986,0.951,0.803,0.866,0.78
3,0,1.344,-0.877,0.936,1.992,0.882,1.786,-1.647,-0.942,0.0,...,-0.678,-1.36,0.0,0.947,1.029,0.999,0.728,0.869,1.027,0.958
4,1,1.105,0.321,1.522,0.883,-1.205,0.681,-1.07,-0.922,0.0,...,-0.374,0.113,0.0,0.756,1.361,0.987,0.838,1.133,0.872,0.808


In [8]:
y_train=df_train[0].values
y_test=df_test[0].values
X_train=df_train.drop(0,axis=1).values
X_test=df_test.drop(0,axis=1).values
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

In [10]:
gbm = lgb.train(params,
                lgb_train,
                valid_sets=lgb_eval)

[1]	valid_0's auc: 0.718121	valid_0's binary_logloss: 0.671716
[2]	valid_0's auc: 0.734415	valid_0's binary_logloss: 0.658775
[3]	valid_0's auc: 0.767592	valid_0's binary_logloss: 0.644696
[4]	valid_0's auc: 0.774332	valid_0's binary_logloss: 0.63497
[5]	valid_0's auc: 0.780597	valid_0's binary_logloss: 0.622594
[6]	valid_0's auc: 0.782298	valid_0's binary_logloss: 0.613867
[7]	valid_0's auc: 0.785942	valid_0's binary_logloss: 0.606095
[8]	valid_0's auc: 0.787587	valid_0's binary_logloss: 0.598635
[9]	valid_0's auc: 0.790361	valid_0's binary_logloss: 0.59144
[10]	valid_0's auc: 0.794521	valid_0's binary_logloss: 0.58343
[11]	valid_0's auc: 0.798842	valid_0's binary_logloss: 0.576479
[12]	valid_0's auc: 0.798681	valid_0's binary_logloss: 0.572351
[13]	valid_0's auc: 0.798826	valid_0's binary_logloss: 0.567695
[14]	valid_0's auc: 0.80089	valid_0's binary_logloss: 0.564266
[15]	valid_0's auc: 0.803196	valid_0's binary_logloss: 0.559937
[16]	valid_0's auc: 0.804373	valid_0's binary_logloss

In [13]:

gridParams = {
    'learning_rate': [ 0.1],
    'num_leaves': [63],
    'boosting_type' : ['gbdt'],
    'objective' : ['binary']
}

mdl = lgb.LGBMClassifier(
    task = params['task'],
    metric = params['metric'],
    metric_freq = params['metric_freq'],
    is_training_metric = params['is_training_metric'],
    max_bin = params['max_bin'],
    tree_learner = params['tree_learner'],
    feature_fraction = params['feature_fraction'],
    bagging_fraction = params['bagging_fraction'],
    bagging_freq = params['bagging_freq'],
    min_data_in_leaf = params['min_data_in_leaf'],
    min_sum_hessian_in_leaf = params['min_sum_hessian_in_leaf'],
    is_enable_sparse = params['is_enable_sparse'],
    use_two_round_loading = params['use_two_round_loading'],
    is_save_binary_file = params['is_save_binary_file'],
    n_jobs = -1
)

scoring = {'AUC': 'roc_auc'}

# Create the grid
grid = GridSearchCV(mdl, gridParams, verbose=2, cv=5, scoring=scoring, n_jobs=-1, refit='AUC')
# Run the grid
grid.fit(X_train, y_train)

print('Best parameters found by grid search are:', grid.best_params_)
print('Best score found by grid search is:', grid.best_score_)


## 实例2

In [14]:
import pandas as pd
import lightgbm as lgb
from sklearn.grid_search import GridSearchCV
from sklearn.model_selection import train_test_split
train_data=pd.read_csv('files/data/python87/train.csv')
train_data.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,21,22,23,24,25,26,27,28,29,30
0,2,1,4,1,1,1,2,2,4,3,...,0.199728,-1.0,-1.0,-0.333333,-0.65,-0.621622,-0.444444,-1.0,-0.176471,0
1,1,1,4,1,4,0,3,3,4,3,...,0.060483,-1.0,-1.0,-0.333333,0.65,-0.72973,-0.555556,-0.866667,-0.529412,0
2,1,1,3,1,1,1,1,2,2,3,...,0.900553,1.0,-1.0,-1.0,-0.55,-0.513514,-0.222222,-1.0,-0.294118,1


In [16]:
y=train_data.pop('30').values # 用pop方式将训练数据中的标签值y取出来，作为训练目标，这里的‘30’是标签的列名

In [17]:
col = train_data.columns   
x = train_data[col].values  # 剩下的列作为训练数据
train_x, valid_x, train_y, valid_y = train_test_split(x, y, test_size=0.333, random_state=0)   # 分训练集和验证集
train = lgb.Dataset(train_x, train_y)
valid = lgb.Dataset(valid_x, valid_y, reference=train)

In [26]:
# 需要优化的参数
parameters = {
              'max_depth': [15, 20, 25, 30, 35],
              'learning_rate': [0.01, 0.02, 0.05, 0.1, 0.15],
              'feature_fraction': [0.6, 0.7, 0.8, 0.9, 0.95],
#               'bagging_fraction': [0.6, 0.7, 0.8, 0.9, 0.95],
#               'bagging_freq': [2, 4, 5, 6, 8],
#               'lambda_l1': [0, 0.1, 0.4, 0.5, 0.6],
#               'lambda_l2': [0, 10, 15, 35, 40],
#               'cat_smooth': [1, 10, 15, 20, 35]
}


In [None]:
gbm = lgb.LGBMClassifier(boosting_type='gbdt',
                         objective = 'binary',
                         metric = 'auc',
                         verbose = 0,
                         learning_rate = 0.01,
                         num_leaves = 35,
                         feature_fraction=0.8,
                         bagging_fraction= 0.9,
                         bagging_freq= 8,
                         lambda_l1= 0.6,
                         lambda_l2= 0)
# 有了gridsearch我们便不需要fit函数
gsearch = GridSearchCV(gbm, param_grid=parameters, scoring='accuracy', cv=3)
gsearch.fit(train_x, train_y)

print("Best score: %0.3f" % gsearch.best_score_)
print("Best parameters set:")
best_parameters = gsearch.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))


https://juejin.im/post/5b76437ae51d45666b5d9b05