In [1]:
import gc
import pandas as pd

# 读取数据
小样本（搭建模型）

In [2]:
# 加载小样本
user_log = pd.read_csv('../L13/data_format1_small/sample_user_log.csv', dtype={'time_stamp':'str'})
user_info = pd.read_csv('../L13/data_format1_small/sample_user_info.csv')
train_data1 = pd.read_csv('../L13/data_format1_small/train.csv')
submission = pd.read_csv('../L13/data_format1_small/test.csv')
train_data = pd.read_csv('./data_format2/train_format2.csv')

全量样本（跑分用）

In [36]:
# 加载全量样本
user_log = pd.read_csv('./data_format1/user_log_format1.csv', dtype={'time_stamp':'str'})
user_info = pd.read_csv('./data_format1/user_info_format1.csv')
train_data1 = pd.read_csv('./data_format1/train_format1.csv')
submission = pd.read_csv('./data_format1/test_format1.csv')
train_data = pd.read_csv('./data_format2/train_format2.csv')

In [37]:
train_data1['origin'] = 'train'
submission['origin'] = 'test'
matrix = pd.concat([train_data1, submission], ignore_index=True, sort=False)

# 数据预处理
## 数据格式处理
对matrix用left join去关联user_info的数据

In [38]:
matrix.drop(['prob'], axis=1, inplace=True)
# 连接user_info表，通过user_id关联
matrix = matrix.merge(user_info, on='user_id', how='left')
# 使用merchant_id（原列名seller_id）
user_log.rename(columns={'seller_id':'merchant_id'}, inplace=True)

对user_log数据格式处理：填补空值，缩小体量

In [39]:
# 格式化
user_log['user_id'] = user_log['user_id'].astype('int32')
user_log['merchant_id'] = user_log['merchant_id'].astype('int32')
user_log['item_id'] = user_log['item_id'].astype('int32')
user_log['cat_id'] = user_log['cat_id'].astype('int32')
user_log['brand_id'].fillna(0, inplace=True)
user_log['brand_id'] = user_log['brand_id'].astype('int32')
user_log['time_stamp'] = pd.to_datetime(user_log['time_stamp'], format='%H%M')

## 离散特征处理
### 填补空值  
对离散特征进行填补空值以及修改数据格式，并在处理完之后释放内存，垃圾回收

In [40]:
# 1 for <18; 2 for [18,24]; 3 for [25,29]; 4 for [30,34]; 5 for [35,39]; 6 for [40,49]; 7 and 8 for >= 50; 0 and NULL for unknown
matrix['age_range'].fillna(0, inplace=True)
# 0:female, 1:male, 2:unknown
matrix['gender'].fillna(2, inplace=True)
matrix['age_range'] = matrix['age_range'].astype('int8')
matrix['gender'] = matrix['gender'].astype('int8')
matrix['label'] = matrix['label'].astype('str')
matrix['user_id'] = matrix['user_id'].astype('int32')
matrix['merchant_id'] = matrix['merchant_id'].astype('int32')
del user_info, train_data1
gc.collect()

724

In [41]:
print(matrix)

        user_id  merchant_id label origin  age_range  gender
0         34176         3906   0.0  train          6       0
1         34176          121   0.0  train          6       0
2         34176         4356   1.0  train          6       0
3         34176         2217   0.0  train          6       0
4        230784         4818   0.0  train          0       0
...         ...          ...   ...    ...        ...     ...
522336   228479         3111   nan   test          6       0
522337    97919         2341   nan   test          8       1
522338    97919         3971   nan   test          8       1
522339    32639         3536   nan   test          0       0
522340    32639         3319   nan   test          0       0

[522341 rows x 6 columns]


### 用户画像  
- 'u1':用户被交互数量
- 'u2':用户购买唯一item数量
- 'u3':用户购买不重复的categories数量
- 'u4':用户购买不重复的商家数量
- 'u5':用户购买不重复的品牌数量

In [42]:
# User特征处理
groups = user_log.groupby(['user_id'])
# 用户交互行为数量 u1
temp = groups.size().reset_index().rename(columns={0:'u1'})
matrix = matrix.merge(temp, on='user_id', how='left')
# 使用agg 基于列的聚合操作，统计唯一值的个数 item_id, cat_id, merchant_id, brand_id
#temp = groups['item_id', 'cat_id', 'merchant_id', 'brand_id'].nunique().reset_index().rename(columns={'item_id':'u2', 'cat_id':'u3', 'merchant_id':'u4', 'brand_id':'u5'})
# 对于每个user_id 不重复的item_id的数量 => u2
temp = groups['item_id'].agg([('u2', 'nunique')]).reset_index()
matrix = matrix.merge(temp, on='user_id', how='left')
# 对于每个user_id 不重复的cat_id的数量 => u3
temp = groups['cat_id'].agg([('u3', 'nunique')]).reset_index()
matrix = matrix.merge(temp, on='user_id', how='left')
temp = groups['merchant_id'].agg([('u4', 'nunique')]).reset_index()
matrix = matrix.merge(temp, on='user_id', how='left')
temp = groups['brand_id'].agg([('u5', 'nunique')]).reset_index()
matrix = matrix.merge(temp, on='user_id', how='left')

- 'u6':用户的时间差
- 'u7 - u10':用户的action_type情况

In [43]:
# 时间间隔特征 u6 按照小时
# 对于每个user_id 计算time_stamp的最小时间 => F_time, 最大时间max => L_time
temp = groups['time_stamp'].agg([('F_time', 'min'), ('L_time', 'max')]).reset_index()
temp['u6'] = (temp['L_time'] - temp['F_time']).dt.seconds/3600
matrix = matrix.merge(temp[['user_id', 'u6']], on='user_id', how='left')
# 统计操作类型为0，1，2，3的个数
temp = groups['action_type'].value_counts().unstack().reset_index().rename(columns={0:'u7', 1:'u8', 2:'u9', 3:'u10'})
matrix = matrix.merge(temp, on='user_id', how='left')
#print(matrix)

### 商家特征
- 'm1':商家被交互数量
- 'm2':商家被交互的不重复user_id数量
- 'm3':商家被交互的不重复item_id数量
- 'm4':商家被交互的不重复cat_id数量
- 'm5':商家被交互的不重复brand_id数量
- 'm6 - m9':商家被交互的action_type情况
- 'm10':按照商家id随机负采样个数

In [44]:
# 商家特征处理
groups = user_log.groupby(['merchant_id'])
# 商家被交互行为数量 m1
temp = groups.size().reset_index().rename(columns={0:'m1'})
matrix = matrix.merge(temp, on='merchant_id', how='left')
# 统计商家被交互的user_id, item_id, cat_id, brand_id 唯一值
temp = groups['user_id', 'item_id', 'cat_id', 'brand_id'].nunique().reset_index().rename(columns={'user_id':'m2', 'item_id':'m3', 'cat_id':'m4', 'brand_id':'m5'})
matrix = matrix.merge(temp, on='merchant_id', how='left')
# 统计商家被交互的action_type 唯一值
temp = groups['action_type'].value_counts().unstack().reset_index().rename(columns={0:'m6', 1:'m7', 2:'m8', 3:'m9'})
matrix = matrix.merge(temp, on='merchant_id', how='left')
# 按照merchant_id 统计随机负采样的个数
temp = train_data[train_data['label']==1].groupby(['merchant_id']).size().reset_index().rename(columns={0:'m10'})
matrix = matrix.merge(temp, on='merchant_id', how='left')
#print(matrix)

  temp = groups['user_id', 'item_id', 'cat_id', 'brand_id'].nunique().reset_index().rename(columns={'user_id':'m2', 'item_id':'m3', 'cat_id':'m4', 'brand_id':'m5'})


### 商家画像和用户画像交叉
- 'um1':统计两个画像交互的情况下的数量
- 'um2':统计两个画像交互的情况下不重复item_id的数量
- 'um3':统计两个画像交互的情况下不重复cat_id的数量
- 'um4':统计两个画像交互的情况下不重复brand_id数量
- 'um5 - m8':统计两个画像交互的情况下交互的不同action_type情况
- 'um9':统计两个画像交互的情况下的时间差

In [45]:
# 按照user_id, merchant_id分组
groups = user_log.groupby(['user_id', 'merchant_id'])
temp = groups.size().reset_index().rename(columns={0:'um1'}) #统计行为个数
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left')
temp = groups['item_id', 'cat_id', 'brand_id'].nunique().reset_index().rename(columns={'item_id':'um2', 'cat_id':'um3', 'brand_id':'um4'}) #统计item_id, cat_id, brand_id唯一个数
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left')
temp = groups['action_type'].value_counts().unstack().reset_index().rename(columns={0:'um5', 1:'um6', 2:'um7', 3:'um8'})#统计不同action_type唯一个数
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left')
temp = groups['time_stamp'].agg([('first', 'min'), ('last', 'max')]).reset_index()
temp['um9'] = (temp['last'] - temp['first']).dt.seconds/3600
temp.drop(['first', 'last'], axis=1, inplace=True)
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left') #统计时间间隔
#print(matrix)

  temp = groups['item_id', 'cat_id', 'brand_id'].nunique().reset_index().rename(columns={'item_id':'um2', 'cat_id':'um3', 'brand_id':'um4'}) #统计item_id, cat_id, brand_id唯一个数


### 特征之间的一些统计数值
- 'r1':用户购买点击比
- 'r2':商家购买点击比
- 'r3':不同用户不同商家购买点击比  

对age和gender做one-hot处理

In [46]:
#用户购买点击比
matrix['r1'] = matrix['u9']/matrix['u7'] 
#商家购买点击比
matrix['r2'] = matrix['m8']/matrix['m6'] 
#不同用户不同商家购买点击比
matrix['r3'] = matrix['um7']/matrix['um5']
matrix.fillna(0, inplace=True)
# # 修改age_range字段名称为 age_0, age_1, age_2... age_8
temp = pd.get_dummies(matrix['age_range'], prefix='age')
matrix = pd.concat([matrix, temp], axis=1)
temp = pd.get_dummies(matrix['gender'], prefix='g')
matrix = pd.concat([matrix, temp], axis=1)
matrix.drop(['age_range', 'gender'], axis=1, inplace=True)
# print(matrix)

## 数据集分割

In [47]:
# 分割训练数据和测试数据
train_data = matrix[matrix['origin'] == 'train'].drop(['origin'], axis=1)
test_data = matrix[matrix['origin'] == 'test'].drop(['label', 'origin'], axis=1)
train_X, train_y = train_data.drop(['label'], axis=1), train_data['label']
del temp, matrix
gc.collect()

20

# 建立模型

In [14]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
import lightgbm as lgb
import xgboost as xgb

In [48]:
# 将训练集进行切分，20%用于验证
X_train, X_valid, y_train, y_valid = train_test_split(train_X, train_y, test_size=.2)

## XGBoost  
2021.2.26最佳参数

```python
model = xgb.XGBClassifier(
    max_depth=10,
    n_estimators=1000,
    min_child_weight=290, 
    colsample_bytree=0.8, 
    subsample=0.8, 
    eta=0.1,    
    seed=42)  
early_stopping_rounds=10
```

auc = 0.68017

In [19]:
# 使用XGBoost
model_xgb = xgb.XGBClassifier(
    max_depth=3,
    n_estimators=500,
    min_child_weight=290, 
    colsample_bytree=0.8, 
    subsample=0.8, 
    eta=0.1,    
    seed=42    
)

max_depth调参结束3，接下来是subsample调参

In [18]:
params_test1={
    'subsample':[0.7,0.8,0.9,1]
}

In [20]:
gsearch1 = GridSearchCV(estimator=model_xgb, param_grid=params_test1, scoring='roc_auc', cv=5, verbose=1, n_jobs=4)
gsearch1.fit(X_train, y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  20 out of  20 | elapsed:   27.3s finished




GridSearchCV(cv=5, error_score=nan,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=0.8, eta=0.1, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=3, min_child_weight=290,
                                     missing=nan, monotone_constraints=None,
                                     n_es...
                                     objective='binary:logistic',
                                     random_state=None, reg_alpha=None,
                                     reg_lambda=None, scale_pos_weight=None,
                                     seed=42, subsample=0.8, tree_m

In [21]:
gsearch1.cv_results_, gsearch1.best_params_, gsearch1.best_score_

({'mean_fit_time': array([4.86663132, 3.88971047, 4.19095802, 5.32379699]),
  'std_fit_time': array([0.42215228, 0.22381896, 0.09027709, 0.30208092]),
  'mean_score_time': array([0.02354546, 0.02453432, 0.01874804, 0.0188683 ]),
  'std_score_time': array([0.00617491, 0.00319117, 0.00364535, 0.00347876]),
  'param_subsample': masked_array(data=[0.7, 0.8, 0.9, 1],
               mask=[False, False, False, False],
         fill_value='?',
              dtype=object),
  'params': [{'subsample': 0.7},
   {'subsample': 0.8},
   {'subsample': 0.9},
   {'subsample': 1}],
  'split0_test_score': array([0.62254175, 0.628114  , 0.6256525 , 0.63473436]),
  'split1_test_score': array([0.62476588, 0.62913285, 0.63772371, 0.64146962]),
  'split2_test_score': array([0.62933662, 0.63269883, 0.63140901, 0.63030453]),
  'split3_test_score': array([0.61128192, 0.61416347, 0.61940696, 0.62570368]),
  'split4_test_score': array([0.61388605, 0.62112801, 0.61703157, 0.62680209]),
  'mean_test_score': array([0.

min_child_weight调参

In [22]:
# 使用XGBoost
model_xgb = xgb.XGBClassifier(
    max_depth=3,
    n_estimators=500,
    min_child_weight=290, 
    colsample_bytree=0.8, 
    subsample=1, 
    eta=0.1,    
    seed=42    
)

In [26]:
params_test2={
    'min_child_weight':range(100, 200, 10)
}

In [27]:
gsearch2 = GridSearchCV(estimator=model_xgb, param_grid=params_test2, scoring='roc_auc', cv=5, verbose=1, n_jobs=4)
gsearch2.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  1.2min
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:  1.3min finished




GridSearchCV(cv=5, error_score=nan,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=0.8, eta=0.1, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=3, min_child_weight=290,
                                     missing=nan, monotone_constraints=None,
                                     n_es...
                                     objective='binary:logistic',
                                     random_state=None, reg_alpha=None,
                                     reg_lambda=None, scale_pos_weight=None,
                                     seed=42, subsample=1, tree_met

In [28]:
gsearch2.cv_results_, gsearch2.best_params_, gsearch2.best_score_

({'mean_fit_time': array([6.29562535, 6.3831605 , 6.30154533, 6.16031704, 6.20105157,
         6.11161714, 7.01683083, 6.19802251, 5.67244697, 4.29013429]),
  'std_fit_time': array([0.01927221, 0.15747119, 0.06370674, 0.03590925, 0.02956762,
         0.04733454, 0.75121876, 0.55756497, 0.03381765, 1.19850434]),
  'mean_score_time': array([0.03370099, 0.03227286, 0.02875891, 0.03073854, 0.03168125,
         0.02964921, 0.02313452, 0.0191565 , 0.01875739, 0.0197701 ]),
  'std_score_time': array([0.00640344, 0.00603962, 0.00701094, 0.00531106, 0.00544394,
         0.00792225, 0.00204617, 0.00116194, 0.00031725, 0.00392234]),
  'param_min_child_weight': masked_array(data=[100, 110, 120, 130, 140, 150, 160, 170, 180, 190],
               mask=[False, False, False, False, False, False, False, False,
                     False, False],
         fill_value='?',
              dtype=object),
  'params': [{'min_child_weight': 100},
   {'min_child_weight': 110},
   {'min_child_weight': 120},
   {'

调参colsample_bytree

In [29]:
# 使用XGBoost
model_xgb = xgb.XGBClassifier(
    max_depth=3,
    n_estimators=500,
    min_child_weight=120, 
    colsample_bytree=0.8, 
    subsample=1, 
    eta=0.1,    
    seed=42    
)

In [30]:
params_test3={
    'colsample_tytree':[0.5,0.6,0.7,0.8,0.9,1.0]
}

In [31]:
gsearch3 = GridSearchCV(estimator=model_xgb, param_grid=params_test3, scoring='roc_auc', cv=5, verbose=1, n_jobs=4)
gsearch3.fit(X_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:   50.3s finished


Parameters: { colsample_tytree } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




GridSearchCV(cv=5, error_score=nan,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=0.8, eta=0.1, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=3, min_child_weight=120,
                                     missing=nan, monotone_constraints=None,
                                     n_es...
                                     objective='binary:logistic',
                                     random_state=None, reg_alpha=None,
                                     reg_lambda=None, scale_pos_weight=None,
                                     seed=42, subsample=1, tree_met

In [32]:
gsearch3.cv_results_, gsearch3.best_params_, gsearch3.best_score_

({'mean_fit_time': array([6.59035459, 7.04970956, 6.99553509, 6.62949824, 6.27803755,
         5.04175916]),
  'std_fit_time': array([0.23513875, 0.0667935 , 0.03396871, 0.19329766, 0.0425037 ,
         1.46339138]),
  'mean_score_time': array([0.03748903, 0.02709174, 0.02926259, 0.02585964, 0.02932558,
         0.02508755]),
  'std_score_time': array([0.01007138, 0.00157979, 0.00620316, 0.00373957, 0.00383695,
         0.00322089]),
  'param_colsample_tytree': masked_array(data=[0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
               mask=[False, False, False, False, False, False],
         fill_value='?',
              dtype=object),
  'params': [{'colsample_tytree': 0.5},
   {'colsample_tytree': 0.6},
   {'colsample_tytree': 0.7},
   {'colsample_tytree': 0.8},
   {'colsample_tytree': 0.9},
   {'colsample_tytree': 1.0}],
  'split0_test_score': array([0.65306392, 0.65306392, 0.65306392, 0.65306392, 0.65306392,
         0.65306392]),
  'split1_test_score': array([0.66533782, 0.66533782, 0.665337

缩小eta

In [49]:
# 使用XGBoost
model_xgb = xgb.XGBClassifier(
    max_depth=3,
    n_estimators=500,
    min_child_weight=120, 
    colsample_bytree=0.5, 
    subsample=1, 
    eta=0.05,    
    seed=42    
)

In [50]:
model_xgb.fit(
    X_train, y_train,
    eval_metric='auc', 
    eval_set=[(X_train, y_train), (X_valid, y_valid)],
    verbose=True,
    #早停法，如果auc在10epoch没有进步就stop
    early_stopping_rounds=10 
)



[0]	validation_0-auc:0.62777	validation_1-auc:0.62326
[1]	validation_0-auc:0.63643	validation_1-auc:0.63050
[2]	validation_0-auc:0.63692	validation_1-auc:0.63130
[3]	validation_0-auc:0.65414	validation_1-auc:0.65068
[4]	validation_0-auc:0.65502	validation_1-auc:0.65152
[5]	validation_0-auc:0.65521	validation_1-auc:0.65200
[6]	validation_0-auc:0.65514	validation_1-auc:0.65180
[7]	validation_0-auc:0.65606	validation_1-auc:0.65310
[8]	validation_0-auc:0.65974	validation_1-auc:0.65725
[9]	validation_0-auc:0.65905	validation_1-auc:0.65653
[10]	validation_0-auc:0.66161	validation_1-auc:0.65711
[11]	validation_0-auc:0.66238	validation_1-auc:0.65763
[12]	validation_0-auc:0.66294	validation_1-auc:0.65858
[13]	validation_0-auc:0.66289	validation_1-auc:0.65861
[14]	validation_0-auc:0.66285	validation_1-auc:0.65849
[15]	validation_0-auc:0.66246	validation_1-auc:0.65860
[16]	validation_0-auc:0.66299	validation_1-auc:0.65889
[17]	validation_0-auc:0.66372	validation_1-auc:0.65873
[18]	validation_0-au

[149]	validation_0-auc:0.71003	validation_1-auc:0.70015
[150]	validation_0-auc:0.71016	validation_1-auc:0.70014
[151]	validation_0-auc:0.71028	validation_1-auc:0.70017
[152]	validation_0-auc:0.71037	validation_1-auc:0.70025
[153]	validation_0-auc:0.71050	validation_1-auc:0.70041
[154]	validation_0-auc:0.71069	validation_1-auc:0.70064
[155]	validation_0-auc:0.71083	validation_1-auc:0.70075
[156]	validation_0-auc:0.71098	validation_1-auc:0.70083
[157]	validation_0-auc:0.71118	validation_1-auc:0.70087
[158]	validation_0-auc:0.71140	validation_1-auc:0.70115
[159]	validation_0-auc:0.71151	validation_1-auc:0.70119
[160]	validation_0-auc:0.71168	validation_1-auc:0.70144
[161]	validation_0-auc:0.71182	validation_1-auc:0.70153
[162]	validation_0-auc:0.71194	validation_1-auc:0.70161
[163]	validation_0-auc:0.71201	validation_1-auc:0.70167
[164]	validation_0-auc:0.71222	validation_1-auc:0.70187
[165]	validation_0-auc:0.71237	validation_1-auc:0.70199
[166]	validation_0-auc:0.71248	validation_1-auc:

[296]	validation_0-auc:0.72080	validation_1-auc:0.70740
[297]	validation_0-auc:0.72082	validation_1-auc:0.70740
[298]	validation_0-auc:0.72085	validation_1-auc:0.70741
[299]	validation_0-auc:0.72092	validation_1-auc:0.70743
[300]	validation_0-auc:0.72100	validation_1-auc:0.70744
[301]	validation_0-auc:0.72105	validation_1-auc:0.70748
[302]	validation_0-auc:0.72109	validation_1-auc:0.70751
[303]	validation_0-auc:0.72113	validation_1-auc:0.70752
[304]	validation_0-auc:0.72115	validation_1-auc:0.70752
[305]	validation_0-auc:0.72119	validation_1-auc:0.70752
[306]	validation_0-auc:0.72125	validation_1-auc:0.70759
[307]	validation_0-auc:0.72130	validation_1-auc:0.70761
[308]	validation_0-auc:0.72133	validation_1-auc:0.70763
[309]	validation_0-auc:0.72136	validation_1-auc:0.70763
[310]	validation_0-auc:0.72141	validation_1-auc:0.70767
[311]	validation_0-auc:0.72145	validation_1-auc:0.70767
[312]	validation_0-auc:0.72152	validation_1-auc:0.70766
[313]	validation_0-auc:0.72155	validation_1-auc:

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5, eta=0.05, gamma=0,
              gpu_id=-1, importance_type='gain', interaction_constraints='',
              learning_rate=0.0500000007, max_delta_step=0, max_depth=3,
              min_child_weight=120, missing=nan, monotone_constraints='()',
              n_estimators=500, n_jobs=4, num_parallel_tree=1,
              objective='binary:logistic', random_state=42, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, seed=42, subsample=1,
              tree_method='exact', use_label_encoder=True,
              validate_parameters=1, verbosity=None)

In [53]:
prob = model_xgb.predict_proba(test_data)
submission['prob'] = pd.Series(prob[:,1])
submission.drop(['origin'], axis=1, inplace=True)
submission.to_csv('./submission/prediction_xbg_2.csv', index=False)

### LightGBM
2021.02.27最优模型  
auc=0.6809485


CV寻找最优参数

In [54]:
model_lgb = lgb.LGBMClassifier(
    n_estimators=200,
    objective='binary',
    subsample=0.8,
    colsample_bytree=0.8,
    max_depth=3,
    num_leaves=50,
    learning_rate=0.1,
    reg_lambda=1,
    metric=['auc']
)

寻找树深度，叶子数量最优参数

In [38]:
params_test1={
    'max_depth': range(3,8,2),
    'num_leaves':range(50,170,30)
}

In [None]:
gsearch1 = GridSearchCV(estimator=model_lgb, param_grid=params_test1, scoring='roc_auc', cv=5, verbose=1, n_jobs=4)
gsearch1.fit(X_train, y_train)

In [44]:
gsearch1.cv_results_, gsearch1.best_params_, gsearch1.best_score_

({'mean_fit_time': array([ 6.98497987,  7.79617214,  7.28170156,  7.87663469,  9.04575028,
         11.14345508, 10.62704248, 11.15114255, 12.76195068, 15.23219094,
         12.38303103, 10.62938423]),
  'std_fit_time': array([0.08446479, 0.92138191, 0.51473822, 0.70975208, 0.96692583,
         1.53159304, 1.91972056, 0.77423959, 0.7950251 , 1.77401645,
         0.94057841, 0.89351958]),
  'mean_score_time': array([0.74787292, 0.8372664 , 0.8604497 , 0.84051318, 1.42310262,
         1.88381686, 1.57841301, 1.26859951, 1.95278959, 1.75649066,
         1.76403556, 1.50091591]),
  'std_score_time': array([0.10775955, 0.11207667, 0.11950106, 0.09754095, 0.14780292,
         0.26637882, 0.3226497 , 0.13314999, 0.52081703, 0.3145436 ,
         0.14303275, 0.23899355]),
  'param_max_depth': masked_array(data=[3, 3, 3, 3, 5, 5, 5, 5, 7, 7, 7, 7],
               mask=[False, False, False, False, False, False, False, False,
                     False, False, False, False],
         fill_value='?

寻找min_data_in_leaf和min_sum_hessian_in_leaf

In [45]:
model_lgb = lgb.LGBMClassifier(
    n_estimators=200,
    objective='binary',
    subsample=0.8,
    colsample_bytree=0.8,
    max_depth=3,
    num_leaves=50,
    learning_rate=0.1,
    reg_lambda=1,
    metric=['auc']
)

In [46]:
params_test2={
    'min_child_samples': [18, 19, 20, 21, 22],
    'min_child_weight':[0.001, 0.002]
}

In [47]:
gsearch2 = GridSearchCV(estimator=model_lgb, param_grid=params_test2, scoring='roc_auc', cv=5, verbose=1, n_jobs=4)
gsearch2.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  1.8min
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:  2.2min finished


GridSearchCV(cv=5, error_score=nan,
             estimator=LGBMClassifier(boosting_type='gbdt', class_weight=None,
                                      colsample_bytree=0.8,
                                      importance_type='split',
                                      learning_rate=0.1, max_depth=3,
                                      metric=['auc'], min_child_samples=20,
                                      min_child_weight=0.001,
                                      min_split_gain=0.0, n_estimators=200,
                                      n_jobs=-1, num_leaves=50,
                                      objective='binary', random_state=None,
                                      reg_alpha=0.0, reg_lambda=1, silent=True,
                                      subsample=0.8, subsample_for_bin=200000,
                                      subsample_freq=0),
             iid='deprecated', n_jobs=4,
             param_grid={'min_child_samples': [18, 19, 20, 21, 22],
            

In [48]:
gsearch2.cv_results_, gsearch2.best_params_, gsearch2.best_score_

({'mean_fit_time': array([ 8.21884499,  7.57595859,  8.60315795,  7.59467487,  7.70812058,
          9.09389596,  9.43928676, 10.08624711,  9.87756248,  9.89501419]),
  'std_fit_time': array([1.20218928, 0.27725212, 0.38644261, 1.29053823, 0.70941346,
         0.95342399, 1.32012684, 1.16610105, 1.03203584, 1.17402278]),
  'mean_score_time': array([0.86973443, 0.84423232, 0.94981685, 0.91652493, 0.92765255,
         0.97472544, 1.21839457, 1.20160966, 1.00439162, 0.82824764]),
  'std_score_time': array([0.14401967, 0.10396152, 0.37354748, 0.15358409, 0.19868709,
         0.11983977, 0.39155247, 0.31258567, 0.16018168, 0.15423524]),
  'param_min_child_samples': masked_array(data=[18, 18, 19, 19, 20, 20, 21, 21, 22, 22],
               mask=[False, False, False, False, False, False, False, False,
                     False, False],
         fill_value='?',
              dtype=object),
  'param_min_child_weight': masked_array(data=[0.001, 0.002, 0.001, 0.002, 0.001, 0.002, 0.001, 0.002,
 

调参feature_fraction 和 bagging_fraction

In [55]:
model_lgb = lgb.LGBMClassifier(
    n_estimators=200,
    objective='binary',
    subsample=0.8,
    colsample_bytree=0.8,
    max_depth=3,
    num_leaves=50,
    learning_rate=0.1,
    reg_lambda=1,
    min_child_samples=22,
    min_child_weight=0.001,
    metric=['auc']
)

In [56]:
params_test3={
    'feature_fraction': [0.5, 0.6, 0.7, 0.8, 0.9],
    'bagging_fraction': [0.6, 0.7, 0.8, 0.9, 1.0]
}

In [57]:
gsearch3 = GridSearchCV(estimator=model_lgb, param_grid=params_test3, scoring='roc_auc', cv=5, verbose=1, n_jobs=4)
gsearch3.fit(X_train, y_train)

Fitting 5 folds for each of 25 candidates, totalling 125 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  1.4min
[Parallel(n_jobs=4)]: Done 125 out of 125 | elapsed:  4.0min finished




GridSearchCV(cv=5, error_score=nan,
             estimator=LGBMClassifier(boosting_type='gbdt', class_weight=None,
                                      colsample_bytree=0.8,
                                      importance_type='split',
                                      learning_rate=0.1, max_depth=3,
                                      metric=['auc'], min_child_samples=22,
                                      min_child_weight=0.001,
                                      min_split_gain=0.0, n_estimators=200,
                                      n_jobs=-1, num_leaves=50,
                                      objective='binary', random_state=None,
                                      reg_alpha=0.0, reg_lambda=1, silent=True,
                                      subsample=0.8, subsample_for_bin=200000,
                                      subsample_freq=0),
             iid='deprecated', n_jobs=4,
             param_grid={'bagging_fraction': [0.6, 0.7, 0.8, 0.9, 1.0],
        

In [58]:
gsearch3.cv_results_, gsearch3.best_params_, gsearch3.best_score_

({'mean_fit_time': array([5.85695009, 6.52399597, 7.2279984 , 7.10151925, 6.60933805,
         6.48704333, 7.30923719, 6.2798038 , 6.64424219, 6.78925624,
         6.25306435, 6.10347614, 6.25466132, 6.62826657, 6.67783837,
         6.15752354, 6.45532274, 5.98261566, 6.70941834, 7.81140223,
         6.17378373, 6.15774379, 6.28532381, 6.52970524, 6.45135145]),
  'std_fit_time': array([0.44665851, 0.64648217, 0.38118176, 1.18855535, 0.26189835,
         0.67001487, 0.52238676, 0.59741227, 0.55530625, 0.56295736,
         0.81280363, 0.56531268, 0.60254261, 0.53260704, 0.51848963,
         0.47998597, 0.76958257, 0.06664629, 0.51172053, 0.6135943 ,
         0.56016823, 0.64025394, 0.50744703, 0.57249333, 0.53641347]),
  'mean_score_time': array([0.90377951, 0.7762794 , 0.82886872, 0.78083401, 0.65162168,
         0.81522059, 0.82578664, 0.77693515, 0.75555863, 0.71564717,
         0.72626719, 0.76121721, 0.77970581, 0.73443494, 0.69775758,
         0.77611866, 0.79547257, 0.68534894, 0.

正则化调参：  
reg_alpha和reg_lambda，之前调参最优的是0和1，则不再进行这步

In [60]:
model_lgb = lgb.LGBMClassifier(
    n_estimators=400,
    objective='binary',
    subsample=0.6,
    colsample_bytree=0.7,
    max_depth=3,
    num_leaves=50,
    learning_rate=0.05,
    reg_lambda=1,
    min_child_samples=22,
    min_child_weight=0.001,
    metric=['auc']
)

In [61]:
model_lgb.fit(
    X_train, y_train,
    eval_metric='auc', 
    eval_set=[(X_train, y_train), (X_valid, y_valid)],
    verbose=True,
    #早停法，如果auc在10epoch没有进步就stop
    early_stopping_rounds=30 
)

[1]	training's auc: 0.605148	valid_1's auc: 0.595876
Training until validation scores don't improve for 30 rounds
[2]	training's auc: 0.649792	valid_1's auc: 0.641222
[3]	training's auc: 0.659214	valid_1's auc: 0.653886
[4]	training's auc: 0.66043	valid_1's auc: 0.654789
[5]	training's auc: 0.66075	valid_1's auc: 0.656114
[6]	training's auc: 0.660444	valid_1's auc: 0.65527
[7]	training's auc: 0.659937	valid_1's auc: 0.655097
[8]	training's auc: 0.662827	valid_1's auc: 0.658063
[9]	training's auc: 0.662445	valid_1's auc: 0.657269
[10]	training's auc: 0.664521	valid_1's auc: 0.659494
[11]	training's auc: 0.665378	valid_1's auc: 0.660421
[12]	training's auc: 0.665884	valid_1's auc: 0.661563
[13]	training's auc: 0.672019	valid_1's auc: 0.666508
[14]	training's auc: 0.671736	valid_1's auc: 0.665959
[15]	training's auc: 0.674166	valid_1's auc: 0.668021
[16]	training's auc: 0.674101	valid_1's auc: 0.668248
[17]	training's auc: 0.676142	valid_1's auc: 0.670123
[18]	training's auc: 0.676308	val

[152]	training's auc: 0.715736	valid_1's auc: 0.704162
[153]	training's auc: 0.715871	valid_1's auc: 0.704265
[154]	training's auc: 0.715928	valid_1's auc: 0.704217
[155]	training's auc: 0.716025	valid_1's auc: 0.704317
[156]	training's auc: 0.716165	valid_1's auc: 0.704383
[157]	training's auc: 0.716195	valid_1's auc: 0.704348
[158]	training's auc: 0.716306	valid_1's auc: 0.704431
[159]	training's auc: 0.716376	valid_1's auc: 0.704455
[160]	training's auc: 0.716478	valid_1's auc: 0.704466
[161]	training's auc: 0.716581	valid_1's auc: 0.704529
[162]	training's auc: 0.716582	valid_1's auc: 0.704495
[163]	training's auc: 0.716668	valid_1's auc: 0.704558
[164]	training's auc: 0.716758	valid_1's auc: 0.704638
[165]	training's auc: 0.716853	valid_1's auc: 0.704676
[166]	training's auc: 0.71687	valid_1's auc: 0.704753
[167]	training's auc: 0.716931	valid_1's auc: 0.704742
[168]	training's auc: 0.717001	valid_1's auc: 0.704763
[169]	training's auc: 0.717069	valid_1's auc: 0.704789
[170]	train

[304]	training's auc: 0.724715	valid_1's auc: 0.707468
[305]	training's auc: 0.724783	valid_1's auc: 0.707447
[306]	training's auc: 0.724843	valid_1's auc: 0.707436
[307]	training's auc: 0.724896	valid_1's auc: 0.707414
[308]	training's auc: 0.724935	valid_1's auc: 0.707429
[309]	training's auc: 0.724985	valid_1's auc: 0.707464
[310]	training's auc: 0.724997	valid_1's auc: 0.707508
[311]	training's auc: 0.725024	valid_1's auc: 0.707514
[312]	training's auc: 0.725061	valid_1's auc: 0.70751
[313]	training's auc: 0.725131	valid_1's auc: 0.707535
[314]	training's auc: 0.725178	valid_1's auc: 0.707548
[315]	training's auc: 0.725211	valid_1's auc: 0.707554
[316]	training's auc: 0.725265	valid_1's auc: 0.707534
[317]	training's auc: 0.725312	valid_1's auc: 0.707533
[318]	training's auc: 0.725375	valid_1's auc: 0.707555
[319]	training's auc: 0.725434	valid_1's auc: 0.707572
[320]	training's auc: 0.725487	valid_1's auc: 0.707587
[321]	training's auc: 0.725526	valid_1's auc: 0.707591
[322]	train

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=0.7,
               importance_type='split', learning_rate=0.05, max_depth=3,
               metric=['auc'], min_child_samples=22, min_child_weight=0.001,
               min_split_gain=0.0, n_estimators=400, n_jobs=-1, num_leaves=50,
               objective='binary', random_state=None, reg_alpha=0.0,
               reg_lambda=1, silent=True, subsample=0.6,
               subsample_for_bin=200000, subsample_freq=0)

In [62]:
prob = model_lgb.predict_proba(test_data)
submission['prob'] = pd.Series(prob[:,1])
# submission.drop(['origin'], axis=1, inplace=True)
submission.to_csv('./submission/prediction_lgb_1.csv', index=False)