## Load Module

In [206]:
import gc
import pandas as pd 
import numpy as np

#导入分析库
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.metrics import classification_report
import lightgbm as lgb
import xgboost as xgb
import catboost as cat

from sklearn.metrics import roc_auc_score, roc_curve, auc
# LOCAL_QUICK = True
LOCAL_QUICK = False
sample_percent = 0.1

MORE_FE = False
# MORE_FE = True
FE_V1 = False if MORE_FE else True


In [207]:
%%time
# 加载数据
# 用户行为，使用format1进行加载
user_log = pd.read_csv('./user_log_format1.csv',nrows=100000, dtype={'time_stamp':'str'})

user_info = pd.read_csv('./user_info_format1.csv')

train_data1 = pd.read_csv('./train_format1.csv')

sub_data = pd.read_csv('./test_format1.csv')
data_train = pd.read_csv('./train_format2.csv',nrows=100000) 


CPU times: user 306 ms, sys: 12.1 ms, total: 318 ms
Wall time: 316 ms


In [208]:
%%time
# 采样测试
if LOCAL_QUICK:
    print('Local quick test: {}, rate is {}'.format(  
        LOCAL_QUICK, sample_percent))
    data = user_log.sample(int(len(user_log) * sample_percent))
    data1 = user_info.sample(int(len(user_info) * sample_percent))
    data2 = train_data1.sample(int(len(train_data1) * sample_percent))
    # submission = sub_data.sample(int(len(sub_data) * sample_percent))
    submission = sub_data.copy()
    
else:
    print('All sample train')
    data = user_log.copy()
    data1 = user_info.copy()
    data2 = train_data1.copy()
    submission = sub_data.copy() 
    del user_log, user_info, train_data1, sub_data
print('---data shape---')     
for df in [data, data1, data2, submission, data_train]:
    print(df.shape)  

All sample train
---data shape---
(100000, 7)
(424170, 3)
(260864, 3)
(261477, 3)
(100000, 6)
CPU times: user 14.3 ms, sys: 103 µs, total: 14.4 ms
Wall time: 12.5 ms


In [209]:
data2['origin'] = 'train'
submission['origin'] = 'test'
matrix = pd.concat([data2, submission], ignore_index=True, sort=False)
matrix.drop(['prob'], axis=1, inplace=True)
# 连接user_info表，通过user_id关联
matrix = matrix.merge(data1, on='user_id', how='left')
# 使用merchant_id（原列名seller_id）
data.rename(columns={'seller_id':'merchant_id'}, inplace=True)

In [210]:
%%time
# 格式化
data['user_id'] = data['user_id'].astype('int32')
data['merchant_id'] = data['merchant_id'].astype('int32')
data['item_id'] = data['item_id'].astype('int32')
data['cat_id'] = data['cat_id'].astype('int32')
data['brand_id'].fillna(0, inplace=True)
data['brand_id'] = data['brand_id'].astype('int32')
data['time_stamp'] = pd.to_datetime(data['time_stamp'], format='%H%M')
# 缺失值填充
matrix['age_range'].fillna(0, inplace=True)
matrix['gender'].fillna(2, inplace=True)

# # gender用众数填充 表现更差
# matrix['gender'].fillna(matrix['gender'].mode()[0],inplace=True)
# # 年龄用中位数填充
# matrix['age_range'].fillna(matrix['age_range'].median(),inplace=True)

matrix['age_range'] = matrix['age_range'].astype('int8')
matrix['gender'] = matrix['gender'].astype('int8')
matrix['label'] = matrix['label'].astype('str')
matrix['user_id'] = matrix['user_id'].astype('int32')
matrix['merchant_id'] = matrix['merchant_id'].astype('int32')

del data1, data2
gc.collect()

CPU times: user 487 ms, sys: 11.9 ms, total: 498 ms
Wall time: 496 ms


0

In [211]:
%%time

##### 特征处理
##### User特征处理
groups = data.groupby(['user_id'])
# 用户交互行为数量 u1
temp = groups.size().reset_index().rename(columns={0:'u1'})
matrix = matrix.merge(temp, on='user_id', how='left')
# 使用agg 基于列的聚合操作，统计唯一值个数 item_id, cat_id, merchant_id, brand_id
temp = groups['item_id'].agg([('u2', 'nunique')]).reset_index()
matrix = matrix.merge(temp, on='user_id', how='left')
temp = groups['cat_id'].agg([('u3', 'nunique')]).reset_index()
matrix = matrix.merge(temp, on='user_id', how='left')
temp = groups['merchant_id'].agg([('u4', 'nunique')]).reset_index()
matrix = matrix.merge(temp, on='user_id', how='left')
temp = groups['brand_id'].agg([('u5', 'nunique')]).reset_index()
matrix = matrix.merge(temp, on='user_id', how='left')
# 时间间隔特征 u6 按照小时
temp = groups['time_stamp'].agg([('F_time', 'min'), ('L_time', 'max')]).reset_index()
temp['u6'] = (temp['L_time'] - temp['F_time']).dt.seconds/3600
matrix = matrix.merge(temp[['user_id', 'u6']], on='user_id', how='left')
# 统计操作类型为0，1，2，3的个数
temp = groups['action_type'].value_counts().unstack().reset_index().rename(  
    columns={0:'u7', 1:'u8', 2:'u9', 3:'u10'})
matrix = matrix.merge(temp, on='user_id', how='left')

del temp
gc.collect()

CPU times: user 1.07 s, sys: 16 ms, total: 1.08 s
Wall time: 1.08 s


18

In [212]:
%%time

##### 商家特征处理
groups = data.groupby(['merchant_id'])
# 商家被交互行为数量 m1
temp = groups.size().reset_index().rename(columns={0:'m1'})
matrix = matrix.merge(temp, on='merchant_id', how='left')
# 统计商家被交互的user_id, item_id, cat_id, brand_id 唯一值
#temp = groups['user_id', 'item_id', 'cat_id', 'brand_id'].nunique().reset_index().rename(columns={
#    'user_id':'m2',
#    'item_id':'m3', 
#    'cat_id':'m4', 
#    'brand_id':'m5'})
temp = groups.agg({
    'user_id': 'nunique',
    'item_id': 'nunique',
    'cat_id': 'nunique',
    'brand_id': 'nunique'
}).reset_index().rename(columns={
    'user_id':'m2',
    'item_id':'m3',
    'cat_id':'m4',
    'brand_id':'m5'
})
matrix = matrix.merge(temp, on='merchant_id', how='left')
# 统计商家被交互的action_type 唯一值
temp = groups['action_type'].value_counts().unstack().reset_index().rename(  
    columns={0:'m6', 1:'m7', 2:'m8', 3:'m9'})
matrix = matrix.merge(temp, on='merchant_id', how='left')

del temp
gc.collect()

CPU times: user 449 ms, sys: 91.9 ms, total: 541 ms
Wall time: 539 ms


0

In [213]:
# 按照merchant_id 统计随机负采样的个数
temp = data_train[data_train['label']==-1].groupby(['merchant_id']).size().reset_index().rename(columns={0:'m10'})
matrix = matrix.merge(temp, on='merchant_id', how='left')

In [214]:
%%time
# 用户+商户特征
groups = data.groupby(['user_id', 'merchant_id'])
temp = groups.size().reset_index().rename(columns={0:'um1'})
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left')
# 统计用户+商户交互的item_id, cat_id, brand_id 唯一值
temp = groups[['item_id', 'cat_id', 'brand_id']].nunique().reset_index().rename(columns={
    'item_id':'um2',
    'cat_id':'um3',
    'brand_id':'um4'
})
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left')
temp = groups['action_type'].value_counts().unstack().reset_index().rename(columns={
    0:'um5',
    1:'um6',
    2:'um7',
    3:'um8'
})
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left')

# 用户+商户交互时间范围（最早和最晚）以及时间差（小时） um9
temp = groups['time_stamp'].agg([('frist', 'min'), ('last', 'max')]).reset_index()
temp['um9'] = (temp['last'] - temp['frist']).dt.seconds/3600
temp.drop(['frist', 'last'], axis=1, inplace=True)
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left')

del temp
gc.collect()



CPU times: user 966 ms, sys: 312 ms, total: 1.28 s
Wall time: 1.28 s


18

In [215]:

matrix['r1'] = matrix['u9']/matrix['u7'] # 用户购买点击比
matrix['r2'] = matrix['m8']/matrix['m6'] # 商家购买点击比
matrix['r3'] = matrix['um7']/matrix['um5'] #不同用户不同商家购买点击比

In [216]:
matrix.fillna(0, inplace=True)

In [217]:
%%time
# # 修改age_range字段名称为 age_0, age_1, age_2... age_8
temp = pd.get_dummies(matrix['age_range'], prefix='age')
matrix = pd.concat([matrix, temp], axis=1)
temp = pd.get_dummies(matrix['gender'], prefix='g')
matrix = pd.concat([matrix, temp], axis=1)
matrix.drop(['age_range', 'gender'], axis=1, inplace=True)

del temp
gc.collect()

CPU times: user 553 ms, sys: 119 ms, total: 672 ms
Wall time: 669 ms


0

In [218]:
# train、test-setdata
train_data = matrix[matrix['origin'] == 'train'].drop(['origin'], axis=1)
test_data = matrix[matrix['origin'] == 'test'].drop(['label', 'origin'], axis=1)

if not LOCAL_QUICK:
    if FE_V1:
        train_data.to_csv('train_data.csv', index=False)
        test_data.to_csv('test_data.csv', index=False)
    if MORE_FE:
        train_data.to_csv('train_data_moreFE.csv', index=False)
        test_data.to_csv('test_data_moreFE.csv', index=False)

del matrix
gc.collect()

0

---

-----

## Load FeatureData

In [219]:
# get data
if not LOCAL_QUICK:
    if FE_V1:
        train_data = pd.read_csv('train_data.csv')
        test_data = pd.read_csv('test_data.csv')
    if MORE_FE:
        train_data = pd.read_csv('train_data_moreFE.csv')
        test_data = pd.read_csv('test_data_moreFE.csv')  

# FeatureSelect_QUICK = True # Feature Select
FeatureSelect_QUICK = False 
if FeatureSelect_QUICK: # 使用部分样本进行快速特征选择
    train_data = train_data.sample(int(len(train_data) * sample_percent))

# train_data = train_data[train_col]
train_X, train_y = train_data.drop(['label'], axis=1), train_data['label']

del train_data  #delete

X_train, X_valid, y_train, y_valid = train_test_split(train_X, train_y, test_size=.2, random_state=42) # test_size=.3

### XGB Model

In [220]:
import pandas as pd
if not LOCAL_QUICK:
    if FE_V1:
        train_data = pd.read_csv('train_data.csv')
        test_data = pd.read_csv('test_data.csv')
    if MORE_FE:
        train_data = pd.read_csv('train_data_moreFE.csv')
        test_data = pd.read_csv('test_data_moreFE.csv') 

#get data
train_X, train_y = train_data.drop(['label'], axis=1), train_data['label']
del train_data
X_train, X_valid, y_train, y_valid = train_test_split(train_X, train_y, test_size=.2, random_state=42) # test_size=.3

In [222]:
%%time
def xgb_train(X_train, y_train, X_valid, y_valid, verbose=True):
    model_xgb = xgb.XGBClassifier(
        max_depth=10, # raw8
        n_estimators=1000,
        min_child_weight=300, 
        colsample_bytree=0.8, 
        subsample=0.8, 
        eta=0.3,    
        seed=42        
    )

    model_xgb.fit(
        X_train, 
        y_train,
        eval_metric='auc',
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        verbose=verbose,
        early_stopping_rounds=10 # 早停法，如果auc在10epoch没有进步就stop
    )
    print(model_xgb.best_score)
    return model_xgb

CPU times: user 6 µs, sys: 0 ns, total: 6 µs
Wall time: 10.7 µs


In [223]:
from xgboost import XGBClassifier
#model_xgb = xgb_train(X_train, y_train, X_valid, y_valid, verbose=False)

# 用参数字典来设置
params = {
    'eval_metric': 'auc',  # 替换为你的评估指标
    'early_stopping_rounds': 10,  # 设置早停轮数
}

# 创建模型
model_xgb = XGBClassifier(**params)

# 训练模型
model_xgb.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=False)


In [224]:
%%time
prob = model_xgb.predict_proba(test_data)

submission['prob'] = pd.Series(prob[:,1])
# submission.drop(['origin'], axis=1, inplace=True)
submission.to_csv('submission_xgb.csv', index=False)

CPU times: user 3.46 s, sys: 465 ms, total: 3.92 s
Wall time: 1.36 s


### LGB Model

In [226]:
############DEF:lgb_train################
import lightgbm
from lightgbm import log_evaluation, early_stopping
callbacks = [log_evaluation(period=100), early_stopping(stopping_rounds=30)]

def lgb_train(X_train, y_train, X_valid, y_valid, verbose=True):
    model_lgb = lgb.LGBMClassifier(
        max_depth=10, # 8
        n_estimators=1000,
        min_child_weight=200, 
        colsample_bytree=0.8, 
        subsample=0.8, 
        eta=0.3,    
        seed=42,   
    )
    model_lgb.fit(
        X_train, 
        y_train,
        eval_metric='auc',
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        callbacks=callbacks
    )

    print(model_lgb.best_score_['valid_1']['auc'])
    return model_lgb

In [228]:
model_lgb = lgb_train(X_train, y_train, X_valid, y_valid, verbose=False)

[LightGBM] [Info] Number of positive: 12805, number of negative: 195886
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.020851 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2230
[LightGBM] [Info] Number of data points in the train set: 208691, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.061359 -> initscore=-2.727697
[LightGBM] [Info] Start training from score -2.727697
Training until validation scores don't improve for 30 rounds
[100]	training's auc: 0.674603	training's binary_logloss: 0.219783	valid_1's auc: 0.639178	valid_1's binary_logloss: 0.220016


[200]	training's auc: 0.691256	training's binary_logloss: 0.217616	valid_1's auc: 0.642301	valid_1's binary_logloss: 0.219693
Early stopping, best iteration is:
[221]	training's auc: 0.69402	training's binary_logloss: 0.217219	valid_1's auc: 0.642902	valid_1's binary_logloss: 0.219652
0.6429017625596379


In [229]:
%%time
prob = model_lgb.predict_proba(test_data)
submission['prob'] = pd.Series(prob[:,1])
# submission.drop(['origin'], axis=1, inplace=True)
submission.to_csv('submission_lgb.csv', index=False)

CPU times: user 5.11 s, sys: 136 ms, total: 5.24 s
Wall time: 1.18 s


### Cat Model

In [230]:
def cat_train(X_train, y_train, X_valid, y_valid, verbose=True):
    model_cat = cat.CatBoostClassifier(learning_rate=0.02, iterations=5000, eval_metric='AUC', od_wait=50,
                                od_type='Iter', random_state=10, thread_count=8, l2_leaf_reg=1, verbose=verbose)
    model_cat.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], early_stopping_rounds=50,
            use_best_model=True)

    print(model_cat.best_score_['validation']['AUC'])
    return model_cat

In [231]:
model_cat = cat_train(X_train, y_train, X_valid, y_valid, verbose=False)

0.6403296560176217


In [232]:
%%time
prob = model_cat.predict_proba(test_data)
submission['prob'] = pd.Series(prob[:,1])
# submission.drop(['origin'], axis=1, inplace=True)
submission.to_csv('submission_cat.csv', index=False)

CPU times: user 1.04 s, sys: 44 ms, total: 1.08 s
Wall time: 478 ms


## StratifiedKFold

In [233]:
# 构造训练集和测试集
def get_train_testDF(train_df,label_df):
    skv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    trainX = []
    trainY = []
    testX = []
    testY = []
    for train_index, test_index in skv.split(X=train_df, y=label_df):
        train_x, train_y, test_x, test_y = train_df.iloc[train_index, :], label_df.iloc[train_index], \
                                            train_df.iloc[test_index, :], label_df.iloc[test_index]

        trainX.append(train_x)
        trainY.append(train_y)
        testX.append(test_x)
        testY.append(test_y)
    return trainX, testX, trainY, testY

### lightgbm

In [234]:
# get data
if not LOCAL_QUICK:
    if FE_V1:
        train_data = pd.read_csv('train_data.csv')
        test_data = pd.read_csv('test_data.csv')
    if MORE_FE:
        train_data = pd.read_csv('train_data_moreFE.csv')
        test_data = pd.read_csv('test_data_moreFE.csv')  

train_X, train_y = train_data.drop(['label'], axis=1), train_data['label']

del train_data

# Split Train&Valid Data
X_train, X_valid, y_train, y_valid = get_train_testDF(train_X, train_y)

In [237]:
# 将训练数据集划分分别训练5个lgbm,xgboost和catboost 模型
# lightgbm模型
from lightgbm import log_evaluation, early_stopping
callbacks = [log_evaluation(period=100), early_stopping(stopping_rounds=30)]

pred_lgbms = []
for i in range(5):
    print('\n============================LGB training use Data {}/5============================\n'.format(i+1))
    model_lgb = lgb.LGBMClassifier(
        max_depth=10, # 8
        n_estimators=1000,
        min_child_weight=200, 
        colsample_bytree=0.8, 
        subsample=0.8, 
        eta=0.3,    
        seed=42
    )

    model_lgb.fit(
        X_train[i], 
        y_train[i],
        eval_metric='auc',
        eval_set=[(X_train[i], y_train[i]), (X_valid[i], y_valid[i])],
        #verbose=False,
        callbacks=callbacks
    )

    print(model_lgb.best_score_['valid_1']['auc'])

    pred = model_lgb.predict_proba(test_data)
    pred = pd.DataFrame(pred[:,1])
    pred_lgbms.append(pred)
pred_lgbms = pd.concat(pred_lgbms, axis=1)
print(pred_lgbms)

submission['prob'] = pred_lgbms.mean(axis=1)
# submission.drop(['origin'], axis=1, inplace=True)
submission.to_csv('submission_KFold_lgb.csv', index=False)

####0.6784



[LightGBM] [Info] Number of positive: 12762, number of negative: 195929
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.015650 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2203
[LightGBM] [Info] Number of data points in the train set: 208691, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.061153 -> initscore=-2.731280
[LightGBM] [Info] Start training from score -2.731280
Training until validation scores don't improve for 30 rounds
[100]	training's auc: 0.671775	training's binary_logloss: 0.21949	valid_1's auc: 0.637473	valid_1's binary_logloss: 0.222416


Early stopping, best iteration is:
[145]	training's auc: 0.680886	training's binary_logloss: 0.218308	valid_1's auc: 0.639898	valid_1's binary_logloss: 0.222152
0.6398983154350076


[LightGBM] [Info] Number of positive: 12762, number of negative: 195929
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011783 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2223
[LightGBM] [Info] Number of data points in the train set: 208691, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.061153 -> initscore=-2.731280
[LightGBM] [Info] Start training from score -2.731280
Training until validation scores don't improve for 30 rounds
[100]	training's auc: 0.672737	training's binary_logloss: 0.21934	valid_1's auc: 0.637959	valid_1's binary_logloss: 0.222781


[200]	training's auc: 0.689669	training's binary_logloss: 0.217118	valid_1's auc: 0.642275	valid_1's binary_logloss: 0.222341
Early stopping, best iteration is:
[226]	training's auc: 0.692883	training's binary_logloss: 0.21664	valid_1's auc: 0.642607	valid_1's binary_logloss: 0.222296
0.6426067114193607


[LightGBM] [Info] Number of positive: 12761, number of negative: 195930
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011204 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2216
[LightGBM] [Info] Number of data points in the train set: 208691, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.061148 -> initscore=-2.731364
[LightGBM] [Info] Start training from score -2.731364
Training until validation scores don't improve for 30 rounds


[100]	training's auc: 0.674327	training's binary_logloss: 0.219178	valid_1's auc: 0.632673	valid_1's binary_logloss: 0.223018


[200]	training's auc: 0.691698	training's binary_logloss: 0.216917	valid_1's auc: 0.635811	valid_1's binary_logloss: 0.222795
Early stopping, best iteration is:
[191]	training's auc: 0.690725	training's binary_logloss: 0.217045	valid_1's auc: 0.636148	valid_1's binary_logloss: 0.222766
0.6361478044602011


[LightGBM] [Info] Number of positive: 12761, number of negative: 195930
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.013596 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2236
[LightGBM] [Info] Number of data points in the train set: 208691, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.061148 -> initscore=-2.731364
[LightGBM] [Info] Start training from score -2.731364
Training until validation scores don't improve for 30 rounds
[100]	training's auc: 0.675315	training's binary_logloss: 0.219097	vali

[200]	training's auc: 0.692962	training's binary_logloss: 0.216717	valid_1's auc: 0.633409	valid_1's binary_logloss: 0.222765
Early stopping, best iteration is:
[187]	training's auc: 0.691048	training's binary_logloss: 0.21699	valid_1's auc: 0.633495	valid_1's binary_logloss: 0.222725
0.6334947695532307


[LightGBM] [Info] Number of positive: 12762, number of negative: 195930
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.013253 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2237
[LightGBM] [Info] Number of data points in the train set: 208692, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.061152 -> initscore=-2.731285
[LightGBM] [Info] Start training from score -2.731285
Training until validation scores don't improve for 30 rounds


[100]	training's auc: 0.673837	training's binary_logloss: 0.219255	valid_1's auc: 0.63489	valid_1's binary_logloss: 0.222562
[200]	training's auc: 0.689233	training's binary_logloss: 0.217199	valid_1's auc: 0.637898	valid_1's binary_logloss: 0.222316


Early stopping, best iteration is:
[247]	training's auc: 0.695252	training's binary_logloss: 0.216366	valid_1's auc: 0.639692	valid_1's binary_logloss: 0.2222
0.6396922726011948
               0         0         0         0         0
0       0.069143  0.053747  0.052295  0.066155  0.069341
1       0.045803  0.037051  0.048623  0.061303  0.051517
2       0.114392  0.067959  0.090216  0.077855  0.095525
3       0.087822  0.090112  0.085717  0.090536  0.088279
4       0.025816  0.023472  0.024355  0.031275  0.019125
...          ...       ...       ...       ...       ...
261472  0.070687  0.068668  0.084018  0.096291  0.061178
261473  0.030407  0.022423  0.031843  0.028308  0.022014
261474  0.053129  0.047989  0.046330  0.036327  0.031782
261475  0.047586  0.046768  0.054266  0.043785  0.072719
261476  0.105139  0.098082  0.094315  0.085894  0.076469

[261477 rows x 5 columns]


### catgbm

In [238]:
# get data
if not LOCAL_QUICK:
    if FE_V1:
        train_data = pd.read_csv('train_data.csv')
        test_data = pd.read_csv('test_data.csv')
    if MORE_FE:
        train_data = pd.read_csv('train_data_moreFE.csv')
        test_data = pd.read_csv('test_data_moreFE.csv')  

train_X, train_y = train_data.drop(['label'], axis=1), train_data['label']

del train_data

# Split Train&Valid Data
X_train, X_valid, y_train, y_valid = get_train_testDF(train_X, train_y)

In [239]:
# 将训练数据集划分分别训练5个lgbm,xgboost和catboost 模型
# catgbm模型

pred_cats = []
for i in range(5):
    print('\n============================CAT training use Data {}/5============================\n'.format(i+1))
    model_cat = cat.CatBoostClassifier(learning_rate=0.02, iterations=5000, eval_metric='AUC', od_wait=50,
                                od_type='Iter', random_state=10, thread_count=8, l2_leaf_reg=1, verbose=False)
    model_cat.fit(X_train[i], y_train[i], eval_set=[(X_valid[i], y_valid[i])], early_stopping_rounds=50,
            use_best_model=True)
    # print(model_cat.evals_result_)
    print(model_cat.best_score_['validation']['AUC'])

    pred = model_cat.predict_proba(test_data)
    pred = pd.DataFrame(pred[:,1])
    pred_cats.append(pred)
pred_cats = pd.concat(pred_cats, axis=1)

submission['prob'] = pred_cats.mean(axis=1)
# submission.drop(['origin'], axis=1, inplace=True)
submission.to_csv('submission_KFold_cat.csv', index=False)


#### 0.68001



0.6383870784419673


0.6323552243862738


0.6340716831735821


0.6312299201462874


0.6372684086240368


### xgboost

In [240]:
# get data
if not LOCAL_QUICK:
    if FE_V1:
        train_data = pd.read_csv('train_data.csv')
        test_data = pd.read_csv('test_data.csv')
    if MORE_FE:
        train_data = pd.read_csv('train_data_moreFE.csv')
        test_data = pd.read_csv('test_data_moreFE.csv')  

train_X, train_y = train_data.drop(['label'], axis=1), train_data['label']

del train_data

# Split Train&Valid Data
X_train, X_valid, y_train, y_valid = get_train_testDF(train_X, train_y)

In [241]:
# 将训练数据集划分分别训练5个lgbm,xgboost和catboost 模型
# xgboost模型

pred_xgbs = []
for i in range(5):
    print('\n============================XGB training use Data {}/5============================\n'.format(i+1))
    model_xgb = xgb.XGBClassifier(
        max_depth=10, # raw8
        n_estimators=1000,
        min_child_weight=300, 
        colsample_bytree=0.8, 
        subsample=0.8, 
        eta=0.3,    
        seed=42        
    )

    model_xgb.fit(
        X_train[i], 
        y_train[i],
        eval_metric='auc',
        eval_set=[(X_train[i], y_train[i]), (X_valid[i], y_valid[i])],
        verbose=False,
        early_stopping_rounds=10 # 早停法，如果auc在10epoch没有进步就stop
    )    

    print(model_xgb.best_score)

    pred = model_xgb.predict_proba(test_data)
    pred = pd.DataFrame(pred[:,1])
    pred_xgbs.append(pred)
pred_xgbs = pd.concat(pred_xgbs, axis=1)

# make submission
submission['prob'] = pred_xgbs.mean(axis=1)
# submission.drop(['origin'], axis=1, inplace=True)
submission.to_csv('submission_KFold_xgb.csv', index=False)

#### 0.6803







0.6346373641114181






0.6411107858608998






0.6347505695432526






0.6326984339414343






0.6295258132697713


In [242]:
"""
xgb:0.689278, ##KFold## 0.6784
lgb:0.688217, ##KFold## 0.6800
cat:0.688843, ##KFold## 0.6803
"""

'\nxgb:0.689278, ##KFold## 0.6784\nlgb:0.688217, ##KFold## 0.6800\ncat:0.688843, ##KFold## 0.6803\n'

Blending

In [243]:
lgb6812 = pd.read_csv("submission_lgb0.6812968.csv")
xgb6787 = pd.read_csv("submission_xgb0.6787.csv")
cat6777 = pd.read_csv("submission_cat-val0.6827785215-onling0.6777246.csv")

FileNotFoundError: [Errno 2] No such file or directory: 'submission_lgb0.6812968.csv'

In [None]:
# 先构造一个矩阵
df = np.array([lgb6812.prob, xgb6787.prob, cat6777.prob])
# 计算协方差矩阵
np.corrcoef(df)

In [None]:
sub = lgb6812.copy()

sub.prob = 0.6*lgb6812.prob + 0.4*cat6777.prob # Online test score:0.6830807
sub.to_csv('./sub_blended11.csv', index=False)
####################################0.6833209################################
sub.prob = 0.5*lgb6812.prob + 0.3*cat6777.prob + 0.2*xgb6787.prob# Online test 0.6833209
sub.to_csv('./sub_blended12.csv', index=False)

sub.prob = 0.45*lgb6812.prob + 0.3*cat6777.prob + 0.25*xgb6787.prob# Online test 0.6832934
sub.to_csv('./sub_blended13.csv', index=False)
####################################0.6833171################################
sub.prob = 0.45*lgb6812.prob + 0.35*cat6777.prob + 0.2*xgb6787.prob# Online test 0.6833171
sub.to_csv('./sub_blended14.csv', index=False)