In [23]:
import pandas as pd
from tqdm import tqdm
import warnings
import gc
import os
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from gensim.models import Word2Vec
from collections import OrderedDict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score
import time
from itertools import combinations

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

warnings.filterwarnings('ignore')

In [24]:
def reduce_mem(df):
    starttime = time.time()
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if pd.isnull(c_min) or pd.isnull(c_max):
                continue
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    print('-- Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction),time spend:{:2.2f} min'.format(end_mem,
                                                                                                           100*(start_mem-end_mem)/start_mem,
                                                                                                           (time.time()-starttime)/60))
    return df

In [25]:
seed = 4096
train = pd.read_csv('/home/mw/work/sub_lgbm/lgbm_seed_1024_train_0.9028028905199429.csv')
train.shape, train.head()

((684283, 3),
                                           carid  y1_is_purchase  probability
 0  WY4N+MOjfIx8wJ3j6GhlA4qEfL71brEUkqbB0SSdqkI=             0.0     0.000019
 1  DXMuODygH0ddFea7SIoAOhF4134Bx4TPvkkPi6WCdzU=             1.0     0.886475
 2  waWCEYZJqj9PYxFdVeVLkpCNf/n0BdXPFi1iHlk0WWk=             1.0     0.840072
 3  nyRm/VviYGDpy2errRWE206SaYkVuqeclusAtXEU9v8=             1.0     0.373198
 4  LacSDMaoqD0AJRqCeYaGUu343r4NQiVuiFc9hyjLcMI=             1.0     0.995718)

In [26]:
train_label = pd.DataFrame()
train_label['probability'] = train['y1_is_purchase']
train_label = reduce_mem(train_label)
# train_label.to_csv('train_label.csv',index = False)
train_label.shape, train_label.head(5)

-- Mem. usage decreased to  1.31 Mb (75.0% reduction),time spend:0.00 min


((684283, 1),
    probability
 0          0.0
 1          1.0
 2          1.0
 3          1.0
 4          1.0)

In [27]:
train_model1 = pd.read_csv('/home/mw/work/sub_lgbm/lgbm_seed_1024_train_0.9028028905199429.csv')
train_model2 = pd.read_csv('/home/mw/work/sub_lgbm/lgbm_seed_1024_train_0.9032879253690116.csv')
train_model3 = pd.read_csv('/home/mw/work/sub_catboost/cat_seed_+2048_train_pred_0.9021236821311975.csv')
train_model4 = pd.read_csv('/home/mw/work/sub_catboost/cat_seed_+2048_train_pred_0.9025256728444977.csv')
'''
以此这样把所有模型的训练预测值给添加进去
'''
train_model1.head(), train_model2.head(), train_model3.head(), train_model4.head()

(                                          carid  y1_is_purchase  probability
 0  WY4N+MOjfIx8wJ3j6GhlA4qEfL71brEUkqbB0SSdqkI=             0.0     0.000019
 1  DXMuODygH0ddFea7SIoAOhF4134Bx4TPvkkPi6WCdzU=             1.0     0.886475
 2  waWCEYZJqj9PYxFdVeVLkpCNf/n0BdXPFi1iHlk0WWk=             1.0     0.840072
 3  nyRm/VviYGDpy2errRWE206SaYkVuqeclusAtXEU9v8=             1.0     0.373198
 4  LacSDMaoqD0AJRqCeYaGUu343r4NQiVuiFc9hyjLcMI=             1.0     0.995718,
                                           carid  y1_is_purchase  probability
 0  WY4N+MOjfIx8wJ3j6GhlA4qEfL71brEUkqbB0SSdqkI=             0.0     0.000018
 1  DXMuODygH0ddFea7SIoAOhF4134Bx4TPvkkPi6WCdzU=             1.0     0.890169
 2  waWCEYZJqj9PYxFdVeVLkpCNf/n0BdXPFi1iHlk0WWk=             1.0     0.836925
 3  nyRm/VviYGDpy2errRWE206SaYkVuqeclusAtXEU9v8=             1.0     0.480884
 4  LacSDMaoqD0AJRqCeYaGUu343r4NQiVuiFc9hyjLcMI=             1.0     0.996619,
                                           carid  y1_is_purcha

In [28]:
test_model1 = pd.read_csv('/home/mw/work/sub_lgbm/lgbm_seed_1024_test_0.9028028905199429.csv')
test_model2 = pd.read_csv('/home/mw/work/sub_lgbm/lgbm_seed_1024_test_0.9032879253690116.csv')
test_model3 = pd.read_csv('/home/mw/work/sub_catboost/cat_seed_+2048_test_0.9021236821311975.csv')
test_model4 = pd.read_csv('/home/mw/work/sub_catboost/cat_seed_+2048_test_0.9025256728444977.csv')
'''
以此这样把所有模型的测试预测值给添加进去
'''
test_model1.head(), test_model2.head(), test_model3.head(), test_model4.head()

(                                          carid     label
 0  FbOikOdqe5f3mRYDAgnBH2PwI5I+egmzWyNwjmgAuWs=  0.000022
 1  WTO/cku1nHO592k9j56on2UzMmx8OLhw8peccj1m13I=  0.600571
 2  ow79MMeuFgFY92UOVjaECsaNPl5cRXAi3M5ZsB4Rt/s=  0.302118
 3  nuO8DDjdXKFMt5Of70LlXMlFoLDX0OMSSBYnNYnqTyQ=  0.900200
 4  j4gIDul5h/7IBEYq4y8oAr2+tSWj/NdsIFbGzDtpTsk=  0.538454,
                                           carid     label
 0  FbOikOdqe5f3mRYDAgnBH2PwI5I+egmzWyNwjmgAuWs=  0.000022
 1  WTO/cku1nHO592k9j56on2UzMmx8OLhw8peccj1m13I=  0.596818
 2  ow79MMeuFgFY92UOVjaECsaNPl5cRXAi3M5ZsB4Rt/s=  0.314337
 3  nuO8DDjdXKFMt5Of70LlXMlFoLDX0OMSSBYnNYnqTyQ=  0.895044
 4  j4gIDul5h/7IBEYq4y8oAr2+tSWj/NdsIFbGzDtpTsk=  0.494623,
                                           carid     label
 0  FbOikOdqe5f3mRYDAgnBH2PwI5I+egmzWyNwjmgAuWs=  0.000144
 1  WTO/cku1nHO592k9j56on2UzMmx8OLhw8peccj1m13I=  0.525896
 2  ow79MMeuFgFY92UOVjaECsaNPl5cRXAi3M5ZsB4Rt/s=  0.303149
 3  nuO8DDjdXKFMt5Of70LlXMlFoLDX0OMSSBYnNYnqTyQ=  0.90

In [29]:
train_df = pd.concat([train_model1['probability'],train_model2['probability'],train_model3['probability'],train_model4['probability']],axis=1)
test_df = pd.concat([test_model1['label'],test_model2['label'],test_model3['label'],test_model4['label']],axis=1)
# train_df = pd.concat([train_model1['probability'],train_model2['probability']],axis=1)
# test_df = pd.concat([test_model1['label'],test_model2['label']],axis=1)
train_df.columns = ['m1', 'm2', 'm3', 'm4']
test_df.columns = ['m1', 'm2', 'm3', 'm4']
print(train_df.shape, test_df.shape)

(684283, 4) (80110, 4)


In [30]:
train_df.head()

Unnamed: 0,m1,m2,m3,m4
0,1.9e-05,1.8e-05,0.000121,4.3e-05
1,0.886475,0.890169,0.894851,0.882734
2,0.840072,0.836925,0.808837,0.794469
3,0.373198,0.480884,0.450337,0.559115
4,0.995718,0.996619,0.995317,0.995761


In [31]:
test_df.head()

Unnamed: 0,m1,m2,m3,m4
0,2.2e-05,2.2e-05,0.000144,9.3e-05
1,0.600571,0.596818,0.525896,0.494446
2,0.302118,0.314337,0.303149,0.297659
3,0.9002,0.895044,0.903002,0.888109
4,0.538454,0.494623,0.517146,0.545792


In [32]:
target = train_label['probability']
target.head()

0    0.0
1    1.0
2    1.0
3    1.0
4    1.0
Name: probability, dtype: float16

# cbt stacking

In [33]:
#   五折交叉
from sklearn.model_selection import StratifiedKFold
from catboost import CatBoostClassifier
# folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=2020)
# oof =  np.zeros(len(train_df)) 
# predictions =np.zeros(len(test_df))
# y_train = train_df['label']
folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
stack_test = np.zeros((len(test_df),1))
oof =  np.zeros(len(train_df)) 

for fold_, (trn, val) in enumerate(folds.split(train_df, target)):
    print("fold n°{}".format(fold_))
    trn_x = train_df.iloc[trn]
    trn_y = target.iloc[trn]
    val_x = train_df.iloc[val]
    val_y = target.iloc[val]

    clf = CatBoostClassifier(iterations=10000, depth=6,learning_rate=0.08, l2_leaf_reg=50, loss_function='Logloss'
                        ,verbose=True,eval_metric='AUC',counter_calc_method='Full',task_type='CPU',devices='0-3',metric_period=50)
    clf.fit(
    trn_x, trn_y.astype('int32'),
    eval_set=[(val_x,val_y.astype('int32'))],
    early_stopping_rounds=50,
    verbose=True,
    use_best_model=True)
    
    oof[val] = clf.predict_proba(train_df.iloc[val])[:, 1]
    stack_test += clf.predict_proba(test_df)[:, 1].reshape(-1,1)/10

fold n°0




0:	test: 0.8945275	best: 0.8945275 (0)	total: 75.6ms	remaining: 12m 35s
50:	test: 0.9043437	best: 0.9043455 (46)	total: 5.57s	remaining: 18m 7s
100:	test: 0.9043811	best: 0.9043811 (100)	total: 11.2s	remaining: 18m 22s
150:	test: 0.9043986	best: 0.9044024 (126)	total: 16.8s	remaining: 18m 18s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.9044023949
bestIteration = 126

Shrink model to first 127 iterations.
fold n°1




0:	test: 0.8941574	best: 0.8941574 (0)	total: 88.9ms	remaining: 14m 48s
50:	test: 0.9038922	best: 0.9038922 (50)	total: 5.51s	remaining: 17m 54s
100:	test: 0.9039556	best: 0.9039556 (100)	total: 11.1s	remaining: 18m 6s
150:	test: 0.9039632	best: 0.9039758 (130)	total: 16.7s	remaining: 18m 7s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.9039757789
bestIteration = 130

Shrink model to first 131 iterations.
fold n°2




0:	test: 0.8949353	best: 0.8949353 (0)	total: 89.5ms	remaining: 14m 55s
50:	test: 0.9042971	best: 0.9042971 (50)	total: 5.51s	remaining: 17m 55s
100:	test: 0.9043388	best: 0.9043388 (100)	total: 11s	remaining: 18m
150:	test: 0.9043585	best: 0.9043629 (138)	total: 16.6s	remaining: 18m 4s
200:	test: 0.9043656	best: 0.9043679 (189)	total: 22.3s	remaining: 18m 7s
250:	test: 0.9043670	best: 0.9043739 (233)	total: 27.9s	remaining: 18m 3s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.9043739128
bestIteration = 233

Shrink model to first 234 iterations.
fold n°3




0:	test: 0.8940223	best: 0.8940223 (0)	total: 109ms	remaining: 18m 7s
50:	test: 0.9039366	best: 0.9039366 (50)	total: 5.63s	remaining: 18m 19s
100:	test: 0.9039834	best: 0.9039834 (100)	total: 11.3s	remaining: 18m 29s
150:	test: 0.9039962	best: 0.9039997 (143)	total: 16.8s	remaining: 18m 16s
200:	test: 0.9040021	best: 0.9040038 (197)	total: 22.6s	remaining: 18m 21s
250:	test: 0.9039940	best: 0.9040122 (222)	total: 28.4s	remaining: 18m 23s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.9040122001
bestIteration = 222

Shrink model to first 223 iterations.
fold n°4




0:	test: 0.8935654	best: 0.8935654 (0)	total: 101ms	remaining: 16m 52s
50:	test: 0.9031924	best: 0.9031924 (50)	total: 5.52s	remaining: 17m 57s
100:	test: 0.9032074	best: 0.9032123 (73)	total: 11.1s	remaining: 18m 9s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.9032122674
bestIteration = 73

Shrink model to first 74 iterations.
fold n°5




0:	test: 0.8950810	best: 0.8950810 (0)	total: 118ms	remaining: 19m 35s
50:	test: 0.9043591	best: 0.9043609 (49)	total: 5.55s	remaining: 18m 2s
100:	test: 0.9044048	best: 0.9044048 (100)	total: 11.1s	remaining: 18m 11s
150:	test: 0.9044220	best: 0.9044228 (149)	total: 16.7s	remaining: 18m 12s
200:	test: 0.9044032	best: 0.9044229 (167)	total: 22.5s	remaining: 18m 18s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.9044229009
bestIteration = 167

Shrink model to first 168 iterations.
fold n°6




0:	test: 0.8935645	best: 0.8935645 (0)	total: 75.7ms	remaining: 12m 37s
50:	test: 0.9032716	best: 0.9032716 (50)	total: 5.65s	remaining: 18m 22s
100:	test: 0.9033553	best: 0.9033553 (100)	total: 11.2s	remaining: 18m 14s
150:	test: 0.9033655	best: 0.9033722 (134)	total: 16.9s	remaining: 18m 19s
200:	test: 0.9033792	best: 0.9033904 (181)	total: 22.6s	remaining: 18m 23s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.9033904475
bestIteration = 181

Shrink model to first 182 iterations.
fold n°7




0:	test: 0.8935308	best: 0.8935308 (0)	total: 86.5ms	remaining: 14m 24s
50:	test: 0.9029743	best: 0.9029743 (50)	total: 5.68s	remaining: 18m 27s
100:	test: 0.9030023	best: 0.9030045 (96)	total: 11.3s	remaining: 18m 24s
150:	test: 0.9030026	best: 0.9030070 (108)	total: 16.8s	remaining: 18m 14s
200:	test: 0.9029893	best: 0.9030073 (157)	total: 22.5s	remaining: 18m 15s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.9030073324
bestIteration = 157

Shrink model to first 158 iterations.
fold n°8




0:	test: 0.8944626	best: 0.8944626 (0)	total: 83ms	remaining: 13m 49s
50:	test: 0.9037846	best: 0.9037894 (40)	total: 5.51s	remaining: 17m 55s
100:	test: 0.9037887	best: 0.9037907 (52)	total: 11.1s	remaining: 18m 6s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.9037907418
bestIteration = 52

Shrink model to first 53 iterations.
fold n°9




0:	test: 0.8944056	best: 0.8944056 (0)	total: 87.2ms	remaining: 14m 31s
50:	test: 0.9037159	best: 0.9037159 (50)	total: 5.5s	remaining: 17m 53s
100:	test: 0.9037311	best: 0.9037351 (93)	total: 11.1s	remaining: 18m 6s
150:	test: 0.9037286	best: 0.9037453 (123)	total: 16.8s	remaining: 18m 14s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.9037453028
bestIteration = 123

Shrink model to first 124 iterations.


In [34]:
auc_score =roc_auc_score(target, oof)
print("AUC Score (Valid): %f" % auc_score)
print('开始储存')
res = pd.DataFrame()
res['carid'] = test_model1.carid
res['label'] = stack_test
res.to_csv('cbt_stacking_seed_{}_{}.csv'.format(seed, auc_score),index = False)
res.head(5)

AUC Score (Valid): 0.903814
开始储存


Unnamed: 0,carid,label
0,FbOikOdqe5f3mRYDAgnBH2PwI5I+egmzWyNwjmgAuWs=,0.000741
1,WTO/cku1nHO592k9j56on2UzMmx8OLhw8peccj1m13I=,0.552092
2,ow79MMeuFgFY92UOVjaECsaNPl5cRXAi3M5ZsB4Rt/s=,0.297355
3,nuO8DDjdXKFMt5Of70LlXMlFoLDX0OMSSBYnNYnqTyQ=,0.900674
4,j4gIDul5h/7IBEYq4y8oAr2+tSWj/NdsIFbGzDtpTsk=,0.502277


In [35]:
res=reduce_mem(res)
res.to_csv('submission.csv'.format(auc_score),index = False)

-- Mem. usage decreased to  0.76 Mb (37.5% reduction),time spend:0.00 min


In [39]:
# !wget -nv -O heywhale_submit https://cdn.kesci.com/submit_tool/v4/heywhale_submit&&chmod +x heywhale_submit
# !./heywhale_submit -token ff040f9de88e681b -file /home/mw/work/cbt_stacking_seed_1234_0.9038400755756564.csv

2021-06-02 06:38:02 URL:https://cdn.kesci.com/submit_tool/v4/heywhale_submit [7357446/7357446] -> "heywhale_submit" [1]
Heywhale Submit Tool 4.0.0

> 已验证Token
> 提交文件 /home/mw/work/cbt_stacking_seed_1234_0.9038400755756564.csv (5051.50 KiB), Target Qiniu
> 已上传 100 %
> 文件已上传        
> 服务器响应: 200 提交成功，请等待评审完成
> 提交完成


In [37]:
target[:3], target[-3:]

(0    0.0
 1    1.0
 2    1.0
 Name: probability, dtype: float16,
 684280    0.0
 684281    0.0
 684282    1.0
 Name: probability, dtype: float16)

In [38]:
oof

array([2.57909848e-04, 9.06028937e-01, 8.17644872e-01, ...,
       3.95553485e-01, 6.25067913e-01, 8.63788174e-01])