In [1]:
import pandas as pd
import seaborn as sns
from catboost import CatBoostClassifier
from matplotlib import pyplot as plt
from tqdm import tqdm
import time
import gc
import numpy as np
from scipy.stats import entropy
from gensim.models import Word2Vec
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import *
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

In [2]:
def reduce_mem(df):
    starttime = time.time()
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if pd.isnull(c_min) or pd.isnull(c_max):
                continue
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    print('-- Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction),time spend:{:2.2f} min'.format(end_mem,
                                                                                                           100*(start_mem-end_mem)/start_mem,
                                                                                                           (time.time()-starttime)/60))
    return df

In [3]:
#  1v2采样的train_label seed=1997
train_df = pd.read_pickle('../../data/train_data.pkl')
train_label_0 = train_df[ train_df['label'] == 0 ]
train_label_1 = train_df[ train_df['label'] == 1 ]
MY_SEED = 2007
train_label_0 = train_label_0.sample(n = 1972420*2 , random_state = MY_SEED).reset_index(drop=True)
train_label_1 = train_label_1.sample(n = 1972420 , random_state = MY_SEED).reset_index(drop=True)
print(train_label_0.shape, train_label_1.shape)
train = pd.concat([train_label_0,train_label_1]).reset_index(drop=True)
train = train.sample(frac = 1, random_state = MY_SEED+1 ).reset_index(drop=True)

(3944840, 36) (1972420, 36)


In [4]:
train_label = pd.DataFrame()
train_label['probability'] = train.label
train_label = reduce_mem(train_label)
# train_label.to_csv('train_label.csv',index = False)
# train_label.head(5)

-- Mem. usage decreased to  5.64 Mb (0.0% reduction),time spend:0.00 min


In [5]:
train_label.shape

(5917260, 1)

In [6]:
train_model1 = pd.read_csv('./3-cat_seed_+2007_train_pred__0.8183666881489253.csv')
train_model2 = pd.read_csv('./cat_seed_+2007_train_pred__0.8159074903737941.csv')
train_model3 = pd.read_csv('./cat_seed_+2007_train_pred__0.8180868298927768.csv')
'''
以此这样把所有模型的训练预测值给添加进去
'''

'\n以此这样把所有模型的训练预测值给添加进去\n'

In [7]:
test_model1 = pd.read_csv('./3-cat_seed_+2007_test_pred_0.8183666881489253.csv')
test_model2 = pd.read_csv('./cat_seed_+2007_test_pred_0.8159074903737941.csv')
test_model3 = pd.read_csv('./cat_seed_+2007_test_pred_0.8180868298927768.csv')
'''
以此这样把所有模型的测试预测值给添加进去
'''

'\n以此这样把所有模型的测试预测值给添加进去\n'

In [10]:
train_df = pd.concat([train_model1,train_model2,train_model3],axis=1)
test_df = pd.concat([test_model1['probability'],test_model2['probability'],test_model3['probability']],axis=1)
train_df.columns = ['m1', 'm2', 'm3']
test_df.columns = ['m1', 'm2', 'm3']
print(train_df.shape, test_df.shape)

(5917260, 3) (2000000, 3)


In [11]:
train_df.head()

Unnamed: 0,m1,m2,m3
0,0.1146,0.10126,0.089653
1,0.1294,0.1512,0.181146
2,0.4897,0.4539,0.495355
3,0.4436,0.453,0.488323
4,0.879,0.855,0.876855


In [12]:
test_df.head()

Unnamed: 0,m1,m2,m3
0,0.05188,0.05658,0.057624
1,0.3816,0.356,0.383842
2,0.268,0.2498,0.338297
3,0.0765,0.10034,0.066307
4,0.557,0.63,0.539774


In [13]:
target = train_label['probability']

# cbt stacking

In [14]:
#   五折交叉
from sklearn.model_selection import StratifiedKFold
# folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=2020)
# oof =  np.zeros(len(train_df)) 
# predictions =np.zeros(len(test_df))
# y_train = train_df['label']
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=15)
stack_test = np.zeros((len(test_df),1))
oof =  np.zeros(len(train_df)) 

for fold_, (trn, val) in enumerate(folds.split(train_df, target)):
    print("fold n°{}".format(fold_))
    trn_x = train_df.iloc[trn]
    trn_y = target.iloc[trn]
    val_x = train_df.iloc[val]
    val_y = target.iloc[val]

    clf = CatBoostClassifier(iterations=10000, depth=6,learning_rate=0.01, l2_leaf_reg=50, loss_function='Logloss'
                        ,verbose=True,eval_metric='AUC',counter_calc_method='Full',task_type='GPU',devices='0-3',metric_period=50)
    clf.fit(
    trn_x, trn_y.astype('int32'),
    eval_set=[(val_x,val_y.astype('int32'))],
    early_stopping_rounds=50,
    verbose=True,
    use_best_model=True)
    
    oof[val] = clf.predict_proba(train_df.iloc[val])[:, 1]
    stack_test += clf.predict_proba(test_df)[:, 1].reshape(-1,1)/5

fold n°0
0:	learn: 0.8112265	test: 0.8105845	best: 0.8105845 (0)	total: 36.1ms	remaining: 6m
50:	learn: 0.8189832	test: 0.8185407	best: 0.8185407 (50)	total: 1.01s	remaining: 3m 17s
100:	learn: 0.8194079	test: 0.8189740	best: 0.8189744 (99)	total: 1.97s	remaining: 3m 13s
150:	learn: 0.8194913	test: 0.8190612	best: 0.8190612 (150)	total: 2.94s	remaining: 3m 11s
200:	learn: 0.8195128	test: 0.8190846	best: 0.8190846 (200)	total: 3.87s	remaining: 3m 8s
250:	learn: 0.8195294	test: 0.8191004	best: 0.8191006 (249)	total: 4.83s	remaining: 3m 7s
300:	learn: 0.8195406	test: 0.8191082	best: 0.8191082 (300)	total: 5.78s	remaining: 3m 6s
350:	learn: 0.8195504	test: 0.8191168	best: 0.8191168 (350)	total: 6.72s	remaining: 3m 4s
400:	learn: 0.8195587	test: 0.8191229	best: 0.8191229 (400)	total: 7.59s	remaining: 3m 1s
450:	learn: 0.8195670	test: 0.8191290	best: 0.8191290 (450)	total: 8.47s	remaining: 2m 59s
500:	learn: 0.8195757	test: 0.8191352	best: 0.8191352 (500)	total: 9.34s	remaining: 2m 57s
550:	

2150:	learn: 0.8196306	test: 0.8194630	best: 0.8194631 (2139)	total: 37.4s	remaining: 2m 16s
2200:	learn: 0.8196326	test: 0.8194632	best: 0.8194632 (2192)	total: 38.3s	remaining: 2m 15s
2250:	learn: 0.8196349	test: 0.8194634	best: 0.8194634 (2250)	total: 39.1s	remaining: 2m 14s
2300:	learn: 0.8196373	test: 0.8194633	best: 0.8194635 (2285)	total: 40s	remaining: 2m 13s
2350:	learn: 0.8196393	test: 0.8194636	best: 0.8194636 (2337)	total: 40.9s	remaining: 2m 12s
2400:	learn: 0.8196416	test: 0.8194641	best: 0.8194641 (2397)	total: 41.7s	remaining: 2m 11s
2450:	learn: 0.8196437	test: 0.8194645	best: 0.8194645 (2448)	total: 42.5s	remaining: 2m 11s
2500:	learn: 0.8196458	test: 0.8194645	best: 0.8194647 (2480)	total: 43.4s	remaining: 2m 10s
2550:	learn: 0.8196482	test: 0.8194647	best: 0.8194648 (2546)	total: 44.3s	remaining: 2m 9s
2600:	learn: 0.8196502	test: 0.8194650	best: 0.8194650 (2577)	total: 45.2s	remaining: 2m 8s
bestTest = 0.8194650114
bestIteration = 2577
Shrink model to first 2578 it

1600:	learn: 0.8196464	test: 0.8192925	best: 0.8192928 (1573)	total: 27.8s	remaining: 2m 25s
bestTest = 0.8192928135
bestIteration = 1573
Shrink model to first 1574 iterations.
fold n°4
0:	learn: 0.8108555	test: 0.8110279	best: 0.8110279 (0)	total: 32.3ms	remaining: 5m 23s
50:	learn: 0.8188542	test: 0.8192144	best: 0.8192144 (50)	total: 906ms	remaining: 2m 56s
100:	learn: 0.8192651	test: 0.8196292	best: 0.8196292 (100)	total: 1.78s	remaining: 2m 55s
150:	learn: 0.8193276	test: 0.8196915	best: 0.8196915 (150)	total: 2.66s	remaining: 2m 53s
200:	learn: 0.8193562	test: 0.8197163	best: 0.8197165 (198)	total: 3.54s	remaining: 2m 52s
250:	learn: 0.8193729	test: 0.8197318	best: 0.8197318 (249)	total: 4.41s	remaining: 2m 51s
300:	learn: 0.8193841	test: 0.8197418	best: 0.8197418 (300)	total: 5.28s	remaining: 2m 50s
350:	learn: 0.8193930	test: 0.8197490	best: 0.8197490 (350)	total: 6.15s	remaining: 2m 49s
400:	learn: 0.8194019	test: 0.8197547	best: 0.8197547 (399)	total: 7.03s	remaining: 2m 48s


In [15]:
auc_score =roc_auc_score(target, oof)
print("AUC Score (Valid): %f" % auc_score)
print('开始储存')
res = pd.DataFrame()
res['id'] = list(range(1,len(test_df)+1))
res['probability'] = stack_test
res.to_csv('cbt_stacking_seed_{}_{}.csv'.format(MY_SEED, auc_score),index = False)
res.head(5)

AUC Score (Valid): 0.819499
开始储存


Unnamed: 0,id,probability
0,1,0.055068
1,2,0.385407
2,3,0.294157
3,4,0.071076
4,5,0.571922


In [15]:
res=reduce_mem(res)
res.to_csv('submission.csv'.format(auc_score),index = False)

-- Mem. usage decreased to 11.44 Mb (62.5% reduction),time spend:0.00 min
