In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm,tqdm_notebook 
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor

from catboost import CatBoostClassifier
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.linear_model import BayesianRidge
from catboost import CatBoostRegressor
from sklearn.model_selection import KFold, StratifiedKFold
from scipy import sparse
import warnings
import time
import sys
import os
import gc
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error
from sklearn.metrics import log_loss
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns',None)
pd.set_option('max_colwidth',100)

## 内存优化

In [2]:
%%time
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

new_transactions = pd.read_csv('../../data/new_merchant_transactions.csv', parse_dates=['purchase_date'])
historical_transactions = pd.read_csv('../../data/historical_transactions.csv', parse_dates=['purchase_date'])
for col in ['authorized_flag', 'category_1']:
    historical_transactions[col] = historical_transactions[col].map({'Y':1, 'N':0})
    new_transactions[col]        = new_transactions[col].map({'Y':1, 'N':0})

CPU times: user 48.4 s, sys: 11.6 s, total: 60 s
Wall time: 1min 5s


## 预处理

In [3]:
%%time
## 加载训练集，测试集，基本处理
train = pd.read_csv('../../data/train.csv')
test = pd.read_csv('../../data/test.csv')

target = train['target']
for df in [train, test]:    
    df['year']  = df['first_active_month'].fillna('0-0').apply(lambda x:int(str(x).split('-')[0]))
    df['first_active_month'] = pd.to_datetime(df['first_active_month'])
    df['elapsed_time'] = (datetime.date(2018,3, 1) - df['first_active_month'].dt.date).dt.days
    
    df['first_active_month'] = pd.to_datetime(df['first_active_month'])
    df['weekofyear'] = df['first_active_month'].dt.weekofyear
    df['dayofyear'] = df['first_active_month'].dt.dayofyear
    df['month'] = df['first_active_month'].dt.month
    
## 交易表合并train test
train_test = pd.concat([train[['card_id','first_active_month']], test[['card_id','first_active_month']] ], axis=0, ignore_index=True)
historical_transactions   = historical_transactions.merge(train_test[['card_id','first_active_month']], on=['card_id'], how='left')
new_transactions = new_transactions.merge(train_test[['card_id','first_active_month']], on=['card_id'], how='left')


CPU times: user 10.8 s, sys: 5.78 s, total: 16.6 s
Wall time: 17.4 s


In [4]:
%%time
def month_trans(x): 
    return x // 30

def week_trans(x): 
    return x // 7

## 交易表预处理
def get_expand_common(df_):
    df = df_.copy()
    
    df['category_2'].fillna(1.0,inplace=True)
    df['category_3'].fillna('A',inplace=True)
    df['category_3'] = df['category_3'].map({'A':0, 'B':1, 'C':2})
    df['merchant_id'].fillna('M_ID_00a6ca8a8a',inplace=True)
    df['installments'].replace(-1, np.nan,inplace=True)
    df['installments'].replace(999, np.nan,inplace=True)
    
    df['purchase_amount'] = np.round(df['purchase_amount'] / 0.00150265118 + 497.06,8)
    df['purchase_amount'] = df.purchase_amount.apply(lambda x: np.round(x))
    
    df['purchase_date']          =  pd.to_datetime(df['purchase_date']) 
    df['first_active_month']     =  pd.to_datetime(df['first_active_month']) 
    df['purchase_hour']          =  df['purchase_date'].dt.hour
    df['year']                   = df['purchase_date'].dt.year
    df['month']                  =  df['purchase_date'].dt.month
    df['day']                    = df['purchase_date'].dt.day
    df['hour']                   = df['purchase_date'].dt.hour
    df['weekofyear'] = df['purchase_date'].dt.weekofyear
    df['dayofweek']              =  df['purchase_date'].dt.dayofweek
    df['weekend']                =  (df.purchase_date.dt.weekday >=5).astype(int) 
    df                           =  df.sort_values(['card_id','purchase_date']) 
    df['purchase_date_floorday'] =  df['purchase_date'].dt.floor('d')  #删除小于day的时间
    
    # 距离激活时间的相对时间,0, 1,2,3,...,max-act
    df['purchase_day_since_active_day']   = df['purchase_date_floorday'] - df['first_active_month']  #ht_card_id_gp['purchase_date_floorday'].transform('min')
    df['purchase_day_since_active_day']   = df['purchase_day_since_active_day'].dt.days  #.astype('timedelta64[D]') 
    df['purchase_month_since_active_day'] = df['purchase_day_since_active_day'].agg(month_trans).values
    df['purchase_week_since_active_day']  = df['purchase_day_since_active_day'].agg(week_trans).values
    
    # 距离最后一天时间的相对时间,0,1,2,3,...,max-act
    ht_card_id_gp = df.groupby('card_id')
    df['purchase_day_since_reference_day']   =  ht_card_id_gp['purchase_date_floorday'].transform('max') - df['purchase_date_floorday']
    df['purchase_day_since_reference_day']   =  df['purchase_day_since_reference_day'].dt.days
    # 一个粗粒度的特征(距离最近购买过去了几周，几月)
    df['purchase_week_since_reference_day']  = df['purchase_day_since_reference_day'].agg(week_trans).values
    df['purchase_month_since_reference_day'] = df['purchase_day_since_reference_day'].agg(month_trans).values
    
    df['purchase_day_diff']   =  df['purchase_date_floorday'].shift()
    df['purchase_day_diff']   =  df['purchase_date_floorday'].values - df['purchase_day_diff'].values
    df['purchase_day_diff']   =  df['purchase_day_diff'].dt.days
    df['purchase_week_diff']  =  df['purchase_day_diff'].agg(week_trans).values
    df['purchase_month_diff'] =  df['purchase_day_diff'].agg(month_trans).values 
    
    df['purchase_amount_ddgd_98']  = df['purchase_amount'].values * df['purchase_day_since_reference_day'].apply(lambda x:0.98**x).values
    df['purchase_amount_ddgd_99']  = df['purchase_amount'].values * df['purchase_day_since_reference_day'].apply(lambda x:0.99**x).values    
    df['purchase_amount_wdgd_96']  = df['purchase_amount'].values * df['purchase_week_since_reference_day'].apply(lambda x:0.96**x).values 
    df['purchase_amount_wdgd_97']  = df['purchase_amount'].values * df['purchase_week_since_reference_day'].apply(lambda x:0.97**x).values 
    df['purchase_amount_mdgd_90']  = df['purchase_amount'].values * df['purchase_month_since_reference_day'].apply(lambda x:0.9**x).values
    df['purchase_amount_mdgd_80']  = df['purchase_amount'].values * df['purchase_month_since_reference_day'].apply(lambda x:0.8**x).values 
    
    df = reduce_mem_usage(df)
    
    return df

historical_transactions = get_expand_common(historical_transactions)
new_transactions        = get_expand_common(new_transactions)

Mem. usage decreased to 3192.83 Mb (64.1% reduction)
Mem. usage decreased to 211.55 Mb (64.7% reduction)
CPU times: user 4min 42s, sys: 32.8 s, total: 5min 15s
Wall time: 5min 20s


## 特征优化部分

In [5]:
%%time
## 构造基本统计特征
def aggregate_transactions(df_, prefix): 
    
    df = df_.copy()
    
    df['month_diff'] = ((datetime.datetime.today() - df['purchase_date']).dt.days)//30
    df['month_diff'] = df['month_diff'].astype(int)
    df['month_diff'] += df['month_lag']
    
    df['price'] = df['purchase_amount'] / df['installments']
    df['duration'] = df['purchase_amount'] * df['month_diff']
    df['amount_month_ratio'] = df['purchase_amount'] / df['month_diff']
    
    df.loc[:, 'purchase_date'] = pd.DatetimeIndex(df['purchase_date']).\
                                      astype(np.int64) * 1e-9
    
    agg_func = {
        'category_1':      ['mean'],
        'category_2':      ['mean'],
        'category_3':      ['mean'],
        'installments':    ['mean', 'max', 'min', 'std'],
        'month_lag':       ['nunique', 'mean', 'max', 'min', 'std'],
        'month':           ['nunique', 'mean', 'max', 'min', 'std'],
        'hour':            ['nunique', 'mean', 'max', 'min', 'std'],
        'weekofyear':      ['nunique', 'mean', 'max', 'min', 'std'],
        'dayofweek':       ['nunique', 'mean'],
        'weekend':         ['mean'],
        'year':            ['nunique'],
        'card_id':         ['size','count'],
        'purchase_date':   ['max', 'min'],
        ###
        'price':             ['mean','max','min','std'],
        'duration':          ['mean','min','max','std','skew'],
        'amount_month_ratio':['mean','min','max','std','skew'],
        } 
    
    for col in ['category_2','category_3']:
        df[col+'_mean'] = df.groupby([col])['purchase_amount'].transform('mean')
        agg_func[col+'_mean'] = ['mean']
    
    agg_df = df.groupby(['card_id']).agg(agg_func)
    agg_df.columns = [prefix + '_'.join(col).strip() for col in agg_df.columns.values]
    agg_df.reset_index(drop=False, inplace=True)
  
    return agg_df
print('generate statistics features...')
auth_base_stat = aggregate_transactions(historical_transactions[historical_transactions['authorized_flag']==1], prefix='auth_')
print('generate statistics features...')
hist_base_stat = aggregate_transactions(historical_transactions[historical_transactions['authorized_flag']==0], prefix='hist_')
print('generate statistics features...')
new_base_stat  = aggregate_transactions(new_transactions, prefix='new_')

generate statistics features...
generate statistics features...
generate statistics features...
CPU times: user 2min 42s, sys: 16.2 s, total: 2min 58s
Wall time: 3min 1s


In [6]:
%%time
def get_quantile(x, percentiles = [0.1, 0.25, 0.75, 0.9]):
    x_len = len(x)
    x = np.sort(x)
    sts_feas = []  
    for per_ in percentiles:
        if per_ == 1:
            sts_feas.append(x[x_len - 1]) 
        else:
            sts_feas.append(x[int(x_len * per_)]) 
    return sts_feas 

def get_cardf_tran(df_, month = 3, prefix = '_'):
    
    df = df_.copy() 
    if prefix == 'hist_cardf_':
        df['month_to_now']  =  (datetime.date(2018, month, 1) - df['purchase_date_floorday'].dt.date).dt.days
    
    df['month_diff'] = ((datetime.datetime.today() - df['purchase_date']).dt.days)//30
    df['month_diff'] = df['month_diff'].astype(int)
    df['month_diff'] += df['month_lag']
    
    print('*'*30,'Part1, whole data','*'*30)
    cardid_features = pd.DataFrame()
    cardid_features['card_id'] = df['card_id'].unique()   
    print( '*' * 30, 'Traditional Features', '*' * 30)
    ht_card_id_gp = df.groupby('card_id') 
    cardid_features['card_id_cnt'] = ht_card_id_gp['authorized_flag'].count().values
    
    if  prefix == 'hist_cardf_':
        cardid_features['card_id_isau_mean'] = ht_card_id_gp['authorized_flag'].mean().values
        cardid_features['card_id_isau_sum'] = ht_card_id_gp['authorized_flag'].sum().values 
    
    cardid_features['month_diff_mean']   = ht_card_id_gp['month_diff'].mean().values
    cardid_features['month_diff_median'] = ht_card_id_gp['month_diff'].median().values
    
    if prefix == 'hist_cardf_':
        cardid_features['reference_day']           =  ht_card_id_gp['purchase_date_floorday'].max().values
        cardid_features['first_day']               =  ht_card_id_gp['purchase_date_floorday'].min().values 
        cardid_features['activation_day']          =  ht_card_id_gp['first_active_month'].max().values
       
        # first to activation day
        cardid_features['first_to_activation_day']  =  (cardid_features['first_day'] - cardid_features['activation_day']).dt.days
        # activation to reference day 
        cardid_features['activation_to_reference_day']  =  (cardid_features['reference_day'] - cardid_features['activation_day']).dt.days
        # first to last day 
        cardid_features['first_to_reference_day']  =  (cardid_features['reference_day'] - cardid_features['first_day']).dt.days
        # reference day to now  
        cardid_features['reference_day_to_now']  =  (datetime.date(2018, month, 1) - cardid_features['reference_day'].dt.date).dt.days 
        # first day to now
        cardid_features['first_day_to_now']  =  (datetime.date(2018, month, 1) - cardid_features['first_day'].dt.date).dt.days 
        
        print('card_id(month_lag, min to reference day):min')
        cardid_features['card_id_month_lag_min'] = ht_card_id_gp['month_lag'].agg('min').values   
        # is_purchase_before_activation,first_to_reference_day_divide_activation_to_reference_day
        cardid_features['is_purchase_before_activation'] = cardid_features['first_to_activation_day'] < 0 
        cardid_features['is_purchase_before_activation'] = cardid_features['is_purchase_before_activation'].astype(int)
        cardid_features['first_to_reference_day_divide_activation_to_reference_day'] = cardid_features['first_to_reference_day']  / (cardid_features['activation_to_reference_day']  + 0.01)
        cardid_features['days_per_count'] = cardid_features['first_to_reference_day'].values / cardid_features['card_id_cnt'].values
   
    if prefix == 'new_cardf_':
        print(' Eight time features, ') 
        cardid_features['reference_day']           =  ht_card_id_gp['reference_day'].last().values
        cardid_features['first_day']               =  ht_card_id_gp['purchase_date_floorday'].min().values 
        cardid_features['last_day']                =  ht_card_id_gp['purchase_date_floorday'].max().values
        cardid_features['activation_day']          =  ht_card_id_gp['first_active_month'].max().values
        # reference to first day
        cardid_features['reference_day_to_first_day']  =  (cardid_features['first_day'] - cardid_features['reference_day']).dt.days
        # reference to last day
        cardid_features['reference_day_to_last_day']  =  (cardid_features['last_day'] - cardid_features['reference_day']).dt.days  
        # first to last day 
        cardid_features['first_to_last_day']  =  (cardid_features['last_day'] - cardid_features['first_day']).dt.days
        # activation to first day 
        cardid_features['activation_to_first_day']  =  (cardid_features['first_day'] - cardid_features['activation_day']).dt.days
        # activation to first day 
        cardid_features['activation_to_last_day']  =  (cardid_features['last_day'] - cardid_features['activation_day']).dt.days
        # last day to now  
        cardid_features['reference_day_to_now']  =  (datetime.date(2018, month, 1) - cardid_features['reference_day'].dt.date).dt.days 
        # first day to now
        cardid_features['first_day_to_now']  =  (datetime.date(2018, month, 1) - cardid_features['first_day'].dt.date).dt.days 
        
        print('card_id(month_lag, min to reference day):min')
        cardid_features['card_id_month_lag_max'] = ht_card_id_gp['month_lag'].agg('max').values  
        cardid_features['first_to_last_day_divide_reference_to_last_day'] = cardid_features['first_to_last_day']  / (cardid_features['reference_day_to_last_day']  + 0.01)
        cardid_features['days_per_count'] = cardid_features['first_to_last_day'].values / cardid_features['card_id_cnt'].values
    
    for f in ['reference_day', 'first_day', 'last_day', 'activation_day']:
        try:
            del cardid_features[f]
        except:
            print(f, '不存在！！！')

    print('card id(city_id,installments,merchant_category_id,.......):nunique, cnt/nunique') 
    for col in tqdm_notebook(['category_1','category_2','category_3','state_id','city_id','installments','merchant_id', 'merchant_category_id','subsector_id','month_lag','purchase_date_floorday']):
        cardid_features['card_id_%s_nunique'%col]            =  ht_card_id_gp[col].nunique().values
        cardid_features['card_id_cnt_divide_%s_nunique'%col] =  cardid_features['card_id_cnt'].values / cardid_features['card_id_%s_nunique'%col].values
         
    print('card_id(purchase_amount & degrade version ):mean,sum,std,median,quantile(10,25,75,90)') 
    for col in tqdm_notebook(['installments','purchase_amount','purchase_amount_ddgd_98','purchase_amount_ddgd_99','purchase_amount_wdgd_96','purchase_amount_wdgd_97','purchase_amount_mdgd_90','purchase_amount_mdgd_80']):
        if col =='purchase_amount':
            for opt in ['sum','mean','std','median','max','min']:
                cardid_features['card_id_' +col+ '_' + opt] = ht_card_id_gp[col].agg(opt).values
            
            cardid_features['card_id_' +col+ '_range'] =  cardid_features['card_id_' +col+ '_max'].values - cardid_features['card_id_' +col+ '_min'].values
            percentiles = ht_card_id_gp[col].apply(lambda x:get_quantile(x,percentiles = [0.025, 0.25, 0.75, 0.975])) 

            cardid_features[col + '_2.5_quantile']  = percentiles.map(lambda x:x[0]).values
            cardid_features[col + '_25_quantile'] = percentiles.map(lambda x:x[1]).values
            cardid_features[col + '_75_quantile'] = percentiles.map(lambda x:x[2]).values
            cardid_features[col + '_97.5_quantile'] = percentiles.map(lambda x:x[3]).values
            cardid_features['card_id_' +col+ '_range2'] =  cardid_features[col+ '_97.5_quantile'].values - cardid_features[col+ '_2.5_quantile'].values
            del cardid_features[col + '_2.5_quantile'],cardid_features[col + '_97.5_quantile']
            gc.collect()
        else:
            for opt in ['sum']:
                cardid_features['card_id_' +col+ '_' + opt] = ht_card_id_gp[col].agg(opt).values          
    
    print( '*' * 30, 'Pivot Features', '*' * 30)
    print('Count  Pivot') #purchase_month_since_reference_day(可能和month_lag重复),百分比降分,暂时忽略 (dayofweek,merchant_cate,state_id)作用不大installments
    for pivot_col in tqdm_notebook(['category_1','category_2','category_3','month_lag','subsector_id','weekend']): #'city_id',,
    
        tmp     = df.groupby(['card_id',pivot_col])['merchant_id'].count().to_frame(pivot_col + '_count')
        tmp.reset_index(inplace =True)  
         
        tmp_pivot = pd.pivot_table(data=tmp,index = 'card_id',columns=pivot_col,values=pivot_col + '_count',fill_value=0)
        tmp_pivot.columns = [tmp_pivot.columns.names[0] + '_cnt_pivot_'+ str(col) for col in tmp_pivot.columns]
        tmp_pivot.reset_index(inplace = True)
        cardid_features = cardid_features.merge(tmp_pivot, on = 'card_id', how='left')
      
        if  pivot_col!='weekend' and  pivot_col!='installments':
            tmp            = df.groupby(['card_id',pivot_col])['purchase_date_floorday'].nunique().to_frame(pivot_col + '_purchase_date_floorday_nunique') 
            tmp1           = df.groupby(['card_id'])['purchase_date_floorday'].nunique().to_frame('purchase_date_floorday_nunique') 
            tmp.reset_index(inplace =True)  
            tmp1.reset_index(inplace =True)   
            tmp  = tmp.merge(tmp1, on ='card_id', how='left')
            tmp[pivot_col + '_day_nunique_pct'] = tmp[pivot_col + '_purchase_date_floorday_nunique'].values / tmp['purchase_date_floorday_nunique'].values
         
            tmp_pivot = pd.pivot_table(data=tmp,index = 'card_id',columns=pivot_col,values=pivot_col + '_day_nunique_pct',fill_value=0)
            tmp_pivot.columns = [tmp_pivot.columns.names[0] + '_day_nunique_pct_'+ str(col) for col in tmp_pivot.columns]
            tmp_pivot.reset_index(inplace = True)
            cardid_features = cardid_features.merge(tmp_pivot, on = 'card_id', how='left')
    
    if prefix == 'new_cardf_':
    ######## 在卡未激活之前就有过消费的记录  ##############   
        print('*'*30,'Part2， data with time less than activation day','*'*30)
        df_part = df.loc[df.purchase_date < df.first_active_month]

        cardid_features_part = pd.DataFrame()
        cardid_features_part['card_id'] = df_part['card_id'].unique()   
        ht_card_id_part_gp = df_part.groupby('card_id')
        cardid_features_part['card_id_part_cnt'] = ht_card_id_part_gp['authorized_flag'].count().values

        print('card_id(purchase_amount): sum') 
        for col in tqdm_notebook(['purchase_amount']): 
            for opt in ['sum','mean']:
                cardid_features_part['card_id_part_' +col+ '_' + opt] = ht_card_id_part_gp[col].agg(opt).values

        cardid_features = cardid_features.merge(cardid_features_part, on ='card_id', how='left')
        cardid_features['card_id_part_purchase_amount_sum_percent'] = cardid_features['card_id_part_purchase_amount_sum'] / (cardid_features['card_id_purchase_amount_sum'] + 0.01)

    cardid_features = reduce_mem_usage(cardid_features)
    
    new_col_names = []
    for col in cardid_features.columns:
        if col == 'card_id':
            new_col_names.append(col)
        else:
            new_col_names.append(prefix + col)
    cardid_features.columns = new_col_names
    
    return cardid_features
print('auth...')
authorized_transactions = historical_transactions.loc[historical_transactions['authorized_flag'] == 1]
auth_cardf_tran = get_cardf_tran(authorized_transactions, 3, prefix='auth_cardf_')
print('hist...')
hist_cardf_tran = get_cardf_tran(historical_transactions, 3, prefix='hist_cardf_')
print('new...')
reference_days = historical_transactions.groupby('card_id')['purchase_date'].last().to_frame('reference_day')
reference_days.reset_index(inplace = True)
new_transactions = new_transactions.merge(reference_days, on ='card_id', how='left')
new_cardf_tran  = get_cardf_tran(new_transactions, 5, prefix='new_cardf_')

auth...
****************************** Part1, whole data ******************************
****************************** Traditional Features ******************************
reference_day 不存在！！！
first_day 不存在！！！
last_day 不存在！！！
activation_day 不存在！！！
card id(city_id,installments,merchant_category_id,.......):nunique, cnt/nunique


  0%|          | 0/11 [00:00<?, ?it/s]

card_id(purchase_amount & degrade version ):mean,sum,std,median,quantile(10,25,75,90)


  0%|          | 0/8 [00:00<?, ?it/s]

****************************** Pivot Features ******************************
Count  Pivot


  0%|          | 0/6 [00:00<?, ?it/s]

Mem. usage decreased to 112.08 Mb (73.1% reduction)
hist...
****************************** Part1, whole data ******************************
****************************** Traditional Features ******************************
card_id(month_lag, min to reference day):min
last_day 不存在！！！
card id(city_id,installments,merchant_category_id,.......):nunique, cnt/nunique


  0%|          | 0/11 [00:00<?, ?it/s]

card_id(purchase_amount & degrade version ):mean,sum,std,median,quantile(10,25,75,90)


  0%|          | 0/8 [00:00<?, ?it/s]

****************************** Pivot Features ******************************
Count  Pivot


  0%|          | 0/6 [00:00<?, ?it/s]

Mem. usage decreased to 121.08 Mb (72.6% reduction)
new...
****************************** Part1, whole data ******************************
****************************** Traditional Features ******************************
 Eight time features, 
card_id(month_lag, min to reference day):min
card id(city_id,installments,merchant_category_id,.......):nunique, cnt/nunique


  0%|          | 0/11 [00:00<?, ?it/s]

card_id(purchase_amount & degrade version ):mean,sum,std,median,quantile(10,25,75,90)


  0%|          | 0/8 [00:00<?, ?it/s]

****************************** Pivot Features ******************************
Count  Pivot


  0%|          | 0/6 [00:00<?, ?it/s]

****************************** Part2， data with time less than activation day ******************************
card_id(purchase_amount): sum


  0%|          | 0/1 [00:00<?, ?it/s]

Mem. usage decreased to 92.65 Mb (73.0% reduction)
CPU times: user 6min 42s, sys: 58.5 s, total: 7min 40s
Wall time: 7min 49s


In [7]:
%%time
def get_cardf_tran_last2(df_, month = 3, prefix = 'last2_'): 
    
    df = df_.loc[df_.month_lag >= -2].copy()
    print('*'*30,'Part1, whole data','*'*30)
    cardid_features = pd.DataFrame()
    cardid_features['card_id'] = df['card_id'].unique()   
    
    df['month_diff'] = ((datetime.datetime.today() - df['purchase_date']).dt.days)//30
    df['month_diff'] = df['month_diff'].astype(int)
    df['month_diff'] += df['month_lag']
    
    print( '*' * 30, 'Traditional Features', '*' * 30)
    ht_card_id_gp = df.groupby('card_id')
    print(' card id : count')
    cardid_features['card_id_cnt'] = ht_card_id_gp['authorized_flag'].count().values
    
    cardid_features['card_id_isau_mean'] = ht_card_id_gp['authorized_flag'].mean().values 
    cardid_features['card_id_isau_sum']  = ht_card_id_gp['authorized_flag'].sum().values
    
    cardid_features['month_diff_mean']   = ht_card_id_gp['month_diff'].mean().values

    print('card id(city_id,installments,merchant_category_id,.......):nunique, cnt/nunique') 
    for col in tqdm_notebook(['state_id','city_id','installments','merchant_id', 'merchant_category_id','purchase_date_floorday']):
        cardid_features['card_id_%s_nunique'%col] = ht_card_id_gp[col].nunique().values
        cardid_features['card_id_cnt_divide_%s_nunique'%col] = cardid_features['card_id_cnt'].values / cardid_features['card_id_%s_nunique'%col].values
         
    for col in tqdm_notebook(['purchase_amount','purchase_amount_ddgd_98','purchase_amount_wdgd_96','purchase_amount_mdgd_90','purchase_amount_mdgd_80']): #,'purchase_amount_ddgd_98','purchase_amount_ddgd_99','purchase_amount_wdgd_96','purchase_amount_wdgd_97','purchase_amount_mdgd_90','purchase_amount_mdgd_80']):
        if col =='purchase_amount':
            for opt in ['sum','mean','std','median']:
                cardid_features['card_id_' +col+ '_' + opt] = ht_card_id_gp[col].agg(opt).values  
        else:
            for opt in ['sum']:
                cardid_features['card_id_' +col+ '_' + opt] = ht_card_id_gp[col].agg(opt).values 
    
    print( '*' * 30, 'Pivot Features', '*' * 30)
    print('Count  Pivot') #purchase_month_since_reference_day(可能和month_lag重复),百分比降分,暂时忽略 (dayofweek,merchant_cate,state_id)作用不大
    
    for pivot_col in tqdm_notebook(['category_1','category_2','category_3','month_lag','subsector_id','weekend']): #'city_id', 
    
        tmp     = df.groupby(['card_id',pivot_col])['merchant_id'].count().to_frame(pivot_col + '_count')
        tmp.reset_index(inplace =True)  
         
        tmp_pivot = pd.pivot_table(data=tmp,index = 'card_id',columns=pivot_col,values=pivot_col + '_count',fill_value=0)
        tmp_pivot.columns = [tmp_pivot.columns.names[0] + '_cnt_pivot_'+ str(col) for col in tmp_pivot.columns]
        tmp_pivot.reset_index(inplace = True)
        cardid_features = cardid_features.merge(tmp_pivot, on = 'card_id', how='left')
      
        if  pivot_col!='weekend' and  pivot_col!='installments':
            tmp            = df.groupby(['card_id',pivot_col])['purchase_date_floorday'].nunique().to_frame(pivot_col + '_purchase_date_floorday_nunique') 
            tmp1           = df.groupby(['card_id'])['purchase_date_floorday'].nunique().to_frame('purchase_date_floorday_nunique') 
            tmp.reset_index(inplace =True)  
            tmp1.reset_index(inplace =True)   
            tmp  = tmp.merge(tmp1, on ='card_id', how='left')
            tmp[pivot_col + '_day_nunique_pct'] = tmp[pivot_col + '_purchase_date_floorday_nunique'].values / tmp['purchase_date_floorday_nunique'].values
         
            tmp_pivot = pd.pivot_table(data=tmp,index = 'card_id',columns=pivot_col,values=pivot_col + '_day_nunique_pct',fill_value=0)
            tmp_pivot.columns = [tmp_pivot.columns.names[0] + '_day_nunique_pct_'+ str(col) for col in tmp_pivot.columns]
            tmp_pivot.reset_index(inplace = True)
            cardid_features = cardid_features.merge(tmp_pivot, on = 'card_id', how='left')
     
    cardid_features = reduce_mem_usage(cardid_features)
    
    new_col_names = []
    for col in cardid_features.columns:
        if col == 'card_id':
            new_col_names.append(col)
        else:
            new_col_names.append(prefix + col)
    cardid_features.columns = new_col_names
    
    return cardid_features  

hist_cardf_tran_last2 = get_cardf_tran_last2(historical_transactions, month = 3, prefix = 'hist_last2_')

****************************** Part1, whole data ******************************
****************************** Traditional Features ******************************
 card id : count
card id(city_id,installments,merchant_category_id,.......):nunique, cnt/nunique


  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

****************************** Pivot Features ******************************
Count  Pivot


  0%|          | 0/6 [00:00<?, ?it/s]

Mem. usage decreased to 84.45 Mb (74.0% reduction)
CPU times: user 1min 2s, sys: 4.31 s, total: 1min 6s
Wall time: 1min 7s


In [8]:
%%time
def successive_aggregates(df_, prefix = 'levelAB_'):
    df = df_.copy()
    cardid_features = pd.DataFrame()
    cardid_features['card_id'] = df['card_id'].unique()    
     
    level12_nunique = [('month_lag','state_id'),('month_lag','city_id'),('month_lag','subsector_id'),('month_lag','merchant_category_id'),('month_lag','merchant_id'),('month_lag','purchase_date_floorday'),\
                       ('subsector_id','merchant_category_id'),('subsector_id','merchant_id'),('subsector_id','purchase_date_floorday'),('subsector_id','month_lag'),\
                       ('merchant_category_id', 'merchant_id'),('merchant_category_id','purchase_date_floorday'),('merchant_category_id','month_lag'),\
                       ('purchase_date_floorday', 'merchant_id'),('purchase_date_floorday','merchant_category_id'),('purchase_date_floorday','subsector_id')]    
    for col_level1,col_level2 in tqdm_notebook(level12_nunique):  
        
        level1  = df.groupby(['card_id',col_level1])[col_level2].nunique().to_frame(col_level2 + '_nunique')
        level1.reset_index(inplace =True)  
         
        level2 = level1.groupby('card_id')[col_level2 + '_nunique'].agg(['mean', 'max', 'std'])
        level2 = pd.DataFrame(level2)
        level2.columns = [col_level1 + '_' + col_level2 + '_nunique_' + col for col in level2.columns.values]
        level2.reset_index(inplace = True)
        
        cardid_features = cardid_features.merge(level2, on='card_id', how='left') 
    
    level12_count = ['month_lag','state_id','city_id','subsector_id','merchant_category_id','merchant_id','purchase_date_floorday']
    for col_level in tqdm_notebook(level12_count): 
    
        level1  = df.groupby(['card_id',col_level])['merchant_id'].count().to_frame(col_level + '_count')
        level1.reset_index(inplace =True)  
         
        level2 = level1.groupby('card_id')[col_level + '_count'].agg(['mean', 'max', 'std'])
        level2 = pd.DataFrame(level2)
        level2.columns = [col_level + '_count_' + col for col in level2.columns.values]
        level2.reset_index(inplace = True)
        
        cardid_features = cardid_features.merge(level2, on='card_id', how='left') 
    
    level12_meansum = [('month_lag','purchase_amount'),('state_id','purchase_amount'),('city_id','purchase_amount'),('subsector_id','purchase_amount'),\
                       ('merchant_category_id','purchase_amount'),('merchant_id','purchase_amount'),('purchase_date_floorday','purchase_amount')]
    for col_level1,col_level2 in tqdm_notebook(level12_meansum): 
    
        level1  = df.groupby(['card_id',col_level1])[col_level2].sum().to_frame(col_level2 + '_sum')
        level1.reset_index(inplace =True)  
         
        level2 = level1.groupby('card_id')[col_level2 + '_sum'].agg(['mean', 'max', 'std'])
        level2 = pd.DataFrame(level2)
        level2.columns = [col_level1 + '_' + col_level2 + '_sum_' + col for col in level2.columns.values]
        level2.reset_index(inplace = True)

        cardid_features = cardid_features.merge(level2, on='card_id', how='left')           
    
    cardid_features = reduce_mem_usage(cardid_features)
    
    new_col_names = []
    for col in cardid_features.columns:
        if col == 'card_id':
            new_col_names.append(col)
        else:
            new_col_names.append(prefix + col)
    cardid_features.columns = new_col_names
    
    return cardid_features  

print('hist...')
hist_levelAB = successive_aggregates(historical_transactions, prefix = 'hist_levelAB_')

hist...


  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

Mem. usage decreased to 70.16 Mb (66.8% reduction)
CPU times: user 4min 2s, sys: 40.3 s, total: 4min 42s
Wall time: 4min 45s


In [9]:
%%time
print(train.shape)
print(test.shape)
## 合并到训练集和测试集
print('#_____基础统计特征')
train = pd.merge(train, auth_base_stat, on='card_id', how='left')
test  = pd.merge(test,  auth_base_stat, on='card_id', how='left')
train = pd.merge(train, hist_base_stat, on='card_id', how='left')
test  = pd.merge(test,  hist_base_stat, on='card_id', how='left')
train = pd.merge(train, new_base_stat , on='card_id', how='left')
test  = pd.merge(test,  new_base_stat , on='card_id', how='left')
print(train.shape)
print(test.shape)
print('#_____全局cardid特征')
train = pd.merge(train, auth_cardf_tran, on='card_id', how='left')
test  = pd.merge(test,  auth_cardf_tran, on='card_id', how='left')
train = pd.merge(train, hist_cardf_tran, on='card_id', how='left')
test  = pd.merge(test,  hist_cardf_tran, on='card_id', how='left')
train = pd.merge(train, new_cardf_tran , on='card_id', how='left')
test  = pd.merge(test,  new_cardf_tran , on='card_id', how='left')
print(train.shape)
print(test.shape)
print('#_____最近两月cardid特征')
train = pd.merge(train, hist_cardf_tran_last2, on='card_id', how='left')
test  = pd.merge(test,  hist_cardf_tran_last2, on='card_id', how='left')
print(train.shape)
print(test.shape)
print('#_____补充二阶特征')
train = pd.merge(train, hist_levelAB, on='card_id', how='left')
test  = pd.merge(test,  hist_levelAB, on='card_id', how='left')
print(train.shape)
print(test.shape)

(201917, 11)
(123623, 10)
#_____基础统计特征
(201917, 164)
(123623, 163)
#_____全局cardid特征
(201917, 687)
(123623, 686)
#_____最近两月cardid特征
(201917, 821)
(123623, 820)
#_____补充二阶特征
(201917, 911)
(123623, 910)
CPU times: user 21.8 s, sys: 1.45 s, total: 23.3 s
Wall time: 24.5 s


In [10]:
%%time
train['outliers'] = 0
train.loc[train['target'] < -30, 'outliers'] = 1
train['outliers'].value_counts()
for f in ['feature_1','feature_2','feature_3']:
    colname = f+'_outliers_mean'
    order_label = train.groupby([f])['outliers'].mean()
    for df in [train, test]:
        df[colname] = df[f].map(order_label)

for df in [train, test]:
    
    df['days_feature1'] = df['elapsed_time'] * df['feature_1']
    df['days_feature2'] = df['elapsed_time'] * df['feature_2']
    df['days_feature3'] = df['elapsed_time'] * df['feature_3']

    df['days_feature1_ratio'] = df['feature_1'] / df['elapsed_time']
    df['days_feature2_ratio'] = df['feature_2'] / df['elapsed_time']
    df['days_feature3_ratio'] = df['feature_3'] / df['elapsed_time']

    df['feature_sum'] = df['feature_1'] + df['feature_2'] + df['feature_3']
    df['feature_mean'] = df['feature_sum']/3
    df['feature_max'] = df[['feature_1', 'feature_2', 'feature_3']].max(axis=1)
    df['feature_min'] = df[['feature_1', 'feature_2', 'feature_3']].min(axis=1)
    df['feature_var'] = df[['feature_1', 'feature_2', 'feature_3']].std(axis=1)
    
    df['card_id_total'] = df['hist_card_id_size']+df['new_card_id_size']
    df['card_id_cnt_total'] = df['hist_card_id_count']+df['new_card_id_count']
    df['card_id_cnt_ratio'] = df['new_card_id_count']/df['hist_card_id_count']
    df['purchase_amount_total'] = df['hist_cardf_card_id_purchase_amount_sum']+df['new_cardf_card_id_purchase_amount_sum']
    df['purchase_amount_ratio'] = df['new_cardf_card_id_purchase_amount_sum']/df['hist_cardf_card_id_purchase_amount_sum']
    df['month_diff_ratio'] = df['new_cardf_month_diff_mean']/df['hist_cardf_month_diff_mean']
    df['installments_total'] = df['new_cardf_card_id_installments_sum']+df['auth_cardf_card_id_installments_sum']
    df['installments_ratio'] = df['new_cardf_card_id_installments_sum']/df['auth_cardf_card_id_installments_sum']
    df['price_total'] = df['purchase_amount_total']/df['installments_total']
    df['new_CLV'] = df['new_card_id_count'] * df['new_cardf_card_id_purchase_amount_sum'] / df['new_cardf_month_diff_mean']
    df['hist_CLV'] = df['hist_card_id_count'] * df['hist_cardf_card_id_purchase_amount_sum'] / df['hist_cardf_month_diff_mean']
    df['CLV_ratio'] = df['new_CLV'] / df['hist_CLV']

CPU times: user 405 ms, sys: 683 ms, total: 1.09 s
Wall time: 1.25 s


## 特征基本过滤

In [11]:
%%time
del_cols = []
for col in train.columns:
    if 'subsector_id_cnt_' in col and 'new_cardf': 
        del_cols.append(col)
del_cols1 = []
for col in train.columns:
    if 'subsector_id_cnt_' in col and 'hist_last2_' in col:
        del_cols1.append(col)
del_cols2 = []
for col in train.columns:
    if 'subsector_id_cnt_' in col and 'auth_cardf' in col:
        del_cols2.append(col)
del_cols3 = []
for col in train.columns:
    if 'merchant_category_id_month_lag_nunique_' in col and '_pivot_supp' in col:
        del_cols3.append(col)
    if 'city_id' in col and '_pivot_supp' in col:
        del_cols3.append(col)
    if 'month_diff' in col and 'hist_last2_' in col:
        del_cols3.append(col)
    if 'month_diff_std' in col or 'month_diff_gap' in col:
        del_cols3.append(col) 
fea_cols = [col for col in train.columns if train[col].dtypes!='object' and train[col].dtypes != '<M8[ns]' and col!='target' not in col and col!='min_num'\
            and col not in del_cols and col not in del_cols1 and col not in del_cols2 and col!='target1' and col!='card_id_cnt_ht_pivot_supp'  and col not in del_cols3]   
print('运用杰哥方法...')
print('删除前:',train.shape[1])
print('删除后:',len(fea_cols))

train = train[fea_cols+['target']]
fea_cols.remove('outliers')
test = test[fea_cols]

train.to_csv('../../data/all_train_features.csv',index=False)
test.to_csv('../../data/all_test_features.csv',index=False)

运用杰哥方法...
删除前: 938
删除后: 770
CPU times: user 2min 11s, sys: 3.81 s, total: 2min 15s
Wall time: 2min 16s


In [12]:
%%time
## load all features
train = pd.read_csv('../../data/all_train_features.csv')
test  = pd.read_csv('../../data/all_test_features.csv')

# ## load sparse
# train_tags = sparse.load_npz('train_tags.npz')
# test_tags  = sparse.load_npz('test_tags.npz')

## 获取非异常值的index
normal_index = train[train['outliers']==0].index.tolist()
## without outliers
ntrain = train[train['outliers'] == 0]

target        = train['target'].values
ntarget       = ntrain['target'].values
target_binary = train['outliers'].values
###
y_train        = target
y_ntrain       = ntarget
y_train_binary = target_binary

print('train:',train.shape)
print('ntrain:',ntrain.shape)

train: (201917, 771)
ntrain: (199710, 771)
CPU times: user 19.8 s, sys: 4.92 s, total: 24.8 s
Wall time: 27.1 s


## 模型训练部分

In [13]:
def train_model(X, X_test, y, params, folds, model_type='lgb', eval_type='regression'):
    oof = np.zeros(X.shape[0])
    predictions = np.zeros(X_test.shape[0])
    scores = []
    for fold_n, (trn_idx, val_idx) in enumerate(folds.split(X, y)):
        print('Fold', fold_n, 'started at', time.ctime())
        
        if model_type == 'lgb':
            trn_data = lgb.Dataset(X[trn_idx], y[trn_idx])
            val_data = lgb.Dataset(X[val_idx], y[val_idx])
            clf = lgb.train(params, trn_data, num_boost_round=20000, 
                            valid_sets=[trn_data, val_data], 
                            verbose_eval=100, early_stopping_rounds=300)
            oof[val_idx] = clf.predict(X[val_idx], num_iteration=clf.best_iteration)
            predictions += clf.predict(X_test, num_iteration=clf.best_iteration) / folds.n_splits
        
        if model_type == 'xgb':
            trn_data = xgb.DMatrix(X[trn_idx], y[trn_idx])
            val_data = xgb.DMatrix(X[val_idx], y[val_idx])
            watchlist = [(trn_data, 'train'), (val_data, 'valid_data')]
            clf = xgb.train(dtrain=trn_data, num_boost_round=20000, 
                            evals=watchlist, early_stopping_rounds=200, 
                            verbose_eval=100, params=params)
            oof[val_idx] = clf.predict(xgb.DMatrix(X[val_idx]), ntree_limit=clf.best_ntree_limit)
            predictions += clf.predict(xgb.DMatrix(X_test), ntree_limit=clf.best_ntree_limit) / folds.n_splits
        
        if (model_type == 'cat') and (eval_type == 'regression'):
            clf = CatBoostRegressor(iterations=20000, eval_metric='RMSE', **params)
            clf.fit(X[trn_idx], y[trn_idx], 
                    eval_set=(X[val_idx], y[val_idx]),
                    cat_features=[], use_best_model=True, verbose=100)
            oof[val_idx] = clf.predict(X[val_idx])
            predictions += clf.predict(X_test) / folds.n_splits
            
        if (model_type == 'cat') and (eval_type == 'binary'):
            clf = CatBoostClassifier(iterations=20000, eval_metric='Logloss', **params)
            clf.fit(X[trn_idx], y[trn_idx], 
                    eval_set=(X[val_idx], y[val_idx]),
                    cat_features=[], use_best_model=True, verbose=100)
            oof[val_idx] = clf.predict_proba(X[val_idx])[:,1]
            predictions += clf.predict_proba(X_test)[:,1] / folds.n_splits
        print(predictions)
        if eval_type == 'regression':
            scores.append(mean_squared_error(oof[val_idx], y[val_idx])**0.5)
        if eval_type == 'binary':
            scores.append(log_loss(y[val_idx], oof[val_idx]))
        
    print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores)))
    
    return oof, predictions, scores

In [14]:
#### lgb
lgb_params = {'num_leaves': 63,
             'min_data_in_leaf': 32, 
             'objective':'regression',
             'max_depth': -1,
             'learning_rate': 0.01,
             "min_child_samples": 20,
             "boosting": "gbdt",
             "feature_fraction": 0.9,
             "bagging_freq": 1,
             "bagging_fraction": 0.9 ,
             "bagging_seed": 11,
             "metric": 'rmse',
             "lambda_l1": 0.1,
             "verbosity": -1}
folds = KFold(n_splits=5, shuffle=True, random_state=4096)
X_ntrain = ntrain[fea_cols].values
X_train  = train[fea_cols].values
X_test   = test[fea_cols].values
print('='*10,'回归模型','='*10)
oof_lgb , predictions_lgb , scores_lgb  = train_model(X_train , X_test, y_train, params=lgb_params, folds=folds, model_type='lgb', eval_type='regression')

Fold 0 started at Sun Mar 20 17:09:05 2022
Training until validation scores don't improve for 300 rounds
[100]	training's rmse: 3.53966	valid_1's rmse: 3.78742
[200]	training's rmse: 3.39621	valid_1's rmse: 3.74856
[300]	training's rmse: 3.30425	valid_1's rmse: 3.73544
[400]	training's rmse: 3.23473	valid_1's rmse: 3.72852
[500]	training's rmse: 3.1792	valid_1's rmse: 3.72531
[600]	training's rmse: 3.13148	valid_1's rmse: 3.72344
[700]	training's rmse: 3.0876	valid_1's rmse: 3.72284
[800]	training's rmse: 3.04712	valid_1's rmse: 3.72308
[900]	training's rmse: 3.00918	valid_1's rmse: 3.72353
Early stopping, best iteration is:
[667]	training's rmse: 3.10189	valid_1's rmse: 3.72259
[-0.3714682  -0.08847166 -0.13364805 ...  0.13672201 -0.65193587
  0.02118456]
Fold 1 started at Sun Mar 20 17:12:20 2022
Training until validation scores don't improve for 300 rounds
[100]	training's rmse: 3.57064	valid_1's rmse: 3.65723
[200]	training's rmse: 3.42429	valid_1's rmse: 3.62102
[300]	training's r

In [15]:
print('='*10,'without outliers 回归模型','='*10)
oof_nlgb, predictions_nlgb, scores_nlgb = train_model(X_ntrain, X_test, y_ntrain, params=lgb_params, folds=folds, model_type='lgb', eval_type='regression')

Fold 0 started at Sun Mar 20 18:54:16 2022
Training until validation scores don't improve for 300 rounds
[100]	training's rmse: 1.58504	valid_1's rmse: 1.5733
[200]	training's rmse: 1.54442	valid_1's rmse: 1.54798
[300]	training's rmse: 1.52309	valid_1's rmse: 1.54028
[400]	training's rmse: 1.50746	valid_1's rmse: 1.53688
[500]	training's rmse: 1.49407	valid_1's rmse: 1.53506
[600]	training's rmse: 1.482	valid_1's rmse: 1.53411
[700]	training's rmse: 1.4712	valid_1's rmse: 1.53369
[800]	training's rmse: 1.46106	valid_1's rmse: 1.53352
[900]	training's rmse: 1.45147	valid_1's rmse: 1.53335
[1000]	training's rmse: 1.44226	valid_1's rmse: 1.53318
[1100]	training's rmse: 1.4334	valid_1's rmse: 1.5332
[1200]	training's rmse: 1.42479	valid_1's rmse: 1.53322
[1300]	training's rmse: 1.41631	valid_1's rmse: 1.53322
[1400]	training's rmse: 1.40783	valid_1's rmse: 1.53322
[1500]	training's rmse: 1.39966	valid_1's rmse: 1.53321
Early stopping, best iteration is:
[1261]	training's rmse: 1.41949	val

In [16]:
print('='*10,'分类模型','='*10)
lgb_params['objective'] = 'binary'
lgb_params['metric']    = 'binary_logloss'
oof_blgb, predictions_blgb, scores_blgb = train_model(X_train , X_test, y_train_binary, params=lgb_params, folds=folds, model_type='lgb', eval_type='binary')

Fold 0 started at Sun Mar 20 19:29:42 2022
Training until validation scores don't improve for 300 rounds
[100]	training's binary_logloss: 0.0342044	valid_1's binary_logloss: 0.0493479
[200]	training's binary_logloss: 0.0268654	valid_1's binary_logloss: 0.0473549
[300]	training's binary_logloss: 0.022339	valid_1's binary_logloss: 0.0469053
[400]	training's binary_logloss: 0.019187	valid_1's binary_logloss: 0.0468784
[500]	training's binary_logloss: 0.0167221	valid_1's binary_logloss: 0.0470381
[600]	training's binary_logloss: 0.0146748	valid_1's binary_logloss: 0.047282
Early stopping, best iteration is:
[347]	training's binary_logloss: 0.0207395	valid_1's binary_logloss: 0.0468522
[0.00936449 0.00024249 0.0017062  ... 0.00134461 0.01087633 0.00060872]
Fold 1 started at Sun Mar 20 19:31:54 2022
Training until validation scores don't improve for 300 rounds
[100]	training's binary_logloss: 0.0350399	valid_1's binary_logloss: 0.0457612
[200]	training's binary_logloss: 0.0275558	valid_1's b

In [17]:
sub_df = pd.read_csv('../../data/sample_submission.csv')
sub_df["target"] = predictions_lgb
sub_df.to_csv('predictions_lgb.csv', index=False)

In [18]:
oof_lgb  = pd.DataFrame(oof_lgb)
oof_nlgb = pd.DataFrame(oof_nlgb)
oof_blgb = pd.DataFrame(oof_blgb)

predictions_lgb  = pd.DataFrame(predictions_lgb)
predictions_nlgb = pd.DataFrame(predictions_nlgb)
predictions_blgb = pd.DataFrame(predictions_blgb)

oof_lgb.to_csv('../../result/oof_lgb.csv',header=None,index=False)
oof_blgb.to_csv('../../result/oof_blgb.csv',header=None,index=False)
oof_nlgb.to_csv('../../result/oof_nlgb.csv',header=None,index=False)

predictions_lgb.to_csv('../../result/predictions_lgb.csv',header=None,index=False)
predictions_nlgb.to_csv('../../result/predictions_nlgb.csv',header=None,index=False)
predictions_blgb.to_csv('../../result/predictions_blgb.csv',header=None,index=False)

In [19]:
#### xgb
xgb_params = {'eta':0.05, 'max_leaves':47, 'max_depth':10, 'subsample':0.8, 'colsample_bytree':0.8,
              'min_child_weight':40, 'max_bin':128, 'reg_alpha':2.0, 'reg_lambda':2.0, 
              'objective':'reg:linear', 'eval_metric':'rmse', 'silent': True, 'nthread':4}
folds = KFold(n_splits=10, shuffle=True, random_state=2018)
print('='*10,'回归模型','='*10)
oof_xgb , predictions_xgb , scores_xgb  = train_model(X_train , X_test, y_train , params=xgb_params, folds=folds, model_type='xgb', eval_type='regression')
print('='*10,'without outliers 回归模型','='*10)
oof_nxgb, predictions_nxgb, scores_nxgb = train_model(X_ntrain, X_test, y_ntrain, params=xgb_params, folds=folds, model_type='xgb', eval_type='regression')
print('='*10,'分类模型','='*10)
xgb_params['objective'] = 'binary:logistic'
xgb_params['metric']    = 'binary_logloss'
oof_bxgb, predictions_bxgb, scores_bxgb = train_model(X_train , X_test, y_train_binary, params=xgb_params, folds=folds, model_type='xgb', eval_type='binary')

Fold 0 started at Sun Mar 20 19:57:45 2022
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-rmse:3.90088	valid_data-rmse:4.07156
[100]	train-rmse:3.11743	valid_data-rmse:3.79847
[200]	train-rmse:2.96223	valid_data-rmse:3.79919
[286]	train-rmse:2.84050	valid_data-rmse:3.80300
[-0.19234598 -0.02435767 -0.15576263 ...  0.06688782 -0.16866477
  0.00856968]
Fold 1 started at Sun Mar 20 20:19:53 2022
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-rmse:3.93080	valid_data-rmse:3.79335
[100]	train-rmse:3.14149	valid

In [20]:
sub_df = pd.read_csv('../../data/sample_submission.csv')
sub_df["target"] = predictions_xgb
sub_df.to_csv('../../predictions_xgb.csv', index=False)

oof_xgb  = pd.DataFrame(oof_xgb)
oof_nxgb = pd.DataFrame(oof_nxgb)
oof_bxgb = pd.DataFrame(oof_bxgb)

predictions_xgb  = pd.DataFrame(predictions_xgb)
predictions_nxgb = pd.DataFrame(predictions_nxgb)
predictions_bxgb = pd.DataFrame(predictions_bxgb)

oof_xgb.to_csv('../../result/oof_xgb.csv',header=None,index=False)
oof_bxgb.to_csv('../../result/oof_bxgb.csv',header=None,index=False)
oof_nxgb.to_csv('../../result/oof_nxgb.csv',header=None,index=False)

predictions_xgb.to_csv('../../result/predictions_xgb.csv',header=None,index=False)
predictions_nxgb.to_csv('../../result/predictions_nxgb.csv',header=None,index=False)
predictions_bxgb.to_csv('../../result/predictions_bxgb.csv',header=None,index=False)

In [21]:
#### cat
cat_params = {'learning_rate': 0.05, 'depth': 9, 'l2_leaf_reg': 10, 'bootstrap_type': 'Bernoulli',
              'od_type': 'Iter', 'od_wait': 50, 'random_seed': 11, 'allow_writing_files': False}
folds = KFold(n_splits=10, shuffle=True, random_state=18)
print('='*10,'回归模型','='*10)
oof_cat , predictions_cat , scores_cat  = train_model(X_train , X_test, y_train , params=cat_params, folds=folds, model_type='cat', eval_type='regression')
print('='*10,'without outliers 回归模型','='*10)
oof_ncat, predictions_ncat, scores_ncat = train_model(X_ntrain, X_test, y_ntrain, params=cat_params, folds=folds, model_type='cat', eval_type='regression')
print('='*10,'分类模型','='*10)
oof_bcat, predictions_bcat, scores_bcat = train_model(X_train , X_test, y_train_binary, params=cat_params, folds=folds, model_type='cat', eval_type='binary')

Fold 0 started at Mon Mar 21 22:12:00 2022
0:	learn: 3.8322414	test: 3.8757204	best: 3.8757204 (0)	total: 518ms	remaining: 2h 52m 47s
100:	learn: 3.5854293	test: 3.6786430	best: 3.6786430 (100)	total: 15.1s	remaining: 49m 25s
200:	learn: 3.5366978	test: 3.6671911	best: 3.6671911 (200)	total: 29.6s	remaining: 48m 35s
300:	learn: 3.5036971	test: 3.6633983	best: 3.6633417 (293)	total: 44.1s	remaining: 48m 4s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 3.66227846
bestIteration = 344

Shrink model to first 345 iterations.
[-0.336061   -0.03190832 -0.09629481 ...  0.09041586 -0.31964174
  0.01571148]
Fold 1 started at Mon Mar 21 22:13:47 2022
0:	learn: 3.8423150	test: 3.7859420	best: 3.7859420 (0)	total: 433ms	remaining: 2h 24m 23s
100:	learn: 3.5946278	test: 3.6029010	best: 3.6029010 (100)	total: 16.1s	remaining: 52m 49s
200:	learn: 3.5457570	test: 3.5949553	best: 3.5949553 (200)	total: 29.9s	remaining: 49m 5s
300:	learn: 3.5111864	test: 3.5904780	best: 3.5904780 (300)

In [22]:
sub_df = pd.read_csv('../../data/sample_submission.csv')
sub_df["target"] = predictions_cat
sub_df.to_csv('../../predictions_cat.csv', index=False)

oof_cat  = pd.DataFrame(oof_cat)
oof_ncat = pd.DataFrame(oof_ncat)
oof_bcat = pd.DataFrame(oof_bcat)

predictions_cat  = pd.DataFrame(predictions_cat)
predictions_ncat = pd.DataFrame(predictions_ncat)
predictions_bcat = pd.DataFrame(predictions_bcat)

oof_cat.to_csv('../../result/oof_cat.csv',header=None,index=False)
oof_bcat.to_csv('../../result/oof_bcat.csv',header=None,index=False)
oof_ncat.to_csv('../../result/oof_ncat.csv',header=None,index=False)

predictions_cat.to_csv('../../result/predictions_cat.csv',header=None,index=False)
predictions_ncat.to_csv('../../result/predictions_ncat.csv',header=None,index=False)
predictions_bcat.to_csv('../../result/predictions_bcat.csv',header=None,index=False)

## 融合阶段

In [None]:
# #### lgb
# oof_lgb  = pd.read_csv('./result/oof_lgb.csv',header=None)
# oof_nlgb = pd.read_csv('./result/oof_nlgb.csv',header=None)
# oof_blgb = pd.read_csv('./result/oof_blgb.csv',header=None)

# predictions_lgb  = pd.read_csv('./result/predictions_lgb.csv',header=None)
# predictions_nlgb = pd.read_csv('./result/predictions_nlgb.csv',header=None)
# predictions_blgb = pd.read_csv('./result/predictions_blgb.csv',header=None)

# #### xgb
# oof_xgb  = pd.read_csv('./result/oof_xgb.csv',header=None)
# oof_nxgb = pd.read_csv('./result/oof_nxgb.csv',header=None)
# oof_bxgb = pd.read_csv('./result/oof_bxgb.csv',header=None)

# predictions_xgb  = pd.read_csv('./result/predictions_xgb.csv',header=None)
# predictions_nxgb = pd.read_csv('./result/predictions_nxgb.csv',header=None)
# predictions_bxgb = pd.read_csv('./result/predictions_bxgb.csv',header=None)

# #### cat
# oof_cat  = pd.read_csv('./result/oof_cat.csv',header=None)
# oof_ncat = pd.read_csv('./result/oof_ncat.csv',header=None)
# oof_bcat = pd.read_csv('./result/oof_bcat.csv',header=None)

# predictions_cat  = pd.read_csv('./result/predictions_cat.csv',header=None)
# predictions_ncat = pd.read_csv('./result/predictions_ncat.csv',header=None)
# predictions_bcat = pd.read_csv('./result/predictions_bcat.csv',header=None)

### 加权融合

In [23]:
sub_df = pd.read_csv('../../data/sample_submission.csv')
sub_df["target"] = (predictions_lgb + predictions_xgb + predictions_cat) / 3
sub_df.to_csv('../../predictions_wei_average.csv', index=False)

### Stacking融合

In [24]:
#### stack 回归模型  without-outliers回归模型 分类模型
def stack_model(oof_1, oof_2, oof_3, predictions_1, predictions_2, predictions_3, y, eval_type='regression'):
   
    train_stack = np.vstack([oof_1, oof_2]).transpose()
    test_stack = np.vstack([predictions_1, predictions_2]).transpose()
    from sklearn.model_selection import RepeatedKFold
    folds = RepeatedKFold(n_splits=5, n_repeats=2, random_state=2020)
    oof = np.zeros(train_stack.shape[0])
    predictions = np.zeros(test_stack.shape[0])

    for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_stack, y)):
        print("fold n°{}".format(fold_+1))
        trn_data, trn_y = train_stack[trn_idx], y[trn_idx]
        val_data, val_y = train_stack[val_idx], y[val_idx]
        print("-" * 10 + "Stacking " + str(fold_) + "-" * 10)
        clf = BayesianRidge()
        clf.fit(trn_data, trn_y)

        oof[val_idx] = clf.predict(val_data)
        predictions += clf.predict(test_stack) / (5 * 2)
    if eval_type == 'regression':
        print('mean: ',np.sqrt(mean_squared_error(y, oof)))
    if eval_type == 'binary':
        print('mean: ',log_loss(y, oof))
    
    return oof, predictions
print('='*30)
oof_stack , predictions_stack  = stack_model(oof_lgb[0] , oof_xgb[0] , oof_cat[0] , predictions_lgb[0] , predictions_xgb[0] , predictions_cat[0] , target)
print('='*30)
oof_nstack, predictions_nstack = stack_model(oof_nlgb[0], oof_nxgb[0], oof_ncat[0], predictions_nlgb[0], predictions_nxgb[0], predictions_ncat[0], ntarget)
print('='*30)
oof_bstack, predictions_bstack = stack_model(oof_blgb[0], oof_bxgb[0], oof_bcat[0], predictions_blgb[0], predictions_bxgb[0], predictions_bcat[0], target_binary, eval_type='binary')

fold n°1
----------Stacking 0----------
fold n°2
----------Stacking 1----------
fold n°3
----------Stacking 2----------
fold n°4
----------Stacking 3----------
fold n°5
----------Stacking 4----------
fold n°6
----------Stacking 5----------
fold n°7
----------Stacking 6----------
fold n°8
----------Stacking 7----------
fold n°9
----------Stacking 8----------
fold n°10
----------Stacking 9----------
mean:  3.639551215295588
fold n°1
----------Stacking 0----------
fold n°2
----------Stacking 1----------
fold n°3
----------Stacking 2----------
fold n°4
----------Stacking 3----------
fold n°5
----------Stacking 4----------
fold n°6
----------Stacking 5----------
fold n°7
----------Stacking 6----------
fold n°8
----------Stacking 7----------
fold n°9
----------Stacking 8----------
fold n°10
----------Stacking 9----------
mean:  1.548240201689886
fold n°1
----------Stacking 0----------
fold n°2
----------Stacking 1----------
fold n°3
----------Stacking 2----------
fold n°4
----------Stacking 

In [25]:
sub_df = pd.read_csv('../../data/sample_submission.csv')
sub_df["target"] = predictions_stack
sub_df.to_csv('../../predictions_stack.csv', index=False)

## Trick融合

In [26]:
sub_df = pd.read_csv('../../data/sample_submission.csv')
sub_df["target"] = predictions_bstack*-33.219281 + (1-predictions_bstack)*predictions_nstack
sub_df.to_csv('../../predictions_trick.csv', index=False)

In [27]:
sub_df = pd.read_csv('../../data/sample_submission.csv')
sub_df["target"] = (predictions_bstack*-33.219281 + (1-predictions_bstack)*predictions_nstack)*0.5 + predictions_stack*0.5
sub_df.to_csv('../../predictions_trick&stacking.csv', index=False)