In [2]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 
%matplotlib inline 

In [3]:
df_train = pd.read_pickle('../input/feats/df_trn_feat0.pkl')

In [5]:
df_train['price_log1'] = df_train.price.apply(np.log1p)

In [7]:
df_train.columns

Index(['uidx', 'iidx', 'iid', 'region_city_label', 'tit_len', 'desc_len',
       'activation_date', 'month', 'day', 'weekday', 'param_1', 'param_2',
       'param_3', 'user_type', 'parent_category_name', 'price',
       'category_name', 'image_top_1', 'ads_cnt_by_uid', 'ads_cnt_by_iid',
       'deal_probability', 'price_log1'],
      dtype='object')

In [8]:
mask0 = df_train.deal_probability == 0
mask1 = df_train.deal_probability == 1
X_trn_deal_0 = df_train[mask0]
X_trn_deal_1 = df_train[mask1]

In [9]:
X_trn_deal_0.shape

(974618, 22)

In [10]:
X_trn_deal_1.shape

(10076, 22)

deal probability = 0 --> 974,618 / 1.5 milion (over 50%)

In [11]:
import lightgbm as lgb

In [12]:
df_train['y_zero'] = mask0.astype('uint8')

In [20]:
selcols = list(df_train.columns)
drop_cols = ['deal_probability','activation_date','iid','y_zero','price']
selcols = [col for col in selcols if col not in drop_cols]
X_train = df_train[selcols].copy()
y_train = mask0.astype('uint8')
y_train.name = 'y_zero'

In [14]:
predictors = list(X_train.columns)

In [15]:
predictors

['uidx',
 'iidx',
 'region_city_label',
 'tit_len',
 'desc_len',
 'month',
 'day',
 'weekday',
 'param_1',
 'param_2',
 'param_3',
 'user_type',
 'parent_category_name',
 'category_name',
 'image_top_1',
 'ads_cnt_by_uid',
 'ads_cnt_by_iid',
 'price_log1']

cv for paramers tuning ...

In [38]:
lgbm_params =  {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',    
    'max_depth': -1,
    'num_leaves': 33,
    'feature_fraction': 0.7,
    'bagging_fraction': 0.8,
    # 'bagging_freq': 5,
    'learning_rate': 0.1
} 
predictors = list(X_train.columns)

categorical = ['month','day','weekday',
               'param_1','param_2','param_3',
               'category_name','parent_category_name',
               'region_city_label',
               'user_type']

In [33]:
dtrain = lgb.Dataset(X_train, label=y_train, feature_name=predictors, categorical_feature=categorical)
cv_results = lgb.cv(lgbm_params, dtrain, 
                    num_boost_round=10000, 
                    nfold=4,
                    stratified=False,
                    early_stopping_rounds=10,
                    verbose_eval= 100)
print('done')



[100]	cv_agg's auc: 0.806964 + 0.000552226
[200]	cv_agg's auc: 0.810522 + 0.000576355
[300]	cv_agg's auc: 0.812095 + 0.00058868
[400]	cv_agg's auc: 0.813173 + 0.000602578
[500]	cv_agg's auc: 0.813967 + 0.000567106
[600]	cv_agg's auc: 0.814497 + 0.000529122
[700]	cv_agg's auc: 0.814898 + 0.00054234
[800]	cv_agg's auc: 0.815214 + 0.000522802
[900]	cv_agg's auc: 0.815458 + 0.000469385
[1000]	cv_agg's auc: 0.815703 + 0.000461226
[1100]	cv_agg's auc: 0.815895 + 0.000449416
[1200]	cv_agg's auc: 0.816041 + 0.000431564
[1300]	cv_agg's auc: 0.816204 + 0.000456995
[1400]	cv_agg's auc: 0.816334 + 0.000470158
[1500]	cv_agg's auc: 0.81641 + 0.000463659
done


use k-1 fold to predict current fold as `meta_y_zero`

In [22]:
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score

In [57]:
def oof(X_train, y_train,kf, params):    
    folds = kf.n_splits
    y_pred_all = np.zeros(y_train.shape[0])
    
    for nfold, (trn_idx, val_idx) in enumerate(kf.split(X_train)):
        
        x_trn, y_trn = X_train.iloc[trn_idx], y_train.iloc[trn_idx]
        x_val, y_val = X_train.iloc[val_idx], y_train.iloc[val_idx]

        dtrain = lgb.Dataset(x_trn,y_trn, feature_name=predictors, categorical_feature=categorical)
        dval   = lgb.Dataset(x_val,y_val, feature_name=predictors, categorical_feature=categorical)

        lgb_clf = lgb.train(params,num_boost_round = 10000,
                            train_set = dtrain,
                            valid_sets = dval,
                            early_stopping_rounds=10,
                            verbose_eval= 0)           
        
        y_pred = lgb_clf.predict(x_val)
        y_pred_all[val_idx] = y_pred
        auc = roc_auc_score(y_val, y_pred)
        print('fold:{}\t:{:.4f}'.format(nfold,auc))
        
    auc = roc_auc_score(y_train,y_pred_all)
    print('--'*20)
    print('all auc:\t{:.4f}'.format(auc))
    return y_pred_all, lgb_clf


In [52]:
y_pred = np.zeros(y_val.shape)
roc_auc_score(y_val,y_pred)

0.5

In [63]:
lgbm_params =  {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',    
    'max_depth': -1,
    'num_leaves': 32,
    'feature_fraction': 0.7,
    'bagging_fraction': 0.8,
    'learning_rate': 0.1
} 
kf = KFold(4,shuffle=True)
y_pred = oof(X_train,y_train,kf,lgbm_params)



fold:0	:0.8160
fold:1	:0.8165
fold:2	:0.8159
fold:3	:0.8162
----------------------------------------
all auc:	0.8162


In [79]:
y_pred = y_pred.astype('float32')
df_y_meta = pd.DataFrame({'y_zero': y_train,'yhat_zero':y_pred,'deal_probability':df_train.deal_probability})
df_y_meta.corr()

Unnamed: 0,deal_probability,y_zero,yhat_zero
deal_probability,1.0,-0.726252,-0.393548
y_zero,-0.726252,1.0,0.539506
yhat_zero,-0.393548,0.539506,1.0


In [84]:
np.save('../input/feats/meta_zero_deal',y_pred)

In [85]:
np.load('../input/feats/meta_zero_deal.npy')

array([ 0.77144444,  0.78732169,  0.69957447, ...,  0.42257649,
        0.82077223,  0.7648322 ], dtype=float32)