In [1]:
import pandas as pd 
import numpy as np 
import lightgbm as lgb

In [2]:
%%time 
df_train = pd.read_pickle('../input/feats/df_trn_feat0.pkl')
df_test = pd.read_pickle('../input/feats/df_test_feat0.pkl')

Wall time: 157 ms


In [3]:
df_train['price_log1'] = df_train.price.apply(np.log1p).astype('float16').fillna(-999)
df_test['price_log1'] = df_test.price.apply(np.log1p).astype('float16').fillna(-999)

In [4]:
%%time 
text_feat = 'txt_svd_feat2.h5'
with pd.HDFStore('../input/feats/{}'.format(text_feat)) as store:
    print(store.keys())
    df_trn_text_feat  = store['df_trn_text_feats']
    df_test_text_feat = store['df_test_text_feats']

['/df_test_text_feats', '/df_trn_text_feats']
Wall time: 168 ms


In [5]:
%%time 

with pd.HDFStore('../input/feats/txt_td_tfidf_hash5.h5') as store:
    print(store.keys())
    df_trn_text_feat1  = store['df_trn_text_feat']
    df_test_text_feat1 = store['df_test_text_feat']

['/df_test_text_feat', '/df_trn_text_feat']
Wall time: 123 ms


In [5]:
# with pd.HDFStore('../input/feats/df_mean_target_cv2.h5') as store:
#     print(store.keys())
#     df_trn_mean_target = store['df_trn_encoded']
#     df_test_mean_target = store['df_test_encoded']    

['/df_test_encoded', '/df_trn_encoded']


In [6]:
df_train = pd.concat([df_train,  df_trn_text_feat1],axis=1)
df_test = pd.concat([df_test, df_test_text_feat1],axis=1)

In [7]:
df_train.columns

Index(['uidx', 'iidx', 'iid', 'region_city_label', 'tit_len', 'desc_len',
       'activation_date', 'month', 'day', 'weekday', 'param_1', 'param_2',
       'param_3', 'user_type', 'parent_category_name', 'price',
       'category_name', 'image_top_1', 'ads_cnt_by_uid', 'ads_cnt_by_iid',
       'deal_probability', 'price_log1', 'title_tfidf_0', 'title_tfidf_1',
       'title_tfidf_2', 'title_tfidf_3', 'title_tfidf_4', 'title_hash_0',
       'title_hash_1', 'title_hash_2', 'title_hash_3', 'title_hash_4',
       'desc_tfidf_0', 'desc_tfidf_1', 'desc_tfidf_2', 'desc_tfidf_3',
       'desc_tfidf_4', 'desc_hash_0', 'desc_hash_1', 'desc_hash_2',
       'desc_hash_3', 'desc_hash_4'],
      dtype='object')

train/ test split for a known answers datasets

In [8]:
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import mean_squared_error

In [15]:
num_bucket=100
seed = 99 ## decide random seed for cutting te/tr split
y_train = df_train.deal_probability
nfolds = 10 ## 1/nfolds  for testing used

percentiles = np.percentile(y_train, np.linspace(0, 100, num_bucket)) ## create percentiles 
falls_into = np.searchsorted(percentiles,y_train) ## fall into bucket

skf = StratifiedKFold(nfolds, random_state=seed) ## random state
skf.get_n_splits(df_train, falls_into)

trn_idx, te_idx = next(skf.split(df_train,falls_into))

df_te = df_train.iloc[te_idx].copy()
df_tr = df_train.iloc[trn_idx].copy()


model and validate on `df_tr`

- train test split (simple hold out)

In [16]:
selcols = [
    'uidx','iidx','tit_len','desc_len','price_log1', ## basic 
    'region_city_label',
    'day','weekday',
    'param_1','param_2','param_3',
    'user_type',
    'parent_category_name','category_name','image_top_1',
    'ads_cnt_by_uid','ads_cnt_by_iid'
#     'mean_region_city_label','mean_param_1','mean_param_2','mean_param_3', ## mean
#     'mean_user_type','mean_parent_category_name','mean_category_name','mean_image_top_1' ### mean    
]

text_cols = []
# text_cols = ['text_feat_{}'.format(e) for e in range(5)]
### text tfidf/
for x in ['title','desc']:
    for y in ['tfidf','hash']:
        for i in range(5):
            text_cols.append('{}_{}_{}'.format(x,y,i))
selcols += text_cols


categorical = [
    'region_city_label',
    'user_type',
    'image_top_1',
    'param_1','param_2','param_3',
    'category_name','parent_category_name'
]
y_train = df_tr.deal_probability
# df_tr.drop('deal_probability',inplace=True, axis=1)


X_train = df_tr[selcols]

In [17]:
X_train.shape

(1353066, 37)

In [22]:
X_trn, X_val, y_trn, y_val = train_test_split(X_train,y_train,test_size=0.2, random_state=1)

lgtrain = lgb.Dataset(X_trn, y_trn,categorical_feature = categorical)
lgvalid = lgb.Dataset(X_val, y_val,categorical_feature = categorical)

lgbm_params =  {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',     
    'max_depth': 5,
    'num_leaves': 33,
    'feature_fraction': 0.7,
    'bagging_fraction': 0.8,    
    'learning_rate': 0.1,
    'num_boost_round': 10000
#     'categorical':categorical
}  

# Go Go Go
lgb_clf = lgb.train(
    lgbm_params,
    lgtrain,    
    valid_sets = lgvalid,
    valid_names= 'valid',
    early_stopping_rounds=10,
    verbose_eval=100
)
print("Model Evaluation Stage")
print('RMSE:', np.sqrt(mean_squared_error(y_val, lgb_clf.predict(X_val))))



Training until validation scores don't improve for 10 rounds.
[100]	valid's rmse: 0.227237
[200]	valid's rmse: 0.225732
[300]	valid's rmse: 0.224931
[400]	valid's rmse: 0.224442
[500]	valid's rmse: 0.224134
[600]	valid's rmse: 0.223896
Early stopping, best iteration is:
[666]	valid's rmse: 0.223813
Model Evaluation Stage
RMSE: 0.223812775375


In [24]:
y_te = df_te.deal_probability
print('RMSE in hold out test:{:.5f}'.format(np.sqrt(mean_squared_error(y_te,lgb_clf.predict(df_te[selcols])))))

RMSE in hold out test:0.22387


In [25]:
df_test2 = pd.read_pickle('../input/test.pkl')
itemid = df_test2.item_id 

In [26]:
y_pred0 = pd.DataFrame()
y_pred0['item_id'] = itemid
y_pred0['deal_probability'] = lgb_clf.predict(df_test[selcols]).clip(0,1)

In [27]:
y_pred0.head()

Unnamed: 0,item_id,deal_probability
0,6544e41a8817,0.057642
1,65b9484d670f,0.230857
2,8bab230b2ecd,0.151402
3,8e348601fefc,0.191676
4,8bd2fe400b89,0.222705


In [29]:
fileno = 14
y_pred0.to_csv('../output/sub/sub_{}.csv'.format(fileno),index=False)

In [28]:
feat_imp = pd.DataFrame({'f':lgb_clf.feature_name(),
                         'split':lgb_clf.feature_importance(importance_type='split'),
                         'gain':lgb_clf.feature_importance(importance_type='gain')}).sort_values(by='split',ascending=False)
feat_imp.head(10)

Unnamed: 0,f,gain,split
5,region_city_label,9534.229616,3304
14,image_top_1,20682.303359,3012
8,param_1,30660.174146,2085
4,price_log1,5746.280673,1242
13,category_name,28219.747931,796
10,param_3,3160.106969,624
9,param_2,6796.610172,571
16,ads_cnt_by_iid,590.868669,419
28,desc_tfidf_1,1324.068148,394
3,desc_len,823.900116,393


___

train on all datasets for submission

In [25]:
X_train = df_train[selcols]
y_train = df_train.deal_probability 
X_val   = df_tr[selcols]
y_val   = df_tr.deal_probability

lgtrain = lgb.Dataset(X_train, y_train,
                    categorical_feature = categorical)
lgvalid = lgb.Dataset(X_val, y_val,                
                categorical_feature = categorical)

lgb_clf = lgb.train(
    lgbm_params,
    lgtrain,    
    valid_sets = [lgtrain,lgvalid],
    valid_names= ['train','valid'],
    early_stopping_rounds=10,
    verbose_eval=100
)





Training until validation scores don't improve for 10 rounds.
[100]	train's rmse: 0.22178	valid's rmse: 0.221735
Did not meet early stopping. Best iteration is:
[100]	train's rmse: 0.22178	valid's rmse: 0.221735


# submit

In [16]:
df_test2 = pd.read_pickle('../input/test.pkl')
itemid = df_test2.item_id 

In [17]:
y_pred = pd.DataFrame()
y_pred['item_id'] = itemid
y_pred['deal_probability'] = lgb_clf.predict(df_test[selcols]).clip(0,1)

In [18]:
y_pred.head()

Unnamed: 0,item_id,deal_probability
0,6544e41a8817,0.061893
1,65b9484d670f,0.294296
2,8bab230b2ecd,0.214183
3,8e348601fefc,0.143842
4,8bd2fe400b89,0.246829


In [18]:
feat_imp = pd.DataFrame({'f':lgb_clf.feature_name(),
                         'split':lgb_clf.feature_importance(importance_type='split'),
                         'gain':lgb_clf.feature_importance(importance_type='gain')}).sort_values(by='split',ascending=False)
feat_imp.head(10)

Unnamed: 0,f,gain,split
14,image_top_1,23474.255153,1130
5,region_city_label,5308.658703,529
8,param_1,42645.431335,419
4,price_log1,5812.561424,230
9,param_2,10154.363106,159
13,category_name,35308.752316,141
10,param_3,2422.239346,106
3,desc_len,1195.364289,55
15,ads_cnt_by_uid,8172.49613,50
33,desc_tfidf_1,1720.456072,47


History 

- 13 local  :0.22368 (lb:0.2290)
- 14 local : 0.22376 (lb:0.2290)
- 14 local : 0.22387 (lb:0.2281)