In [1]:
# General imports
import numpy as np
import pandas as pd
import os, sys, gc, warnings, random, datetime

from sklearn import metrics
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import LabelEncoder

from tqdm import tqdm
from sklearn.model_selection import TimeSeriesSplit

import math
warnings.filterwarnings('ignore')

# Show all column in head()
pd.set_option('display.max_columns', None)

In [2]:
train_df = pd.read_csv('wns_train_df.csv')
test_df = pd.read_csv('wns_test_df.csv')

In [3]:
train_df.head()

Unnamed: 0,impression_id,user_id,app_code,is_4G,is_click,items_count,sessions_count,ptypes_count,item_price_mean,c1_count,c2_count,c3_count,year,month,dow,hour,minute,seconds,day,intermediate,latest,old,app_code_207,app_code_386,app_code_190,app_code_127,app_code_371,app_code_3,app_code_296,app_code_129,app_code_385,app_code_32,app_code_5,app_code_249,app_code_469,app_count,device_type
0,c4ca4238a0b923820dcc509a6f75849b,87862,422,0,0,1,1,1,2350.0,1,1,1,2018,11,3,0,0,1,15,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,3,1
1,45c48cce2e2d7fbdea1afc51c7c6ad26,63410,467,1,1,12,7,2,4452.833333,1,1,1,2018,11,3,0,0,1,15,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,52,1
2,70efdf2ec9b086079795c442636b55fb,71748,259,1,0,2,2,1,1598.5,1,1,1,2018,11,3,0,0,1,15,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,1
3,8e296a067a37563370ded05f5a3bf3ec,69209,244,1,0,18,7,3,9963.388889,1,1,1,2018,11,3,0,0,1,15,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,1
4,182be0c5cdcd5072bb1864cdee4d3d6e,62873,473,0,0,45,24,5,8796.4,9,4,6,2018,11,3,0,0,1,15,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16,1


In [4]:
#### Seeder
# :seed to make all processes deterministic     # type: int
def seed_everything(seed=0):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

########################### Vars
SEED = 42
seed_everything(SEED)


In [5]:

LOCAL_TEST = False
TARGET = 'is_click'
test_df['is_click'] = 0

#test_df[TARGET] = np.random.randint(0,1, size=len(test_df)
rm_cols = ['impression_id', TARGET]

In [6]:

########################### Features elimination 
# ks_2samp is a two-sided test for the null hypothesis that 2 independent samples are drawn from the same continuous distribution.
from scipy.stats import ks_2samp
features_check = []

columns_to_check = set(list(train_df)).difference(rm_cols)
for i in columns_to_check:
    features_check.append(ks_2samp(test_df[i], train_df[i])[1])

features_check = pd.Series(features_check, index=columns_to_check).sort_values() 
features_discard = list(features_check[features_check==0].index)
print(features_discard)

# We will reset this list for now (use local test drop),
# with better checking
features_discard = [] 

# Final features list
features_columns = [col for col in list(train_df) if col not in rm_cols + features_discard]

print(features_columns)

['app_count']
['user_id', 'app_code', 'is_4G', 'items_count', 'sessions_count', 'ptypes_count', 'item_price_mean', 'c1_count', 'c2_count', 'c3_count', 'year', 'month', 'dow', 'hour', 'minute', 'seconds', 'day', 'intermediate', 'latest', 'old', 'app_code_207', 'app_code_386', 'app_code_190', 'app_code_127', 'app_code_371', 'app_code_3', 'app_code_296', 'app_code_129', 'app_code_385', 'app_code_32', 'app_code_5', 'app_code_249', 'app_code_469', 'app_count', 'device_type']


In [7]:
# something in range 5-10
# 5 - is a common number of splits
# 10+ is too much (we will not have enough diversity in data)
# Here we will use 3 for faster training
# but you can change it by yourself
N_SPLITS = 5

In [8]:
########################### Model
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold

def make_predictions(tr_df, tt_df, features_columns, target, lgb_params, NFOLDS=2):
    #CV : using k-fold
    #folds = KFold(n_splits=NFOLDS, shuffle=True, random_state=SEED)
    
     #CV : using StratifiedKFold
    folds = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)

    X,y = tr_df[features_columns], tr_df[target]    
    P,P_y = tt_df[features_columns], tt_df[target]  

    tt_df = tt_df[['impression_id',target]]    
    predictions = np.zeros(len(tt_df))
    
    for fold_, (trn_idx, val_idx) in enumerate(folds.split(X, y)):
        print('Fold:',fold_)
        tr_x, tr_y = X.iloc[trn_idx,:], y[trn_idx]
        vl_x, vl_y = X.iloc[val_idx,:], y[val_idx]
            
        print(len(tr_x),len(vl_x))
        tr_data = lgb.Dataset(tr_x, label=tr_y)

        if LOCAL_TEST:
            vl_data = lgb.Dataset(P, label=P_y) 
        else:
            vl_data = lgb.Dataset(vl_x, label=vl_y)  

        estimator = lgb.train(
            lgb_params,
            tr_data,
            valid_sets = [tr_data, vl_data],
            verbose_eval = 200,
        )   
        
        pp_p = estimator.predict(P)
        # predictions += pp_p/NFOLDS
        predictions += pp_p/N_SPLITS

        if LOCAL_TEST:
            feature_imp = pd.DataFrame(sorted(zip(estimator.feature_importance(),X.columns)), columns=['Value','Feature'])
            print(feature_imp)
        
        del tr_x, tr_y, vl_x, vl_y, tr_data, vl_data
        gc.collect()
        
    tt_df['prediction'] = predictions
    
    return tt_df
## -------------------

In [9]:
########################### Model params
lgb_params = {
                    'objective':'binary',
                    'boosting_type':'gbdt',
                    'metric':'auc',
                    'n_jobs':-1,
                    'learning_rate':0.01,
                    'num_leaves': 2**8,
                    'max_depth':-1,
                    'tree_learner':'serial',
                    'colsample_bytree': 0.7,
                    'subsample_freq':1,
                    'subsample':0.7,
                    'n_estimators':800,
                    'max_bin':255,
                    'verbose':-1,
                    'seed': SEED,
                    'early_stopping_rounds':30, 
                } 


In [10]:
				########################### Model Train
if LOCAL_TEST:
    lgb_params['learning_rate'] = 0.01
    lgb_params['n_estimators'] = 20000
    lgb_params['early_stopping_rounds'] = 100
    test_predictions = make_predictions(train_df, test_df, features_columns, 'is_click', lgb_params)
    print(metrics.roc_auc_score(test_predictions[TARGET], test_predictions['prediction']))
else:
    lgb_params['learning_rate'] = 0.01
    lgb_params['n_estimators'] = 500
    lgb_params['early_stopping_rounds'] = 30   
    test_predictions = make_predictions(train_df, test_df, features_columns, 'is_click', lgb_params, NFOLDS=5)

Fold: 0
190086 47523
Training until validation scores don't improve for 30 rounds.
[200]	training's auc: 0.850898	valid_1's auc: 0.74058
[400]	training's auc: 0.899021	valid_1's auc: 0.743408
Did not meet early stopping. Best iteration is:
[500]	training's auc: 0.913872	valid_1's auc: 0.743899
Fold: 1
190086 47523
Training until validation scores don't improve for 30 rounds.
[200]	training's auc: 0.8497	valid_1's auc: 0.745485
[400]	training's auc: 0.896884	valid_1's auc: 0.750536
Did not meet early stopping. Best iteration is:
[500]	training's auc: 0.912398	valid_1's auc: 0.751111
Fold: 2
190088 47521
Training until validation scores don't improve for 30 rounds.
[200]	training's auc: 0.852002	valid_1's auc: 0.735311
[400]	training's auc: 0.898426	valid_1's auc: 0.739993
Did not meet early stopping. Best iteration is:
[500]	training's auc: 0.913468	valid_1's auc: 0.740867
Fold: 3
190088 47521
Training until validation scores don't improve for 30 rounds.
[200]	training's auc: 0.851432	v

In [11]:
if not LOCAL_TEST:
    test_predictions['is_click'] = test_predictions['prediction']
    test_predictions[['impression_id','is_click']].to_csv('lgb_submission_wns.csv', index=False)
    
    print(train_df.shape)
    print(test_df.shape)

(237609, 37)
(90675, 37)
