In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
import pickle

np.random.seed(777)



## 01 Load Data

In [31]:
print 'Load target'
y = pd.DataFrame(pickle.load(open('../input/target.pkl','rb')), columns=['target'])
trn = pd.read_csv('../input/train_append_lb_lag_fe_v3.csv')
rem_targets = [2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 17, 18, 19, 21, 22, 23]  # 18 classes

# trim 
trn = trn[y['target'].isin(rem_targets)]
y = y[y['target'].isin(rem_targets)]
y = LabelEncoder().fit_transform(y)

print trn.shape, y.shape

Load target
(45595, 379) (45595,)


  y = column_or_1d(y, warn=True)


## 02 Run Model

In [None]:
bst, best_iter = main(np.asarray(trn.drop(drop_cols,axis=1)), np.asarray(y))

CV..
Iter 1 / 2


In [27]:
# -*- coding:utf8 -*-
"""
@author: Kweonwoo Jung
@brief:
		- collect preprocessed features
		- fit model and evaluate mlogloss
"""


def main(x,y):

    # get model
    params, num_round, early_stop = get_model()
    #model = get_model()
    
    # params
    n_splits = 2
    test_size = 0.05
        
    print 'CV..'
    # split to 95:05 (sss/random split)
    sss = StratifiedShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=777)
    trn_scores = []; vld_scores = []; best_iter = []
    for i, (t, v) in enumerate(sss.split(x, y)):
        print 'Iter {} / {}'.format(i+1, n_splits)
        x_trn, x_vld = x[t], x[v]
        y_trn, y_vld = y[t], y[v]
       
        dtrn = xgb.DMatrix(x_trn, label=y_trn)
        dvld = xgb.DMatrix(x_vld, label=y_vld)
        watch_list = [(dtrn, 'train'), (dvld, 'eval')]

        # fit xgb
        bst = xgb.train(params, dtrn, num_round, watch_list, \
                        early_stopping_rounds=early_stop, verbose_eval=False)
        
        # trn
        preds = bst.predict(dtrn)
        trn_score = log_loss(y_trn, preds)
        trn_scores.append(trn_score)
        
        # vld
        preds = bst.predict(dvld)  
        vld_score = log_loss(y_vld, preds)
        vld_scores.append(vld_score)
        
        # iter
        best_iter.append(bst.best_iteration)

    # eval
    print '='*50
    print 'TRN SCORE : ', np.mean(trn_scores)
    print 'TRN SCOREs : ', trn_scores
    print 'VLD SCORE : ', np.mean(vld_scores)
    print 'VLD SCOREs : ', vld_scores
    print 'BEST ITER : ', np.mean(best_iter)
    print 'BEST ITERs : ', best_iter
    
    return bst, np.mean(best_iter)

In [28]:
def get_model():
    # XGB Model Param
    num_round = 5000
    early_stop = 50
    xgb_params = {
        'booster': 'gbtree',
        #'colsample_bylevel': 0.7,
        'colsample_bytree': 0.8,
        'gamma': 3,
        'learning_rate': 0.1,
        'max_depth': 4,
        'min_child_weight': 5,
        'nthread': 12,
        'num_class': 18,
        'objective': 'multi:softprob',
        #'reg_alpha': 0,
        #'reg_lambda': 2,
        'silent': 1,
        'subsample': 0.8,
        'eval_metric': 'mlogloss',
        'seed': 777,
        }
    num_round = 5000
    early_stop = 50
    xgb_params = {
        'booster': 'gbtree',
        'gamma': 1,
        'learning_rate': 0.1,
        'max_depth': 4,
        'min_child_weight': 3,
        'nthread': 12,
        'num_class': 18,
        'objective': 'multi:softprob',
        'silent': 1,
        'eval_metric': 'mlogloss',
        'seed': 777,
        }
    return xgb_params, num_round, early_stop

## Submit Result

In [None]:
# full target cols
target_cols =  ['ind_ahor_fin_ult1', 'ind_aval_fin_ult1', 'ind_cco_fin_ult1',
                'ind_cder_fin_ult1', 'ind_cno_fin_ult1',  'ind_ctju_fin_ult1',
                'ind_ctma_fin_ult1', 'ind_ctop_fin_ult1', 'ind_ctpp_fin_ult1',
                'ind_deco_fin_ult1', 'ind_deme_fin_ult1', 'ind_dela_fin_ult1',
                'ind_ecue_fin_ult1', 'ind_fond_fin_ult1', 'ind_hip_fin_ult1',
                'ind_plan_fin_ult1', 'ind_pres_fin_ult1', 'ind_reca_fin_ult1',
                'ind_tjcr_fin_ult1', 'ind_valo_fin_ult1', 'ind_viv_fin_ult1',
                'ind_nomina_ult1',   'ind_nom_pens_ult1', 'ind_recibo_ult1']

# trimmed target cols
target_cols =  ['ind_cco_fin_ult1',
                'ind_cno_fin_ult1',  'ind_ctju_fin_ult1',
                'ind_ctma_fin_ult1', 'ind_ctop_fin_ult1', 'ind_ctpp_fin_ult1',
                'ind_deco_fin_ult1', 'ind_deme_fin_ult1', 'ind_dela_fin_ult1',
                'ind_ecue_fin_ult1', 'ind_fond_fin_ult1', 
                'ind_plan_fin_ult1', 'ind_reca_fin_ult1',
                'ind_tjcr_fin_ult1', 'ind_valo_fin_ult1', 
                'ind_nomina_ult1',   'ind_nom_pens_ult1', 'ind_recibo_ult1']

In [None]:
from datetime import datetime
import os

print 'loading data..'
tst = pd.read_csv('../input/test_append_lb_lag_fe_v3.csv')
'''
tst['age_log'] = np.log(tst['age']+1)
tst['fecha_alta_log'] = np.log(tst['fecha_alta']+1)
tst['renta_log'] = np.log(tst['renta']+1)
tst['antiguedad_log'] = np.log(tst['antiguedad']+1)
'''
dtrn = xgb.DMatrix(trn,y)
dtst = xgb.DMatrix(tst)

xgb_params, num_round, early_stop = get_model()
num_round = 171

print 'fitting a model..'
bst = xgb.train(xgb_params, dtrn, num_boost_round = int(num_round/0.95), verbose_eval=False)

print 'making predictions..'
preds = bst.predict(dtst)
preds = np.fliplr(np.argsort(preds, axis=1))

print 'making submission..'
final_preds = []
for pred in preds:
    top_products = []
    for i, product in enumerate(pred):
        top_products.append(target_cols[product])
        if i == 6:
            break
    final_preds.append(' '.join(top_products))

test_id = pd.read_csv('../input/test_ver2.csv', usecols=['ncodpers'])['ncodpers']
out_df = pd.DataFrame({'ncodpers':test_id, 'added_products':final_preds})
file_name = datetime.now().strftime("result_%Y%m%d%H%M%S") + '.csv'
out_df.to_csv(os.path.join('../output',file_name), index=False)
print 'submission file created!'

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [None]:
# feature importance plot
xgb.plot_importance(bst)

In [None]:
# feature importances
scores = dict()
fscore = bst.get_fscore()
for k,v in fscore.items():
    scores[trn.columns[int(k[1:])]] = v
    
import operator
scores_sorted = sorted(scores.items(), key=operator.itemgetter(1), reverse=True)
scores_sorted

In [None]:
# feature importances
fscore = bst.get_fscore()
    
import operator
scores_sorted = sorted(fscore.items(), key=operator.itemgetter(1), reverse=True)
scores_sorted

In [None]:
# select top k features
top = 290

cols = []
for i, (k,v) in enumerate(scores_sorted):
    if i == top:
        break
    
    cols.append(k)
len(cols)

### f score of each feature on original xgboost model - lag-5
[('age', 8211),
 ('renta', 7539),
 ('fecha_alta', 6954),
 ('antiguedad', 6124),
 ('cod_prov', 3387),
 ('nomprov', 2931),
 ('canal_entrada', 2743),
 ('fecha_alta_lag_fiv', 1974),
 ('age_lag_fiv', 1932),
 ('renta_lag_fiv', 1672),
 ('ind_cco_fin_ult1_lag_one', 1473),
 ('ind_nom_pens_ult1_lag_one', 1461),
 ('ind_recibo_ult1_lag_one', 1346),
 ('ind_cno_fin_ult1_lag_one', 1310),
 ('segmento', 1269),
 ('cod_prov_lag_fiv', 1245),
 ('age_lag_fou', 1130),
 ('antiguedad_lag_fiv', 1128),
 ('sexo', 1100),
 ('age_lag_one', 1078),
 ('tiprel_1mes', 1055),
 ('ind_nomina_ult1_lag_one', 1001),
 ('ind_cno_fin_ult1_lag_two', 941),
 ('ind_ecue_fin_ult1_lag_one', 935),
 ('ind_dela_fin_ult1_lag_one', 924),
 ('fecha_alta_lag_one', 918),
 ('ind_nom_pens_ult1_lag_two', 906),
 ('renta_lag_fou', 882),
 ('fecha_alta_lag_fou', 880),
 ('ind_recibo_ult1_lag_two', 854),
 ('fecha_alta_lag_thr', 790),
 ('ind_tjcr_fin_ult1_lag_one', 784),
 ('canal_entrada_lag_fiv', 774),
 ('ind_tjcr_fin_ult1_lag_two', 774),
 ('ind_recibo_ult1_lag_thr', 737),
 ('renta_lag_thr', 693),
 ('antiguedad_lag_fou', 661),
 ('age_lag_thr', 657),
 ('ind_recibo_ult1_lag_fiv', 641),
 ('renta_lag_one', 629),
 ('ind_nomina_ult1_lag_two', 622),
 ('ind_tjcr_fin_ult1_lag_thr', 609),
 ('nomprov_lag_fiv', 607),
 ('ind_valo_fin_ult1_lag_one', 579),
 ('ind_reca_fin_ult1_lag_one', 578),
 ('indext', 573),
 ('ind_recibo_ult1_lag_fou', 558),
 ('ind_nom_pens_ult1_lag_thr', 549),
 ('ind_tjcr_fin_ult1_lag_fiv', 549),
 ('age_lag_two', 538),
 ('ind_cco_fin_ult1_lag_two', 532),
 ('fecha_alta_lag_two', 527),
 ('ind_ctop_fin_ult1_lag_one', 505),
 ('ind_actividad_cliente', 498),
 ('ind_nom_pens_ult1_lag_fiv', 469),
 ('ind_ctpp_fin_ult1_lag_one', 457),
 ('ind_cco_fin_ult1_lag_thr', 452),
 ('cod_prov_lag_fou', 448),
 ('ind_tjcr_fin_ult1_lag_fou', 436),
 ('ind_fond_fin_ult1_lag_one', 432),
 ('ind_dela_fin_ult1_lag_two', 410),
 ('renta_lag_two', 405),
 ('ind_cno_fin_ult1_lag_thr', 403),
 ('nomprov_lag_fou', 402),
 ('ind_ctma_fin_ult1_lag_two', 397),
 ('ind_valo_fin_ult1_lag_fiv', 397),
 ('ind_dela_fin_ult1_lag_fiv', 395),
 ('ind_cco_fin_ult1_lag_fiv', 392),
 ('antiguedad_lag_thr', 392),
 ('ind_ctop_fin_ult1_lag_two', 391),
 ('ind_cno_fin_ult1_lag_fiv', 389),
 ('ind_cco_fin_ult1_lag_fou', 379),
 ('ind_nom_pens_ult1_lag_fou', 372),
 ('ind_deco_fin_ult1_lag_one', 364),
 ('ind_ctma_fin_ult1_lag_one', 362),
 ('ind_ecue_fin_ult1_lag_fiv', 362),
 ('canal_entrada_lag_fou', 358),
 ('cod_prov_lag_thr', 353),
 ('ind_cno_fin_ult1_lag_fou', 352),
 ('segmento_lag_fiv', 319),
 ('ind_ecue_fin_ult1_lag_two', 314),
 ('ind_nomina_ult1_lag_fiv', 313),
 ('ind_ecue_fin_ult1_lag_thr', 308),
 ('ind_nomina_ult1_lag_fou', 305),
 ('cod_prov_lag_one', 302),
 ('ind_valo_fin_ult1_lag_fou', 299),
 ('nomprov_lag_one', 295),
 ('ind_ecue_fin_ult1_lag_fou', 290),
 ('ind_dela_fin_ult1_lag_thr', 290),
 ('ind_ctma_fin_ult1_lag_thr', 278),
 ('ind_deco_fin_ult1_lag_thr', 268),
 ('ind_nomina_ult1_lag_thr', 267),
 ('ind_ctju_fin_ult1_lag_one', 264),
 ('canal_entrada_lag_thr', 264),
 ('ind_fond_fin_ult1_lag_fiv', 260),
 ('ind_ctpp_fin_ult1_lag_two', 258),
 ('nomprov_lag_two', 243),
 ('antiguedad_lag_one', 237),
 ('ind_fond_fin_ult1_lag_fou', 236),
 ('ind_ctop_fin_ult1_lag_fiv', 236),
 ('nomprov_lag_thr', 227),
 ('sexo_lag_fiv', 223),
 ('ind_ctpp_fin_ult1_lag_fiv', 219),
 ('ind_dela_fin_ult1_lag_fou', 219),
 ('ind_deme_fin_ult1_lag_one', 214),
 ('cod_prov_lag_two', 206),
 ('antiguedad_lag_two', 204),
 ('ind_deme_fin_ult1_lag_fiv', 199),
 ('ind_ctpp_fin_ult1_lag_thr', 198),
 ('ind_ctop_fin_ult1_lag_thr', 196),
 ('ind_valo_fin_ult1_lag_thr', 193),
 ('ind_deco_fin_ult1_lag_two', 189),
 ('ind_reca_fin_ult1_lag_fiv', 186),
 ('ind_hip_fin_ult1_lag_one', 182),
 ('ind_valo_fin_ult1_lag_two', 176),
 ('ind_actividad_cliente_lag_thr', 172),
 ('tiprel_1mes_lag_one', 171),
 ('ind_reca_fin_ult1_lag_two', 168),
 ('ult_fec_cli_1t_lag_one', 168),
 ('canal_entrada_lag_two', 154),
 ('ind_ctma_fin_ult1_lag_fiv', 153),
 ('ind_actividad_cliente_lag_fiv', 145),
 ('ind_deco_fin_ult1_lag_fou', 142),
 ('ind_ctop_fin_ult1_lag_fou', 139),
 ('ind_fond_fin_ult1_lag_two', 135),
 ('canal_entrada_lag_one', 131),
 ('ind_reca_fin_ult1_lag_thr', 129),
 ('segmento_lag_fou', 128),
 ('sexo_lag_one', 126),
 ('ind_ctma_fin_ult1_lag_fou', 125),
 ('segmento_lag_one', 125),
 ('indext_lag_fiv', 113),
 ('ind_fond_fin_ult1_lag_thr', 112),
 ('tiprel_1mes_lag_fou', 106),
 ('segmento_lag_thr', 103),
 ('ind_plan_fin_ult1_lag_fou', 96),
 ('ind_plan_fin_ult1_lag_one', 88),
 ('ind_pres_fin_ult1_lag_one', 78),
 ('ind_viv_fin_ult1_lag_thr', 73),
 ('ind_ctju_fin_ult1_lag_fou', 71),
 ('ind_plan_fin_ult1_lag_thr', 71),
 ('ind_viv_fin_ult1_lag_one', 70),
 ('sexo_lag_thr', 69),
 ('segmento_lag_two', 67),
 ('ult_fec_cli_1t_lag_thr', 66),
 ('ind_ctpp_fin_ult1_lag_fou', 61),
 ('sexo_lag_fou', 60),
 ('ind_actividad_cliente_lag_one', 56),
 ('pais_residencia', 52),
 ('ind_ctju_fin_ult1_lag_fiv', 47),
 ('ind_hip_fin_ult1_lag_fiv', 47),
 ('tiprel_1mes_lag_thr', 45),
 ('indext_lag_one', 43),
 ('ind_deme_fin_ult1_lag_thr', 40),
 ('ind_plan_fin_ult1_lag_fiv', 40),
 ('ind_hip_fin_ult1_lag_two', 37),
 ('pais_residencia_lag_fiv', 37),
 ('ind_actividad_cliente_lag_fou', 36),
 ('ind_pres_fin_ult1_lag_fiv', 35),
 ('ind_ctju_fin_ult1_lag_two', 31),
 ('indext_lag_two', 30),
 ('sexo_lag_two', 30),
 ('tiprel_1mes_lag_fiv', 29),
 ('indext_lag_fou', 27),
 ('ind_actividad_cliente_lag_two', 26),
 ('ind_pres_fin_ult1_lag_two', 26),
 ('ind_deme_fin_ult1_lag_fou', 26),
 ('ind_cder_fin_ult1_lag_thr', 24),
 ('ind_cder_fin_ult1_lag_two', 24),
 ('ult_fec_cli_1t_lag_two', 24),
 ('ind_viv_fin_ult1_lag_two', 24),
 ('ind_hip_fin_ult1_lag_fou', 23),
 ('ind_cder_fin_ult1_lag_one', 18),
 ('tiprel_1mes_lag_two', 17),
 ('indfall', 16),
 ('ind_reca_fin_ult1_lag_fou', 15),
 ('ind_deme_fin_ult1_lag_two', 15),
 ('ind_empleado', 14),
 ('ind_plan_fin_ult1_lag_two', 13),
 ('ind_ctju_fin_ult1_lag_thr', 13),
 ('ind_cder_fin_ult1_lag_fou', 12),
 ('ult_fec_cli_1t_lag_fou', 12),
 ('indext_lag_thr', 12),
 ('ind_deco_fin_ult1_lag_fiv', 11),
 ('ind_cder_fin_ult1_lag_fiv', 10),
 ('indresi', 9),
 ('ult_fec_cli_1t', 7),
 ('ind_pres_fin_ult1_lag_fou', 6),
 ('ind_viv_fin_ult1_lag_fiv', 5),
 ('indrel', 5),
 ('ind_empleado_lag_fiv', 4),
 ('ind_viv_fin_ult1_lag_fou', 4),
 ('ind_pres_fin_ult1_lag_thr', 3),
 ('indrel_lag_fou', 3),
 ('pais_residencia_lag_one', 3),
 ('ind_empleado_lag_one', 2),
 ('ind_empleado_lag_fou', 1),
 ('ind_hip_fin_ult1_lag_thr', 1)]

In [25]:
cols = trn.columns.values.tolist()

corr_mat = trn.corr()
drop_cols = []
threshold = 0.9

for i, i_col in enumerate(cols):
    for j, j_col in enumerate(cols):
        if j_col in drop_cols:
            continue
        if i > j:
            if corr_mat.iloc[i,j] > threshold:
                drop_cols.append(j_col)
            
print(len(drop_cols))