In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

np.random.seed(2018)

In [None]:
def apk(actual, predicted, k=7, default=0.0):
    # MAP@7 이므로, 최대 7개만 사용한다
    if len(predicted) > k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i, p in enumerate(predicted):
        # 점수를 부여하는 조건은 다음과 같다 :
        # 예측값이 정답에 있고 (‘p in actual’)
        # 예측값이 중복이 아니면 (‘p not in predicted[:i]’) 
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    # 정답값이 공백일 경우, 무조건 1.0점을 반환한다
    if not actual:
        return default

    # 정답의 개수(len(actual))로 average precision을 구한다
    return score / min(len(actual), k)

def mapk(actual, predicted, k=7, default=0.0):
    # list of list인 정답값(actual)과 예측값(predicted)에서 고객별 Average Precision을 구하고, np.mean()을 통해 평균을 계산한다
    return np.mean([apk(a, p, k, default) for a, p in zip(actual, predicted)])

In [None]:
trn = pd.read_csv('data/train_ver2.csv')
tst = pd.read_csv('data/test_ver2.csv')

prods = trn.columns[24:].tolist()

# fillna for target values
trn[prods] = trn[prods].fillna(0.0).astype(np.int8)

# remove no posessions
no_product = trn[prods].sum(axis=1) == 0
trn = trn[~no_product]

# merge train and test
for col in trn.columns[24:]:
    tst[col] = 0
df = pd.concat([trn, tst], axis=0)

# features to use in learning
features = []

# preprocessing
# categorical
categorical_cols = ['ind_empleado', 'pais_residencia', 'sexo', 'tiprel_1mes', 'indresi', 'indext', 'conyuemp', 
                'canal_entrada', 'indfall', 'tipodom', 'nomprov', 'segmento']
for col in categorical_cols:
    df[col], _ = df[col].factorize(na_sentinel=-99)
features += categorical_cols

# numerical 
df['age'].replace(' NA', -99, inplace=True)
df['age'] = df['age'].astype(np.int8)

df['antiguedad'].replace('     NA', -99, inplace=True)
df['antiguedad'] = df['antiguedad'].astype(np.int8)

df['renta'].replace('         NA', -99, inplace=True)
df['renta'].fillna(-99, inplace=True)
df['renta'] = df['renta'].astype(float).astype(np.int8)

df['indrel_1mes'].replace('P', 5, inplace=True)
df['indrel_1mes'].fillna(-99, inplace=True)
df['indrel_1mes'] = df['indrel_1mes'].astype(float).astype(np.int8)

features += ['age','antiguedad','renta','ind_nuevo','indrel','indrel_1mes','ind_actividad_cliente']

## feature engineering - date variables
df['fecha_alta_month'] = df['fecha_alta'].map(lambda x: 0.0 if x.__class__ is float else float(x.split('-')[1])).astype(np.int8)
df['fecha_alta_year'] = df['fecha_alta'].map(lambda x: 0.0 if x.__class__ is float else float(x.split('-')[0])).astype(np.int16)
features += ['fecha_alta_month', 'fecha_alta_year']

df['ult_fec_cli_1t_month'] = df['ult_fec_cli_1t'].map(lambda x: 0.0 if x.__class__ is float else float(x.split('-')[1])).astype(np.int8)
df['ult_fec_cli_1t_year'] = df['ult_fec_cli_1t'].map(lambda x: 0.0 if x.__class__ is float else float(x.split('-')[0])).astype(np.int16)
features += ['ult_fec_cli_1t_month', 'ult_fec_cli_1t_year']

df.fillna(-99, inplace=True)

## feature engineering - lag data
def date_to_int(str_date):
    Y, M, D = [int(a) for a in str_date.strip().split("-")] # "2016-05-28"
    int_date = (int(Y) - 2015) * 12 + int(M)
    return int_date

df['int_date'] = df['fecha_dato'].map(date_to_int).astype(np.int8)
df_lag = df.copy()
df_lag.columns = [col + '_prev' if col not in ['ncodpers', 'int_date'] else col for col in df.columns ]
df_lag['int_date'] += 1
df_trn = df.merge(df_lag, on=['ncodpers','int_date'], how='left')
del df, df_lag

# fillna for first lag products NaN into 0
for prod in prods:
    prev = prod + '_prev'
    df_trn[prev].fillna(0, inplace=True)
    
df_trn.fillna(-99, inplace=True)
features += [feature + '_prev' for feature in features]
features += [prod + '_prev' for prod in prods]

###
### more feature engineering here later
###

# prepare for model training
trn = df_trn[~df_trn['fecha_dato'].isin(['2016-05-28', '2016-06-28'])]
tst = df_trn[df_trn['fecha_dato'] == '2016-05-28']
submit = df_trn[df_trn['fecha_dato'] == '2016-06-28']

del df_trn

# get purchase only
X = []
Y = []
for i, prod in enumerate(prods):
    prev = prod + '_prev'
    prX = trn[(trn[prod] == 1) & (trn[prev] == 0)]
    prY = np.zeros(prX.shape[0], dtype=np.int8) + i
    X.append(prX)
    Y.append(prY)
XY = pd.concat(X)
Y = np.hstack(Y)
XY['y'] = Y

# train - valid split
mask = np.random.rand(len(XY)) < 0.8
XY_trn = XY[mask]
XY_vld = XY[~mask]

# preparing for evaluation/submission
ncodpers_tst = tst.as_matrix(columns=['ncodpers'])
ncodpers_submit = submit.as_matrix(columns=['ncodpers'])

for prod in prods:
    prev = prod + '_prev'
    padd = prod + '_add'
    tst[padd] = tst[prod] - tst[prev]
    
add_tst = tst.as_matrix(columns=[prod + '_add' for prod in prods])
add_tst_list = [list() for i in range(len(ncodpers_tst))]
count_tst = 0
for ncodper in range(len(ncodpers_tst)):
    for prod in range(len(prods)):
        if add_tst[ncodper, prod] > 0:
            add_tst_list[ncodper].append(prod)
            count_tst += 1

# best score possible in test set (0.042663)
print(mapk(add_tst_list, add_tst_list, 7, 0.0))


# model training
import xgboost as xgb

param = {
    'booster': 'gbtree',
    'max_depth': 8,
    'nthread': 24,
    'num_class': len(prods),
    'objective': 'multi:softprob',
    'silent': 1,
    'eval_metric': 'mlogloss',
    'eta': 0.1,
    'min_child_weight': 10,
    'colsample_bytree': 0.8,
    'colsample_bylevel': 0.9,
    'seed': 2018,
    }

X_trn = XY_trn.as_matrix(columns=features)
Y_trn = XY_trn.as_matrix(columns=['y'])
dtrn = xgb.DMatrix(X_trn, label=Y_trn, feature_names=features)

X_vld = XY_vld.as_matrix(columns=features)
Y_vld = XY_vld.as_matrix(columns=['y'])
dvld = xgb.DMatrix(X_vld, label=Y_vld, feature_names=features)

# TRAIN! - hold out validation
watch_list = [(dtrn, 'train'), (dvld, 'eval')]
model = xgb.train(param, dtrn, num_boost_round=1000, evals=watch_list, early_stopping_rounds=20)

import pickle
pickle.dump(model, open("model/baseline.trn-vld.pkl", "wb"))
best_ntree_limit = model.best_ntree_limit

# train full model for submission
X = XY.as_matrix(columns=features)
Y = XY.as_matrix(columns=['y'])
dX = xgb.DMatrix(X, label=Y, feature_names=features)

best_ntree_limit = int(best_ntree_limit * (len(XY_trn) + len(XY_vld)) / len(XY_trn))
evallist = [(dX, 'trnvld')]
# TRAIN with all data (train + valid)
model = xgb.train(param, dX, best_ntree_limit, evals=evallist)

print("Feature importance:")
for kv in sorted([(k,v) for k,v in model.get_fscore().items()], key=lambda kv: kv[1], reverse=True):
    print(kv)
    
# evaluation on test set
X_test = tst.as_matrix(columns=features)
dtest = xgb.DMatrix(X_test, feature_names=features)
preds = model.predict(dtest, ntree_limit=best_ntree_limit)

# subtract prev1 because possessed item cannot be bought
preds = preds - tst.as_matrix(columns=[prod + '_prev' for prod in prods])

result = []
for ncodper, pred in zip(ncodpers_tst, preds):
    y_prods = [(y,p,ip) for y,p,ip in zip(pred, prods, range(len(prods)))]
    y_prods = sorted(y_prods, key=lambda a: a[0], reverse=True)[:7]
    result.append([ip for y,p,ip in y_prods])
    
# actual score in test set (0.042663)
print(mapk(add_tst_list, result, 7, 0.0))


# Submission
X_submit = submit.as_matrix(columns=features)
dsubmit = xgb.DMatrix(X_submit, feature_names=features)
preds_submit = model.predict(dsubmit, ntree_limit=best_ntree_limit)

# subtract prev1 because possessed item cannot be bought
preds_submit = preds_submit - submit.as_matrix(columns=[prod + '_prev' for prod in prods])

# write submission file
submit_file = open('output/baseline.xgb', 'w')
submit_file.write('ncodpers,added_products\n')
for ncodper, pred in zip(ncodpers_submit, preds_submit):
    y_prods = [(y,p,ip) for y,p,ip in zip(pred, prods, range(len(prods)))]
    y_prods = sorted(y_prods, key=lambda a: a[0], reverse=True)[:7]
    y_prods = [p for y,p,ip in y_prods]
    submit_file.write('{},{}\n'.format(int(ncodper), ' '.join(y_prods)))

In [None]:
watch_list = [(dtrn, 'train'), (dvld, 'eval')]
model = xgb.train(param, dtrn, num_boost_round=1000, evals=watch_list, early_stopping_rounds=20)

[0]	train-mlogloss:2.7633	eval-mlogloss:2.76435
Multiple eval metrics have been passed: 'eval-mlogloss' will be used for early stopping.

Will train until eval-mlogloss hasn't improved in 20 rounds.
[1]	train-mlogloss:2.56815	eval-mlogloss:2.56944
[2]	train-mlogloss:2.42815	eval-mlogloss:2.42978
[3]	train-mlogloss:2.31961	eval-mlogloss:2.32158
[4]	train-mlogloss:2.22782	eval-mlogloss:2.23006
[5]	train-mlogloss:2.15092	eval-mlogloss:2.15344
[6]	train-mlogloss:2.08519	eval-mlogloss:2.08796
[7]	train-mlogloss:2.02871	eval-mlogloss:2.03168
[8]	train-mlogloss:1.97987	eval-mlogloss:1.98308
[9]	train-mlogloss:1.93581	eval-mlogloss:1.93921
[10]	train-mlogloss:1.89617	eval-mlogloss:1.89976
[11]	train-mlogloss:1.86105	eval-mlogloss:1.86484
[12]	train-mlogloss:1.82875	eval-mlogloss:1.83278
[13]	train-mlogloss:1.80003	eval-mlogloss:1.80428
[14]	train-mlogloss:1.7743	eval-mlogloss:1.77871
[15]	train-mlogloss:1.75042	eval-mlogloss:1.75503
[16]	train-mlogloss:1.72864	eval-mlogloss:1.73345
[17]	train-

[161]	train-mlogloss:1.42749	eval-mlogloss:1.45415
[162]	train-mlogloss:1.42727	eval-mlogloss:1.45409
[163]	train-mlogloss:1.42711	eval-mlogloss:1.45407
[164]	train-mlogloss:1.42689	eval-mlogloss:1.45404
[165]	train-mlogloss:1.42663	eval-mlogloss:1.45398
[166]	train-mlogloss:1.42641	eval-mlogloss:1.45392
[167]	train-mlogloss:1.42623	eval-mlogloss:1.45389
[168]	train-mlogloss:1.42601	eval-mlogloss:1.45382
[169]	train-mlogloss:1.42577	eval-mlogloss:1.45376
[170]	train-mlogloss:1.42554	eval-mlogloss:1.45371
[171]	train-mlogloss:1.42538	eval-mlogloss:1.45367
[172]	train-mlogloss:1.42519	eval-mlogloss:1.45364
[173]	train-mlogloss:1.42501	eval-mlogloss:1.45362
[174]	train-mlogloss:1.42477	eval-mlogloss:1.45357
[175]	train-mlogloss:1.4246	eval-mlogloss:1.45355
[176]	train-mlogloss:1.42443	eval-mlogloss:1.45351
[177]	train-mlogloss:1.42424	eval-mlogloss:1.45348
[178]	train-mlogloss:1.42404	eval-mlogloss:1.45346
[179]	train-mlogloss:1.42383	eval-mlogloss:1.45341
[180]	train-mlogloss:1.42363	eva

In [None]:
import pickle
pickle.dump(model, open("next_multi.pickle", "wb"))

In [None]:
best_ntree_limit = model.best_ntree_limit

In [None]:
best_ntree_limit

In [None]:
X = XY.as_matrix(columns=features)
Y = XY.as_matrix(columns=['y'])
dX = xgb.DMatrix(X, label=Y, feature_names=features)

In [None]:
best_ntree_limit = int(best_ntree_limit * (len(XY_trn) + len(XY_vld)) / len(XY_trn))

In [None]:
evallist = [(dX, 'trnvld')]
model = xgb.train(param, dX, best_ntree_limit, evals=evallist)

In [None]:
print("Feature importance:")
for kv in sorted([(k,v) for k,v in model.get_fscore().items()], key=lambda kv: kv[1], reverse=True):
    print(kv)

In [None]:
X_test = tst.as_matrix(columns=features)
dtest = xgb.DMatrix(X_test, feature_names=features)

preds = model.predict(dtest, ntree_limit=best_ntree_limit)

In [None]:
preds

In [None]:
# subtract prev1 because possessed item cannot be bought
preds = preds - tst.as_matrix(columns=[prod + '_prev' for prod in prods])

In [None]:
ncodpers_tst.shape

In [None]:
result = []

for ncodper, pred in zip(ncodpers_tst, preds):
    y_prods = [(y,p,ip) for y,p,ip in zip(pred, prods, range(len(prods)))]
    y_prods = sorted(y_prods, key=lambda a: a[0], reverse=True)[:7]
    result.append([ip for y,p,ip in y_prods])

In [None]:
mapk(add_tst_list, result, 7, 0.0)

In [None]:
X_submit = submit.as_matrix(columns=features)
dsubmit = xgb.DMatrix(X_submit, feature_names=features)

preds_submit = model.predict(dsubmit, ntree_limit=best_ntree_limit)

In [None]:
# subtract prev1 because possessed item cannot be bought
preds_submit = preds_submit - submit.as_matrix(columns=[prod + '_prev' for prod in prods])

In [None]:
ncodpers_submit[929561]

In [None]:
# write submission file
submit_file = open('output/baseline.xgb', 'w')
submit_file.write('ncodpers,added_products\n')
for ncodper, pred in zip(ncodpers_submit, preds_submit):
    y_prods = [(y,p,ip) for y,p,ip in zip(pred, prods, range(len(prods)))]
    y_prods = sorted(y_prods, key=lambda a: a[0], reverse=True)[:7]
    y_prods = [p for y,p,ip in y_prods]
    submit_file.write('{},{}\n'.format(int(ncodper), ' '.join(y_prods)))

- private leaderboard 0.025018
- rank : 1001