In [None]:
import numpy as np
import pandas as pd
import xgboost as xgb
from hyperopt import fmin, tpe, hp, Trials

from metrics import average_precision_score, norm_disc_cum_gain_score

In [None]:
def cosine_similarity(x, z):
    x_norm = np.linalg.norm(x, axis=1)
    z_norm = np.linalg.norm(z, axis=1)
    return np.sum(x*z, axis=1) / x_norm / z_norm

def create_features(x):
    idx = x.shape[-1] // 2
    q_emb, p_emb = x[:,:idx], x[:,idx:]
    abs_diff = np.abs(q_emb-p_emb)
    prod = q_emb * p_emb
    cos_sim = cosine_similarity(q_emb, p_emb).reshape(-1, 1)
    return np.hstack((x, abs_diff, prod, cos_sim))

def get_groups(qids):
    '''assuming qids are already sorted'''
    counts = np.bincount(qids.astype(int))
    return counts[counts.nonzero()]

def get_dmatrix(file_name):
    data = np.load(file_name)['arr_0'].astype(np.float32)
    x, y, qids = data[:,3:], data[:,2].astype(int), data[:,0].astype(int)
    del data
    x = create_features(x)
    groups = get_groups(qids)
    del qids
    dmatrix = xgb.DMatrix(x, label=y)
    del x, y
    dmatrix.set_group(groups)
    del groups
    return dmatrix

In [None]:
dtrain = get_dmatrix('./data/train_data.npz')
dval = get_dmatrix('./data/val_data_small.npz')

In [None]:
params = {
    'tree_method': 'gpu_hist', # gpu_hist if gpu else hist
    'objective': 'rank:ndcg',
    'eval_metric': 'ndcg@100',
    'learning_rate': None,
    'gamma': None,
    'colsample_bytree': None,
    'subsample': None,
    'reg_alpha': None,
    'reg_lambda': None,
    'min_child_weight': None,
    'max_depth': None,
}
int_params = set(['min_child_weight', 'max_depth'])
def objective(space):
    for key, value in space.items():
        if key in int_params:
            params[key] = int(value)
        else:
            params[key] = value
    
    results_dict = dict()
    evallist = [(dval, 'eval')]
    model = xgb.train(params, dtrain,
                      evals=evallist, 
                      evals_result=results_dict,
                      verbose_eval=False,
                      num_boost_round=1000,
                      early_stopping_rounds=10,
                      )
    return -results_dict['eval']['ndcg@100'][-1]

space = {
    'learning_rate': hp.uniform('learning_rate', 0.001, 0.1),
    'gamma': hp.uniform ('gamma', 1, 3),
    'colsample_bytree': hp.uniform ('colsample_bytree', 0.5, 1),
    'subsample': hp.uniform ('subsample', 0.5, 1),
    'reg_alpha': hp.uniform('reg_alpha', 0, 1),
    'reg_lambda': hp.uniform('reg_lambda', 1, 4.5),
    'min_child_weight': hp.quniform('min_child_weight', 0, 10, 1),
    'max_depth': hp.quniform('max_depth', 2, 20, 1),
}

trials = Trials()
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=300,
            trials=trials)

for key, value in best.items():
    if key in int_params:
        params[key] = int(value)
    else:
        params[key] = value
    
print()
print('Best Parameters:')
print(params)
# params = {
#     'tree_method': 'gpu_hist',
#     'objective': 'rank:ndcg',
#     'eval_metric': 'ndcg@100',
#     'learning_rate': 0.08234410288432056,
#     'gamma': 1.2658850952270655,
#     'colsample_bytree': 0.8664018282251827,
#     'subsample': 0.7068097095668904,
#     'reg_alpha': 0.9980762017154667,
#     'reg_lambda': 2.0879446922830236,
#     'min_child_weight': 9,
#     'max_depth': 12
# }

In [None]:
dtrain = get_dmatrix('../input/irdm-data/train_data.npz')

In [None]:
del dval
model = xgb.train(params, dtrain, num_boost_round=100)
del dtrain

In [None]:
# Save Results
test_data = np.load('../input/irdm-data/val_data.npz')
qids = list(test_data.keys())
qids = [int(qid) for qid in qids]
ndcg, ap = [], []
with open('LR.txt', 'w') as f:
    for qid in qids:
        data = test_data[str(qid)]
        pids, rels, x = data[:,0], data[:,1], data[:,2:]
        dtest = xgb.DMatrix(create_features(x))
        scores = model.predict(dtest)
        idxs = np.argsort(-scores)
        pids, scores, rels = pids[idxs], scores[idxs], rels[idxs]
        ap.append(average_precision_score(rels))
        ndcg.append(norm_disc_cum_gain_score(rels, k=100))
        for i in range(scores[:100].size):
            rank = i+1
            f.write(f'{qid} A1 {pids[i]} {rank} {scores[i]} LM\n')

print(f'Mean AP: {sum(ap) / len(ap)}')  # 0.03648974746107205
print(f'Mean nDCG: {sum(ndcg) / len(ndcg)}')  # 0.09014276245733374