In [1]:
import numpy as np
from xgboost import XGBRegressor

In [2]:
X_train = []
y_train = []
with open('l2r/train.txt', 'r') as file:
    i = 0
    
    while True:
        line = file.readline()
        if not line:
            break
        
        i += 1
        if i == 0:
            print(line)

        buf = line.split()
        y = buf[0]
        y_train.append(float(y))

        x = np.array([-9.0]*(699+1))  # +1 для qid.
        for elem in buf[1:-1]:
            id, feat = elem.split(':')
            if id == 'qid':
                id = 0
            x[int(id)] = float(feat)
        X_train.append(x)
            
X_train = np.array(X_train)
y_train = np.array(y_train)

In [3]:
class Data:
    def __init__(self, X, y):
        self.X = X[:, 1:]  # без qid
        self.y = y
        self.qid = X[:, 0]
        self.qid_unique = np.unique(X[:, 0])
        self.qid_unique_index = []
        
    def _qid_unique_index(self):
        for i in range(self.qid_unique.shape[0]):
            indexes = np.where(self.qid == self.qid_unique[i])[0]
            self.qid_unique_index.append(indexes)

In [4]:
data = Data(X_train, y_train)
data._qid_unique_index()

In [5]:
def dcg(y_predict, y):
    idx = np.argsort(y_predict)[::-1]
    pred_y = y[idx]
    return np.sum((np.power(2.0, pred_y) - 1) / np.log(np.arange(1, y.shape[0] + 1) + 1.0))

def maxdcg(y):
    y_sorted = np.sort(y[:y.shape[0]])[::-1]
    return np.sum((np.power(2., y_sorted[:y.shape[0]]) - 1.) / np.log(np.arange(1, y.shape[0] + 1) + 1))

def ndcg(y_predict, y):
    return dcg(y_predict, y) / calc_maxdcg(y)

def delta_z_func(y_predict, y):
    p = np.argsort(y_predict)[::-1] + 1.
    pow_rel = np.power(2., y)
    log_den = np.arange(1, y.shape[0] + 1) / np.log(p + 1)
    pow_rel_diff = pow_rel[:, None] - pow_rel[None, :]
    log_den_diff = log_den[:, None] - log_den[None, :]
    max_dcg = maxdcg(y)
    if np.isclose(max_dcg, 0.):
        return 0.
    return np.abs(pow_rel_diff * log_den_diff) / max_dcg

def lambda_ij(y_predict, y, delta_z):
    sign = np.sign(y[:, None] - y[None, :])
    sij = np.abs(y_predict[:, None] - y_predict[None, :])
    return np.sum(-sign * delta_z / (1 + np.exp(sij)), axis=1)


def hessian_func(y_predict, y, delta_z):
    sij = np.abs(y_predict[:, None] - y_predict[None, :])
    hessian = np.sum(delta_z / (1 + np.exp(sij)) / (1 + np.exp(-sij)), axis=1)
    hessian[np.isclose(hessian, 0.)] = 1.
    return hessian

In [9]:
def objective(dataset):
    def calc_objective(y, y_predict):

        gradient = np.zeros([y.shape[0]])
        hessian = np.zeros([y.shape[0]])
        for i in range(dataset.qid_unique.shape[0]):
            indexes = np.array(dataset.qid_unique_index[i])
            delta_z = delta_z_func(y_predict[indexes], y[indexes])

            gradient[indexes] = lambda_ij(y_predict[indexes], y[indexes], delta_z),
            hessian[indexes] = hessian_func(y_predict[indexes], y[indexes], delta_z)

        return gradient, hessian

    return calc_objective

In [10]:
params = {'objective': objective(data), 'max_depth': 8,
          'tree_method':'gpu_hist', 'nthread': -1,
          'n_estimators': 1000, 'learning_rate': 0.4}
model = XGBRegressor(**params)

In [11]:
%%time
model.fit(data.X, data.y)

CPU times: user 1h 24s, sys: 3min 57s, total: 1h 4min 21s
Wall time: 1h 4min 47s


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.4, max_delta_step=0,
             max_depth=8, min_child_weight=1, missing=None, n_estimators=1000,
             n_jobs=1, nthread=-1,
             objective=<function objective.<locals>.calc_objective at 0x7f93f5da6268>,
             random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             seed=None, silent=None, subsample=1, tree_method='gpu_hist',
             verbosity=1)

In [13]:
X_test = []
y_test = []
with open('l2r/test.txt', 'r') as file:
    i = 0
    
    while True:
        line = file.readline()
        if not line:
            break
        
        i += 1
        if i == 0:
            print(line)

        buf = line.split()
        y = buf[0]
        y_test.append(float(y))

        x = np.array([-9.0]*(699+1))  # +1 для qid.
        for elem in buf[1:-1]:
            id, feat = elem.split(':')
            if id == 'qid':
                id = 0
            x[int(id)] = float(feat)
        X_test.append(x)
            
X_test = np.array(X_test)
y_test = np.array(y_test)

In [14]:
data_test = Data(X_test, y_test)
data_test._qid_unique_index()

In [15]:
with open('subm_' + 'v1' + '.csv', 'w') as file:
    file.write('QueryId,DocumentId\n')
    doc_idx = 1
    for i in range(len(data_test.qid_unique)):
        X = data_test.X[data_test.qid_unique_index[i]]
        qid = data_test.qid_unique[i]
        y_pred = model.predict(X)
        idx_sorted = np.argsort(y_pred.reshape(-1))[::-1]
        for j in range(idx_sorted.shape[0]):
            file.write(str(int(qid)) + ',' + str(doc_idx + idx_sorted[j]) + '\n')
        doc_idx += idx_sorted.shape[0]