In [1]:
from catboost import CatBoostRanker, Pool, MetricVisualizer
from copy import deepcopy
import numpy as np
import os
import pandas as pd

# Re-formatting given txts to a necessary format

In [2]:
train_file = open('imat2009/imat2009_train_new.txt')
test_file = open('imat2009/imat2009_test_new.txt')

In [3]:
train = train_file.readlines()
test = test_file.readlines()

In [4]:
train_new = []

for line in train:
    train_new.append(line.split())

In [5]:
X_train = []
y_train = []
queries_train = []

for train_new_line in train_new:
    my_train = [0] * 245
    for i in range(1, len(train_new_line)-2):
        index, value = train_new_line[i].split(':')
        my_train[int(index)-1] = float(value)
    X_train.append(my_train)
    y_train.append(float(train_new_line[0]))
    queries_train.append(int(train_new_line[-1]))

In [6]:
test_new = []

for line in test:
    test_new.append(line.split())

In [7]:
X_test = []
y_test = []
queries_test = []

for test_new_line in test_new:
    my_test = [0] * 245
    for i in range(1, len(test_new_line)-2):
        index, value = test_new_line[i].split(':')
        my_test[int(index)-1] = float(value)
    X_test.append(my_test)
    y_test.append(float(test_new_line[0]))
    queries_test.append(int(test_new_line[-1]))

In [8]:
X_train = np.array(X_train)
y_train = np.array(y_train)
queries_train = np.array(queries_train)
X_test = np.array(X_test)
y_test = np.array(y_test)
queries_test = np.array(queries_test)

# CatBoost Tutorial

In [9]:
num_documents = X_train.shape[0]
print(num_documents)

77714


In [10]:
X_train.shape[1]

245

In [11]:
from collections import Counter
Counter(y_train).items()

dict_items([(1.0, 20086), (0.0, 25776), (2.0, 24424), (4.0, 952), (3.0, 1744), (0.5, 1982), (1.5, 1033), (0.25, 77), (1.33333, 110), (1.2, 3), (2.37037, 39), (0.666671, 340), (2.33333, 79), (0.333329, 268), (2.16049, 19), (2.5, 337), (2.87037, 26), (1.66667, 107), (2.12037, 4), (2.25, 19), (2.24074, 25), (0.2, 10), (1.6, 6), (0.8, 5), (0.6, 10), (0.875, 1), (2.66667, 31), (3.1625, 2), (1.75, 12), (0.75, 55), (2.61111, 4), (0.222229, 1), (0.4, 5), (1.25, 23), (1.97143, 2), (3.5, 16), (2.24691, 10), (2.16667, 1), (1.95239, 1), (1.4, 4), (3.66667, 5), (3.8, 2), (0.125, 1), (2.05556, 2), (3.33333, 4), (2.2, 5), (2.58025, 2), (1.16667, 2), (2.91358, 1), (2.07407, 3), (2.11729, 1), (3.25, 1), (2.375, 1), (3.21666, 1), (2.74074, 5), (2.12346, 3), (0.166671, 8), (0.833329, 5), (1.14286, 1), (3.53, 1), (3.4, 1), (2.75, 1), (3.58125, 1), (2.40741, 1), (0.583329, 1), (1.8, 1), (2.42857, 1), (2.0463, 1), (1.77143, 1), (3.75, 1), (0.888886, 1)])

In [12]:
max_relevance = np.max(y_train)
y_train /= max_relevance
y_test /= max_relevance

In [13]:
num_queries = np.unique(queries_train).shape[0]
num_queries

7300

In [14]:
train = Pool(
    data=X_train,
    label=y_train,
    group_id=queries_train
)

test = Pool(
    data=X_test,
    label=y_test,
    group_id=queries_test
)

In [15]:
default_parameters = {
    'iterations': 2000,
    'custom_metric': ['NDCG', 'PFound', 'AverageGain:top=10'],
    'verbose': False,
    'random_seed': 0,
}

parameters = {}

In [16]:
def fit_model(loss_function, additional_params=None, train_pool=train, test_pool=test):
    parameters = deepcopy(default_parameters)
    parameters['loss_function'] = loss_function
    parameters['train_dir'] = loss_function
    
    if additional_params is not None:
        parameters.update(additional_params)
        
    model = CatBoostRanker(**parameters)
    model.fit(train_pool, eval_set=test_pool, plot=True)
    
    return model

In [22]:
model = fit_model('RMSE', {'custom_metric': ['NDCG', 'RecallAt:top=10', 'MAP:top=10']})

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [18]:
fit_model('PairLogit')

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostRanker at 0x2b658e4f0>

In [20]:
fit_model('YetiRank')

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostRanker at 0x2aad61340>