In [0]:
%pip install fastrank



In [0]:
# Download data files:
import urllib.request
import gzip
import json

DATA_URL_BASE = "https://raw.githubusercontent.com/jjfiv/fastrank/master/examples/"

urls = [("ex{0}".format(x), "{0}/trec_news_2018{1}".format(DATA_URL_BASE, x)) for x in [".features.json", ".train", ".test"]]

for (fname, url) in urls:
  with open(fname, 'wb') as fp:
    handle = urllib.request.urlopen(url)
    for line in handle:
      fp.write(line)

In [0]:
import fastrank
import numpy as np
from fastrank import CModel, CDataset, TrainRequest

dataset = CDataset.open_ranksvm("ex.train", "ex.features.json")
test_dataset = CDataset.open_ranksvm("ex.test", "ex.features.json")

print("Train Dataset has {0} queries, {1} documents, and {2} features.".format(len(dataset.queries()), dataset.num_instances(), dataset.num_features()))
print("Test Dataset has {0} queries, {1} documents, and {2} features.".format(len(test_dataset.queries()), test_dataset.num_instances(), test_dataset.num_features()))

Train Dataset has 45 queries, 782 documents, and 6 features.
Test Dataset has 5 queries, 41 documents, and 6 features.


In [0]:
train_request = TrainRequest.random_forest()
params = train_request.params
params.num_trees = 100
params.feature_sampling_rate = 0.5
params.instance_sampling_rate = 0.5
params.seed = 1234567
random_forest = dataset.train_model(train_request)

In [0]:
train_request = TrainRequest.coordinate_ascent()
params = train_request.params
params.init_random = True
params.normalize = True
params.seed = 1234567
coordinate_ascent = dataset.train_model(train_request)

# simple python-accessible models (serialization is up to you)
coordinate_ascent.to_dict()

{'Linear': {'weights': [2.1789363075430108e-09,
   3.229859575980813e-06,
   -6.60633974465588e-07,
   0.0,
   -0.9999773758216621,
   1.9152791882e-05]}}

In [0]:
models = {'CA': coordinate_ascent, 'RF': random_forest}

for (name, model) in models.items():
  q_to_measure = test_dataset.evaluate(model, "NDCG@5")
  print("{0}: NDCG@5: {1:.3}".format(name, np.mean(list(q_to_measure.values()))))


CA: NDCG@5: 0.928
RF: NDCG@5: 0.901
