# Learning-to-Rank example

In [1]:
from sklearn.ensemble import RandomForestRegressor
import numpy as np

## A class for pointwise-based learning to rank model

In [2]:
class PointWiseLTRModel(object):
    def __init__(self, regressor):
        """
        Args:
            classifier: an instance of scikit-learn regressor
        """
        self.regressor = regressor

    def _train(self, X, y):
        """Trains and LTR model.
        
        Args:
            X: features of training instances
            y: relevance assessments of training instances
        """
        assert self.regressor is not None
        self.model = self.regressor.fit(X, y)

    def rank(self, ft, doc_ids):
        """Predicts relevance labels and rank documents for a given query.
        
        Args:
            ft: a list of features for query-doc pairs
            doc_ids: a list of document ids
        """
        assert self.model is not None
        rel_labels = self.model.predict(ft)
        sort_indices = np.argsort(rel_labels)[::-1]

        results = []
        for i in sort_indices:
            results.append((doc_ids[i], rel_labels[i]))
        return results

## Read data from file

In [3]:
def read_data_from_file(filename):
    """Loads pre-computed features from file.
    
    Args:
        filename: File name
        
    Returns: 
        X features of data, y labels of data, group a list of numbers indicate how many instances for each query
    """
    X, y, qids, doc_ids = [], [], [], []
    with open(filename, 'r') as f:
        i, s_qid = 0, None
        for line in f:
            items = line.strip().split()
            label = int(items[0])
            qid = items[1]
            doc_id = items[2]
            features = np.array([float(i.split(":")[1]) for i in items[3:]])
            X.append(features)
            y.append(label)
            qids.append(qid)
            doc_ids.append(doc_id)

    return X, y, qids, doc_ids

## Main

#### Read input data

In [4]:
X, y, qids, doc_ids = read_data_from_file('data/features_sample.txt')
qids_unique= list(set(qids))

print('#queries: ', len(qids_unique))
print('#query-doc pairs: ', len(y))

#queries:  339
#query-doc pairs:  14013


Sample of feature vectors

In [5]:
for i in range(5):
    print('{:10} {:20} rel: {}\nfeatures: {}\n'.format(qids[i], doc_ids[i], y[i], ', '.join([str(_) for _ in X[i]])))

qid:10     GX000-00-0000000     rel: 0
features: 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.001348, 0.0, 0.222222, 0.0, 0.001282, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.017241, 0.0, 0.0, 0.0

qid:10     GX000-24-12369390    rel: 1
features: 0.03131, 0.666667, 0.5, 0.166667, 0.033206, 0.0, 0.0, 0.0, 0.0, 0.0, 0.023327, 0.641157, 0.498951, 0.323153, 0.026674, 0.029246, 0.5, 0.222222, 0.111111, 0.029398, 0.689128, 0.636228, 0.869764, 0.7164, 0.725186, 0.554961, 0.695985, 0.50406, 0.602946, 0.679534, 0.730286, 0.687414, 0.529688, 0.436996, 0.643739, 0.372337, 0.64689, 0.686107, 0.823908, 0.750092, 0.385426, 0.923077, 0.086207, 0.333333, 0.448276, 0.0

qid:10     GX000-62-7863450     rel: 1
features: 0.078682, 0.166667, 0.5, 0.333333, 0.080022, 0.0, 0.0, 0.0, 0.0, 0.0, 0.108216, 0.174635, 0.501049, 0.351114, 0.112102, 0.118642, 0.5625, 0.944444, 0.333333, 0.119184, 0.696967, 0.486387, 

#### Split data into train and test sets (80% and 20%, respectively)

In [6]:
train_qids = []
test_qids = []

for i in range(len(qids_unique)):
    qid = qids_unique[i]
    if i % 5 == 0:  # test query
        test_qids.append(qid)
    else:  # train query
        train_qids.append(qid)
    
train_X, train_y = [], []
test_X, test_y = [], []

for i in range(len(X)):
    if qids[i] in train_qids:
        train_X.append(X[i])
        train_y.append(y[i])
    else:
        test_X.append(X[i])
        test_y.append(y[i])

#### Create a regression model and an LTR instance based on that

In [7]:
clf = RandomForestRegressor(max_depth=3, random_state=0, n_estimators=10)
ltr = PointWiseLTRModel(clf)

#### Train LTR model

In [8]:
ltr._train(train_X, train_y)

#### Generate ranking for a test query

In [9]:
qid = test_qids[0]  # first test query
# get the doc_ids and feature vectors for the documents that are to be ranked for this query
# Note that this is a really inefficient way of doing this!
test_X = []
test_doc_ids = []

for i in range(len(X)):
    if qids[i] == qid:
        test_X.append(X[i])
        test_doc_ids.append(doc_ids[i])

r = ltr.rank(test_X, test_doc_ids)

# Print top-10 results
for (doc_id, score) in r[:10]:
    print('{:20} {:06.4f}'.format(doc_id, score))

GX232-33-5643547     0.3913
GX017-25-0246746     0.3716
GX007-11-15268600    0.3252
GX003-21-16226598    0.3146
GX243-75-2642608     0.3014
GX000-79-1969938     0.2945
GX001-57-11634684    0.2945
GX235-05-9989317     0.2906
GX042-90-13281691    0.2842
GX050-75-7675845     0.2842
