Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
jma127 committed Aug 18, 2015
0 parents commit 8e1e992
Show file tree
Hide file tree
Showing 19 changed files with 1,316 additions and 0 deletions.
16 changes: 16 additions & 0 deletions .gitignore
@@ -0,0 +1,16 @@
# Compiler/Interpreter Output #
###############################
*.py[cod]

# Folders #
###########
build/
dist/
local/
*.egg-info/

# Junk #
########
.DS_Store*
.*.swp
*.swp
12 changes: 12 additions & 0 deletions LICENSE.txt
@@ -0,0 +1,12 @@
Copyright (c) 2015, Jerry Ma
All rights reserved.

Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:

1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.

2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.

3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
60 changes: 60 additions & 0 deletions README.rst
@@ -0,0 +1,60 @@
pyltr
=====

pyltr is a Python learning-to-rank toolkit with ranking models, evaluation
metrics, data wrangling helpers, and more.

This software is licensed under the BSD 3-clause license (see ``LICENSE.txt``).

The author may be contacted at ``ma127jerry <@t> gmail`` with general
feedback, questions, or bug reports.


Example
=======

Import pyltr::

import pyltr

Import a `LETOR
<http://research.microsoft.com/en-us/um/beijing/projects/letor/>`_ dataset
(e.g. `MQ2007
<http://research.microsoft.com/en-us/um/beijing/projects/letor/LETOR4.0/Data/MQ2007.rar>`_
)::

with open('train.txt') as trainfile, \
open('vali.txt') as valifile, \
open('test.txt') as evalfile:
TX, Ty, Tqids, _ = pyltr.data.letor.read_dataset(trainfile)
VX, Vy, Vqids, _ = pyltr.data.letor.read_dataset(valifile)
EX, Ey, Eqids, _ = pyltr.data.letor.read_dataset(evalfile)

Train a `LambdaMART
<http://research.microsoft.com/pubs/132652/MSR-TR-2010-82.pdf>`_ model, using
validation set for early stopping and trimming::

metric = pyltr.metrics.dcg.NDCG(k=10)

# Only needed if you want to perform validation (early stopping & trimming)
monitor = pyltr.models.monitors.ValidationMonitor(
VX, Vy, Vqids, metric=metric, stop_after=250)

model = pyltr.models.lambdamart.LambdaMART(
metric=metric,
n_estimators=1000,
learning_rate=0.02,
max_features=0.5,
query_subsample=0.5,
max_leaf_nodes=10,
min_samples_leaf=64,
verbose=1,
)

model.fit(TX, ty, Tqids, monitor=monitor)

Evaluate model on test data::

Epred = model.predict(Ex)
print 'Random ranking:', metric.calc_mean_random(Eqids, Ey)
print 'Our model:', metric.calc_mean(Eqids, Ey, Epred)
14 changes: 14 additions & 0 deletions TODO.txt
@@ -0,0 +1,14 @@
- Models
- AdaRank
- RankNet
- LambdaRank
- Metrics
- MAP
- P@k
- MRR
- ERR
- Spearman coeff (tau)
- Kendall cheff (rho)
- Documentation
- Finish numpy-style documentation.
- Setup Sphinx.
12 changes: 12 additions & 0 deletions pyltr/__init__.py
@@ -0,0 +1,12 @@
"""
Base module for pyltr.
We import all packages since it's not particularly expensive.
"""

import data
import metrics
import models
import util
7 changes: 7 additions & 0 deletions pyltr/data/__init__.py
@@ -0,0 +1,7 @@
"""
Various data wrangling utilities.
"""

import letor
131 changes: 131 additions & 0 deletions pyltr/data/letor.py
@@ -0,0 +1,131 @@
"""
Various utilities for converting data from/to Microsoft's LETOR format.
"""

import numpy as np
import sklearn.externals.six
from sklearn.externals.six.moves import range


def iter_lines(lines, has_targets=True, one_indexed=True, missing=0.0):
"""Transforms an iterator of lines to an iterator of LETOR rows.
Each row is represented by a (x, y, qid, comment) tuple.
Parameters
----------
lines : iterable of lines
Lines to parse.
has_targets : bool, optional
Whether the file contains targets. If True, will expect the first token
of every line to be a real representing the sample's target (i.e.
score). If False, will use -1 as a placeholder for all targets.
one_indexed : bool, optional
Whether feature ids are one-indexed. If True, will subtract 1 from each
feature id.
missing : float, optional
Placeholder to use if a feature value is not provided for a sample.
Yields
------
x : array of floats
Feature vector of the sample.
y : float
Target value (score) of the sample, or -1 if no target was parsed.
qid : object
Query id of the sample. This is currently guaranteed to be a string.
comment : str
Comment accompanying the sample.
"""
for line in lines:
data, _, comment = line.rstrip().partition('#')
toks = data.split()

num_features = 0
x = np.repeat(missing, 8)
y = -1.0
if has_targets:
y = float(toks[0])
toks = toks[1:]

qid = _parse_qid_tok(toks[0])

for tok in toks[1:]:
fid, _, val = tok.partition(':')
fid = int(fid)
val = float(val)
if one_indexed:
fid -= 1
assert fid >= 0
while len(x) <= fid:
orig = len(x)
x.resize(len(x) * 2)
x[orig:orig * 2] = missing

x[fid] = val
num_features = max(fid + 1, num_features)

assert num_features > 0
x.resize(num_features)

yield (x, y, qid, comment)


def read_dataset(source, has_targets=True, one_indexed=True, missing=0.0):
"""Parses a LETOR dataset from `source`.
Parameters
----------
source : string or iterable of lines
String, file, or other file-like object to parse.
has_targets : bool, optional
See `iter_lines`.
one_indexed : bool, optional
See `iter_lines`.
missing : float, optional
See `iter_lines`.
Returns
-------
X : array of arrays of floats
Feature matrix (see `iter_lines`).
y : array of floats
Target vector (see `iter_lines`).
qids : array of objects
Query id vector (see `iter_lines`).
comments : array of strs
Comment vector (see `iter_lines`).
"""
if isinstance(source, sklearn.externals.six.string_types):
source = source.splitlines()

max_width = 0
xs, ys, qids, comments = [], [], [], []
it = iter_lines(source, has_targets=has_targets,
one_indexed=one_indexed, missing=missing)
for x, y, qid, comment in it:
xs.append(x)
ys.append(y)
qids.append(qid)
comments.append(comment)
max_width = max(max_width, len(x))

assert max_width > 0
X = np.ndarray((len(xs), max_width), dtype=np.float64)
X.fill(missing)
for i, x in enumerate(xs):
X[i, :len(x)] = x
ys = np.array(ys) if has_targets else None
qids = np.array(qids)
comments = np.array(comments)

return (X, ys, qids, comments)


def _parse_qid_tok(tok):
assert tok.startswith('qid:')
return tok[4:]
9 changes: 9 additions & 0 deletions pyltr/metrics/__init__.py
@@ -0,0 +1,9 @@
"""
Various metrics classes.
"""

from _metrics import *
import dcg
import gains
89 changes: 89 additions & 0 deletions pyltr/metrics/_metrics.py
@@ -0,0 +1,89 @@
import numpy as np
from sklearn.externals.six.moves import range
from ..util.group import check_qids, get_groups
from ..util.sort import get_sorted_y


class Metric(object):
"""Base LTR metric class.
Subclasses must override evaluate() and cona optionally override various
other methods.
"""
def evaluate(self, qid, targets):
"""Evaluates the metric on a ranked list of targets.
qid is guaranteed to be a hashable type s.t.
sorted(targets1) == sorted(targets2) iff qid1 == qid2.
"""
raise NotImplementedError()

def calc_swap_deltas(self, qid, targets):
"""Returns an upper triangular matrix.
Each (i, j) contains the change in the metric from swapping targets[i, j].
Can be overridden for efficiency.
"""
n_targets = len(targets)
deltas = np.zeros((n_targets, n_targets))
original = self.evaluate(qid, targets)
max_k = self.max_k()
if max_k is None or n_targets < max_k:
max_k = n_targets

for i in range(max_k):
for j in range(i + 1, n_targets):
tmp = targets[i]
targets[i] = targets[j]
targets[j] = tmp
deltas[i, j] = self.evaluate(qid, targets) - original
tmp = targets[i]
targets[i] = targets[j]
targets[j] = tmp

return deltas

def max_k(self):
"""Returns a value k for which:
``swap_delta()[i][j] == 0 for all i, j >= k``
Returns None if no such value exists.
"""
return None

def evaluate_preds(self, qid, targets, preds):
return self.evaluate(qid, get_sorted_y(targets, preds))

def calc_random_ev(self, qid, targets):
"""Calculates the expectied value of the metric on randomized targets.
The default implementation may be overriden with something smarter
than repeated shuffles.
"""
targets = np.copy(targets)
scores = []
for _ in range(50):
np.random.shuffle(targets)
scores.append(self.evaluate(qid, targets))
return np.mean(scores)

def calc_mean(self, qids, y, y_pred):
"""Calculates the mean of the metric among the provided predictions."""
check_qids(qids)
query_groups = get_groups(qids)
return np.mean([self.evaluate_preds(qid, y[a:b], y_pred[a:b])
for qid, a, b in query_groups])

def calc_mean_random(self, qids, y):
"""Calculates the EV of the mean of the metric with random ranking."""
check_qids(qids)
query_groups = get_groups(qids)
return np.mean([self.calc_random_ev(qid, y[a:b])
for qid, a, b in query_groups])

0 comments on commit 8e1e992

Please sign in to comment.