Initial commit

jma127 · Aug 18, 2015 · 8e1e992 · 8e1e992
commit 8e1e992
Show file tree

Hide file tree

Showing 19 changed files with 1,316 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,16 @@
+# Compiler/Interpreter Output #
+###############################
+*.py[cod]
+
+# Folders #
+###########
+build/
+dist/
+local/
+*.egg-info/
+
+# Junk #
+########
+.DS_Store*
+.*.swp
+*.swp
diff --git a/LICENSE.txt b/LICENSE.txt
@@ -0,0 +1,12 @@
+Copyright (c) 2015, Jerry Ma
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/README.rst b/README.rst
@@ -0,0 +1,60 @@
+pyltr
+=====
+
+pyltr is a Python learning-to-rank toolkit with ranking models, evaluation
+metrics, data wrangling helpers, and more.
+
+This software is licensed under the BSD 3-clause license (see ``LICENSE.txt``).
+
+The author may be contacted at ``ma127jerry <@t> gmail`` with general
+feedback, questions, or bug reports.
+
+
+Example
+=======
+
+Import pyltr::
+
+    import pyltr
+
+Import a `LETOR
+<http://research.microsoft.com/en-us/um/beijing/projects/letor/>`_ dataset
+(e.g. `MQ2007
+<http://research.microsoft.com/en-us/um/beijing/projects/letor/LETOR4.0/Data/MQ2007.rar>`_
+)::
+
+    with open('train.txt') as trainfile, \
+            open('vali.txt') as valifile, \
+            open('test.txt') as evalfile:
+        TX, Ty, Tqids, _ = pyltr.data.letor.read_dataset(trainfile)
+        VX, Vy, Vqids, _ = pyltr.data.letor.read_dataset(valifile)
+        EX, Ey, Eqids, _ = pyltr.data.letor.read_dataset(evalfile)
+
+Train a `LambdaMART
+<http://research.microsoft.com/pubs/132652/MSR-TR-2010-82.pdf>`_ model, using
+validation set for early stopping and trimming::
+
+    metric = pyltr.metrics.dcg.NDCG(k=10)
+
+    # Only needed if you want to perform validation (early stopping & trimming)
+    monitor = pyltr.models.monitors.ValidationMonitor(
+        VX, Vy, Vqids, metric=metric, stop_after=250)
+
+    model = pyltr.models.lambdamart.LambdaMART(
+        metric=metric,
+        n_estimators=1000,
+        learning_rate=0.02,
+        max_features=0.5,
+        query_subsample=0.5,
+        max_leaf_nodes=10,
+        min_samples_leaf=64,
+        verbose=1,
+    )
+
+    model.fit(TX, ty, Tqids, monitor=monitor)
+
+Evaluate model on test data::
+
+    Epred = model.predict(Ex)
+    print 'Random ranking:', metric.calc_mean_random(Eqids, Ey)
+    print 'Our model:', metric.calc_mean(Eqids, Ey, Epred)
diff --git a/TODO.txt b/TODO.txt
@@ -0,0 +1,14 @@
+- Models
+  - AdaRank
+  - RankNet
+  - LambdaRank
+- Metrics
+  - MAP
+  - P@k
+  - MRR
+  - ERR
+  - Spearman coeff (tau)
+  - Kendall cheff (rho)
+- Documentation
+  - Finish numpy-style documentation.
+  - Setup Sphinx.
diff --git a/pyltr/__init__.py b/pyltr/__init__.py
@@ -0,0 +1,12 @@
+"""
+
+Base module for pyltr.
+
+We import all packages since it's not particularly expensive.
+
+"""
+
+import data
+import metrics
+import models
+import util
diff --git a/pyltr/data/__init__.py b/pyltr/data/__init__.py
@@ -0,0 +1,7 @@
+"""
+
+Various data wrangling utilities.
+
+"""
+
+import letor
diff --git a/pyltr/data/letor.py b/pyltr/data/letor.py
@@ -0,0 +1,131 @@
+"""
+
+Various utilities for converting data from/to Microsoft's LETOR format.
+
+"""
+
+import numpy as np
+import sklearn.externals.six
+from sklearn.externals.six.moves import range
+
+
+def iter_lines(lines, has_targets=True, one_indexed=True, missing=0.0):
+    """Transforms an iterator of lines to an iterator of LETOR rows.
+
+    Each row is represented by a (x, y, qid, comment) tuple.
+
+    Parameters
+    ----------
+    lines : iterable of lines
+        Lines to parse.
+    has_targets : bool, optional
+        Whether the file contains targets. If True, will expect the first token
+        of every line to be a real representing the sample's target (i.e.
+        score). If False, will use -1 as a placeholder for all targets.
+    one_indexed : bool, optional
+        Whether feature ids are one-indexed. If True, will subtract 1 from each
+        feature id.
+    missing : float, optional
+        Placeholder to use if a feature value is not provided for a sample.
+
+    Yields
+    ------
+    x : array of floats
+        Feature vector of the sample.
+    y : float
+        Target value (score) of the sample, or -1 if no target was parsed.
+    qid : object
+        Query id of the sample. This is currently guaranteed to be a string.
+    comment : str
+        Comment accompanying the sample.
+
+    """
+    for line in lines:
+        data, _, comment = line.rstrip().partition('#')
+        toks = data.split()
+
+        num_features = 0
+        x = np.repeat(missing, 8)
+        y = -1.0
+        if has_targets:
+            y = float(toks[0])
+            toks = toks[1:]
+
+        qid = _parse_qid_tok(toks[0])
+
+        for tok in toks[1:]:
+            fid, _, val = tok.partition(':')
+            fid = int(fid)
+            val = float(val)
+            if one_indexed:
+                fid -= 1
+            assert fid >= 0
+            while len(x) <= fid:
+                orig = len(x)
+                x.resize(len(x) * 2)
+                x[orig:orig * 2] = missing
+
+            x[fid] = val
+            num_features = max(fid + 1, num_features)
+
+        assert num_features > 0
+        x.resize(num_features)
+
+        yield (x, y, qid, comment)
+
+
+def read_dataset(source, has_targets=True, one_indexed=True, missing=0.0):
+    """Parses a LETOR dataset from `source`.
+
+    Parameters
+    ----------
+    source : string or iterable of lines
+        String, file, or other file-like object to parse.
+    has_targets : bool, optional
+        See `iter_lines`.
+    one_indexed : bool, optional
+        See `iter_lines`.
+    missing : float, optional
+        See `iter_lines`.
+
+    Returns
+    -------
+    X : array of arrays of floats
+        Feature matrix (see `iter_lines`).
+    y : array of floats
+        Target vector (see `iter_lines`).
+    qids : array of objects
+        Query id vector (see `iter_lines`).
+    comments : array of strs
+        Comment vector (see `iter_lines`).
+
+    """
+    if isinstance(source, sklearn.externals.six.string_types):
+        source = source.splitlines()
+
+    max_width = 0
+    xs, ys, qids, comments = [], [], [], []
+    it = iter_lines(source, has_targets=has_targets,
+                    one_indexed=one_indexed, missing=missing)
+    for x, y, qid, comment in it:
+        xs.append(x)
+        ys.append(y)
+        qids.append(qid)
+        comments.append(comment)
+        max_width = max(max_width, len(x))
+
+    assert max_width > 0
+    X = np.ndarray((len(xs), max_width), dtype=np.float64)
+    X.fill(missing)
+    for i, x in enumerate(xs):
+        X[i, :len(x)] = x
+    ys = np.array(ys) if has_targets else None
+    qids = np.array(qids)
+    comments = np.array(comments)
+
+    return (X, ys, qids, comments)
+
+
+def _parse_qid_tok(tok):
+    assert tok.startswith('qid:')
+    return tok[4:]
diff --git a/pyltr/metrics/__init__.py b/pyltr/metrics/__init__.py
@@ -0,0 +1,9 @@
+"""
+
+Various metrics classes.
+
+"""
+
+from _metrics import *
+import dcg
+import gains
diff --git a/pyltr/metrics/_metrics.py b/pyltr/metrics/_metrics.py
@@ -0,0 +1,89 @@
+import numpy as np
+from sklearn.externals.six.moves import range
+from ..util.group import check_qids, get_groups
+from ..util.sort import get_sorted_y
+
+
+class Metric(object):
+    """Base LTR metric class.
+
+    Subclasses must override evaluate() and cona optionally override various
+    other methods.
+
+    """
+    def evaluate(self, qid, targets):
+        """Evaluates the metric on a ranked list of targets.
+
+        qid is guaranteed to be a hashable type s.t.
+        sorted(targets1) == sorted(targets2) iff qid1 == qid2.
+
+        """
+        raise NotImplementedError()
+
+    def calc_swap_deltas(self, qid, targets):
+        """Returns an upper triangular matrix.
+
+        Each (i, j) contains the change in the metric from swapping targets[i, j].
+
+        Can be overridden for efficiency.
+
+        """
+        n_targets = len(targets)
+        deltas = np.zeros((n_targets, n_targets))
+        original = self.evaluate(qid, targets)
+        max_k = self.max_k()
+        if max_k is None or n_targets < max_k:
+            max_k = n_targets
+
+        for i in range(max_k):
+            for j in range(i + 1, n_targets):
+                tmp = targets[i]
+                targets[i] = targets[j]
+                targets[j] = tmp
+                deltas[i, j] = self.evaluate(qid, targets) - original
+                tmp = targets[i]
+                targets[i] = targets[j]
+                targets[j] = tmp
+
+        return deltas
+
+    def max_k(self):
+        """Returns a value k for which:
+
+        ``swap_delta()[i][j] == 0 for all i, j >= k``
+
+        Returns None if no such value exists.
+
+        """
+        return None
+
+    def evaluate_preds(self, qid, targets, preds):
+        return self.evaluate(qid, get_sorted_y(targets, preds))
+
+    def calc_random_ev(self, qid, targets):
+        """Calculates the expectied value of the metric on randomized targets.
+
+        The default implementation may be overriden with something smarter
+        than repeated shuffles.
+
+        """
+        targets = np.copy(targets)
+        scores = []
+        for _ in range(50):
+            np.random.shuffle(targets)
+            scores.append(self.evaluate(qid, targets))
+        return np.mean(scores)
+
+    def calc_mean(self, qids, y, y_pred):
+        """Calculates the mean of the metric among the provided predictions."""
+        check_qids(qids)
+        query_groups = get_groups(qids)
+        return np.mean([self.evaluate_preds(qid, y[a:b], y_pred[a:b])
+                        for qid, a, b in query_groups])
+
+    def calc_mean_random(self, qids, y):
+        """Calculates the EV of the mean of the metric with random ranking."""
+        check_qids(qids)
+        query_groups = get_groups(qids)
+        return np.mean([self.calc_random_ev(qid, y[a:b])
+                        for qid, a, b in query_groups])