## Dask LightGBMRanker

This notebook tests `lightgbm.dask.LGBMRanker`, proposed in https://github.com/microsoft/LightGBM/pull/3708.

In [31]:
import itertools
import time

import dask.array as da
import dask.dataframe as dd

import numpy as np

from dask.distributed import Client, LocalCluster, wait
from scipy.stats import spearmanr
from sklearn.utils import check_random_state

from lightgbm.dask import DaskLGBMRanker
from lightgbm.sklearn import LGBMRanker

In [12]:
n_workers = 4
cluster = LocalCluster(n_workers=n_workers)
client = Client(cluster)
client.wait_for_workers(n_workers)

print(f"View the dashboard: {cluster.dashboard_link}")

Perhaps you already have a cluster running?
Hosting the HTTP server on port 38719 instead


View the dashboard: http://127.0.0.1:38719/status


In [13]:
def _make_ranking(
    n_samples=100,
    n_features=20,
    n_informative=5,
    gmax=1,
    random_gs=False,
    avg_gs=10,
    random_state=0,
):
    """Generate a learning-to-rank dataset - feature vectors grouped together with
    integer-valued graded relevance scores. Replace this with a sklearn.datasets function
    if ranking objective becomes supported in sklearn.datasets module."""
    rnd_generator = check_random_state(random_state)

    y_vec, group_vec = np.empty((0,), dtype=int), np.empty((0,), dtype=int)
    gid = 0

    # build target, group ID vectors.
    relvalues = range(gmax + 1)
    while len(y_vec) < n_samples:
        gsize = avg_gs if not random_gs else rnd_generator.poisson(avg_gs)
        if not gsize:
            continue

        rel = rnd_generator.choice(relvalues, size=gsize, replace=True)
        y_vec = np.append(y_vec, rel)
        group_vec = np.append(group_vec, [gid] * gsize)
        gid += 1

    y_vec, group_vec = y_vec[0:n_samples], group_vec[0:n_samples]

    # build feature data, X. Transform first few into informative features.
    n_informative = max(min(n_features, n_informative), 0)
    x_grid = np.linspace(0, stop=1, num=gmax + 2)
    X = rnd_generator.uniform(size=(n_samples, n_features))

    # make first n_informative features values bucketed according to relevance scores.
    def bucket_fn(z):
        return rnd_generator.uniform(x_grid[z], high=x_grid[z + 1])

    for j in range(n_informative):
        bias, coef = rnd_generator.normal(size=2)
        X[:, j] = bias + coef * np.apply_along_axis(bucket_fn, axis=0, arr=y_vec)

    return X, y_vec, group_vec


def _create_ranking_data(n_samples=100, output="array", chunk_size=50):
    X, y, g = _make_ranking(n_samples=n_samples, random_state=42)
    rnd = np.random.RandomState(42)
    w = rnd.rand(X.shape[0]) * 0.01
    g_rle = np.array([sum([1 for _ in grp]) for _, grp in itertools.groupby(g)])

    if output == "dataframe":

        # add target, weight, and group to DataFrame so that partitions abide by group boundaries.
        X_df = pd.DataFrame(X, columns=[f"feature_{i}" for i in range(X.shape[1])])
        X = X_df.copy()
        X_df = X_df.assign(y=y, g=g, w=w)

        # set_index ensures partitions are based on group id. See https://bit.ly/3pAWyNw.
        X_df.set_index("g", inplace=True)
        dX = dd.from_pandas(X_df, chunksize=chunk_size)

        # separate target, weight from features.
        dy = dX["y"]
        dw = dX["w"]
        dX = dX.drop(columns=["y", "w"])
        dg = dX.index.to_series()

        # encode group identifiers into run-length encoding, the format LightGBMRanker is expecting
        # so that within each partition, sum(g) = n_samples.
        dg = dg.map_partitions(lambda p: p.groupby("g", sort=False).apply(lambda z: z.shape[0]))

    elif output == "array":

        # ranking arrays: one chunk per group. Each chunk must include all columns.
        p = X.shape[1]
        dX, dy, dw, dg = list(), list(), list(), list()
        for g_idx, rhs in enumerate(np.cumsum(g_rle)):
            lhs = rhs - g_rle[g_idx]
            dX.append(da.from_array(X[lhs:rhs, :], chunks=(rhs - lhs, p)))
            dy.append(da.from_array(y[lhs:rhs]))
            dw.append(da.from_array(w[lhs:rhs]))
            dg.append(da.from_array(np.array([g_rle[g_idx]])))

        dX = da.concatenate(dX, axis=0)
        dy = da.concatenate(dy, axis=0)
        dw = da.concatenate(dw, axis=0)
        dg = da.concatenate(dg, axis=0)

    else:
        raise ValueError("ranking data creation only supported for Dask arrays and dataframes")

    return X, y, w, g_rle, dX, dy, dw, dg

## Test with Dask array

In [14]:
X, y, w, g, dX, dy, dw, dg = _create_ranking_data(output="array")

In [20]:
dg.compute()

array([10, 10, 10, 10, 10, 10, 10, 10, 10, 10])

In [23]:
listen_port = 12410

dask_ranker = DaskLGBMRanker(
    time_out=5, local_listen_port=listen_port, seed=42, min_child_samples=1
)

dask_ranker = dask_ranker.fit(X=dX, y=dy, sample_weight=dw, group=dg, client=client)
rnkvec_dask = dask_ranker.predict(dX)
rnkvec_dask = rnkvec_dask.compute()

Parameter tree_learner not set or set to incorrect value (None), using "data" as default


In [24]:
rnkvec_dask

array([-1.61911428,  1.34558111, -1.68706133, -1.6254889 , -1.62548888,
        2.1857188 , -1.59656968, -1.62548889, -1.61626712,  2.78163879,
       -1.60519821, -1.62266898, -1.76722149, -1.60593613,  1.31288066,
       -1.61377047,  1.59345141,  1.73398405,  2.09383409, -1.76648355,
        2.64820997, -1.75394921,  1.3018359 ,  2.64820998,  1.32681614,
        1.31383015,  1.73398404,  1.33805625,  2.12685499,  2.17598738,
       -1.60779518, -1.59997695,  1.66347386,  2.07799638,  2.7816388 ,
       -1.6690408 ,  1.56922532, -1.60071491, -1.75958592, -1.79631722,
       -1.59186463, -1.70317437,  2.16717095,  2.61456174,  2.8584333 ,
        1.297471  ,  1.34986058, -1.67453332,  2.64820998,  2.09903914,
       -1.59649273,  2.09708282, -1.67861995,  2.54080954, -1.58664337,
        2.10144774,  1.59922552, -1.59649959, -1.59656969, -1.58942456,
       -1.60484607, -1.62548889, -1.66474169, -1.68591056, -1.68418729,
        2.08540358,  2.06515072, -1.66789002,  1.5944009 ,  2.06

In [28]:
local_ranker = LGBMRanker(seed=42, min_child_samples=1)
local_ranker.fit(X, y, sample_weight=w, group=g)
rnkvec_local = local_ranker.predict(X)

In [32]:
# distributed ranker should be able to rank decently well.
dcor = spearmanr(rnkvec_dask, y).correlation
assert dcor > 0.6
dcor

0.8598233007392299

In [36]:
# relative difference between distributed ranker and local ranker spearman corr should be small.
lcor = spearmanr(rnkvec_local, y).correlation
print(np.abs(dcor - lcor))
assert np.abs(dcor - lcor) < 0.003

0.0007107967896998746
