## Dask LightGBMRanker

This notebook contains tests on `lightgbm.dask.LGBMRanker`.

In [31]:
import itertools

import dask.array as da
import dask.dataframe as dd
import lightgbm as lgb
import numpy as np
import pandas as pd
from dask.distributed import Client, LocalCluster
from scipy.stats import spearmanr
from sklearn.datasets import make_blobs, make_regression

In [2]:
n_workers = 3
cluster = LocalCluster(n_workers=n_workers)
client = Client(cluster)
client.wait_for_workers(n_workers)

print(f"View the dashboard: {cluster.dashboard_link}")

View the dashboard: http://127.0.0.1:8787/status


In [29]:
from itertools import groupby

from sklearn.utils import check_random_state


def make_ranking(
    n_samples=100,
    n_features=20,
    n_informative=5,
    gmax=2,
    group=None,
    random_gs=False,
    avg_gs=10,
    random_state=0,
):
    rnd_generator = check_random_state(random_state)

    y_vec, group_id_vec = np.empty((0,), dtype=int), np.empty((0,), dtype=int)
    gid = 0

    # build target, group ID vectors.
    relvalues = range(gmax + 1)

    # build y/target and group-id vectors with user-specified group sizes.
    if group is not None and hasattr(group, "__len__"):
        n_samples = np.sum(group)

        for i, gsize in enumerate(group):
            y_vec = np.concatenate(
                (y_vec, rnd_generator.choice(relvalues, size=gsize, replace=True))
            )
            group_id_vec = np.concatenate((group_id_vec, [i] * gsize))

    # build y/target and group-id vectors according to n_samples, avg_gs, and random_gs.
    else:
        while len(y_vec) < n_samples:
            gsize = avg_gs if not random_gs else rnd_generator.poisson(avg_gs)

            # groups should contain > 1 element for pairwise learning objective.
            if gsize < 1:
                continue

            y_vec = np.append(y_vec, rnd_generator.choice(relvalues, size=gsize, replace=True))
            group_id_vec = np.append(group_id_vec, [gid] * gsize)
            gid += 1

        y_vec, group_id_vec = y_vec[:n_samples], group_id_vec[:n_samples]

    # build feature data, X. Transform first few into informative features.
    n_informative = max(min(n_features, n_informative), 0)
    X = rnd_generator.uniform(size=(n_samples, n_features))

    for j in range(n_informative):
        bias, coef = rnd_generator.normal(size=2)
        X[:, j] = bias + coef * y_vec

    return X, y_vec, group_id_vec


def _create_ranking_data(n_samples=100, chunk_size=50, **kwargs):
    X, y, g = make_ranking(n_samples=n_samples, random_state=42, **kwargs)
    rnd = np.random.RandomState(42)
    w = rnd.rand(X.shape[0]) * 0.01
    g_rle = np.array([len(list(grp)) for _, grp in groupby(g)])

    # ranking arrays: one chunk per group. Each chunk must include all columns.
    p = X.shape[1]
    dX, dy, dw, dg = [], [], [], []
    for g_idx, rhs in enumerate(np.cumsum(g_rle)):
        lhs = rhs - g_rle[g_idx]
        dX.append(da.from_array(X[lhs:rhs, :], chunks=(rhs - lhs, p)))
        dy.append(da.from_array(y[lhs:rhs]))
        dw.append(da.from_array(w[lhs:rhs]))
        dg.append(da.from_array(np.array([g_rle[g_idx]])))

    dX = da.concatenate(dX, axis=0)
    dy = da.concatenate(dy, axis=0)
    dw = da.concatenate(dw, axis=0)
    dg = da.concatenate(dg, axis=0)

    return X, y, w, g_rle, dX, dy, dw, dg


def _create_data(objective, n_samples=100, centers=2, chunk_size=50):
    if objective == "classification":
        X, y = make_blobs(n_samples=n_samples, centers=centers, random_state=42)
    elif objective == "regression":
        X, y = make_regression(n_samples=n_samples, random_state=42)
    rnd = np.random.RandomState(42)
    weights = rnd.random(X.shape[0]) * 0.01
    dX = da.from_array(X, (chunk_size, X.shape[1]))
    dy = da.from_array(y, chunk_size)
    dw = da.from_array(weights, chunk_size)
    return X, y, weights, dX, dy, dw

## Test with Dask array

In [25]:
params = {"random_state": 42, "n_estimators": 50, "num_leaves": 20, "min_child_samples": 1}

dask_ranker = lgb.DaskLGBMRanker(
    client=client, time_out=5, local_listen_port=12400, tree_learner_type="data_parallel", **params
)
local_ranker = lgb.LGBMRanker(**params)

dask_regressor = lgb.DaskLGBMRegressor(
    client=client, time_out=5, local_listen_port=12400, tree_learner_type="data_parallel", **params
)
local_regressor = lgb.LGBMRegressor(**params)

dask_classifier = lgb.DaskLGBMClassifier(
    client=client, time_out=5, local_listen_port=12400, tree_learner_type="data_parallel", **params
)
local_classifier = lgb.LGBMClassifier(**params)

In [23]:
# ----- ranking ----- #
X, y, w, g, dX, dy, dw, dg = _create_ranking_data()

dask_summaries = {}
local_summaries = {}
for _ in range(100):
    dask_ranker = dask_ranker.fit(dX, dy, sample_weight=dw, group=dg)
    num_trees = dask_ranker.booster_.num_trees()
    dask_summaries[num_trees] = dask_summaries.get(num_trees, 0) + 1

    local_ranker.fit(X, y, sample_weight=w, group=g)
    num_trees = local_ranker.booster_.num_trees()
    local_summaries[num_trees] = local_summaries.get(num_trees, 0) + 1

print("   dask: " + str(dask_summaries))
print("sklearn: " + str(local_summaries))

In [34]:
# ----- regression ----- #
X, y, w, dX, dy, dw = _create_data(objective="regression")

dask_summaries = {}
local_summaries = {}
for _ in range(100):
    dask_regressor.fit(dX, dy, sample_weight=dw)
    num_trees = dask_regressor.booster_.num_trees()
    dask_summaries[num_trees] = dask_summaries.get(num_trees, 0) + 1

    local_regressor.fit(X, y, sample_weight=w)
    num_trees = local_regressor.booster_.num_trees()
    local_summaries[num_trees] = local_summaries.get(num_trees, 0) + 1

print("   dask: " + str(dask_summaries))
print("sklearn: " + str(local_summaries))

   dask: {50: 100}
sklearn: {50: 100}


In [33]:
# ----- binary classification ----- #
X, y, w, dX, dy, dw = _create_data(objective="classification")

dask_summaries = {}
local_summaries = {}
for _ in range(100):
    dask_classifier.fit(dX, dy, sample_weight=dw)
    num_trees = dask_classifier.booster_.num_trees()
    dask_summaries[num_trees] = dask_summaries.get(num_trees, 0) + 1

    local_classifier.fit(X, y, sample_weight=w)
    num_trees = local_classifier.booster_.num_trees()
    local_summaries[num_trees] = local_summaries.get(num_trees, 0) + 1

print("   dask: " + str(dask_summaries))
print("sklearn: " + str(local_summaries))

   dask: {47: 53, 46: 47}
sklearn: {48: 100}


In [36]:
# ----- multiclass classification ----- #
dask_classifier = lgb.DaskLGBMClassifier(
    client=client, time_out=5, local_listen_port=12400, tree_learner_type="data_parallel", **params
)
local_classifier = lgb.LGBMClassifier(**params)

X, y, w, dX, dy, dw = _create_data(objective="classification", centers=3)

dask_summaries = {}
local_summaries = {}
for _ in range(100):
    dask_classifier.fit(dX, dy, sample_weight=dw)
    num_trees = dask_classifier.booster_.num_trees()
    dask_summaries[num_trees] = dask_summaries.get(num_trees, 0) + 1

    local_classifier.fit(X, y, sample_weight=w)
    num_trees = local_classifier.booster_.num_trees()
    local_summaries[num_trees] = local_summaries.get(num_trees, 0) + 1

print("   dask: " + str(dask_summaries))
print("sklearn: " + str(local_summaries))

   dask: {117: 54, 111: 46}
sklearn: {150: 100}
