This notebook tests `lightgbm.dask`'s behavior with sparse inputs to `pred_contrib()`.

In [1]:
import dask.array as da
import numpy as np

from dask.distributed import Client, LocalCluster, wait

from lightgbm.dask import DaskLGBMClassifier
from lightgbm.sklearn import LGBMClassifier
from scipy.sparse import csc_matrix
from sklearn.datasets import make_blobs

In [4]:
n_workers = 3
cluster = LocalCluster(n_workers=n_workers)
client = Client(cluster)
client.wait_for_workers(n_workers)

print(f"View the dashboard: {cluster.dashboard_link}")

View the dashboard: http://127.0.0.1:8787/status


In [3]:
chunk_size=50
X, y = make_blobs(n_samples=100, centers=3, random_state=42)
rnd = np.random.RandomState(42)
dX = da.from_array(X, chunks=(chunk_size, X.shape[1])).map_blocks(csc_matrix)
dy = da.from_array(y, chunks=chunk_size)

In [20]:
dask_clf = DaskLGBMClassifier(n_estimators=5, num_leaves=2, tree_learner="data")
dask_clf.fit(dX, dy, client=client)

preds = dask_clf.predict(dX, pred_contrib=True)
preds_computed = preds.compute()

print(type(preds), type(preds.partitions[0].compute()), type(preds_computed), f"{dask_clf.n_classes_} classes, {dX.shape[1]} features")
print("---")
print(dX.partitions[0].compute())
print("---")
preds.compute().shape

<class 'dask.array.core.Array'> <class 'scipy.sparse.csc.csc_matrix'> <class 'scipy.sparse.coo.coo_matrix'> 3 classes, 2 features
---
  (0, 0)	-7.726420909219675
  (1, 0)	5.453396053597771
  (2, 0)	-2.978672008987702
  (3, 0)	6.042673147164201
  (4, 0)	-6.521839830802987
  (5, 0)	3.649342511097413
  (6, 0)	-2.1779341916491863
  (7, 0)	4.4202069483905895
  (8, 0)	4.736956385576142
  (9, 0)	-3.6601912004750528
  (10, 0)	-3.053580347577933
  (11, 0)	-6.65216725654714
  (12, 0)	-6.357685625534373
  (13, 0)	-3.6155325970587784
  (14, 0)	-1.7707310430573397
  (15, 0)	-7.950519689212382
  (16, 0)	-6.602936391821251
  (17, 0)	-2.581207744633084
  (18, 0)	-7.763484627352403
  (19, 0)	-6.406389566577725
  (20, 0)	-2.9726153158652124
  (21, 0)	-6.956728900565374
  (22, 0)	-7.326142143218291
  (23, 0)	-2.147802017544336
  (24, 0)	-2.5450236621627016
  :	:
  (25, 1)	10.071408354417237
  (26, 1)	1.552524361175373
  (27, 1)	-7.737267149692229
  (28, 1)	-6.093024989533495
  (29, 1)	-8.200566206360223


(100, 9)

In [22]:
preds.partitions[0].compute()

<50x9 sparse matrix of type '<class 'numpy.float64'>'
	with 350 stored elements in Compressed Sparse Column format>

In [14]:
X = dX.compute()
y = dy.compute()

local_clf = LGBMClassifier()
local_clf.fit(X=dX.compute(), y=y)
local_preds = local_clf.predict(dX.compute().tocsc(), pred_contrib=True)

print(local_clf.n_classes_, type(local_preds))
print("---")
print(local_preds)

3 <class 'list'>
---
[<100x3 sparse matrix of type '<class 'numpy.float64'>'
	with 300 stored elements in Compressed Sparse Column format>, <100x3 sparse matrix of type '<class 'numpy.float64'>'
	with 300 stored elements in Compressed Sparse Column format>, <100x3 sparse matrix of type '<class 'numpy.float64'>'
	with 300 stored elements in Compressed Sparse Column format>]


In [16]:
local_preds[0]

<100x3 sparse matrix of type '<class 'numpy.float64'>'
	with 300 stored elements in Compressed Sparse Column format>