In [1]:
import lance
from lance.sampler import build_shuffle_sample

import pyarrow as pa
import pyarrow.compute as pc

In [None]:
# Creating an example dataset
nrows = 1024 * 1024
ndim = 768
vecs = pa.FixedSizeListArray.from_arrays(pc.random(nrows * ndim).cast("float32"), ndim)
tab = pa.table({
    "id": pa.array(range(nrows)),
    "vec": vecs
})
ds = lance.write_dataset(tab, "sample_dataset", mode="overwrite")
del tab

In [2]:
ds = lance.dataset("sample_dataset")

In [25]:
sample = build_shuffle_sample(ds, predicate="id > 200", batch_size=128)
sample

DatasetSample(params=SampleParams(predicate='id > 200', batch_size=128, shuffle=True, sample_rate=None, seed=782193499428547259), row_id_mask=Mask<n=1048375>, batch_starts=PrimitiveArray<UInt32>
[
  994761,
  220617,
  282825,
  927689,
  864585,
  20937,
  565193,
  191945,
  546761,
  56265,
  ...8171 elements...,
  832201,
  879817,
  92233,
  349001,
  573385,
  1010377,
  526537,
  787785,
  444233,
  922185,
], batch_lengths=PrimitiveArray<UInt16>
[
  128,
  128,
  128,
  128,
  128,
  128,
  128,
  128,
  128,
  128,
  ...8171 elements...,
  128,
  128,
  128,
  128,
  128,
  128,
  128,
  128,
  128,
  128,
], metrics=SampleMetrics(dataset_size=1048576, matched_rows=1048375, sampled_rows=1048375))

## Samples allow introspection

Samples store information about the dataset, which can be used to understand the
sample and the quality of it. This includes the originally parameters, as well
as the seed chosen (or the one that was provided) so the sample can be reproduced.

There are also metrics which can show how many rows were matched by the predicate
and how many were sampled (if `sample_rate` was specified).

In [4]:
print("Sample covers {} rows in {} batches".format(sample.num_rows, len(sample)))

Sample covers 1048375 rows in 8191 batches


In [5]:
sample.params

SampleParams(predicate='id > 200', batch_size=128, shuffle=True, sample_rate=None, seed=4160125278387179401)

In [6]:
matched_percent = sample.metrics.matched_rows / sample.metrics.dataset_size
print("The predicate matched {:.2%} of the dataset".format(matched_percent))

The predicate matched 99.98% of the dataset


In [7]:
sample = build_shuffle_sample(ds, predicate="id > 200", batch_size=128, sample_rate=0.2)

In [8]:
sample_percent = sample.metrics.sampled_rows / sample.metrics.dataset_size
print("The sample retrieved {:.2%} of the dataset".format(sample_percent))

The sample retrieved 19.89% of the dataset


## Slicing Samples

Samples can be sliced to get a subset of the sample. This is useful for distributed
training, where each worker might select a different slice of the data.

In [9]:
num_workers = 10
worker_i = 3

sample_slice = sample[worker_i::num_workers]
len(sample_slice)

794

Slicing can also be used to skip batches, which is useful for resuming training.

In [10]:
skip = 10
len(sample_slice[skip:])

784

## Serializing Samples

Samples can be serialized, which is useful for distributed training.

The simplest way to serialize is using pickle, which can either be written into
a file or saved as bytes in-memory.

In [11]:
import pickle

with open("sample_slice.pkl", "wb") as f:
    pickle.dump(sample_slice, f)

For debugging purposes, you can also save the sample as a GZIP-compressed TAR archive.
This can be opened and inspected.

In [12]:
sample_slice.serialize_into("sample_slice.tar.gz")

In [17]:
# decompress and list contents of the archive
!rm -rf sample_slice
!mkdir sample_slice
!tar -vxf sample_slice.tar.gz -C ./sample_slice/
!ls -lh sample_slice

x params.json
x metrics.json
x row_id_mask.bin
x batches.arrow
total 120
-rw-r--r--  1 willjones  staff   5.6K Dec 31  1969 batches.arrow
-rw-r--r--  1 willjones  staff    82B Dec 31  1969 metrics.json
-rw-r--r--  1 willjones  staff   122B Dec 31  1969 params.json
-rw-r--r--  1 willjones  staff    41K Dec 31  1969 row_id_mask.bin


## Loading data from a Sample

The sample can be passed to data loader methods.

The underlying mechanism is simply that the sample is Iterable, and returns
an iterator of batch indices.

In [18]:
for batch in sample:
    print(batch)
    break

[
  218427,
  218428,
  218430,
  218431,
  218441,
  218443,
  218446,
  218447,
  218449,
  218453,
  ...
  218532,
  218533,
  218538,
  218543,
  218544,
  218545,
  218548,
  218550,
  218551,
  218554
]


In [24]:
def batch_iter(sample):
    for indices in sample:
        # TODO: LanceDataset.take should accept a pyarrow array
        yield ds.take(indices.to_pylist())

next(iter(batch_iter(sample)))

pyarrow.Table
id: int64
vec: fixed_size_list<item: float>[768]
  child 0, item: float
----
id: [[218427,218428,218430,218431,218441,...,218545,218548,218550,218551,218554]]
vec: [[[0.039365917,0.9347307,0.17798844,0.49374506,0.7373791,...,0.67922693,0.569175,0.82238936,0.45353127,0.3069117],[0.28323245,0.93013906,0.7872263,0.44097242,0.5217537,...,0.0035082009,0.054516055,0.8780995,0.009865927,0.30937225],...,[0.50098234,0.9499888,0.69663125,0.9786897,0.4081945,...,0.9559047,0.60340804,0.06629784,0.90485907,0.14169064],[0.1103932,0.62449497,0.39245197,0.12508623,0.2733724,...,0.1418554,0.9146605,0.3044245,0.52509713,0.9617853]]]

This is only single-threaded, so in most production use cases you'll instead want
to use one of the data loaders, which will use multiple threads to read batches
ahead.