lance-format · eddyxu · Dec 8, 2024 · Dec 7, 2024 · Dec 8, 2024 · chebbyChefNEQ
diff --git a/python/python/lance/torch/data.py b/python/python/lance/torch/data.py
@@ -7,6 +7,7 @@
 from __future__ import annotations
 
 import json
+import logging
 import math
 import warnings
 from pathlib import Path
@@ -45,17 +46,32 @@ def _fsl_to_tensor(arr: pa.FixedSizeListArray, dimension: int) -> torch.Tensor:
 
 
 def _to_tensor(
-    batch: pa.RecordBatch,
+    batch: Union[pa.RecordBatch, Dict[str, pa.Array]],
     *,
     uint64_as_int64: bool = True,
     hf_converter: Optional[dict] = None,
+    use_blob_api: bool = False,
+    **kwargs,
 ) -> Union[dict[str, torch.Tensor], torch.Tensor]:
     """Convert a pyarrow RecordBatch to torch Tensor."""
     ret = {}
 
-    for col in batch.schema.names:
+    cols = (
+        batch.column_names if isinstance(batch, pa.RecordBatch) else list(batch.keys())
+    )
+    for col in cols:
         arr: pa.Array = batch[col]
 
+        if (
+            use_blob_api
+            and isinstance(arr, list)
+            and arr
+            and isinstance(arr[0], lance.BlobFile)
+        ):
+            raise NotImplementedError(
+                'Need user-provided "to_tensor_fn" for Blob files'
+            )
+
         tensor: torch.Tensor = None
         if (isinstance(arr.type, pa.FixedShapeTensorType)) and (
             pa.types.is_floating(arr.type.value_type)
@@ -234,6 +250,10 @@ def __init__(
         self._to_tensor_fn = to_tensor_fn
         self._hf_converter = None
 
+        self._blob_columns = self._blob_columns()
+        if self._blob_columns:
+            self.with_row_id = True
+
         # As Shared Dataset
         self.shard_granularity = shard_granularity
         self.rank = rank
@@ -258,6 +278,13 @@ def __init__(
     def __repr__(self) -> str:
         return f"LanceTorchDataset({self.dataset.uri}, size={self.samples})"
 
+    @property
+    def schema(self) -> pa.Schema:
+        if not self.columns:
+            return self.dataset.schema
+        fields = [self.dataset.schema.field(col) for col in self.columns]
+        return pa.schema(fields, metadata=self.dataset.schema.metadata)
+
     def __iter__(self):
         if self.sampler is None:
             if self.rank is not None and self.world_size is not None:
@@ -280,6 +307,12 @@ def __iter__(self):
         else:
             sampler = self.sampler
 
+        projected_columns = self.columns or self.dataset.schema.names
+        if self._blob_columns:
+            projected_columns = [
+                c for c in projected_columns if c not in self._blob_columns
+            ]
+
         stream: Iterable[pa.RecordBatch]
         if self.cached_ds:
             stream = self.cached_ds
@@ -288,14 +321,14 @@ def __iter__(self):
                 raw_stream = maybe_sample(
                     self.dataset,
                     n=self.samples,
-                    columns=self.columns,
+                    columns=projected_columns,
                     batch_size=self.batch_size,
                     filt=self.filter,
                 )
             else:
                 raw_stream = sampler(
                     self.dataset,
-                    columns=self.columns,
+                    columns=projected_columns,
                     filter=self.filter,
                     batch_size=self.batch_size,
                     with_row_id=self.with_row_id,
@@ -308,8 +341,39 @@ def __iter__(self):
                 self.cached_ds = CachedDataset(stream, cache=self.cache)
                 stream = self.cached_ds
 
+        use_blob_api = bool(self._blob_columns)
         for batch in stream:
+            if use_blob_api:
+                dict_batch = {}
+                assert "_rowid" in batch.column_names
+                row_ids = batch["_rowid"]
+                for col in batch.column_names:
+                    dict_batch[col] = batch[col]
+                for col in self._blob_columns:
+                    dict_batch[col] = self.dataset.take_blobs(
+                        row_ids=row_ids.to_pylist(), blob_column=col
+                    )
+                batch = dict_batch
             if self._to_tensor_fn is not None:
-                batch = self._to_tensor_fn(batch, hf_converter=self._hf_converter)
+                batch = self._to_tensor_fn(
+                    batch, hf_converter=self._hf_converter, use_blob_api=use_blob_api
+                )
             yield batch
             del batch
+
+    def _blob_columns(self) -> List[str]:
+        """Returns True if one of the projected column is Large Blob encoded."""
+        cols = self.columns
+        if not cols:
+            cols = self.dataset.schema.names
+        blob_cols = []
+        for col in cols:
+            field = self.dataset.schema.field(col)
+            if (
+                field.type == pa.large_binary()
+                and field.metadata is not None
+                and field.metadata.get(b"lance-encoding:blob") == b"true"
+            ):
+                logging.debug("Column %s is a Large Blob column", col)
+                blob_cols.append(col)
+        return blob_cols
diff --git a/python/python/tests/torch_tests/test_data.py b/python/python/tests/torch_tests/test_data.py
@@ -277,3 +277,49 @@ def test_convert_int_tensors(tmp_path: Path, dtype):
     first = next(iter(torch_ds))
     assert first["vec"].dtype == torch.uint8 if dtype == np.uint8 else torch.int64
     assert first["vec"].shape == (4, 32)
+
+
+def test_blob_api(tmp_path: Path):
+    ints = pa.array(range(100), type=pa.int64())
+    vals = pa.array([b"0" * 1024 for _ in range(100)], pa.large_binary())
+    schema = pa.schema(
+        [
+            pa.field("int", ints.type),
+            pa.field(
+                "val", pa.large_binary(), metadata={"lance-encoding:blob": "true"}
+            ),
+        ]
+    )
+    tbl = pa.Table.from_arrays([ints, vals], schema=schema)
+
+    ds = lance.write_dataset(tbl, tmp_path / "data.lance")
+    torch_ds = LanceDataset(
+        ds,
+        batch_size=4,
+    )
+    with pytest.raises(NotImplementedError):
+        next(iter(torch_ds))
+
+    def to_tensor_fn(batch, *args, **kwargs):
+        ints = torch.tensor(batch["int"].to_numpy())
+        vals = []
+        for blob in batch["val"]:
+            blob.seek(100)
+            data = blob.read(100)
+            tensor = torch.tensor(np.frombuffer(data, dtype=np.uint8))
+            vals.append(tensor)
+
+            # vals.append(torch.tensor(blob))
+        vals = torch.stack(vals)
+        return {"int": ints, "val": vals}
+
+    torch_ds = LanceDataset(
+        ds,
+        batch_size=4,
+        to_tensor_fn=to_tensor_fn,
+    )
+    first = next(iter(torch_ds))
+    assert first["int"].dtype == torch.int64
+    assert first["int"].shape == (4,)
+    assert first["val"].dtype == torch.uint8
+    assert first["val"].shape == (4, 100)