Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor: rename use_experimental_writer to use_legacy_format #2433

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions python/python/lance/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -2385,7 +2385,7 @@ def write_dataset(
commit_lock: Optional[CommitLock] = None,
progress: Optional[FragmentWriteProgress] = None,
storage_options: Optional[Dict[str, str]] = None,
use_experimental_writer: bool = False,
use_legacy_format: bool = True,
) -> LanceDataset:
"""Write a given data_obj to the given uri

Expand Down Expand Up @@ -2425,9 +2425,9 @@ def write_dataset(
storage_options : optional, dict
Extra options that make sense for a particular storage connection. This is
used to store connection parameters like credentials, endpoint, etc.
use_experimental_writer : optional, bool
Use the Lance v2 writer to write Lance v2 files. This is not recommended
at this time as there are several known limitations in the v2 writer.
use_legacy_format : optional, bool, default True
Use the Lance v1 writer to write Lance v1 files. The default is currently
True but will change as we roll out the v2 format.
"""
if _check_for_hugging_face(data_obj):
# Huggingface datasets
Expand All @@ -2449,7 +2449,7 @@ def write_dataset(
"max_bytes_per_file": max_bytes_per_file,
"progress": progress,
"storage_options": storage_options,
"use_experimental_writer": use_experimental_writer,
"use_legacy_format": use_legacy_format,
}

if commit_lock:
Expand Down
17 changes: 10 additions & 7 deletions python/python/lance/fragment.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ def create(
progress: Optional[FragmentWriteProgress] = None,
mode: str = "append",
*,
use_experimental_writer=False,
use_legacy_format=True,
) -> FragmentMetadata:
"""Create a :class:`FragmentMetadata` from the given data.

Expand Down Expand Up @@ -177,6 +177,9 @@ def create(
The write mode. If "append" is specified, the data will be checked
against the existing dataset's schema. Otherwise, pass "create" or
"overwrite" to assign new field ids to the schema.
use_legacy_format: bool, default True
Use the legacy format to write Lance files. The default is True
while the v2 format is still in beta.

See Also
--------
Expand Down Expand Up @@ -215,7 +218,7 @@ def create(
max_rows_per_group=max_rows_per_group,
progress=progress,
mode=mode,
use_experimental_writer=use_experimental_writer,
use_legacy_format=use_legacy_format,
)
return FragmentMetadata(inner_meta.json())

Expand Down Expand Up @@ -504,7 +507,7 @@ def write_fragments(
max_rows_per_group: int = 1024,
max_bytes_per_file: int = DEFAULT_MAX_BYTES_PER_FILE,
progress: Optional[FragmentWriteProgress] = None,
use_experimental_writer: bool = False,
use_legacy_format: bool = True,
storage_options: Optional[Dict[str, str]] = None,
) -> List[FragmentMetadata]:
"""
Expand Down Expand Up @@ -542,9 +545,9 @@ def write_fragments(
*Experimental API*. Progress tracking for writing the fragment. Pass
a custom class that defines hooks to be called when each fragment is
starting to write and finishing writing.
use_experimental_writer : optional, bool
Use the Lance v2 writer to write Lance v2 files. This is not recommended
at this time as there are several known limitations in the v2 writer.
use_legacy_format : optional, bool, default True
Use the Lance v1 writer to write Lance v1 files. The default is currently
True while the v2 format is in beta.
storage_options : Optional[Dict[str, str]]
Extra options that make sense for a particular storage connection. This is
used to store connection parameters like credentials, endpoint, etc.
Expand Down Expand Up @@ -578,7 +581,7 @@ def write_fragments(
max_rows_per_group=max_rows_per_group,
max_bytes_per_file=max_bytes_per_file,
progress=progress,
use_experimental_writer=use_experimental_writer,
use_legacy_format=use_legacy_format,
storage_options=storage_options,
)
return [FragmentMetadata.from_metadata(frag) for frag in fragments]
25 changes: 12 additions & 13 deletions python/python/lance/ray/sink.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def _write_fragment(
max_rows_per_file: int = 1024 * 1024,
max_bytes_per_file: Optional[int] = None,
max_rows_per_group: int = 1024, # Only useful for v1 writer.
use_experimental_writer: bool = False,
use_legacy_format: bool = True,
storage_options: Optional[Dict[str, Any]] = None,
) -> Tuple[FragmentMetadata, pa.Schema]:
from ..dependencies import _PANDAS_AVAILABLE
Expand Down Expand Up @@ -88,7 +88,7 @@ def record_batch_converter():
max_rows_per_file=max_rows_per_file,
max_rows_per_group=max_rows_per_group,
max_bytes_per_file=max_bytes_per_file,
use_experimental_writer=use_experimental_writer,
use_legacy_format=use_legacy_format,
storage_options=storage_options,
)
return [(fragment, schema) for fragment in fragments]
Expand Down Expand Up @@ -161,9 +161,8 @@ class LanceDatasink(_BaseLanceDatasink):
Choices are 'append', 'create', 'overwrite'.
max_rows_per_file : int, optional
The maximum number of rows per file. Default is 1024 * 1024.
use_experimental_writer : bool, optional
Set true to use v2 writer. Default is False now. Will be removed once
v2 writer become the default.
use_legacy_format : bool, optional
Set True to use the legacy v1 format. Default is False
"""

NAME = "Lance"
Expand All @@ -174,14 +173,14 @@ def __init__(
schema: Optional[pa.Schema] = None,
mode: Literal["create", "append", "overwrite"] = "create",
max_rows_per_file: int = 1024 * 1024,
use_experimental_writer: bool = True,
use_legacy_format: bool = False,
*args,
**kwargs,
):
super().__init__(uri, schema=schema, mode=mode, *args, **kwargs)

self.max_rows_per_file = max_rows_per_file
self.use_experimental_writer = use_experimental_writer
self.use_legacy_format = use_legacy_format
# if mode is append, read_version is read from existing dataset.
self.read_version: int | None = None

Expand All @@ -206,7 +205,7 @@ def write(
self.uri,
schema=self.schema,
max_rows_per_file=self.max_rows_per_file,
use_experimental_writer=self.use_experimental_writer,
use_legacy_format=self.use_legacy_format,
)
return [
(pickle.dumps(fragment), pickle.dumps(schema))
Expand Down Expand Up @@ -235,8 +234,8 @@ class LanceFragmentWriter:
max_rows_per_group : int, optional
The maximum number of rows per group. Default is 1024.
Only useful for v1 writer.
use_experimental_writer : bool, optional
Set true to use v2 writer. Default is True.
use_legacy_format : bool, optional
Set True to use the legacy v1 writer. Default is False
storage_options : Dict[str, Any], optional
The storage options for the writer. Default is None.

Expand All @@ -251,7 +250,7 @@ def __init__(
max_rows_per_file: int = 1024 * 1024,
max_bytes_per_file: Optional[int] = None,
max_rows_per_group: Optional[int] = None, # Only useful for v1 writer.
use_experimental_writer: bool = True,
use_legacy_format: bool = False,
storage_options: Optional[Dict[str, Any]] = None,
):
self.uri = uri
Expand All @@ -261,7 +260,7 @@ def __init__(
self.max_rows_per_group = max_rows_per_group
self.max_rows_per_file = max_rows_per_file
self.max_bytes_per_file = max_bytes_per_file
self.use_experimental_writer = use_experimental_writer
self.use_legacy_format = use_legacy_format
self.storage_options = storage_options

def __call__(self, batch: Union[pa.Table, "pd.DataFrame"]) -> Dict[str, Any]:
Expand All @@ -277,7 +276,7 @@ def __call__(self, batch: Union[pa.Table, "pd.DataFrame"]) -> Dict[str, Any]:
schema=self.schema,
max_rows_per_file=self.max_rows_per_file,
max_rows_per_group=self.max_rows_per_group,
use_experimental_writer=self.use_experimental_writer,
use_legacy_format=self.use_legacy_format,
storage_options=self.storage_options,
)
return pa.Table.from_pydict(
Expand Down
2 changes: 1 addition & 1 deletion python/python/tests/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -1728,7 +1728,7 @@ def test_migrate_manifest(tmp_path: Path):

def test_v2_dataset(tmp_path: Path):
table = pa.table({"a": range(100), "b": range(100)})
dataset = lance.write_dataset(table, tmp_path, use_experimental_writer=True)
dataset = lance.write_dataset(table, tmp_path, use_legacy_format=False)
batches = list(dataset.to_batches())
assert len(batches) == 1
assert pa.Table.from_batches(batches) == table
Expand Down
2 changes: 1 addition & 1 deletion python/python/tests/test_fragment.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,7 +261,7 @@ def test_fragment_v2(tmp_path):
fragments = write_fragments(
tab,
tmp_path,
use_experimental_writer=True,
use_legacy_format=False,
)
assert len(fragments) == 1
ds = lance.dataset(dataset_uri)
Expand Down
6 changes: 2 additions & 4 deletions python/src/dataset.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1171,10 +1171,8 @@ pub fn get_write_params(options: &PyDict) -> PyResult<Option<WriteParams>> {
if let Some(maybe_nbytes) = get_dict_opt::<usize>(options, "max_bytes_per_file")? {
p.max_bytes_per_file = maybe_nbytes;
}
if let Some(use_experimental_writer) =
get_dict_opt::<bool>(options, "use_experimental_writer")?
{
p.use_experimental_writer = use_experimental_writer;
if let Some(use_legacy_format) = get_dict_opt::<bool>(options, "use_legacy_format")? {
p.use_legacy_format = use_legacy_format;
}
if let Some(progress) = get_dict_opt::<PyObject>(options, "progress")? {
p.progress = Arc::new(PyWriteProgress::new(progress.to_object(options.py())));
Expand Down
Loading
Loading