Skip to content

Commit

Permalink
Add features parameter to CSV (#685)
Browse files Browse the repository at this point in the history
* specify column types in csv

* take features into account when building cache dir

* add test

* minor typo in text dataset
  • Loading branch information
lhoestq authored Sep 30, 2020
1 parent e406c3e commit 6460a2e
Show file tree
Hide file tree
Showing 4 changed files with 94 additions and 10 deletions.
7 changes: 5 additions & 2 deletions datasets/csv/csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ class CsvConfig(datasets.BuilderConfig):
read_options: pac.ReadOptions = None
parse_options: pac.ParseOptions = None
convert_options: pac.ConvertOptions = None
features: datasets.Features = None

@property
def pa_read_options(self):
Expand All @@ -43,7 +44,9 @@ def pa_parse_options(self):

@property
def pa_convert_options(self):
convert_options = self.convert_options or pac.ConvertOptions()
convert_options = self.convert_options or pac.ConvertOptions(
column_types=self.features.type if self.features is not None else None
)
return convert_options


Expand Down Expand Up @@ -78,6 +81,6 @@ def _generate_tables(self, files):
file,
read_options=self.config.pa_read_options,
parse_options=self.config.pa_parse_options,
convert_options=self.config.convert_options,
convert_options=self.config.pa_convert_options,
)
yield i, pa_table
2 changes: 1 addition & 1 deletion datasets/text/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ def _generate_tables(self, files):
file,
read_options=self.config.pa_read_options,
parse_options=self.config.pa_parse_options,
convert_options=self.config.convert_options,
convert_options=self.config.pa_convert_options,
)
# Uncomment for debugging (will print the Arrow table size and elements)
# logger.warning(f"pa_table: {pa_table} num rows: {pa_table.num_rows}")
Expand Down
14 changes: 9 additions & 5 deletions src/datasets/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,14 @@
from functools import partial
from typing import Dict, List, Optional, Union

import xxhash
from filelock import FileLock

from . import utils
from .arrow_dataset import Dataset
from .arrow_reader import HF_GCP_BASE_URL, ArrowReader, DatasetNotOnHfGcs, MissingFilesOnHfGcs
from .arrow_writer import ArrowWriter, BeamWriter
from .dataset_dict import DatasetDict
from .fingerprint import Hasher
from .info import (
DATASET_INFO_FILENAME,
DATASET_INFOS_DICT_FILE_NAME,
Expand Down Expand Up @@ -153,6 +153,8 @@ def __init__(

# Prepare config: DatasetConfig contains name, version and description but can be extended by each dataset
config_kwargs = dict((key, value) for key, value in config_kwargs.items() if value is not None)
if "features" in inspect.signature(self.BUILDER_CONFIG_CLASS.__init__).parameters and features is not None:
config_kwargs["features"] = features
self.config = self._create_builder_config(
name,
**config_kwargs,
Expand Down Expand Up @@ -256,7 +258,7 @@ def _create_builder_config(self, name=None, **config_kwargs):
# if not builder_config.description:
# raise ValueError("BuilderConfig %s must have a description" % name)
if builder_config.data_files is not None:
m = xxhash.xxh64()
m = Hasher()
if isinstance(builder_config.data_files, str):
data_files = {"train": [builder_config.data_files]}
elif isinstance(builder_config.data_files, (tuple, list)):
Expand All @@ -269,10 +271,12 @@ def _create_builder_config(self, name=None, **config_kwargs):
else:
raise ValueError("Please provide a valid `data_files` in `DatasetBuilder`")
for key in sorted(data_files.keys()):
m.update(key.encode("utf-8"))
m.update(key)
for data_file in data_files[key]:
m.update(os.path.abspath(data_file).encode("utf-8"))
m.update(str(os.path.getmtime(data_file)).encode("utf-8"))
m.update(os.path.abspath(data_file))
m.update(str(os.path.getmtime(data_file)))
if hasattr(builder_config, "features"):
m.update(builder_config.features)
builder_config.name += "-" + m.hexdigest()
return builder_config

Expand Down
81 changes: 79 additions & 2 deletions tests/test_dataset_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,10 @@
BuilderConfig,
DatasetBuilder,
DownloadConfig,
Features,
GenerateMode,
MockDownloadManager,
Value,
cached_path,
hf_api,
hf_bucket_url,
Expand Down Expand Up @@ -375,13 +377,17 @@ def test_load_real_dataset_all_configs(self, dataset_name):

class TextTest(TestCase):
def test_caching(self):
n_samples = 10
with tempfile.TemporaryDirectory() as tmp_dir:
open(os.path.join(tmp_dir, "text.txt"), "w", encoding="utf-8").write("\n".join("foo" for _ in range(10)))
open(os.path.join(tmp_dir, "text.txt"), "w", encoding="utf-8").write(
"\n".join("foo" for _ in range(n_samples))
)
ds = load_dataset(
"./datasets/text", data_files=os.path.join(tmp_dir, "text.txt"), cache_dir=tmp_dir, split="train"
)
data_file = ds._data_files[0]
fingerprint = ds._fingerprint
self.assertEqual(len(ds), n_samples)
del ds
ds = load_dataset(
"./datasets/text", data_files=os.path.join(tmp_dir, "text.txt"), cache_dir=tmp_dir, split="train"
Expand All @@ -390,10 +396,81 @@ def test_caching(self):
self.assertEqual(ds._fingerprint, fingerprint)
del ds

open(os.path.join(tmp_dir, "text.txt"), "w", encoding="utf-8").write("\n".join("bar" for _ in range(10)))
open(os.path.join(tmp_dir, "text.txt"), "w", encoding="utf-8").write(
"\n".join("bar" for _ in range(n_samples))
)
ds = load_dataset(
"./datasets/text", data_files=os.path.join(tmp_dir, "text.txt"), cache_dir=tmp_dir, split="train"
)
self.assertNotEqual(ds._data_files[0], data_file)
self.assertNotEqual(ds._fingerprint, fingerprint)
del ds


class CsvTest(TestCase):
def test_caching(self):
n_rows = 10

features = Features({"foo": Value("string"), "bar": Value("string")})

with tempfile.TemporaryDirectory() as tmp_dir:
open(os.path.join(tmp_dir, "table.csv"), "w", encoding="utf-8").write(
"\n".join(",".join(["foo", "bar"]) for _ in range(n_rows + 1))
)
ds = load_dataset(
"./datasets/csv", data_files=os.path.join(tmp_dir, "table.csv"), cache_dir=tmp_dir, split="train"
)
data_file = ds._data_files[0]
fingerprint = ds._fingerprint
self.assertEqual(len(ds), n_rows)
del ds
ds = load_dataset(
"./datasets/csv", data_files=os.path.join(tmp_dir, "table.csv"), cache_dir=tmp_dir, split="train"
)
self.assertEqual(ds._data_files[0], data_file)
self.assertEqual(ds._fingerprint, fingerprint)
del ds
ds = load_dataset(
"./datasets/csv",
data_files=os.path.join(tmp_dir, "table.csv"),
cache_dir=tmp_dir,
split="train",
features=features,
)
self.assertNotEqual(ds._data_files[0], data_file)
self.assertNotEqual(ds._fingerprint, fingerprint)
del ds

open(os.path.join(tmp_dir, "table.csv"), "w", encoding="utf-8").write(
"\n".join(",".join(["Foo", "Bar"]) for _ in range(n_rows + 1))
)
ds = load_dataset(
"./datasets/csv", data_files=os.path.join(tmp_dir, "table.csv"), cache_dir=tmp_dir, split="train"
)
self.assertNotEqual(ds._data_files[0], data_file)
self.assertNotEqual(ds._fingerprint, fingerprint)
del ds

def test_features(self):
n_rows = 10
n_cols = 3

def get_features(type):
return Features({str(i): Value(type) for i in range(n_cols)})

with tempfile.TemporaryDirectory() as tmp_dir:
open(os.path.join(tmp_dir, "table.csv"), "w", encoding="utf-8").write(
"\n".join(",".join([str(i) for i in range(n_cols)]) for _ in range(n_rows + 1))
)
for type in ["float64", "int8"]:
features = get_features(type)
ds = load_dataset(
"./datasets/csv",
data_files=os.path.join(tmp_dir, "table.csv"),
cache_dir=tmp_dir,
split="train",
features=features,
)
self.assertEqual(len(ds), n_rows)
self.assertDictEqual(ds.features, features)
del ds

1 comment on commit 6460a2e

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Show benchmarks

PyArrow==0.17.1

Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.019185 / 0.011353 (0.007832) 0.016150 / 0.011008 (0.005142) 0.048201 / 0.038508 (0.009693) 0.034356 / 0.023109 (0.011247) 0.219298 / 0.275898 (-0.056600) 0.252352 / 0.323480 (-0.071128) 0.009916 / 0.007986 (0.001931) 0.004578 / 0.004328 (0.000250) 0.007130 / 0.004250 (0.002880) 0.053790 / 0.037052 (0.016737) 0.215957 / 0.258489 (-0.042532) 0.255976 / 0.293841 (-0.037865) 0.169495 / 0.128546 (0.040949) 0.133075 / 0.075646 (0.057429) 0.457982 / 0.419271 (0.038710) 0.544566 / 0.043533 (0.501034) 0.226756 / 0.255139 (-0.028383) 0.231501 / 0.283200 (-0.051699) 0.088038 / 0.141683 (-0.053645) 1.906630 / 1.452155 (0.454475) 2.136945 / 1.492716 (0.644229)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.042289 / 0.037411 (0.004877) 0.022847 / 0.014526 (0.008321) 0.104051 / 0.176557 (-0.072505) 0.163401 / 0.737135 (-0.573735) 0.176001 / 0.296338 (-0.120338)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.219199 / 0.215209 (0.003990) 2.295272 / 2.077655 (0.217617) 1.323735 / 1.504120 (-0.180385) 1.265044 / 1.541195 (-0.276151) 1.268431 / 1.468490 (-0.200059) 7.349419 / 4.584777 (2.764643) 6.075362 / 3.745712 (2.329650) 8.726593 / 5.269862 (3.456732) 7.556636 / 4.565676 (2.990960) 0.734642 / 0.424275 (0.310367) 0.012066 / 0.007607 (0.004459) 0.254715 / 0.226044 (0.028670) 2.615114 / 2.268929 (0.346185) 1.875116 / 55.444624 (-53.569508) 1.657482 / 6.876477 (-5.218994) 1.657979 / 2.142072 (-0.484094) 7.436684 / 4.805227 (2.631456) 5.632197 / 6.500664 (-0.868467) 7.601722 / 0.075469 (7.526253)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 15.158783 / 1.841788 (13.316995) 15.162164 / 8.074308 (7.087856) 16.924511 / 10.191392 (6.733119) 0.478746 / 0.680424 (-0.201678) 0.319805 / 0.534201 (-0.214396) 0.882723 / 0.579283 (0.303440) 0.673427 / 0.434364 (0.239063) 0.850088 / 0.540337 (0.309750) 1.706280 / 1.386936 (0.319344)
PyArrow==1.0
Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.020443 / 0.011353 (0.009090) 0.016109 / 0.011008 (0.005101) 0.050488 / 0.038508 (0.011980) 0.032935 / 0.023109 (0.009826) 0.353075 / 0.275898 (0.077177) 0.410183 / 0.323480 (0.086703) 0.008530 / 0.007986 (0.000545) 0.004752 / 0.004328 (0.000423) 0.007218 / 0.004250 (0.002968) 0.047773 / 0.037052 (0.010720) 0.359589 / 0.258489 (0.101100) 0.405747 / 0.293841 (0.111906) 0.163632 / 0.128546 (0.035085) 0.128246 / 0.075646 (0.052600) 0.480453 / 0.419271 (0.061181) 0.432787 / 0.043533 (0.389254) 0.343324 / 0.255139 (0.088185) 0.376367 / 0.283200 (0.093167) 0.105838 / 0.141683 (-0.035845) 1.948552 / 1.452155 (0.496397) 1.983928 / 1.492716 (0.491212)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.043469 / 0.037411 (0.006057) 0.024915 / 0.014526 (0.010390) 0.110801 / 0.176557 (-0.065756) 0.175083 / 0.737135 (-0.562053) 0.029582 / 0.296338 (-0.266756)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.289305 / 0.215209 (0.074096) 2.930518 / 2.077655 (0.852864) 2.035530 / 1.504120 (0.531410) 1.879286 / 1.541195 (0.338092) 1.901125 / 1.468490 (0.432635) 7.178347 / 4.584777 (2.593570) 5.952152 / 3.745712 (2.206440) 8.635854 / 5.269862 (3.365992) 7.517351 / 4.565676 (2.951675) 0.734739 / 0.424275 (0.310463) 0.011890 / 0.007607 (0.004283) 0.330166 / 0.226044 (0.104122) 3.397968 / 2.268929 (1.129039) 2.487351 / 55.444624 (-52.957273) 2.267610 / 6.876477 (-4.608867) 2.313160 / 2.142072 (0.171088) 7.191309 / 4.805227 (2.386082) 5.094784 / 6.500664 (-1.405880) 8.962091 / 0.075469 (8.886622)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 15.586475 / 1.841788 (13.744687) 14.416421 / 8.074308 (6.342113) 16.075526 / 10.191392 (5.884134) 0.873789 / 0.680424 (0.193365) 0.651331 / 0.534201 (0.117130) 0.878610 / 0.579283 (0.299326) 0.653391 / 0.434364 (0.219028) 0.830989 / 0.540337 (0.290651) 1.746276 / 1.386936 (0.359340)

Please sign in to comment.