Skip to content

Commit

Permalink
use python read for text dataset
Browse files Browse the repository at this point in the history
  • Loading branch information
lhoestq committed Oct 5, 2020
1 parent 0ab8829 commit 7fd8f90
Showing 1 changed file with 18 additions and 18 deletions.
36 changes: 18 additions & 18 deletions datasets/text/text.py
Expand Up @@ -21,7 +21,7 @@ class TextConfig(datasets.BuilderConfig):
"""BuilderConfig for text files."""

encoding: str = None
chunksize: int = 10_000
chunksize: int = 10 << 20 # 10MB


class Text(datasets.ArrowBasedBuilder):
Expand Down Expand Up @@ -55,20 +55,20 @@ def _split_generators(self, dl_manager):

def _generate_tables(self, files):
for i, file in enumerate(files):
text_file_reader = pd.read_csv(
file,
dtype={"text": str},
names=["text"],
header=None,
iterator=True,
chunksize=self.config.chunksize,
encoding=self.config.encoding,
sep="\n",
lineterminator="\n",
)
for j, df in enumerate(text_file_reader):
pa_table = pa.Table.from_pandas(df)
# Uncomment for debugging (will print the Arrow table size and elements)
# logger.warning(f"pa_table: {pa_table} num rows: {pa_table.num_rows}")
# logger.warning('\n'.join(str(pa_table.slice(i, 1).to_pydict()) for i in range(pa_table.num_rows)))
yield (i, j), pa_table
j = 0
with open(file, "r", encoding=self.config.encoding) as f:
while True:
batch = f.read(self.config.chunksize)
if not batch:
break
batch += f.readline() # finish current line
batch = batch.splitlines()
pa_table = pa.Table.from_arrays(
[pa.array(batch)],
schema=pa.schema({"text": pa.string()})
)
# Uncomment for debugging (will print the Arrow table size and elements)
# logger.warning(f"pa_table: {pa_table} num rows: {pa_table.num_rows}")
# logger.warning('\n'.join(str(pa_table.slice(i, 1).to_pydict()) for i in range(pa_table.num_rows)))
yield (i, j), pa_table
j += 1

1 comment on commit 7fd8f90

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Show benchmarks

PyArrow==0.17.1

Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.019038 / 0.011353 (0.007685) 0.016423 / 0.011008 (0.005415) 0.049536 / 0.038508 (0.011028) 0.032454 / 0.023109 (0.009345) 0.228006 / 0.275898 (-0.047892) 0.249134 / 0.323480 (-0.074346) 0.009697 / 0.007986 (0.001711) 0.004559 / 0.004328 (0.000231) 0.007102 / 0.004250 (0.002852) 0.050047 / 0.037052 (0.012994) 0.223271 / 0.258489 (-0.035218) 0.246303 / 0.293841 (-0.047538) 0.172109 / 0.128546 (0.043563) 0.137336 / 0.075646 (0.061689) 0.467841 / 0.419271 (0.048570) 0.539741 / 0.043533 (0.496209) 0.220866 / 0.255139 (-0.034273) 0.241748 / 0.283200 (-0.041452) 0.085573 / 0.141683 (-0.056110) 1.921393 / 1.452155 (0.469239) 2.033510 / 1.492716 (0.540793)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.042523 / 0.037411 (0.005112) 0.021306 / 0.014526 (0.006780) 0.095341 / 0.176557 (-0.081216) 0.095902 / 0.737135 (-0.641234) 0.028043 / 0.296338 (-0.268296)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.225301 / 0.215209 (0.010092) 2.322096 / 2.077655 (0.244441) 1.340682 / 1.504120 (-0.163438) 1.223872 / 1.541195 (-0.317322) 1.255743 / 1.468490 (-0.212747) 7.445400 / 4.584777 (2.860623) 6.242450 / 3.745712 (2.496737) 8.917471 / 5.269862 (3.647609) 7.787794 / 4.565676 (3.222117) 0.742726 / 0.424275 (0.318451) 0.011752 / 0.007607 (0.004145) 0.259459 / 0.226044 (0.033414) 2.750922 / 2.268929 (0.481994) 1.883758 / 55.444624 (-53.560866) 1.736027 / 6.876477 (-5.140450) 1.753745 / 2.142072 (-0.388328) 7.538416 / 4.805227 (2.733188) 7.380705 / 6.500664 (0.880041) 7.763637 / 0.075469 (7.688168)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 15.202906 / 1.841788 (13.361119) 33.566550 / 8.074308 (25.492242) 17.029816 / 10.191392 (6.838424) 0.945586 / 0.680424 (0.265162) 0.330781 / 0.534201 (-0.203420) 0.891540 / 0.579283 (0.312256) 0.694054 / 0.434364 (0.259690) 0.861278 / 0.540337 (0.320941) 1.768515 / 1.386936 (0.381579)
PyArrow==1.0
Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.020602 / 0.011353 (0.009249) 0.016769 / 0.011008 (0.005761) 0.052320 / 0.038508 (0.013812) 0.033735 / 0.023109 (0.010626) 0.354634 / 0.275898 (0.078736) 0.395529 / 0.323480 (0.072049) 0.009767 / 0.007986 (0.001782) 0.004888 / 0.004328 (0.000560) 0.007717 / 0.004250 (0.003466) 0.049609 / 0.037052 (0.012557) 0.361125 / 0.258489 (0.102636) 0.410469 / 0.293841 (0.116629) 0.170270 / 0.128546 (0.041724) 0.132993 / 0.075646 (0.057347) 0.490881 / 0.419271 (0.071609) 0.434196 / 0.043533 (0.390663) 0.350630 / 0.255139 (0.095491) 0.379571 / 0.283200 (0.096371) 0.095801 / 0.141683 (-0.045882) 1.996369 / 1.452155 (0.544214) 2.095058 / 1.492716 (0.602342)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.046228 / 0.037411 (0.008817) 0.024037 / 0.014526 (0.009511) 0.028237 / 0.176557 (-0.148320) 0.092636 / 0.737135 (-0.644499) 0.029520 / 0.296338 (-0.266818)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.299936 / 0.215209 (0.084727) 2.998736 / 2.077655 (0.921082) 2.111802 / 1.504120 (0.607682) 1.982003 / 1.541195 (0.440808) 2.050963 / 1.468490 (0.582473) 7.494550 / 4.584777 (2.909773) 6.310609 / 3.745712 (2.564897) 8.766707 / 5.269862 (3.496846) 7.791955 / 4.565676 (3.226279) 0.709728 / 0.424275 (0.285452) 0.011969 / 0.007607 (0.004361) 0.313011 / 0.226044 (0.086967) 3.339834 / 2.268929 (1.070906) 2.468728 / 55.444624 (-52.975897) 2.300868 / 6.876477 (-4.575608) 2.340173 / 2.142072 (0.198100) 7.358400 / 4.805227 (2.553172) 5.124291 / 6.500664 (-1.376373) 9.033251 / 0.075469 (8.957782)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 15.224254 / 1.841788 (13.382466) 16.108016 / 8.074308 (8.033708) 16.794289 / 10.191392 (6.602897) 0.902288 / 0.680424 (0.221864) 0.662208 / 0.534201 (0.128007) 0.891203 / 0.579283 (0.311920) 0.685648 / 0.434364 (0.251284) 0.856392 / 0.540337 (0.316055) 1.805421 / 1.386936 (0.418485)

Please sign in to comment.