Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use python read for text dataset #715

Merged
merged 8 commits into from Oct 5, 2020
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
Binary file modified datasets/text/dummy/0.0.0/dummy_data.zip
Binary file not shown.
32 changes: 15 additions & 17 deletions datasets/text/text.py
Expand Up @@ -21,7 +21,7 @@ class TextConfig(datasets.BuilderConfig):
"""BuilderConfig for text files."""

encoding: str = None
chunksize: int = 10_000
chunksize: int = 10 << 20 # 10MB


class Text(datasets.ArrowBasedBuilder):
Expand Down Expand Up @@ -55,19 +55,17 @@ def _split_generators(self, dl_manager):

def _generate_tables(self, files):
for i, file in enumerate(files):
text_file_reader = pd.read_csv(
file,
dtype={"text": str},
names=["text"],
header=None,
iterator=True,
chunksize=self.config.chunksize,
encoding=self.config.encoding,
sep="\n",
)
for j, df in enumerate(text_file_reader):
pa_table = pa.Table.from_pandas(df)
# Uncomment for debugging (will print the Arrow table size and elements)
# logger.warning(f"pa_table: {pa_table} num rows: {pa_table.num_rows}")
# logger.warning('\n'.join(str(pa_table.slice(i, 1).to_pydict()) for i in range(pa_table.num_rows)))
yield (i, j), pa_table
j = 0
lhoestq marked this conversation as resolved.
Show resolved Hide resolved
with open(file, "r", encoding=self.config.encoding) as f:
while True:
batch = f.read(self.config.chunksize)
if not batch:
break
batch += f.readline() # finish current line
batch = batch.splitlines()
pa_table = pa.Table.from_arrays([pa.array(batch)], schema=pa.schema({"text": pa.string()}))
# Uncomment for debugging (will print the Arrow table size and elements)
# logger.warning(f"pa_table: {pa_table} num rows: {pa_table.num_rows}")
# logger.warning('\n'.join(str(pa_table.slice(i, 1).to_pydict()) for i in range(pa_table.num_rows)))
yield (i, j), pa_table
j += 1
7 changes: 5 additions & 2 deletions tests/test_dataset_common.py
Expand Up @@ -187,6 +187,7 @@ def check_load_dataset(self, dataset_name, configs, is_local=False):
dataset = dataset_builder.as_dataset()

# check that dataset is not empty
self.parent.assertListEqual(sorted(dataset_builder.info.splits.keys()), sorted(dataset))
for split in dataset_builder.info.splits.keys():
# check that loaded datset is not empty
self.parent.assertTrue(len(dataset[split]) > 0)
Expand Down Expand Up @@ -364,7 +365,7 @@ def test_caching(self):
n_samples = 10
with tempfile.TemporaryDirectory() as tmp_dir:
open(os.path.join(tmp_dir, "text.txt"), "w", encoding="utf-8").write(
"\n".join("foo" for _ in range(n_samples))
"\r\n".join("foo" for _ in range(n_samples))
)
ds = load_dataset(
"./datasets/text", data_files=os.path.join(tmp_dir, "text.txt"), cache_dir=tmp_dir, split="train"
Expand All @@ -388,6 +389,7 @@ def test_caching(self):
)
self.assertNotEqual(ds._data_files[0], data_file)
self.assertNotEqual(ds._fingerprint, fingerprint)
self.assertEqual(len(ds), n_samples)
del ds


Expand All @@ -399,7 +401,7 @@ def test_caching(self):

with tempfile.TemporaryDirectory() as tmp_dir:
open(os.path.join(tmp_dir, "table.csv"), "w", encoding="utf-8").write(
"\n".join(",".join(["foo", "bar"]) for _ in range(n_rows + 1))
"\r\n".join(",".join(["foo", "bar"]) for _ in range(n_rows + 1))
)
ds = load_dataset(
"./datasets/csv", data_files=os.path.join(tmp_dir, "table.csv"), cache_dir=tmp_dir, split="train"
Expand Down Expand Up @@ -433,6 +435,7 @@ def test_caching(self):
)
self.assertNotEqual(ds._data_files[0], data_file)
self.assertNotEqual(ds._fingerprint, fingerprint)
self.assertEqual(len(ds), n_rows)
del ds

def test_features(self):
Expand Down