huggingface · lhoestq · Oct 5, 2020 · Oct 5, 2020 · Oct 5, 2020 · Oct 5, 2020
diff --git a/datasets/text/dummy/0.0.0/dummy_data.zip b/datasets/text/dummy/0.0.0/dummy_data.zip
diff --git a/datasets/text/text.py b/datasets/text/text.py
@@ -21,7 +21,7 @@ class TextConfig(datasets.BuilderConfig):
     """BuilderConfig for text files."""
 
     encoding: str = None
-    chunksize: int = 10_000
+    chunksize: int = 10 << 20  # 10MB
 
 
 class Text(datasets.ArrowBasedBuilder):
@@ -55,19 +55,17 @@ def _split_generators(self, dl_manager):
 
     def _generate_tables(self, files):
         for i, file in enumerate(files):
-            text_file_reader = pd.read_csv(
-                file,
-                dtype={"text": str},
-                names=["text"],
-                header=None,
-                iterator=True,
-                chunksize=self.config.chunksize,
-                encoding=self.config.encoding,
-                sep="\n",
-            )
-            for j, df in enumerate(text_file_reader):
-                pa_table = pa.Table.from_pandas(df)
-                # Uncomment for debugging (will print the Arrow table size and elements)
-                # logger.warning(f"pa_table: {pa_table} num rows: {pa_table.num_rows}")
-                # logger.warning('\n'.join(str(pa_table.slice(i, 1).to_pydict()) for i in range(pa_table.num_rows)))
-                yield (i, j), pa_table
+            j = 0
+            with open(file, "r", encoding=self.config.encoding) as f:
+                while True:
+                    batch = f.read(self.config.chunksize)
+                    if not batch:
+                        break
+                    batch += f.readline()  # finish current line
+                    batch = batch.splitlines()
+                    pa_table = pa.Table.from_arrays([pa.array(batch)], schema=pa.schema({"text": pa.string()}))
+                    # Uncomment for debugging (will print the Arrow table size and elements)
+                    # logger.warning(f"pa_table: {pa_table} num rows: {pa_table.num_rows}")
+                    # logger.warning('\n'.join(str(pa_table.slice(i, 1).to_pydict()) for i in range(pa_table.num_rows)))
+                    yield (i, j), pa_table
+                    j += 1
diff --git a/tests/test_dataset_common.py b/tests/test_dataset_common.py
@@ -187,6 +187,7 @@ def check_load_dataset(self, dataset_name, configs, is_local=False):
                 dataset = dataset_builder.as_dataset()
 
                 # check that dataset is not empty
+                self.parent.assertListEqual(sorted(dataset_builder.info.splits.keys()), sorted(dataset))
                 for split in dataset_builder.info.splits.keys():
                     # check that loaded datset is not empty
                     self.parent.assertTrue(len(dataset[split]) > 0)
@@ -364,7 +365,7 @@ def test_caching(self):
         n_samples = 10
         with tempfile.TemporaryDirectory() as tmp_dir:
             open(os.path.join(tmp_dir, "text.txt"), "w", encoding="utf-8").write(
-                "\n".join("foo" for _ in range(n_samples))
+                "\r\n".join("foo" for _ in range(n_samples))
             )
             ds = load_dataset(
                 "./datasets/text", data_files=os.path.join(tmp_dir, "text.txt"), cache_dir=tmp_dir, split="train"
@@ -388,6 +389,7 @@ def test_caching(self):
             )
             self.assertNotEqual(ds._data_files[0], data_file)
             self.assertNotEqual(ds._fingerprint, fingerprint)
+            self.assertEqual(len(ds), n_samples)
             del ds
 
 
@@ -399,7 +401,7 @@ def test_caching(self):
 
         with tempfile.TemporaryDirectory() as tmp_dir:
             open(os.path.join(tmp_dir, "table.csv"), "w", encoding="utf-8").write(
-                "\n".join(",".join(["foo", "bar"]) for _ in range(n_rows + 1))
+                "\r\n".join(",".join(["foo", "bar"]) for _ in range(n_rows + 1))
             )
             ds = load_dataset(
                 "./datasets/csv", data_files=os.path.join(tmp_dir, "table.csv"), cache_dir=tmp_dir, split="train"
@@ -433,6 +435,7 @@ def test_caching(self):
             )
             self.assertNotEqual(ds._data_files[0], data_file)
             self.assertNotEqual(ds._fingerprint, fingerprint)
+            self.assertEqual(len(ds), n_rows)
             del ds
 
     def test_features(self):