Merge pull request #14 from zillow/tz/gather_statistics_kwargs

Resolves #13 Surface the gather_statistics argument
intake · Aug 22, 2019 · e38d720 · e38d720
2 parents 2e419af + c20180d
commit e38d720
Show file tree

Hide file tree

Showing 2 changed files with 28 additions and 6 deletions.
diff --git a/docs/source/quickstart.rst b/docs/source/quickstart.rst
@@ -54,6 +54,24 @@ Arguments to ``open_parquet``:
  be loaded, but partitions containing *at least one* value which passes the filter will be
  loaded.
 
+- ``engine`` : 'fastparquet' or 'pyarrow'. Which backend to read with.
+
+- ``gather_statistics`` : bool or None (default).  Gather the statistics for
+  each dataset partition. By default, this will only be done if the _metadata
+  file is available. Otherwise, statistics will only be gathered if True,
+  because the footer of every file will be parsed (which is very slow on some
+  systems).
+
+- ``engine`` : 'fastparquet' or 'pyarrow'. Which backend to read with.
+
+- ``gather_statistics`` : bool or None (default).  Gather the statistics for
+  each dataset partition. By default, this will only be done if the _metadata
+  file is available. Otherwise, statistics will only be gathered if True,
+  because the footer of every file will be parsed (which is very slow on some
+  systems).
+
+- see ``dd.read_parquet()`` for the other named parameters that can be passed through.
+
 .. _documentation : http://dask.pydata.org/en/latest/remote-data-services.html
 
 A source so defined will provide the usual methods such as ``discover`` and ``read_partition``.

diff --git a/intake_parquet/source.py b/intake_parquet/source.py
@@ -31,6 +31,15 @@ class ParquetSource(base.DataSource):
 
     - engine: 'fastparquet' or 'pyarrow'
         Which backend to read with.
+
+
+    - gather_statistics : bool or None (default).
+        Gather the statistics for each dataset partition. By default,
+        this will only be done if the _metadata file is available. Otherwise,
+        statistics will only be gathered if True, because the footer of
+        every file will be parsed (which is very slow on some systems).
+
+    - see dd.read_parquet() for the other named parameters that can be passed through.
     """
     container = 'dataframe'
     name = 'parquet'
@@ -96,13 +105,8 @@ def _to_dask(self):
         """
         import dask.dataframe as dd
         urlpath = self._get_cache(self._urlpath)[0]
-        kw = dict(columns=self._kwargs.get('columns', None),
-                  index=self._kwargs.get('index', None),
-                  engine=self._kwargs.get('engine', 'auto'))
-        if 'filters' in self._kwargs:
-            kw['filters'] = self._kwargs['filters']
         self._df = dd.read_parquet(urlpath,
-                                   storage_options=self._storage_options, **kw)
+                                   storage_options=self._storage_options, **self._kwargs)
         self._load_metadata()
         return self._df