ENH: initial support a file-like backend with csv & hdf5 implementations

closes ibis-project#1165
jreback · Nov 2, 2017 · d5bb902 · d5bb902
1 parent b530707
commit d5bb902
Show file tree

Hide file tree

Showing 19 changed files with 641 additions and 3 deletions.
diff --git a/appveyor.yml b/appveyor.yml
@@ -30,7 +30,8 @@ test_script:
   - "%CONDA% install conda=4.3.22 --channel conda-forge"
   - "%CONDA% create --name \"ibis_%PYTHON_VERSION%\" python=%PYTHON_VERSION% --channel conda-forge"
   - "%ACTIVATE% \"ibis_%PYTHON_VERSION%\""
-  - "pip install -e .\"[sqlite, postgres, visualization, pandas]\""
+  - "%CONDA% install pytables"
+  - "pip install -e .\"[sqlite, postgres, visualization, pandas, csv, hdf5]\""
   - "pip install flake8 mock pytest click \"pbs==0.110\""
   - "flake8"
   - "python ci\\datamgr.py download --directory \"%USERPROFILE%\""

diff --git a/ci/requirements-dev-2.7.yml b/ci/requirements-dev-2.7.yml
@@ -15,6 +15,8 @@ dependencies:
   - pytest
   - python=2.7
   - python-graphviz
+  - pytables
+  - pathlib2
   - sh
   - six
   - sqlalchemy>=1.0.0

diff --git a/ci/requirements-dev-3.6.yml b/ci/requirements-dev-3.6.yml
@@ -13,6 +13,7 @@ dependencies:
   - pytest
   - python=3.6
   - python-graphviz
+  - pytables
   - sh
   - six
   - sqlalchemy>=1.0.0

diff --git a/ci/requirements-docs-3.6.yml b/ci/requirements-docs-3.6.yml
@@ -16,6 +16,7 @@ dependencies:
   - pytest
   - python=3.6
   - python-graphviz
+  - pytables
   - sh
   - six
   - sphinx_rtd_theme

diff --git a/conda-recipes/ibis-framework/meta.yaml b/conda-recipes/ibis-framework/meta.yaml
@@ -13,6 +13,7 @@ source:
 requirements:
   build:
     - enum34  # [py27]
+    - pathlib2  # [py27]
     - numpy >=1.10.0
     - pandas >=0.18.1
     - python
@@ -21,6 +22,7 @@ requirements:
     - toolz
   run:
     - enum34  # [py27]
+    - pathlib2  # [py27]
     - numpy >=1.10.0
     - pandas >=0.18.1
     - python
@@ -56,6 +58,8 @@ test:
     - ibis.sql.tests
     - ibis.sql.vertica
     - ibis.sql.vertica.tests
+    - ibis.file
+    - ibis.file.tests
     - ibis.tests
   commands:
     - pytest --version

diff --git a/docs/source/release.rst b/docs/source/release.rst
@@ -9,7 +9,19 @@ Release Notes
    releases (e.g., ``0.5.1``) will generally not be found here and contain
    only bug fixes.
 
-v0.12.0 (October 28, 2017)
+  v0.13.0 (????)
+----------------
+
+This release brings a new backend, File Support support along with a number of
+bug fixes and reliability enhancements. We recommend that all users upgrade
+from earlier versions of Ibis.
+
+New Backends
+~~~~~~~~~~~~
+
+* File Support for CSV & HDF5 (:issue:`1165`)
+
+  v0.12.0 (October 28, 2017)
 --------------------------
 
 This release brings Clickhouse and BigQuery SQL support along with a number of

diff --git a/ibis/__init__.py b/ibis/__init__.py
@@ -57,6 +57,17 @@
 except ImportError:  # pip install  ibis-framework[pandas]
     pass
 
+try:
+    import ibis.file.csv as csv
+except ImportError:  # pip install  ibis-framework[csv]
+    pass
+
+
+try:
+    import ibis.file.hdf5 as hdf5
+except ImportError:  # pip install  ibis-framework[hdf5]
+    pass
+
 import ibis.config_init
 from ibis.config import options
 import ibis.util as util

diff --git a/ibis/file/__init__.py b/ibis/file/__init__.py
diff --git a/ibis/file/client.py b/ibis/file/client.py
@@ -0,0 +1,88 @@
+import ibis
+import ibis.expr.types as ir
+from ibis.file.utils import pathlib
+from ibis.pandas.core import execute, execute_with_scope  # noqa
+
+
+class FileClient(ibis.client.Client):
+
+    def __init__(self, root):
+        super(FileClient, self).__init__()
+        self.root = pathlib.Path(str(root))
+        self.dictionary = {}
+
+    def insert(self, path, expr, **kwargs):
+        raise NotImplementedError
+
+    def table(self, name, path):
+        raise NotImplementedError
+
+    def database(self, name=None, path=None):
+        if name is None:
+            return FileDatabase('root', self, path=path)
+
+        if name not in self.list_databases(path):
+            raise AttributeError(name)
+        if path is None:
+            path = self.root
+
+        new_name = "{}.{}".format(name, self.extension)
+        if (self.root / name).is_dir():
+            path = path / name
+        elif not str(path).endswith(new_name):
+            path = path / new_name
+
+        return FileDatabase(name, self, path=path)
+
+    def execute(self, expr, params=None, **kwargs):  # noqa
+        assert isinstance(expr, ir.Expr)
+        scope = kwargs.pop('scope', {})
+        return execute_with_scope(
+            expr, scope=scope,
+            params=params, **kwargs)
+
+    def list_tables(self, path=None):
+        raise NotImplementedError
+
+    def list_databases(self, path=None):
+        raise NotImplementedError
+
+
+class FileDatabase(ibis.client.Database):
+
+    def __init__(self, name, client, path=None):
+        super(FileDatabase, self).__init__(name, client)
+        self.path = path
+
+    def __str__(self):
+        return '{0.__class__.__name__}({0.name})'.format(self)
+
+    def __dir__(self):
+        dbs = self.list_databases(path=self.path)
+        tables = self.list_tables(path=self.path)
+        return sorted(list(set(dbs).union(set(tables))))
+
+    def __getattr__(self, name):
+        try:
+            return object.__getattribute__(self, name)
+        except AttributeError:
+            try:
+                return self.table(name, path=self.path)
+            except AttributeError:
+                return self.database(name, path=self.path)
+
+    def table(self, name, path):
+        return self.client.table(name, path=path)
+
+    def database(self, name=None, path=None):
+        return self.client.database(name=name, path=path)
+
+    def list_databases(self, path=None):
+        if path is None:
+            path = self.path
+        return sorted(self.client.list_databases(path=path))
+
+    def list_tables(self, path=None):
+        if path is None:
+            path = self.path
+        return sorted(self.client.list_tables(path=path))
diff --git a/ibis/file/csv.py b/ibis/file/csv.py
@@ -0,0 +1,111 @@
+import pandas as pd
+import ibis.expr.operations as ops
+from ibis.file.client import FileClient
+from ibis.pandas.core import pre_execute, execute  # noqa
+from ibis.pandas.client import pandas_dtypes_to_ibis_schema
+from ibis.pandas.execution.selection import physical_tables
+
+
+def connect(path):
+    """Create a CSVClient for use with Ibis
+
+    Parameters
+    ----------
+    path: str or pathlib.Path
+
+    Returns
+    -------
+    CSVClient
+    """
+
+    return CSVClient(path)
+
+
+class CSVTable(ops.DatabaseTable):
+    pass
+
+
+class CSVClient(FileClient):
+    extension = 'csv'
+
+    def insert(self, path, t, index=False, **kwargs):
+        path = self.root / path
+        data = execute(t)
+        data.to_csv(str(path), index=index, **kwargs)
+
+    def table(self, name, path=None):
+        if name not in self.list_tables(path):
+            raise AttributeError(name)
+
+        if path is None:
+            path = self.root
+
+        # get the schema
+        f = path / "{}.{}".format(name, self.extension)
+        df = pd.read_csv(str(f), header=0, nrows=10)
+        schema = pandas_dtypes_to_ibis_schema(df, {})
+
+        t = CSVTable(name, schema, self).to_expr()
+        self.dictionary[name] = f
+        return t
+
+    def list_tables(self, path=None):
+        # tables are files in a dir
+        if path is None:
+            path = self.root
+
+        tables = []
+        if path.is_dir():
+            for d in path.iterdir():
+                if d.is_file():
+                    if str(d).endswith(self.extension):
+                        tables.append(d.stem)
+        elif path.is_file():
+            if str(path).endswith(self.extension):
+                tables.append(path.stem)
+        return tables
+
+    def list_databases(self, path=None):
+        # databases are dir
+        if path is None:
+            path = self.root
+
+        tables = []
+        if path.is_dir():
+            for d in path.iterdir():
+                if d.is_dir():
+                    tables.append(d.name)
+        return tables
+
+
+@pre_execute.register(CSVTable, CSVClient)
+def csv_pre_execute_table(op, client, scope=None, **kwargs):
+    path = client.dictionary[op.name]
+    df = pd.read_csv(str(path), header=0)
+    return {op: df}
+
+
+@pre_execute.register(ops.Selection, CSVClient)
+def csv_pre_execute(op, client, scope=None, **kwargs):
+
+    pt = physical_tables(op.table.op())
+    pt = pt[0]
+
+    path = client.dictionary[pt.name]
+
+    if op.selections:
+
+        header = pd.read_csv(str(path), header=0, nrows=1)
+        usecols = [getattr(s.op(), 'name', None) or s.get_name()
+                   for s in op.selections]
+
+        # we cannot read all the columns taht we would like
+        if len(pd.Index(usecols) & header.columns) != len(usecols):
+            usecols = None
+
+    else:
+
+        usecols = None
+
+    df = pd.read_csv(str(path), usecols=usecols, header=0)
+    return {op: df}
diff --git a/ibis/file/hdf5.py b/ibis/file/hdf5.py
@@ -0,0 +1,89 @@
+import pandas as pd
+import ibis.expr.operations as ops
+from ibis.file.client import FileClient
+from ibis.pandas.core import pre_execute, execute  # noqa
+from ibis.pandas.client import pandas_dtypes_to_ibis_schema
+
+
+def connect(path):
+    """Create a HDF5Client for use with Ibis
+
+    Parameters
+    ----------
+    path: str or pathlib.Path
+
+    Returns
+    -------
+    HDF5Client
+    """
+    return HDFClient(path)
+
+
+class HDFTable(ops.DatabaseTable):
+    pass
+
+
+class HDFClient(FileClient):
+    extension = 'h5'
+
+    def insert(self, path, key, t, format='table',
+               data_columns=True, **kwargs):
+
+        path = self.root / path
+        data = execute(t)
+        data.to_hdf(str(path), key, format=format,
+                    data_columns=data_columns, **kwargs)
+
+    def table(self, name, path):
+        if name not in self.list_tables(path):
+            raise AttributeError(name)
+
+        # get the schema
+        with pd.HDFStore(str(path), mode='r') as store:
+            df = store.select(name, start=0, stop=0)
+            schema = pandas_dtypes_to_ibis_schema(df, {})
+
+        t = HDFTable(name, schema, self).to_expr()
+        self.dictionary[name] = path
+        return t
+
+    def list_tables(self, path=None):
+        # tables are individual tables within a file
+
+        if path is None:
+            path = self.root
+
+        if path.is_file() and str(path).endswith(self.extension):
+
+            with pd.HDFStore(str(path), mode='r') as store:
+                # strip leading /
+                return [k[1:] for k in store.keys()]
+
+        return []
+
+    def list_databases(self, path=None):
+        # databases are dir & a .h5 file
+        if path is None:
+            path = self.root
+
+        tables = []
+        if path.is_dir():
+            for d in path.iterdir():
+                if d.is_dir():
+                    tables.append(d.name)
+                elif d.is_file():
+                    if str(d).endswith(self.extension):
+                        tables.append(d.stem)
+        elif path.is_file():
+            # by definition we are at the db level at this point
+            pass
+
+        return tables
+
+
+@pre_execute.register(HDFTable, HDFClient)
+def hdf_pre_execute_table(op, client, scope=None, **kwargs):
+    key = op.name
+    path = client.dictionary[key]
+    df = pd.read_hdf(str(path), key, mode='r')
+    return {op: df}
diff --git a/ibis/file/tests/__init__.py b/ibis/file/tests/__init__.py