Skip to content

Commit

Permalink
ENH: initial support a file-like backend with csv & hdf5 implementations
Browse files Browse the repository at this point in the history
  • Loading branch information
jreback committed Nov 2, 2017
1 parent b530707 commit d5bb902
Show file tree
Hide file tree
Showing 19 changed files with 641 additions and 3 deletions.
3 changes: 2 additions & 1 deletion appveyor.yml
Expand Up @@ -30,7 +30,8 @@ test_script:
- "%CONDA% install conda=4.3.22 --channel conda-forge"
- "%CONDA% create --name \"ibis_%PYTHON_VERSION%\" python=%PYTHON_VERSION% --channel conda-forge"
- "%ACTIVATE% \"ibis_%PYTHON_VERSION%\""
- "pip install -e .\"[sqlite, postgres, visualization, pandas]\""
- "%CONDA% install pytables"
- "pip install -e .\"[sqlite, postgres, visualization, pandas, csv, hdf5]\""
- "pip install flake8 mock pytest click \"pbs==0.110\""
- "flake8"
- "python ci\\datamgr.py download --directory \"%USERPROFILE%\""
Expand Down
2 changes: 2 additions & 0 deletions ci/requirements-dev-2.7.yml
Expand Up @@ -15,6 +15,8 @@ dependencies:
- pytest
- python=2.7
- python-graphviz
- pytables
- pathlib2
- sh
- six
- sqlalchemy>=1.0.0
Expand Down
1 change: 1 addition & 0 deletions ci/requirements-dev-3.6.yml
Expand Up @@ -13,6 +13,7 @@ dependencies:
- pytest
- python=3.6
- python-graphviz
- pytables
- sh
- six
- sqlalchemy>=1.0.0
Expand Down
1 change: 1 addition & 0 deletions ci/requirements-docs-3.6.yml
Expand Up @@ -16,6 +16,7 @@ dependencies:
- pytest
- python=3.6
- python-graphviz
- pytables
- sh
- six
- sphinx_rtd_theme
Expand Down
4 changes: 4 additions & 0 deletions conda-recipes/ibis-framework/meta.yaml
Expand Up @@ -13,6 +13,7 @@ source:
requirements:
build:
- enum34 # [py27]
- pathlib2 # [py27]
- numpy >=1.10.0
- pandas >=0.18.1
- python
Expand All @@ -21,6 +22,7 @@ requirements:
- toolz
run:
- enum34 # [py27]
- pathlib2 # [py27]
- numpy >=1.10.0
- pandas >=0.18.1
- python
Expand Down Expand Up @@ -56,6 +58,8 @@ test:
- ibis.sql.tests
- ibis.sql.vertica
- ibis.sql.vertica.tests
- ibis.file
- ibis.file.tests
- ibis.tests
commands:
- pytest --version
Expand Down
14 changes: 13 additions & 1 deletion docs/source/release.rst
Expand Up @@ -9,7 +9,19 @@ Release Notes
releases (e.g., ``0.5.1``) will generally not be found here and contain
only bug fixes.

v0.12.0 (October 28, 2017)
v0.13.0 (????)
----------------

This release brings a new backend, File Support support along with a number of
bug fixes and reliability enhancements. We recommend that all users upgrade
from earlier versions of Ibis.

New Backends
~~~~~~~~~~~~

* File Support for CSV & HDF5 (:issue:`1165`)

v0.12.0 (October 28, 2017)
--------------------------

This release brings Clickhouse and BigQuery SQL support along with a number of
Expand Down
11 changes: 11 additions & 0 deletions ibis/__init__.py
Expand Up @@ -57,6 +57,17 @@
except ImportError: # pip install ibis-framework[pandas]
pass

try:
import ibis.file.csv as csv
except ImportError: # pip install ibis-framework[csv]
pass


try:
import ibis.file.hdf5 as hdf5
except ImportError: # pip install ibis-framework[hdf5]
pass

import ibis.config_init
from ibis.config import options
import ibis.util as util
Expand Down
Empty file added ibis/file/__init__.py
Empty file.
88 changes: 88 additions & 0 deletions ibis/file/client.py
@@ -0,0 +1,88 @@
import ibis
import ibis.expr.types as ir
from ibis.file.utils import pathlib
from ibis.pandas.core import execute, execute_with_scope # noqa


class FileClient(ibis.client.Client):

def __init__(self, root):
super(FileClient, self).__init__()
self.root = pathlib.Path(str(root))
self.dictionary = {}

def insert(self, path, expr, **kwargs):
raise NotImplementedError

def table(self, name, path):
raise NotImplementedError

def database(self, name=None, path=None):
if name is None:
return FileDatabase('root', self, path=path)

if name not in self.list_databases(path):
raise AttributeError(name)
if path is None:
path = self.root

new_name = "{}.{}".format(name, self.extension)
if (self.root / name).is_dir():
path = path / name
elif not str(path).endswith(new_name):
path = path / new_name

return FileDatabase(name, self, path=path)

def execute(self, expr, params=None, **kwargs): # noqa
assert isinstance(expr, ir.Expr)
scope = kwargs.pop('scope', {})
return execute_with_scope(
expr, scope=scope,
params=params, **kwargs)

def list_tables(self, path=None):
raise NotImplementedError

def list_databases(self, path=None):
raise NotImplementedError


class FileDatabase(ibis.client.Database):

def __init__(self, name, client, path=None):
super(FileDatabase, self).__init__(name, client)
self.path = path

def __str__(self):
return '{0.__class__.__name__}({0.name})'.format(self)

def __dir__(self):
dbs = self.list_databases(path=self.path)
tables = self.list_tables(path=self.path)
return sorted(list(set(dbs).union(set(tables))))

def __getattr__(self, name):
try:
return object.__getattribute__(self, name)
except AttributeError:
try:
return self.table(name, path=self.path)
except AttributeError:
return self.database(name, path=self.path)

def table(self, name, path):
return self.client.table(name, path=path)

def database(self, name=None, path=None):
return self.client.database(name=name, path=path)

def list_databases(self, path=None):
if path is None:
path = self.path
return sorted(self.client.list_databases(path=path))

def list_tables(self, path=None):
if path is None:
path = self.path
return sorted(self.client.list_tables(path=path))
111 changes: 111 additions & 0 deletions ibis/file/csv.py
@@ -0,0 +1,111 @@
import pandas as pd
import ibis.expr.operations as ops
from ibis.file.client import FileClient
from ibis.pandas.core import pre_execute, execute # noqa
from ibis.pandas.client import pandas_dtypes_to_ibis_schema
from ibis.pandas.execution.selection import physical_tables


def connect(path):
"""Create a CSVClient for use with Ibis
Parameters
----------
path: str or pathlib.Path
Returns
-------
CSVClient
"""

return CSVClient(path)


class CSVTable(ops.DatabaseTable):
pass


class CSVClient(FileClient):
extension = 'csv'

def insert(self, path, t, index=False, **kwargs):
path = self.root / path
data = execute(t)
data.to_csv(str(path), index=index, **kwargs)

def table(self, name, path=None):
if name not in self.list_tables(path):
raise AttributeError(name)

if path is None:
path = self.root

# get the schema
f = path / "{}.{}".format(name, self.extension)
df = pd.read_csv(str(f), header=0, nrows=10)
schema = pandas_dtypes_to_ibis_schema(df, {})

t = CSVTable(name, schema, self).to_expr()
self.dictionary[name] = f
return t

def list_tables(self, path=None):
# tables are files in a dir
if path is None:
path = self.root

tables = []
if path.is_dir():
for d in path.iterdir():
if d.is_file():
if str(d).endswith(self.extension):
tables.append(d.stem)
elif path.is_file():
if str(path).endswith(self.extension):
tables.append(path.stem)
return tables

def list_databases(self, path=None):
# databases are dir
if path is None:
path = self.root

tables = []
if path.is_dir():
for d in path.iterdir():
if d.is_dir():
tables.append(d.name)
return tables


@pre_execute.register(CSVTable, CSVClient)
def csv_pre_execute_table(op, client, scope=None, **kwargs):
path = client.dictionary[op.name]
df = pd.read_csv(str(path), header=0)
return {op: df}


@pre_execute.register(ops.Selection, CSVClient)
def csv_pre_execute(op, client, scope=None, **kwargs):

pt = physical_tables(op.table.op())
pt = pt[0]

path = client.dictionary[pt.name]

if op.selections:

header = pd.read_csv(str(path), header=0, nrows=1)
usecols = [getattr(s.op(), 'name', None) or s.get_name()
for s in op.selections]

# we cannot read all the columns taht we would like
if len(pd.Index(usecols) & header.columns) != len(usecols):
usecols = None

else:

usecols = None

df = pd.read_csv(str(path), usecols=usecols, header=0)
return {op: df}
89 changes: 89 additions & 0 deletions ibis/file/hdf5.py
@@ -0,0 +1,89 @@
import pandas as pd
import ibis.expr.operations as ops
from ibis.file.client import FileClient
from ibis.pandas.core import pre_execute, execute # noqa
from ibis.pandas.client import pandas_dtypes_to_ibis_schema


def connect(path):
"""Create a HDF5Client for use with Ibis
Parameters
----------
path: str or pathlib.Path
Returns
-------
HDF5Client
"""
return HDFClient(path)


class HDFTable(ops.DatabaseTable):
pass


class HDFClient(FileClient):
extension = 'h5'

def insert(self, path, key, t, format='table',
data_columns=True, **kwargs):

path = self.root / path
data = execute(t)
data.to_hdf(str(path), key, format=format,
data_columns=data_columns, **kwargs)

def table(self, name, path):
if name not in self.list_tables(path):
raise AttributeError(name)

# get the schema
with pd.HDFStore(str(path), mode='r') as store:
df = store.select(name, start=0, stop=0)
schema = pandas_dtypes_to_ibis_schema(df, {})

t = HDFTable(name, schema, self).to_expr()
self.dictionary[name] = path
return t

def list_tables(self, path=None):
# tables are individual tables within a file

if path is None:
path = self.root

if path.is_file() and str(path).endswith(self.extension):

with pd.HDFStore(str(path), mode='r') as store:
# strip leading /
return [k[1:] for k in store.keys()]

return []

def list_databases(self, path=None):
# databases are dir & a .h5 file
if path is None:
path = self.root

tables = []
if path.is_dir():
for d in path.iterdir():
if d.is_dir():
tables.append(d.name)
elif d.is_file():
if str(d).endswith(self.extension):
tables.append(d.stem)
elif path.is_file():
# by definition we are at the db level at this point
pass

return tables


@pre_execute.register(HDFTable, HDFClient)
def hdf_pre_execute_table(op, client, scope=None, **kwargs):
key = op.name
path = client.dictionary[key]
df = pd.read_hdf(str(path), key, mode='r')
return {op: df}
Empty file added ibis/file/tests/__init__.py
Empty file.

0 comments on commit d5bb902

Please sign in to comment.