| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,96 @@ | ||
| DROP TABLE IF EXISTS diamonds; | ||
|
|
||
| CREATE TABLE diamonds ( | ||
| `date` Date DEFAULT today(), | ||
| carat Float64, | ||
| cut String, | ||
| color String, | ||
| clarity String, | ||
| depth Float64, | ||
| `table` Float64, | ||
| price Int64, | ||
| x Float64, | ||
| y Float64, | ||
| z Float64 | ||
| ) ENGINE = MergeTree(date, (`carat`), 8192); | ||
|
|
||
| DROP TABLE IF EXISTS batting; | ||
|
|
||
| CREATE TABLE batting ( | ||
| `date` Date DEFAULT today(), | ||
| `playerID` String, | ||
| `yearID` Int64, | ||
| stint Int64, | ||
| `teamID` String, | ||
| `lgID` String, | ||
| `G` Int64, | ||
| `AB` Int64, | ||
| `R` Int64, | ||
| `H` Int64, | ||
| `X2B` Int64, | ||
| `X3B` Int64, | ||
| `HR` Int64, | ||
| `RBI` Int64, | ||
| `SB` Int64, | ||
| `CS` Int64, | ||
| `BB` Int64, | ||
| `SO` Int64, | ||
| `IBB` Int64, | ||
| `HBP` Int64, | ||
| `SH` Int64, | ||
| `SF` Int64, | ||
| `GIDP` Int64 | ||
| ) ENGINE = MergeTree(date, (`playerID`), 8192); | ||
|
|
||
| DROP TABLE IF EXISTS awards_players; | ||
|
|
||
| CREATE TABLE awards_players ( | ||
| `date` Date DEFAULT today(), | ||
| `playerID` String, | ||
| `awardID` String, | ||
| `yearID` Int64, | ||
| `lgID` String, | ||
| tie String, | ||
| notes String | ||
| ) ENGINE = MergeTree(date, (`playerID`), 8192); | ||
|
|
||
| DROP TABLE IF EXISTS functional_alltypes; | ||
|
|
||
| CREATE TABLE functional_alltypes ( | ||
| `date` Date DEFAULT toDate(timestamp_col), | ||
| `index` Int64, | ||
| `Unnamed_0` Int64, | ||
| id Int32, | ||
| bool_col UInt8, | ||
| tinyint_col Int8, | ||
| smallint_col Int16, | ||
| int_col Int32, | ||
| bigint_col Int64, | ||
| float_col Float32, | ||
| double_col Float64, | ||
| date_string_col String, | ||
| string_col String, | ||
| timestamp_col DateTime, | ||
| year Int32, | ||
| month Int32 | ||
| ) ENGINE = MergeTree(date, (`index`), 8192); | ||
|
|
||
| DROP TABLE IF EXISTS tzone; | ||
|
|
||
| CREATE TABLE tzone ( | ||
| `date` Date DEFAULT today(), | ||
| ts DateTime, | ||
| key String, | ||
| value Float64 | ||
| ) ENGINE = MergeTree(date, (key), 8192); | ||
|
|
||
| DROP TABLE IF EXISTS array_types; | ||
|
|
||
| CREATE TABLE IF NOT EXISTS array_types ( | ||
| `date` Date DEFAULT today(), | ||
| x Array(Int64), | ||
| y Array(String), | ||
| z Array(Float64), | ||
| grouper String, | ||
| scalar_column Float64 | ||
| ) ENGINE = MergeTree(date, (scalar_column), 8192); |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,286 @@ | ||
| #!/usr/bin/env python | ||
|
|
||
| import os | ||
| import getpass | ||
| import tempfile | ||
| import tarfile | ||
| import operator | ||
|
|
||
| import sqlalchemy as sa | ||
|
|
||
| import numpy as np | ||
| import pandas as pd | ||
|
|
||
| import click | ||
|
|
||
| try: | ||
| import sh | ||
| except ImportError: | ||
| import pbs as sh | ||
|
|
||
|
|
||
| @click.group() | ||
| def cli(): | ||
| pass | ||
|
|
||
|
|
||
| @cli.command() | ||
| @click.argument('tables', nargs=-1) | ||
| @click.option('-S', '--script', type=click.File('rt'), required=True) | ||
| @click.option( | ||
| '-d', '--database', | ||
| default=os.environ.get('IBIS_TEST_CLICKHOUSE_DB', 'ibis_testing') | ||
| ) | ||
| @click.option( | ||
| '-D', '--data-directory', | ||
| default=tempfile.gettempdir(), type=click.Path(exists=True) | ||
| ) | ||
| def clickhouse(script, tables, database, data_directory): | ||
| username = os.environ.get('IBIS_CLICKHOUSE_USER', 'default') | ||
| host = os.environ.get('IBIS_CLICKHOUSE_HOST', 'localhost') | ||
| password = os.environ.get('IBIS_CLICKHOUSE_PASS', '') | ||
|
|
||
| url = sa.engine.url.URL( | ||
| 'clickhouse+native', | ||
| username=username, | ||
| host=host, | ||
| password=password, | ||
| ) | ||
| engine = sa.create_engine(str(url)) | ||
| engine.execute('DROP DATABASE IF EXISTS "{}"'.format(database)) | ||
| engine.execute('CREATE DATABASE "{}"'.format(database)) | ||
|
|
||
| url = sa.engine.url.URL( | ||
| 'clickhouse+native', | ||
| username=username, | ||
| host=host, | ||
| password=password, | ||
| database=database, | ||
| ) | ||
| engine = sa.create_engine(str(url)) | ||
| script_text = script.read() | ||
|
|
||
| # missing stmt | ||
| # INSERT INTO array_types (x, y, z, grouper, scalar_column) VALUES | ||
| # ([1, 2, 3], ['a', 'b', 'c'], [1.0, 2.0, 3.0], 'a', 1.0), | ||
| # ([4, 5], ['d', 'e'], [4.0, 5.0], 'a', 2.0), | ||
| # ([6], ['f'], [6.0], 'a', 3.0), | ||
| # ([1], ['a'], [], 'b', 4.0), | ||
| # ([2, 3], ['b', 'c'], [], 'b', 5.0), | ||
| # ([4, 5], ['d', 'e'], [4.0, 5.0], 'c', 6.0); | ||
|
|
||
| with engine.begin() as con: | ||
| # doesn't support multiple statements | ||
| for stmt in script_text.split(';'): | ||
| if len(stmt.strip()): | ||
| con.execute(stmt) | ||
|
|
||
| table_paths = [ | ||
| os.path.join(data_directory, '{}.csv'.format(table)) | ||
| for table in tables | ||
| ] | ||
| dtype = {'bool_col': np.bool_} | ||
| for table, path in zip(tables, table_paths): | ||
| # correct dtypes per table to be able to insert | ||
| # TODO: cleanup, kinda ugly | ||
| df = pd.read_csv(path, index_col=None, header=0, dtype=dtype) | ||
| if table == 'functional_alltypes': | ||
| df = df.rename(columns={'Unnamed: 0': 'Unnamed_0'}) | ||
| cols = ['date_string_col', 'string_col'] | ||
| df[cols] = df[cols].astype(str) | ||
| df.timestamp_col = df.timestamp_col.astype('datetime64[s]') | ||
| elif table == 'batting': | ||
| cols = ['playerID', 'teamID', 'lgID'] | ||
| df[cols] = df[cols].astype(str) | ||
| cols = df.select_dtypes([float]).columns | ||
| df[cols] = df[cols].fillna(0).astype(int) | ||
| elif table == 'awards_players': | ||
| cols = ['playerID', 'awardID', 'lgID', 'tie', 'notes'] | ||
| df[cols] = df[cols].astype(str) | ||
|
|
||
| df.to_sql(table, engine, index=False, if_exists='append') | ||
|
|
||
|
|
||
| @cli.command() | ||
| @click.argument('tables', nargs=-1) | ||
| @click.option('-S', '--script', type=click.File('rt'), required=True) | ||
| @click.option( | ||
| '-d', '--database', | ||
| default=os.environ.get( | ||
| 'IBIS_TEST_POSTGRES_DB', os.environ.get('PGDATABASE', 'ibis_testing') | ||
| ), | ||
| ) | ||
| @click.option( | ||
| '-D', '--data-directory', | ||
| default=tempfile.gettempdir(), type=click.Path(exists=True) | ||
| ) | ||
| def postgres(script, tables, database, data_directory): | ||
| username = os.environ.get( | ||
| 'IBIS_POSTGRES_USER', os.environ.get('PGUSER', getpass.getuser()) | ||
| ) | ||
| host = os.environ.get('PGHOST', 'localhost') | ||
| password = os.environ.get('IBIS_POSTGRES_PASS', os.environ.get('PGPASS')) | ||
| url = sa.engine.url.URL( | ||
| 'postgresql', | ||
| username=username, | ||
| host=host, | ||
| password=password, | ||
| ) | ||
| engine = sa.create_engine(str(url), isolation_level='AUTOCOMMIT') | ||
| engine.execute('DROP DATABASE IF EXISTS "{}"'.format(database)) | ||
| engine.execute('CREATE DATABASE "{}"'.format(database)) | ||
|
|
||
| url = sa.engine.url.URL( | ||
| 'postgresql', | ||
| username=username, | ||
| host=host, | ||
| password=password, | ||
| database=database, | ||
| ) | ||
| engine = sa.create_engine(str(url)) | ||
| script_text = script.read() | ||
| with engine.begin() as con: | ||
| con.execute(script_text) | ||
|
|
||
| table_paths = [ | ||
| os.path.join(data_directory, '{}.csv'.format(table)) | ||
| for table in tables | ||
| ] | ||
| dtype = {'bool_col': np.bool_} | ||
| for table, path in zip(tables, table_paths): | ||
| df = pd.read_csv(path, index_col=None, header=0, dtype=dtype) | ||
| df.to_sql(table, engine, index=False, if_exists='append') | ||
| engine = sa.create_engine(str(url), isolation_level='AUTOCOMMIT') | ||
| engine.execute('VACUUM FULL ANALYZE') | ||
|
|
||
|
|
||
| @cli.command() | ||
| @click.argument('tables', nargs=-1) | ||
| @click.option('-S', '--script', type=click.File('rt'), required=True) | ||
| @click.option( | ||
| '-d', '--database', | ||
| default=os.environ.get('IBIS_TEST_SQLITE_DB_PATH', 'ibis_testing.db') | ||
| ) | ||
| @click.option( | ||
| '-D', '--data-directory', | ||
| default=tempfile.gettempdir(), type=click.Path(exists=True) | ||
| ) | ||
| def sqlite(script, tables, database, data_directory): | ||
| database = os.path.abspath(database) | ||
| if os.path.exists(database): | ||
| try: | ||
| os.remove(database) | ||
| except OSError: | ||
| pass | ||
| engine = sa.create_engine('sqlite:///{}'.format(database)) | ||
| script_text = script.read() | ||
| with engine.begin() as con: | ||
| con.connection.connection.executescript(script_text) | ||
| table_paths = [ | ||
| os.path.join(data_directory, '{}.csv'.format(table)) | ||
| for table in tables | ||
| ] | ||
| click.echo(tables) | ||
| click.echo(table_paths) | ||
| for table, path in zip(tables, table_paths): | ||
| df = pd.read_csv(path, index_col=None, header=0) | ||
| with engine.begin() as con: | ||
| df.to_sql(table, con, index=False, if_exists='append') | ||
| engine.execute('VACUUM') | ||
| engine.execute('VACUUM ANALYZE') | ||
|
|
||
|
|
||
| if os.environ.get('APPVEYOR', None) is not None: | ||
| curl = sh.Command('C:\\Tools\\curl\\bin\\curl.exe') | ||
| else: | ||
| curl = sh.curl | ||
|
|
||
|
|
||
| @cli.command() | ||
| @click.argument( | ||
| 'base_url', | ||
| required=False, | ||
| default='https://storage.googleapis.com/ibis-ci-data' # noqa: E501 | ||
| ) | ||
| @click.option('-d', '--data', multiple=True) | ||
| @click.option('-D', '--directory', default='.', type=click.Path(exists=False)) | ||
| def download(base_url, data, directory): | ||
| if not data: | ||
| data = 'ibis-testing-data.tar.gz', | ||
|
|
||
| if not os.path.exists(directory): | ||
| os.mkdir(directory) | ||
|
|
||
| for piece in data: | ||
| data_url = '{}/{}'.format(base_url, piece) | ||
| path = os.path.join(directory, piece) | ||
|
|
||
| curl( | ||
| data_url, o=path, L=True, | ||
| _out=click.get_binary_stream('stdout'), | ||
| _err=click.get_binary_stream('stderr'), | ||
| ) | ||
|
|
||
| if piece.endswith(('.tar', '.gz', '.bz2', '.xz')): | ||
| with tarfile.open(path, mode='r|gz') as f: | ||
| f.extractall(path=directory) | ||
|
|
||
|
|
||
| def parse_env(ctx, param, values): | ||
| pairs = [] | ||
| for envar in values: | ||
| try: | ||
| name, value = envar.split('=', 1) | ||
| except ValueError: | ||
| raise click.ClickException( | ||
| 'Environment variables must be of the form NAME=VALUE. ' | ||
| '{} is not in this format'.format(envar) | ||
| ) | ||
| pairs.append((name, value)) | ||
| return dict(pairs) | ||
|
|
||
|
|
||
| @cli.command() | ||
| @click.argument('data_directory', type=click.Path(exists=True)) | ||
| @click.option('-e', '--environment', multiple=True, callback=parse_env) | ||
| def env(data_directory, environment): | ||
| envars = dict([ | ||
| ('IBIS_TEST_IMPALA_HOST', 'impala'), | ||
| ('IBIS_TEST_NN_HOST', 'impala'), | ||
| ('IBIS_TEST_IMPALA_POST', 21050), | ||
| ('IBIS_TEST_WEBHDFS_PORT', 50070), | ||
| ('IBIS_TEST_WEBHDFS_USER', 'ubuntu'), | ||
| ( | ||
| 'IBIS_TEST_SQLITE_DB_PATH', | ||
| os.path.join(data_directory, 'ibis_testing.db'), | ||
| ), | ||
| ( | ||
| 'DIAMONDS_CSV', | ||
| os.path.join(data_directory, 'diamonds.csv') | ||
| ), | ||
| ( | ||
| 'BATTING_CSV', | ||
| os.path.join(data_directory, 'batting.csv') | ||
| ), | ||
| ( | ||
| 'AWARDS_PLAYERS_CSV', | ||
| os.path.join(data_directory, 'awards_players.csv') | ||
| ), | ||
| ( | ||
| 'FUNCTIONAL_ALLTYPES_CSV', | ||
| os.path.join(data_directory, 'functional_alltypes.csv') | ||
| ), | ||
| ('IBIS_TEST_POSTGRES_DB', 'ibis_testing'), | ||
| ('IBIS_POSTGRES_USER', getpass.getuser()), | ||
| ('IBIS_POSTGRES_PASS', ''), | ||
| ]) | ||
| envars.update(environment) | ||
| string = '\n'.join( | ||
| '='.join((name, str(value))) | ||
| for name, value in sorted(envars.items(), key=operator.itemgetter(0)) | ||
| ) | ||
| click.echo(string) | ||
|
|
||
|
|
||
| if __name__ == '__main__': | ||
| cli() |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,108 @@ | ||
| DROP TABLE IF EXISTS diamonds CASCADE; | ||
|
|
||
| CREATE TABLE diamonds ( | ||
| carat FLOAT, | ||
| cut TEXT, | ||
| color TEXT, | ||
| clarity TEXT, | ||
| depth FLOAT, | ||
| "table" FLOAT, | ||
| price BIGINT, | ||
| x FLOAT, | ||
| y FLOAT, | ||
| z FLOAT | ||
| ); | ||
|
|
||
| DROP TABLE IF EXISTS batting CASCADE; | ||
|
|
||
| CREATE TABLE batting ( | ||
| "playerID" TEXT, | ||
| "yearID" BIGINT, | ||
| stint BIGINT, | ||
| "teamID" TEXT, | ||
| "lgID" TEXT, | ||
| "G" BIGINT, | ||
| "AB" BIGINT, | ||
| "R" BIGINT, | ||
| "H" BIGINT, | ||
| "X2B" BIGINT, | ||
| "X3B" BIGINT, | ||
| "HR" BIGINT, | ||
| "RBI" BIGINT, | ||
| "SB" BIGINT, | ||
| "CS" BIGINT, | ||
| "BB" BIGINT, | ||
| "SO" BIGINT, | ||
| "IBB" BIGINT, | ||
| "HBP" BIGINT, | ||
| "SH" BIGINT, | ||
| "SF" BIGINT, | ||
| "GIDP" BIGINT | ||
| ); | ||
|
|
||
| DROP TABLE IF EXISTS awards_players CASCADE; | ||
|
|
||
| CREATE TABLE awards_players ( | ||
| "playerID" TEXT, | ||
| "awardID" TEXT, | ||
| "yearID" BIGINT, | ||
| "lgID" TEXT, | ||
| tie TEXT, | ||
| notes TEXT | ||
| ); | ||
|
|
||
| DROP TABLE IF EXISTS functional_alltypes CASCADE; | ||
|
|
||
| CREATE TABLE functional_alltypes ( | ||
| "index" BIGINT, | ||
| "Unnamed: 0" BIGINT, | ||
| id INTEGER, | ||
| bool_col BOOLEAN, | ||
| tinyint_col SMALLINT, | ||
| smallint_col SMALLINT, | ||
| int_col INTEGER, | ||
| bigint_col BIGINT, | ||
| float_col REAL, | ||
| double_col DOUBLE PRECISION, | ||
| date_string_col TEXT, | ||
| string_col TEXT, | ||
| timestamp_col TIMESTAMP WITHOUT TIME ZONE, | ||
| year INTEGER, | ||
| month INTEGER | ||
| ); | ||
|
|
||
| CREATE INDEX "ix_functional_alltypes_index" ON functional_alltypes ("index"); | ||
|
|
||
| DROP TABLE IF EXISTS tzone CASCADE; | ||
|
|
||
| CREATE TABLE tzone ( | ||
| ts TIMESTAMP WITH TIME ZONE, | ||
| key TEXT, | ||
| value DOUBLE PRECISION | ||
| ); | ||
|
|
||
| INSERT INTO tzone | ||
| SELECT | ||
| CAST('2017-05-28 11:01:31.000400' AS TIMESTAMP WITH TIME ZONE) + | ||
| t * INTERVAL '1 day 1 microsecond' AS ts, | ||
| CHR(97 + t) AS key, | ||
| t + t / 10.0 AS value | ||
| FROM generate_series(0, 9) AS t; | ||
|
|
||
| DROP TABLE IF EXISTS array_types CASCADE; | ||
|
|
||
| CREATE TABLE IF NOT EXISTS array_types ( | ||
| x BIGINT[], | ||
| y TEXT[], | ||
| z DOUBLE PRECISION[], | ||
| grouper TEXT, | ||
| scalar_column DOUBLE PRECISION | ||
| ); | ||
|
|
||
| INSERT INTO array_types VALUES | ||
| (ARRAY[1, 2, 3], ARRAY['a', 'b', 'c'], ARRAY[1.0, 2.0, 3.0], 'a', 1.0), | ||
| (ARRAY[4, 5], ARRAY['d', 'e'], ARRAY[4.0, 5.0], 'a', 2.0), | ||
| (ARRAY[6, NULL], ARRAY['f', NULL], ARRAY[6.0, NULL], 'a', 3.0), | ||
| (ARRAY[NULL, 1, NULL], ARRAY[NULL, 'a', NULL], ARRAY[]::DOUBLE PRECISION[], 'b', 4.0), | ||
| (ARRAY[2, NULL, 3], ARRAY['b', NULL, 'c'], NULL, 'b', 5.0), | ||
| (ARRAY[4, NULL, NULL, 5], ARRAY['d', NULL, NULL, 'e'], ARRAY[4.0, NULL, NULL, 5.0], 'c', 6.0); |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,28 @@ | ||
| channels: | ||
| - conda-forge | ||
| dependencies: | ||
| - click | ||
| - cmake | ||
| - enum34 | ||
| - flake8 | ||
| - graphviz | ||
| - impyla>=0.13.7 | ||
| - mock | ||
| - multipledispatch | ||
| - numpy=1.10.0 | ||
| - pandas=0.18.1 | ||
| - psycopg2 | ||
| - pytest | ||
| - python=2.7 | ||
| - python-graphviz | ||
| - sh | ||
| - six | ||
| - sqlalchemy>=1.0.0 | ||
| - thrift<=0.9.3 | ||
| - thriftpy<=0.3.9 | ||
| - toolz | ||
| - clickhouse-driver>=0.0.8 | ||
| - clickhouse-sqlalchemy | ||
| - pip: | ||
| - hdfs>=2.0.0 | ||
| - google-cloud-bigquery |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,26 @@ | ||
| channels: | ||
| - conda-forge | ||
| dependencies: | ||
| - click | ||
| - cmake | ||
| - flake8 | ||
| - graphviz | ||
| - impyla>=0.13.7 | ||
| - multipledispatch | ||
| - numpy=1.11.0 | ||
| - pandas=0.19.0 | ||
| - psycopg2 | ||
| - pytest | ||
| - python=3.4 | ||
| - python-graphviz | ||
| - sh | ||
| - six | ||
| - sqlalchemy>=1.0.0 | ||
| - thrift<=0.9.3 | ||
| - thriftpy<=0.3.9 | ||
| - toolz | ||
| - pip: | ||
| - hdfs>=2.0.0 | ||
| - clickhouse-driver>=0.0.8 | ||
| - clickhouse-sqlalchemy | ||
| - google-cloud-bigquery |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,26 @@ | ||
| channels: | ||
| - conda-forge | ||
| dependencies: | ||
| - click | ||
| - cmake | ||
| - flake8 | ||
| - graphviz | ||
| - impyla>=0.13.7 | ||
| - multipledispatch | ||
| - numpy=1.12.0 | ||
| - pandas | ||
| - psycopg2 | ||
| - pytest | ||
| - python=3.5 | ||
| - python-graphviz | ||
| - six | ||
| - sh | ||
| - sqlalchemy>=1.0.0 | ||
| - thrift<=0.9.3 | ||
| - thriftpy<=0.3.9 | ||
| - toolz | ||
| - clickhouse-driver>=0.0.8 | ||
| - clickhouse-sqlalchemy | ||
| - pip: | ||
| - hdfs>=2.0.0 | ||
| - google-cloud-bigquery |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,26 @@ | ||
| channels: | ||
| - conda-forge | ||
| dependencies: | ||
| - click | ||
| - cmake | ||
| - flake8 | ||
| - graphviz | ||
| - impyla>=0.13.7 | ||
| - multipledispatch | ||
| - numpy | ||
| - pandas | ||
| - psycopg2 | ||
| - pytest | ||
| - python=3.6 | ||
| - python-graphviz | ||
| - sh | ||
| - six | ||
| - sqlalchemy>=1.0.0 | ||
| - thrift | ||
| - thriftpy<=0.3.9 | ||
| - toolz | ||
| - clickhouse-driver>=0.0.8 | ||
| - clickhouse-sqlalchemy | ||
| - pip: | ||
| - hdfs>=2.0.0 | ||
| - google-cloud-bigquery |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,30 @@ | ||
| channels: | ||
| - conda-forge | ||
| dependencies: | ||
| - click | ||
| - cmake | ||
| - flake8 | ||
| - graphviz | ||
| - impyla>=0.13.7 | ||
| - ipython | ||
| - matplotlib | ||
| - multipledispatch | ||
| - numpy | ||
| - numpydoc | ||
| - pandas | ||
| - psycopg2 | ||
| - pytest | ||
| - python=3.6 | ||
| - python-graphviz | ||
| - sh | ||
| - six | ||
| - sphinx_rtd_theme | ||
| - sqlalchemy>=1.0.0 | ||
| - thrift | ||
| - thriftpy<=0.3.9 | ||
| - toolz | ||
| - clickhouse-driver>=0.0.8 | ||
| - clickhouse-sqlalchemy | ||
| - pip: | ||
| - hdfs>=2.0.0 | ||
| - google-cloud-bigquery |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,67 @@ | ||
| CREATE TABLE functional_alltypes ( | ||
| "index" BIGINT, | ||
| "Unnamed: 0" BIGINT, | ||
| id BIGINT, | ||
| bool_col BOOLEAN, | ||
| tinyint_col BIGINT, | ||
| smallint_col BIGINT, | ||
| int_col BIGINT, | ||
| bigint_col BIGINT, | ||
| float_col FLOAT, | ||
| double_col FLOAT, | ||
| date_string_col TEXT, | ||
| string_col TEXT, | ||
| timestamp_col TEXT, | ||
| year BIGINT, | ||
| month BIGINT, | ||
| CHECK (bool_col IN (0, 1)) | ||
| ); | ||
|
|
||
| CREATE INDEX ix_functional_alltypes_index ON "functional_alltypes" ("index"); | ||
|
|
||
| CREATE TABLE awards_players ( | ||
| "playerID" TEXT, | ||
| "awardID" TEXT, | ||
| "yearID" BIGINT, | ||
| "lgID" TEXT, | ||
| tie TEXT, | ||
| notes TEXT | ||
| ); | ||
|
|
||
| CREATE TABLE batting ( | ||
| "playerID" TEXT, | ||
| "yearID" BIGINT, | ||
| stint BIGINT, | ||
| "teamID" TEXT, | ||
| "lgID" TEXT, | ||
| "G" BIGINT, | ||
| "AB" BIGINT, | ||
| "R" BIGINT, | ||
| "H" BIGINT, | ||
| "X2B" BIGINT, | ||
| "X3B" BIGINT, | ||
| "HR" BIGINT, | ||
| "RBI" BIGINT, | ||
| "SB" BIGINT, | ||
| "CS" BIGINT, | ||
| "BB" BIGINT, | ||
| "SO" BIGINT, | ||
| "IBB" BIGINT, | ||
| "HBP" BIGINT, | ||
| "SH" BIGINT, | ||
| "SF" BIGINT, | ||
| "GIDP" BIGINT | ||
| ); | ||
|
|
||
| CREATE TABLE diamonds ( | ||
| carat FLOAT, | ||
| cut TEXT, | ||
| color TEXT, | ||
| clarity TEXT, | ||
| depth FLOAT, | ||
| "table" FLOAT, | ||
| price BIGINT, | ||
| x FLOAT, | ||
| y FLOAT, | ||
| z FLOAT | ||
| ); |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,2 +1,4 @@ | ||
| sphinx_rtd_theme | ||
| numpydoc | ||
| ipython | ||
| matplotlib |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,44 @@ | ||
| import ibis.common as com | ||
| from ibis.config import options # noqa: F401 | ||
| from ibis.bigquery.client import BigQueryClient | ||
|
|
||
|
|
||
| def compile(expr): | ||
| """ | ||
| Force compilation of expression as though it were an expression depending | ||
| on BigQuery. Note you can also call expr.compile() | ||
| Returns | ||
| ------- | ||
| compiled : string | ||
| """ | ||
| from .compiler import to_sql | ||
| return to_sql(expr) | ||
|
|
||
|
|
||
| def verify(expr): | ||
| """ | ||
| Determine if expression can be successfully translated to execute on | ||
| BigQuery | ||
| """ | ||
| try: | ||
| compile(expr) | ||
| return True | ||
| except com.TranslationError: | ||
| return False | ||
|
|
||
|
|
||
| def connect(project_id, dataset_id): | ||
| """Create a BigQueryClient for use with Ibis | ||
| Parameters | ||
| ---------- | ||
| project_id: str | ||
| dataset_id: str | ||
| Returns | ||
| ------- | ||
| BigQueryClient | ||
| """ | ||
|
|
||
| return BigQueryClient(project_id, dataset_id) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,210 @@ | ||
| import re | ||
|
|
||
| import pandas as pd | ||
|
|
||
| import ibis | ||
| import ibis.expr.types as ir | ||
| import ibis.expr.datatypes as dt | ||
| from ibis.client import Database, Query, SQLClient | ||
| from ibis.bigquery import compiler as comp | ||
| import google.cloud.bigquery | ||
|
|
||
|
|
||
| def _ensure_split(table_id, dataset_id): | ||
| split = table_id.split('.') | ||
| if len(split) > 1: | ||
| assert len(split) == 2 | ||
| if dataset_id: | ||
| raise ValueError( | ||
| "Can't pass a fully qualified table name *AND* a dataset_id" | ||
| ) | ||
| (dataset_id, table_id) = split | ||
| return (table_id, dataset_id) | ||
|
|
||
|
|
||
| class BigQueryCursor(object): | ||
| """Cursor to allow the BigQuery client to reuse machinery in ibis/client.py | ||
| """ | ||
|
|
||
| def __init__(self, query): | ||
| self.query = query | ||
|
|
||
| def fetchall(self): | ||
| return list(self.query.fetch_data()) | ||
|
|
||
| @property | ||
| def columns(self): | ||
| return [field.name for field in self.query.schema] | ||
|
|
||
| def __enter__(self): | ||
| # For compatibility when constructed from Query.execute() | ||
| return self | ||
|
|
||
| def __exit__(self, exc_type, exc_value, traceback): | ||
| pass | ||
|
|
||
|
|
||
| class BigQuery(Query): | ||
|
|
||
| def _fetch(self, cursor): | ||
| return pd.DataFrame(cursor.fetchall(), columns=cursor.columns) | ||
|
|
||
|
|
||
| class BigQueryAPIProxy(object): | ||
|
|
||
| def __init__(self, project_id): | ||
| self._client = google.cloud.bigquery.Client(project_id) | ||
|
|
||
| @property | ||
| def client(self): | ||
| return self._client | ||
|
|
||
| @property | ||
| def project_id(self): | ||
| return self.client.project | ||
|
|
||
| def get_datasets(self): | ||
| return list(self.client.list_datasets()) | ||
|
|
||
| def get_dataset(self, dataset_id): | ||
| return self.client.dataset(dataset_id) | ||
|
|
||
| def get_table(self, table_id, dataset_id, reload=True): | ||
| (table_id, dataset_id) = _ensure_split(table_id, dataset_id) | ||
| table = self.client.dataset(dataset_id).table(table_id) | ||
| if reload: | ||
| table.reload() | ||
| return table | ||
|
|
||
| def get_schema(self, table_id, dataset_id): | ||
| return self.get_table(table_id, dataset_id).schema | ||
|
|
||
|
|
||
| class BigQueryDatabase(Database): | ||
| pass | ||
|
|
||
|
|
||
| class BigQueryClient(SQLClient): | ||
|
|
||
| sync_query = BigQuery | ||
| database_class = BigQueryDatabase | ||
| proxy_class = BigQueryAPIProxy | ||
|
|
||
| def __init__(self, project_id, dataset_id): | ||
| self._proxy = self.__class__.proxy_class(project_id) | ||
| self._dataset_id = dataset_id | ||
|
|
||
| @property | ||
| def project_id(self): | ||
| return self._proxy.project_id | ||
|
|
||
| @property | ||
| def dataset_id(self): | ||
| return self._dataset_id | ||
|
|
||
| @property | ||
| def _table_expr_klass(self): | ||
| return ir.TableExpr | ||
|
|
||
| def _build_ast(self, expr, params=None): | ||
| return comp.build_ast(expr, params=params) | ||
|
|
||
| def _fully_qualified_name(self, name, database): | ||
| dataset_id = database or self.dataset_id | ||
| return dataset_id + '.' + name | ||
|
|
||
| def _get_table_schema(self, qualified_name): | ||
| return self.get_schema(qualified_name) | ||
|
|
||
| def _execute(self, stmt, results=True): | ||
| # TODO(phillipc): Allow **kwargs in calls to execute | ||
| query = self._proxy.client.run_sync_query(stmt) | ||
| query.use_legacy_sql = False | ||
| query.run() | ||
| return BigQueryCursor(query) | ||
|
|
||
| def database(self, name=None): | ||
| if name is None: | ||
| name = self.dataset_id | ||
| return self.database_class(name, self) | ||
|
|
||
| @property | ||
| def current_database(self): | ||
| return self.database(self.dataset_id) | ||
|
|
||
| def set_database(self, name): | ||
| self._dataset_id = name | ||
|
|
||
| def exists_database(self, name): | ||
| return self._proxy.get_dataset(name).exists() | ||
|
|
||
| def list_databases(self, like=None): | ||
| results = [dataset.name | ||
| for dataset in self._proxy.get_datasets()] | ||
| if like: | ||
| results = [ | ||
| dataset_name for dataset_name in results | ||
| if re.match(like, dataset_name) | ||
| ] | ||
| return results | ||
|
|
||
| def exists_table(self, name, database=None): | ||
| (table_id, dataset_id) = _ensure_split(name, database) | ||
| return self._proxy.get_table(table_id, dataset_id).exists() | ||
|
|
||
| def list_tables(self, like=None, database=None): | ||
| dataset = self._proxy.get_dataset(database or self.dataset_id) | ||
| result = [table.name for table in dataset.list_tables()] | ||
| if like: | ||
| result = [ | ||
| table_name for table_name in result | ||
| if re.match(like, table_name) | ||
| ] | ||
| return result | ||
|
|
||
| def get_schema(self, name, database=None): | ||
| (table_id, dataset_id) = _ensure_split(name, database) | ||
| bq_table = self._proxy.get_table(table_id, dataset_id) | ||
| return bigquery_table_to_ibis_schema(bq_table) | ||
|
|
||
|
|
||
| _DTYPE_TO_IBIS_TYPE = { | ||
| 'INT64': dt.int64, | ||
| 'FLOAT64': dt.double, | ||
| 'BOOL': dt.boolean, | ||
| 'STRING': dt.string, | ||
| 'DATE': dt.date, | ||
| # FIXME: enforce no tz info | ||
| 'DATETIME': dt.timestamp, | ||
| 'TIME': dt.time, | ||
| 'TIMESTAMP': dt.timestamp, | ||
| 'BYTES': dt.binary, | ||
| } | ||
|
|
||
|
|
||
| _LEGACY_TO_STANDARD = { | ||
| 'INTEGER': 'INT64', | ||
| 'FLOAT': 'FLOAT64', | ||
| 'BOOLEAN': 'BOOL', | ||
| } | ||
|
|
||
|
|
||
| def _discover_type(field): | ||
| typ = field.field_type | ||
| if typ == 'RECORD': | ||
| fields = field.fields | ||
| assert fields | ||
| names = [el.name for el in fields] | ||
| ibis_types = [_discover_type(el) for el in fields] | ||
| ibis_type = dt.Struct(names, ibis_types) | ||
| else: | ||
| ibis_type = _LEGACY_TO_STANDARD.get(typ, typ) | ||
| ibis_type = _DTYPE_TO_IBIS_TYPE.get(ibis_type, ibis_type) | ||
| if field.mode == 'REPEATED': | ||
| ibis_type = dt.Array(ibis_type) | ||
| return ibis_type | ||
|
|
||
|
|
||
| def bigquery_table_to_ibis_schema(table): | ||
| pairs = ((el.name, _discover_type(el)) for el in table.schema) | ||
| return ibis.schema(pairs) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,154 @@ | ||
| import ibis.sql.compiler as comp | ||
| import ibis.expr.operations as ops | ||
| from ibis.impala.compiler import ImpalaSelect | ||
| from ibis.impala import compiler as impala_compiler | ||
|
|
||
|
|
||
| class BigQuerySelectBuilder(comp.SelectBuilder): | ||
|
|
||
| @property | ||
| def _select_class(self): | ||
| return BigQuerySelect | ||
|
|
||
|
|
||
| class BigQueryQueryBuilder(comp.QueryBuilder): | ||
|
|
||
| select_builder = BigQuerySelectBuilder | ||
|
|
||
| def __init__(self, expr, context=None, params=None): | ||
| super(BigQueryQueryBuilder, self).__init__( | ||
| expr, context=context, params=params | ||
| ) | ||
|
|
||
| def _make_context(self): | ||
| return BigQueryContext() | ||
|
|
||
| @property | ||
| def _union_class(self): | ||
| # return BigQueryUnion | ||
| raise NotImplementedError() | ||
|
|
||
|
|
||
| def build_ast(expr, context=None, params=None): | ||
| builder = BigQueryQueryBuilder(expr, context=context, params=params) | ||
| return builder.get_result() | ||
|
|
||
|
|
||
| def _get_query(expr, context, params=None): | ||
| ast = build_ast(expr, context, params=params) | ||
| (query, rest) = (ast.queries[0], ast.queries[1:]) | ||
| assert not rest | ||
| return query | ||
|
|
||
|
|
||
| def to_sql(expr, context=None, params=None): | ||
| query = _get_query(expr, context, params=params) | ||
| compiled = query.compile() | ||
| return compiled | ||
|
|
||
|
|
||
| class BigQueryContext(comp.QueryContext): | ||
|
|
||
| def _to_sql(self, expr, ctx): | ||
| return to_sql(expr, context=ctx) | ||
|
|
||
|
|
||
| class BigQuerySelect(ImpalaSelect): | ||
|
|
||
| @property | ||
| def translator(self): | ||
| return BigQueryExprTranslator | ||
|
|
||
|
|
||
| def _extract_field(sql_attr): | ||
| def extract_field_formatter(translator, expr): | ||
| op = expr.op() | ||
| arg = translator.translate(op.args[0]) | ||
| return "extract({0!s} from {1!s})".format(sql_attr, arg) | ||
| return extract_field_formatter | ||
|
|
||
|
|
||
| def _ifnull(translator, expr): | ||
| (a, b) = (translator.translate(arg) for arg in expr.op().args) | ||
| return ('CASE WHEN {0!s} IS NULL THEN {1!s} ELSE {0!s} END' | ||
| .format(a, b)) | ||
|
|
||
|
|
||
| _sql_type_names = { | ||
| 'int8': 'int64', | ||
| 'int16': 'int64', | ||
| 'int32': 'int64', | ||
| 'int64': 'int64', | ||
| 'float': 'float64', | ||
| 'double': 'float64', | ||
| 'string': 'string', | ||
| 'boolean': 'boolean', | ||
| 'timestamp': 'timestamp', | ||
| } | ||
|
|
||
|
|
||
| def _cast(translator, expr): | ||
| op = expr.op() | ||
| arg, target_type = op.args | ||
| arg_formatted = translator.translate(arg) | ||
| sql_type = _sql_type_names[target_type.name.lower()] | ||
| return 'CAST({0!s} AS {1!s})'.format(arg_formatted, sql_type) | ||
|
|
||
|
|
||
| def _struct_field(translator, expr): | ||
| arg, field = expr.op().args | ||
| arg_formatted = translator.translate(arg) | ||
| return '{}.`{}`'.format(arg_formatted, field) | ||
|
|
||
|
|
||
| def _array_collect(translator, expr): | ||
| return 'ARRAY_AGG({})'.format(*map(translator.translate, expr.op().args)) | ||
|
|
||
|
|
||
| def _array_concat(translator, expr): | ||
| return 'ARRAY_CONCAT({})'.format( | ||
| ', '.join(map(translator.translate, expr.op().args)) | ||
| ) | ||
|
|
||
|
|
||
| def _array_index(translator, expr): | ||
| # SAFE_OFFSET returns NULL if out of bounds | ||
| return '{}[SAFE_OFFSET({})]'.format( | ||
| *map(translator.translate, expr.op().args) | ||
| ) | ||
|
|
||
|
|
||
| def _array_length(translator, expr): | ||
| return 'ARRAY_LENGTH({})'.format( | ||
| *map(translator.translate, expr.op().args) | ||
| ) | ||
|
|
||
|
|
||
| _operation_registry = impala_compiler._operation_registry.copy() | ||
| _operation_registry.update({ | ||
| ops.ExtractYear: _extract_field('year'), | ||
| ops.ExtractMonth: _extract_field('month'), | ||
| ops.ExtractDay: _extract_field('day'), | ||
| ops.ExtractHour: _extract_field('hour'), | ||
| ops.ExtractMinute: _extract_field('minute'), | ||
| ops.ExtractSecond: _extract_field('second'), | ||
| ops.ExtractMillisecond: _extract_field('millisecond'), | ||
|
|
||
| ops.IfNull: _ifnull, | ||
| ops.Cast: _cast, | ||
|
|
||
| ops.StructField: _struct_field, | ||
|
|
||
| ops.ArrayCollect: _array_collect, | ||
| ops.ArrayConcat: _array_concat, | ||
| ops.ArrayIndex: _array_index, | ||
| ops.ArrayLength: _array_length, | ||
|
|
||
| # BigQuery doesn't have these operations built in. | ||
| # ops.ArrayRepeat: _array_repeat, | ||
| # ops.ArraySlice: _array_slice, | ||
| }) | ||
|
|
||
|
|
||
| class BigQueryExprTranslator(impala_compiler.ImpalaExprTranslator): | ||
| _registry = _operation_registry |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,34 @@ | ||
| import os | ||
|
|
||
| import pytest | ||
|
|
||
| import ibis | ||
|
|
||
|
|
||
| PROJECT_ID = os.environ.get('GOOGLE_BIGQUERY_PROJECT_ID') | ||
| DATASET_ID = 'testing' | ||
|
|
||
|
|
||
| @pytest.fixture(scope='session') | ||
| def client(): | ||
| ga = pytest.importorskip('google.auth') | ||
|
|
||
| try: | ||
| return ibis.bigquery.connect(PROJECT_ID, DATASET_ID) | ||
| except ga.exceptions.DefaultCredentialsError: | ||
| pytest.skip("no credentials found, skipping") | ||
|
|
||
|
|
||
| @pytest.fixture(scope='session') | ||
| def alltypes(client): | ||
| return client.table('functional_alltypes') | ||
|
|
||
|
|
||
| @pytest.fixture(scope='session') | ||
| def df(alltypes): | ||
| return alltypes.execute() | ||
|
|
||
|
|
||
| @pytest.fixture(scope='session') | ||
| def struct_table(client): | ||
| return client.table('struct_table') |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,144 @@ | ||
| import pytest | ||
|
|
||
| import numpy as np | ||
| import pandas as pd | ||
| import pandas.util.testing as tm | ||
|
|
||
| import ibis | ||
| import ibis.expr.types as ir | ||
|
|
||
|
|
||
| pytestmark = pytest.mark.bigquery | ||
| pytest.importorskip('google.cloud.bigquery') | ||
|
|
||
|
|
||
| def test_table(alltypes): | ||
| assert isinstance(alltypes, ir.TableExpr) | ||
|
|
||
|
|
||
| def test_column_execute(alltypes, df): | ||
| col_name = 'float_col' | ||
| expr = alltypes[col_name] | ||
| result = expr.execute() | ||
| expected = df[col_name] | ||
| tm.assert_series_equal(result, expected) | ||
|
|
||
|
|
||
| def test_literal_execute(client): | ||
| expected = '1234' | ||
| expr = ibis.literal(expected) | ||
| result = client.execute(expr) | ||
| assert result == expected | ||
|
|
||
|
|
||
| def test_simple_aggregate_execute(alltypes, df): | ||
| col_name = 'float_col' | ||
| expr = alltypes[col_name].sum() | ||
| result = expr.execute() | ||
| expected = df[col_name].sum() | ||
| np.testing.assert_allclose(result, expected) | ||
|
|
||
|
|
||
| def test_list_tables(client): | ||
| assert len(client.list_tables(like='functional_alltypes')) == 1 | ||
|
|
||
|
|
||
| def test_current_database(client): | ||
| assert client.current_database.name == 'testing' | ||
| assert client.current_database.name == client.dataset_id | ||
| assert client.current_database.tables == client.list_tables() | ||
|
|
||
|
|
||
| def test_database(client): | ||
| database = client.database(client.dataset_id) | ||
| assert database.list_tables() == client.list_tables() | ||
|
|
||
|
|
||
| def test_database_layer(client): | ||
| bq_dataset = client._proxy.get_dataset(client.dataset_id) | ||
| actual = client.list_tables() | ||
| expected = [el.name for el in bq_dataset.list_tables()] | ||
| assert sorted(actual) == sorted(expected) | ||
|
|
||
|
|
||
| def test_compile_verify(alltypes): | ||
| column = alltypes['string_col'] | ||
| unsupported_expr = column.replace('foo', 'bar') | ||
| supported_expr = column.lower() | ||
| assert not unsupported_expr.verify() | ||
| assert supported_expr.verify() | ||
|
|
||
|
|
||
| def test_compile_toplevel(): | ||
| t = ibis.table([('foo', 'double')], name='t0') | ||
|
|
||
| # it works! | ||
| expr = t.foo.sum() | ||
| result = ibis.bigquery.compile(expr) | ||
| # FIXME: remove quotes because bigquery can't use anythig that needs | ||
| # quoting? | ||
| expected = """\ | ||
| SELECT sum(`foo`) AS `sum` | ||
| FROM t0""" # noqa | ||
| assert str(result) == expected | ||
|
|
||
|
|
||
| def test_struct_field_access(struct_table): | ||
| expr = struct_table.struct_col.string_field | ||
| result = expr.execute() | ||
| expected = pd.Series([None, 'a'], name='tmp') | ||
| tm.assert_series_equal(result, expected) | ||
|
|
||
|
|
||
| def test_array_index(struct_table): | ||
| expr = struct_table.array_of_structs_col[1] | ||
| result = expr.execute() | ||
| expected = pd.Series( | ||
| [ | ||
| {'int_field': None, 'string_field': None}, | ||
| {'int_field': None, 'string_field': 'hijklmnop'} | ||
| ], | ||
| name='tmp' | ||
| ) | ||
| tm.assert_series_equal(result, expected) | ||
|
|
||
|
|
||
| def test_array_concat(struct_table): | ||
| c = struct_table.array_of_structs_col | ||
| expr = c + c | ||
| result = expr.execute() | ||
| expected = pd.Series( | ||
| [ | ||
| [ | ||
| {'int_field': 12345, 'string_field': 'abcdefg'}, | ||
| {'int_field': None, 'string_field': None}, | ||
| {'int_field': 12345, 'string_field': 'abcdefg'}, | ||
| {'int_field': None, 'string_field': None}, | ||
| ], | ||
| [ | ||
| {'int_field': 12345, 'string_field': 'abcdefg'}, | ||
| {'int_field': None, 'string_field': 'hijklmnop'}, | ||
| {'int_field': 12345, 'string_field': 'abcdefg'}, | ||
| {'int_field': None, 'string_field': 'hijklmnop'}, | ||
| ], | ||
| ], | ||
| name='tmp', | ||
| ) | ||
| tm.assert_series_equal(result, expected) | ||
|
|
||
|
|
||
| def test_array_length(struct_table): | ||
| expr = struct_table.array_of_structs_col.length() | ||
| result = expr.execute() | ||
| expected = pd.Series([2, 2], name='tmp') | ||
| tm.assert_series_equal(result, expected) | ||
|
|
||
|
|
||
| @pytest.mark.xfail | ||
| def test_array_collect(struct_table): | ||
| key = struct_table.array_of_structs_col[0].string_field | ||
| expr = struct_table.groupby(key).aggregate( | ||
| foo=lambda t: t.array_of_structs_col[0].int_field.collect() | ||
| ) | ||
| result = expr.execute() | ||
| assert result == -1 |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,81 @@ | ||
| import ibis.common as com | ||
|
|
||
| from ibis.config import options | ||
| from ibis.clickhouse.client import ClickhouseClient | ||
|
|
||
|
|
||
| def compile(expr): | ||
| """ | ||
| Force compilation of expression as though it were an expression depending | ||
| on Clickhouse. Note you can also call expr.compile() | ||
| Returns | ||
| ------- | ||
| compiled : string | ||
| """ | ||
| from .compiler import to_sql | ||
| return to_sql(expr) | ||
|
|
||
|
|
||
| def verify(expr): | ||
| """ | ||
| Determine if expression can be successfully translated to execute on | ||
| Clickhouse | ||
| """ | ||
| try: | ||
| compile(expr) | ||
| return True | ||
| except com.TranslationError: | ||
| return False | ||
|
|
||
|
|
||
| def connect(host='localhost', port=9000, database='default', user='default', | ||
| password='', client_name='ibis', compression=False): | ||
| """Create an ClickhouseClient for use with Ibis. | ||
| Parameters | ||
| ---------- | ||
| host : str, optional | ||
| Host name of the clickhouse server | ||
| port : int, optional | ||
| Clickhouse server's port | ||
| database : str, optional | ||
| Default database when executing queries | ||
| user : str, optional | ||
| User to authenticate with | ||
| password : str, optional | ||
| Password to authenticate with | ||
| client_name: str, optional | ||
| This will appear in clickhouse server logs | ||
| compression: str, optional | ||
| Weather or not to use compression. Default is False. | ||
| Possible choices: lz4, lz4hc, quicklz, zstd | ||
| True is equivalent to 'lz4'. | ||
| Examples | ||
| -------- | ||
| >>> import ibis | ||
| >>> import os | ||
| >>> clickhouse_host = os.environ.get('IBIS_TEST_CLICKHOUSE_HOST', | ||
| ... 'localhost') | ||
| >>> clickhouse_port = int(os.environ.get('IBIS_TEST_CLICKHOUSE_PORT', | ||
| ... 9000)) | ||
| >>> client = ibis.clickhouse.connect( | ||
| ... host=clickhouse_host, | ||
| ... port=clickhouse_port | ||
| ... ) | ||
| >>> client # doctest: +ELLIPSIS | ||
| <ibis.clickhouse.client.ClickhouseClient object at 0x...> | ||
| Returns | ||
| ------- | ||
| ClickhouseClient | ||
| """ | ||
|
|
||
| client = ClickhouseClient(host, port=port, database=database, user=user, | ||
| password=password, client_name=client_name, | ||
| compression=compression) | ||
| if options.default_backend is None: | ||
| options.default_backend = client | ||
|
|
||
| return client |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,304 @@ | ||
| import re | ||
| import pandas as pd | ||
|
|
||
| import ibis.common as com | ||
| import ibis.expr.datatypes as dt | ||
| import ibis.expr.operations as ops | ||
| import ibis.expr.types as ir | ||
|
|
||
| from ibis.config import options | ||
| from ibis.compat import zip as czip | ||
| from ibis.client import Query, Database, DatabaseEntity, SQLClient | ||
| from ibis.clickhouse.compiler import build_ast | ||
| from ibis.util import log | ||
| from ibis.sql.compiler import DDL | ||
|
|
||
| from clickhouse_driver.client import Client as _DriverClient | ||
|
|
||
| from .types import clickhouse_to_pandas, clickhouse_to_ibis | ||
|
|
||
|
|
||
| fully_qualified_re = re.compile(r"(.*)\.(?:`(.*)`|(.*))") | ||
|
|
||
|
|
||
| class ClickhouseDatabase(Database): | ||
| pass | ||
|
|
||
|
|
||
| class ClickhouseQuery(Query): | ||
|
|
||
| def execute(self): | ||
| # synchronous by default | ||
| cursor = self.client._execute(self.compiled_ddl) | ||
| result = self._fetch(cursor) | ||
| return self._wrap_result(result) | ||
|
|
||
| def _fetch(self, cursor): | ||
| data, columns = cursor | ||
| names, types = czip(*columns) | ||
|
|
||
| cols = {} | ||
| for (col, name, db_type) in czip(data, names, types): | ||
| dtype = self._db_type_to_dtype(db_type, name) | ||
| try: | ||
| cols[name] = pd.Series(col, dtype=dtype) | ||
| except TypeError: | ||
| cols[name] = pd.Series(col) | ||
|
|
||
| return pd.DataFrame(cols, columns=names) | ||
|
|
||
| def _db_type_to_dtype(self, db_type, column): | ||
| return clickhouse_to_pandas[db_type] | ||
|
|
||
|
|
||
| class ClickhouseClient(SQLClient): | ||
| """An Ibis client interface that uses Clickhouse""" | ||
|
|
||
| database_class = ClickhouseDatabase | ||
| sync_query = ClickhouseQuery | ||
|
|
||
| def __init__(self, *args, **kwargs): | ||
| self.con = _DriverClient(*args, **kwargs) | ||
|
|
||
| def _build_ast(self, expr, params=None): | ||
| return build_ast(expr, params=params) | ||
|
|
||
| @property | ||
| def current_database(self): | ||
| # might be better to use driver.Connection instead of Client | ||
| return self.con.connection.database | ||
|
|
||
| @property | ||
| def _table_expr_klass(self): | ||
| return ClickhouseTable | ||
|
|
||
| def log(self, msg): | ||
| log(msg) | ||
|
|
||
| def close(self): | ||
| """Close Clickhouse connection and drop any temporary objects""" | ||
| self.con.disconnect() | ||
|
|
||
| def _execute(self, query): | ||
| if isinstance(query, DDL): | ||
| query = query.compile() | ||
| self.log(query) | ||
|
|
||
| return self.con.execute(query, columnar=True, with_column_types=True) | ||
|
|
||
| def _fully_qualified_name(self, name, database): | ||
| if bool(fully_qualified_re.search(name)): | ||
| return name | ||
|
|
||
| database = database or self.current_database | ||
| return '{0}.`{1}`'.format(database, name) | ||
|
|
||
| def list_tables(self, like=None, database=None): | ||
| """ | ||
| List tables in the current (or indicated) database. Like the SHOW | ||
| TABLES command in the clickhouse-shell. | ||
| Parameters | ||
| ---------- | ||
| like : string, default None | ||
| e.g. 'foo*' to match all tables starting with 'foo' | ||
| database : string, default None | ||
| If not passed, uses the current/default database | ||
| Returns | ||
| ------- | ||
| tables : list of strings | ||
| """ | ||
| statement = 'SHOW TABLES' | ||
| if database: | ||
| statement += " FROM `{0}`".format(database) | ||
| if like: | ||
| m = fully_qualified_re.match(like) | ||
| if m: | ||
| database, quoted, unquoted = m.groups() | ||
| like = quoted or unquoted | ||
| return self.list_tables(like=like, database=database) | ||
| statement += " LIKE '{0}'".format(like) | ||
|
|
||
| return self._execute(statement) | ||
|
|
||
| def set_database(self, name): | ||
| """ | ||
| Set the default database scope for client | ||
| """ | ||
| self.con.database = name | ||
|
|
||
| def exists_database(self, name): | ||
| """ | ||
| Checks if a given database exists | ||
| Parameters | ||
| ---------- | ||
| name : string | ||
| Database name | ||
| Returns | ||
| ------- | ||
| if_exists : boolean | ||
| """ | ||
| return len(self.list_databases(like=name)) > 0 | ||
|
|
||
| def list_databases(self, like=None): | ||
| """ | ||
| List databases in the Clickhouse cluster. | ||
| Like the SHOW DATABASES command in the clickhouse-shell. | ||
| Parameters | ||
| ---------- | ||
| like : string, default None | ||
| e.g. 'foo*' to match all tables starting with 'foo' | ||
| Returns | ||
| ------- | ||
| databases : list of strings | ||
| """ | ||
| statement = 'SELECT name FROM system.databases' | ||
| if like: | ||
| statement += " WHERE name LIKE '{0}'".format(like) | ||
|
|
||
| return self._execute(statement) | ||
|
|
||
| def get_schema(self, table_name, database=None): | ||
| """ | ||
| Return a Schema object for the indicated table and database | ||
| Parameters | ||
| ---------- | ||
| table_name : string | ||
| May be fully qualified | ||
| database : string, default None | ||
| Returns | ||
| ------- | ||
| schema : ibis Schema | ||
| """ | ||
| qualified_name = self._fully_qualified_name(table_name, database) | ||
| query = 'DESC {0}'.format(qualified_name) | ||
| data, _ = self._execute(query) | ||
|
|
||
| names, types = data[:2] | ||
| ibis_types = map(clickhouse_to_ibis.get, types) | ||
|
|
||
| return dt.Schema(names, ibis_types) | ||
|
|
||
| @property | ||
| def client_options(self): | ||
| return self.con.options | ||
|
|
||
| def set_options(self, options): | ||
| self.con.set_options(options) | ||
|
|
||
| def reset_options(self): | ||
| # Must nuke all cursors | ||
| raise NotImplementedError | ||
|
|
||
| def exists_table(self, name, database=None): | ||
| """ | ||
| Determine if the indicated table or view exists | ||
| Parameters | ||
| ---------- | ||
| name : string | ||
| database : string, default None | ||
| Returns | ||
| ------- | ||
| if_exists : boolean | ||
| """ | ||
| return len(self.list_tables(like=name, database=database)) > 0 | ||
|
|
||
| def _ensure_temp_db_exists(self): | ||
| name = options.clickhouse.temp_db, | ||
| if not self.exists_database(name): | ||
| self.create_database(name, force=True) | ||
|
|
||
| def _get_table_schema(self, tname): | ||
| return self.get_schema(tname) | ||
|
|
||
| def _get_schema_using_query(self, query): | ||
| _, types = self._execute(query) | ||
| names, clickhouse_types = zip(*types) | ||
| ibis_types = map(clickhouse_to_ibis.get, clickhouse_types) | ||
| return dt.Schema(names, ibis_types) | ||
|
|
||
| def _exec_statement(self, stmt, adapter=None): | ||
| query = ClickhouseQuery(self, stmt) | ||
| result = query.execute() | ||
| if adapter is not None: | ||
| result = adapter(result) | ||
| return result | ||
|
|
||
| def _table_command(self, cmd, name, database=None): | ||
| qualified_name = self._fully_qualified_name(name, database) | ||
| return '{0} {1}'.format(cmd, qualified_name) | ||
|
|
||
|
|
||
| class ClickhouseTable(ir.TableExpr, DatabaseEntity): | ||
| """References a physical table in Clickhouse""" | ||
|
|
||
| @property | ||
| def _qualified_name(self): | ||
| return self.op().args[0] | ||
|
|
||
| @property | ||
| def _unqualified_name(self): | ||
| return self._match_name()[1] | ||
|
|
||
| @property | ||
| def _client(self): | ||
| return self.op().args[2] | ||
|
|
||
| def _match_name(self): | ||
| m = fully_qualified_re.match(self._qualified_name) | ||
| if not m: | ||
| raise com.IbisError('Cannot determine database name from {0}' | ||
| .format(self._qualified_name)) | ||
| db, quoted, unquoted = m.groups() | ||
| return db, quoted or unquoted | ||
|
|
||
| @property | ||
| def _database(self): | ||
| return self._match_name()[0] | ||
|
|
||
| def invalidate_metadata(self): | ||
| self._client.invalidate_metadata(self._qualified_name) | ||
|
|
||
| def metadata(self): | ||
| """ | ||
| Return parsed results of DESCRIBE FORMATTED statement | ||
| Returns | ||
| ------- | ||
| meta : TableMetadata | ||
| """ | ||
| return self._client.describe_formatted(self._qualified_name) | ||
|
|
||
| describe_formatted = metadata | ||
|
|
||
| @property | ||
| def name(self): | ||
| return self.op().name | ||
|
|
||
| def _execute(self, stmt): | ||
| return self._client._execute(stmt) | ||
|
|
||
|
|
||
| class ClickhouseTemporaryTable(ops.DatabaseTable): | ||
|
|
||
| def __del__(self): | ||
| try: | ||
| self.drop() | ||
| except com.IbisError: | ||
| pass | ||
|
|
||
| def drop(self): | ||
| try: | ||
| self.source.drop_table(self.name) | ||
| except Exception: # ClickhouseError | ||
| # database might have been dropped | ||
| pass |