22 changes: 21 additions & 1 deletion benchmarks/benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ def time_impala_large_expr_compile(self):
class PandasBackend:

def setup(self):
n = int(5e6)
n = 30 * int(2e5)
data = pd.DataFrame({
'key': np.random.choice(16000, size=n),
'low_card_key': np.random.choice(30, size=n),
Expand All @@ -103,6 +103,8 @@ def setup(self):
'timestamp_strings': pd.date_range(
start='now', periods=n, freq='s'
).values.astype(str),
'repeated_timestamps': pd.date_range(
start='2018-09-01', periods=30).repeat(int(n / 30))
})

t = ibis.pandas.connect({'df': data}).table('df')
Expand Down Expand Up @@ -130,6 +132,18 @@ def setup(self):
'low_card_key', 'key', 'value'
]].sort_by(['low_card_key', 'key'])

low_card_window = ibis.trailing_range_window(
2 * ibis.day(),
order_by=t.repeated_timestamps,
group_by=t.low_card_key)
self.low_card_grouped_rolling = t.value.mean().over(low_card_window)

high_card_window = ibis.trailing_range_window(
2 * ibis.day(),
order_by=t.repeated_timestamps,
group_by=t.key)
self.high_card_grouped_rolling = t.value.mean().over(high_card_window)

def time_high_cardinality_group_by(self):
self.high_card_group_by.execute()

Expand All @@ -153,3 +167,9 @@ def time_simple_sort_projection(self):

def time_multikey_sort_projection(self):
self.multikey_sort_projection.execute()

def time_low_card_grouped_rolling(self):
self.low_card_grouped_rolling.execute()

def time_high_card_grouped_rolling(self):
self.high_card_grouped_rolling.execute()
14 changes: 8 additions & 6 deletions ci/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
FROM ibisproject/miniconda3
FROM continuumio/miniconda3

# fonts are for docs
RUN apt-get -qq update -y \
&& apt-get -qq install -y --no-install-recommends ttf-dejavu \
git gcc make clang libboost-dev postgresql-client ca-certificates \
RUN apt-get -qq update --yes \
&& apt-get -qq install --yes --no-install-recommends \
build-essential ttf-dejavu \
git make clang libboost-dev postgresql-client ca-certificates \
&& rm -rf /var/lib/apt/lists/*

ARG PYTHON
ARG ENVKIND

ADD ci/requirements-${ENVKIND}-${PYTHON}.yml /
COPY ci/requirements-${ENVKIND}-${PYTHON}.yml /

RUN conda env create -q -n ibis-${ENVKIND}-${PYTHON} -f /requirements-${ENVKIND}-${PYTHON}.yml \
&& conda install conda-build -y -q
Expand All @@ -20,8 +21,9 @@ RUN conda env create -q -n ibis-${ENVKIND}-${PYTHON} -f /requirements-${ENVKIND}

RUN echo 'source activate ibis-'${ENVKIND}-${PYTHON}' && exec "$@"' > activate.sh

ADD . /ibis
COPY . /ibis
WORKDIR /ibis

RUN bash /activate.sh python setup.py develop

ENTRYPOINT ["bash", "/activate.sh"]
22 changes: 17 additions & 5 deletions ci/build.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,19 @@
#!/bin/bash -e

docker-compose rm --force --stop
docker-compose up -d --no-build postgres mysql clickhouse impala
docker-compose run --rm waiter
docker-compose build --pull ibis
docker-compose run --rm ibis ci/load-data.sh
compose_file=$(dirname "$0")/docker-compose.yml

# stop all running docker compose services
docker-compose -f "$compose_file" rm --force --stop

# build the ibis image
docker-compose -f "$compose_file" build --pull ibis

# start all docker compose services
docker-compose -f "$compose_file" up -d --no-build \
mapd postgres mysql clickhouse impala kudu-master kudu-tserver

# wait for services to start
docker-compose -f "$compose_file" run --rm waiter

# load data
docker-compose -f "$compose_file" run -e LOGLEVEL --rm ibis ci/load-data.sh
276 changes: 258 additions & 18 deletions ci/datamgr.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,23 @@
#!/usr/bin/env python

import json
import os
import six
import click
import sys
import tarfile
import tempfile
import warnings

import click
import six

import pandas as pd
import sqlalchemy as sa

from toolz import dissoc
from plumbum import local
from plumbum.cmd import curl, psql

import ibis
from ibis.compat import Path


Expand All @@ -22,6 +29,9 @@
'awards_players']


logger = ibis.util.get_logger('datamgr')


def recreate_database(driver, params, **kwargs):
url = sa.engine.url.URL(driver, **dissoc(params, 'database'))
engine = sa.create_engine(url, **kwargs)
Expand Down Expand Up @@ -54,7 +64,6 @@ def init_database(driver, params, schema=None, recreate=True, **kwargs):
def read_tables(names, data_directory):
for name in names:
path = data_directory / '{}.csv'.format(name)
click.echo(path)
df = pd.read_csv(str(path), index_col=None, header=0)

if name == 'functional_alltypes':
Expand All @@ -65,13 +74,37 @@ def read_tables(names, data_directory):
# timestamp_col has object dtype
df['timestamp_col'] = pd.to_datetime(df['timestamp_col'])

yield (name, df)
yield name, df


def convert_to_database_compatible_value(value):
"""Pandas 0.23 broke DataFrame.to_sql, so we workaround it by rolling our
own extremely low-tech conversion routine
"""
if pd.isnull(value):
return None
if isinstance(value, pd.Timestamp):
return value.to_pydatetime()
try:
return value.item()
except AttributeError:
return value


def insert(engine, tablename, df):
keys = df.columns
rows = [
dict(zip(keys, map(convert_to_database_compatible_value, row)))
for row in df.itertuples(index=False, name=None)
]
t = sa.Table(tablename, sa.MetaData(bind=engine), autoload=True)
engine.execute(t.insert(), rows)


def insert_tables(engine, names, data_directory):
for table, df in read_tables(names, data_directory):
with engine.begin() as connection:
df.to_sql(table, connection, index=False, if_exists='append')
insert(connection, table, df)


@click.group()
Expand All @@ -82,11 +115,10 @@ def cli():
@cli.command()
@click.argument('name', default='ibis-testing-data.tar.gz')
@click.option('--base-url',
default='https://storage.googleapis.com/ibis-ci-data')
default='https://storage.googleapis.com/ibis-testing-data')
@click.option('-d', '--directory', default=SCRIPT_DIR)
def download(base_url, directory, name):
directory = Path(directory)
# There is no exist_ok python 3.4
if not directory.exists():
directory.mkdir()

Expand All @@ -98,9 +130,9 @@ def download(base_url, directory, name):
download(stdout=click.get_binary_stream('stdout'),
stderr=click.get_binary_stream('stderr'))
else:
click.echo('Skipping download due to {} already exists.'.format(name))
logger.info('Skipping download: %s already exists', name)

click.echo('Extracting archive to {} ...'.format(directory))
logger.info('Extracting archive to %s', directory)
if path.suffix in ('.tar', '.gz', '.bz2', '.xz'):
with tarfile.open(str(path), mode='r|gz') as f:
f.extractall(path=str(directory))
Expand All @@ -117,7 +149,7 @@ def parquet(tables, data_directory, ignore_missing_dependency, **params):
except ImportError:
msg = 'PyArrow dependency is missing'
if ignore_missing_dependency:
click.echo('Ignored: {}'.format(msg))
logger.warning('Ignored: %s', msg)
return 0
else:
raise click.ClickException(msg)
Expand Down Expand Up @@ -149,15 +181,14 @@ def parquet(tables, data_directory, ignore_missing_dependency, **params):
@click.option('-d', '--data-directory', default=DATA_DIR)
def postgres(schema, tables, data_directory, **params):
data_directory = Path(data_directory)
click.echo('Initializing PostgreSQL...')
logger.info('Initializing PostgreSQL...')
engine = init_database('postgresql', params, schema,
isolation_level='AUTOCOMMIT')

query = "COPY {} FROM STDIN WITH (FORMAT CSV, HEADER TRUE, DELIMITER ',')"
database = params['database']
for table in tables:
src = data_directory / '{}.csv'.format(table)
click.echo(src)
load = psql['--host', params['host'], '--port', params['port'],
'--username', params['user'], '--dbname', database,
'--command', query.format(table)]
Expand All @@ -177,7 +208,7 @@ def postgres(schema, tables, data_directory, **params):
def sqlite(database, schema, tables, data_directory, **params):
database = Path(database)
data_directory = Path(data_directory)
click.echo('Initializing SQLite...')
logger.info('Initializing SQLite...')

try:
database.unlink()
Expand All @@ -189,6 +220,85 @@ def sqlite(database, schema, tables, data_directory, **params):
insert_tables(engine, tables, data_directory)


@cli.command()
@click.option('-h', '--host', default='localhost')
@click.option('-P', '--port', default=9091, type=int)
@click.option('-u', '--user', default='mapd')
@click.option('-p', '--password', default='HyperInteractive')
@click.option('-D', '--database', default='ibis_testing')
@click.option('-S', '--schema', type=click.File('rt'),
default=str(SCRIPT_DIR / 'schema' / 'mapd.sql'))
@click.option('-t', '--tables', multiple=True, default=TEST_TABLES)
@click.option('-d', '--data-directory', default=DATA_DIR)
def mapd(schema, tables, data_directory, **params):
if sys.version_info.major < 3:
logger.info('MapD backend is unavailable for Python 2.')
return

import pymapd

data_directory = Path(data_directory)
reserved_words = ['table', 'year', 'month']

# connection
logger.info('Initializing MapD...')
if params['database'] != 'mapd':
conn = pymapd.connect(
host=params['host'],
user=params['user'],
password=params['password'],
port=params['port'],
dbname='mapd'
)
stmt = 'CREATE DATABASE {}'.format(params['database'])
try:
conn.execute(stmt)
except Exception:
logger.exception('MapD DDL statement %r failed', stmt)
conn.close()

conn = pymapd.connect(
host=params['host'], user=params['user'],
password=params['password'],
port=params['port'], dbname=params['database']
)

# create tables
for stmt in filter(None, map(str.strip, schema.read().split(';'))):
try:
conn.execute(stmt)
except Exception:
logger.exception('MapD DDL statement \n%r\n failed', stmt)

# import data
for table, df in read_tables(tables, data_directory):
if table == 'batting':
# float nan problem
cols = df.select_dtypes([float]).columns
df[cols] = df[cols].fillna(0).astype(int)

# string None driver problem
cols = df.select_dtypes([object]).columns
df[cols] = df[cols].fillna('')
elif table == 'awards_players':
# string None driver problem
cols = df.select_dtypes([object]).columns
df[cols] = df[cols].fillna('')

# rename fields
for df_col in df.columns:
if ' ' in df_col or ':' in df_col:
column = df_col.replace(' ', '_').replace(':', '_')
elif df_col in reserved_words:
column = '{}_'.format(df_col)
else:
continue
df.rename(columns={df_col: column}, inplace=True)
conn.load_table_columnar(table, df)

conn.close()


@cli.command()
@click.option('-h', '--host', default='localhost')
@click.option('-P', '--port', default=3306, type=int)
Expand All @@ -201,9 +311,11 @@ def sqlite(database, schema, tables, data_directory, **params):
@click.option('-d', '--data-directory', default=DATA_DIR)
def mysql(schema, tables, data_directory, **params):
data_directory = Path(data_directory)
click.echo('Initializing MySQL...')
engine = init_database('mysql+pymysql', params, schema,
isolation_level='AUTOCOMMIT')
logger.info('Initializing MySQL...')
with warnings.catch_warnings():
warnings.simplefilter("ignore")
engine = init_database('mysql+pymysql', params, schema,
isolation_level='AUTOCOMMIT')
insert_tables(engine, tables, data_directory)


Expand All @@ -219,7 +331,7 @@ def mysql(schema, tables, data_directory, **params):
@click.option('-d', '--data-directory', default=DATA_DIR)
def clickhouse(schema, tables, data_directory, **params):
data_directory = Path(data_directory)
click.echo('Initializing ClickHouse...')
logger.info('Initializing ClickHouse...')
engine = init_database('clickhouse+native', params, schema)

for table, df in read_tables(tables, data_directory):
Expand All @@ -234,8 +346,136 @@ def clickhouse(schema, tables, data_directory, **params):
# string None driver problem
cols = df.select_dtypes([object]).columns
df[cols] = df[cols].fillna('')
insert(engine, table, df)


df.to_sql(table, engine, index=False, if_exists='append')
@cli.command()
@click.option('-d', '--data-directory', default=DATA_DIR)
@click.option('-i', '--ignore-missing-dependency', is_flag=True, default=False)
def bigquery(data_directory, ignore_missing_dependency, **params):
try:
import google.api_core.exceptions
from google.cloud import bigquery
except ImportError:
msg = 'google-cloud-bigquery dependency is missing'
if ignore_missing_dependency:
logger.warning('Ignored: %s', msg)
return 0
else:
raise click.ClickException(msg)

project_id = os.environ['GOOGLE_BIGQUERY_PROJECT_ID']
bqclient = bigquery.Client(project=project_id)

# Create testing dataset.
testing_dataset = bqclient.dataset('testing')
try:
bqclient.create_dataset(bigquery.Dataset(testing_dataset))
except google.api_core.exceptions.Conflict:
pass # Skip if already created.

# Set up main data table.
data_directory = Path(data_directory)
functional_alltypes_path = data_directory / 'functional_alltypes.csv'
functional_alltypes_schema = []
schema_path = data_directory / 'functional_alltypes_bigquery_schema.json'
with open(str(schema_path)) as schemafile:
schema_json = json.load(schemafile)
for field in schema_json:
functional_alltypes_schema.append(
bigquery.SchemaField.from_api_repr(field))
load_config = bigquery.LoadJobConfig()
load_config.skip_leading_rows = 1 # skip the header row.
load_config.schema = functional_alltypes_schema

# Load main data table.
functional_alltypes_schema = []
with open(str(functional_alltypes_path), 'rb') as csvfile:
job = bqclient.load_table_from_file(
csvfile,
testing_dataset.table('functional_alltypes'),
job_config=load_config).result()

if job.error_result:
raise click.ClickException(str(job.error_result))

# Load an ingestion time partitioned table.
functional_alltypes_path = data_directory / 'functional_alltypes.csv'
with open(str(functional_alltypes_path), 'rb') as csvfile:
load_config.time_partitioning = bigquery.TimePartitioning()
job = bqclient.load_table_from_file(
csvfile,
testing_dataset.table('functional_alltypes_parted'),
job_config=load_config).result()

if job.error_result:
raise click.ClickException(str(job.error_result))

# Create a table with complex data types (nested and repeated).
struct_table_path = data_directory / 'struct_table.avro'
with open(str(struct_table_path), 'rb') as avrofile:
load_config = bigquery.LoadJobConfig()
load_config.source_format = 'AVRO'
job = bqclient.load_table_from_file(
avrofile,
testing_dataset.table('struct_table'),
job_config=load_config)

if job.error_result:
raise click.ClickException(str(job.error_result))

# Create empty date-partitioned table.
date_table = bigquery.Table(testing_dataset.table('date_column_parted'))
date_table.schema = [
bigquery.SchemaField('my_date_parted_col', 'DATE'),
bigquery.SchemaField('string_col', 'STRING'),
bigquery.SchemaField('int_col', 'INTEGER'),
]
date_table.time_partitioning = bigquery.TimePartitioning(
field='my_date_parted_col')
bqclient.create_table(date_table)

# Create empty timestamp-partitioned tables.
timestamp_table = bigquery.Table(
testing_dataset.table('timestamp_column_parted'))
timestamp_table.schema = [
bigquery.SchemaField('my_timestamp_parted_col', 'DATE'),
bigquery.SchemaField('string_col', 'STRING'),
bigquery.SchemaField('int_col', 'INTEGER'),
]
timestamp_table.time_partitioning = bigquery.TimePartitioning(
field='my_timestamp_parted_col')
bqclient.create_table(timestamp_table)

# Create a table with a numeric column
numeric_table = bigquery.Table(
testing_dataset.table('numeric_table'))
numeric_table.schema = [
bigquery.SchemaField('string_col', 'STRING'),
bigquery.SchemaField('numeric_col', 'NUMERIC'),
]
bqclient.create_table(numeric_table)

df = pd.read_csv(
str(data_directory / 'functional_alltypes.csv'),
usecols=['string_col', 'double_col'],
header=0,
)
with tempfile.NamedTemporaryFile(mode='a+b') as csvfile:
df.to_csv(csvfile, header=False, index=False)
csvfile.seek(0)

load_config = bigquery.LoadJobConfig()
load_config.skip_leading_rows = 1 # skip the header row.
load_config.schema = numeric_table.schema

job = bqclient.load_table_from_file(
csvfile,
testing_dataset.table('numeric_table'),
job_config=load_config).result()

if job.error_result:
raise click.ClickException(str(job.error_result))


if __name__ == '__main__':
Expand Down
107 changes: 77 additions & 30 deletions ci/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,13 @@ services:
ports:
- 3306:3306
environment:
- MYSQL_ALLOW_EMPTY_PASSWORD=1
- MYSQL_DATABASE=ibis_testing
- MYSQL_USER=ibis
- MYSQL_PASSWORD=ibis
MYSQL_ALLOW_EMPTY_PASSWORD: 1
MYSQL_DATABASE: ibis_testing
MYSQL_USER: ibis
MYSQL_PASSWORD: ibis

impala:
image: ibisproject/impala
image: ibisproject/impala:latest
hostname: impala
networks:
default:
Expand All @@ -36,6 +36,7 @@ services:
- 8042:8042
# Hive
- 9083:9083

# Impala
- 21000:21000
- 21050:21050
Expand All @@ -44,48 +45,94 @@ services:
- 25020:25020

clickhouse:
image: yandex/clickhouse-server:1.1.54327
image: yandex/clickhouse-server:1.1.54388
ports:
- 8123:8123
- 9000:9000

kudu-master:
image: ibisproject/kudu:latest
networks:
default:
aliases:
- kudu
cap_add:
- SYS_TIME
ports:
- 7051:7051
- 8051:8051
environment:
KUDU_MASTER: "true"

kudu-tserver:
image: ibisproject/kudu:latest
cap_add:
- SYS_TIME
ports:
- 7050:7050
- 8050:8050
environment:
KUDU_MASTER: "false"

mapd:
image: mapd/mapd-ce-cpu:v3.6.0
ports:
- "9091-9092:9091-9092"
environment:
- MAPD_HOST=mapd
- MAPD_PORT=9091
- MAPD_DATABASE=ibis_testing
- MAPD_USER=mapd
volumes:
- ./mapd.conf:/mapd-storage/mapd.conf

waiter:
image: jwilder/dockerize
command: |
dockerize -wait tcp://mysql:3306
dockerize -wait tcp://mapd:9091
-wait tcp://mysql:3306
-wait tcp://postgres:5432
-wait tcp://impala:21050
-wait tcp://impala:50070
-wait tcp://kudu-master:7051
-wait tcp://kudu-master:8051
-wait tcp://kudu-tserver:7050
-wait tcp://kudu-tserver:8050
-wait tcp://clickhouse:9000
-wait-retry-interval 5s
-timeout 5m
ibis:
image: ibis:${PYTHON_VERSION:-3.6}
environment:
- IBIS_TEST_DOWNLOAD_DIRECTORY=/tmp
- IBIS_TEST_DATA_DIRECTORY=/tmp/ibis-testing-data
- IBIS_TEST_SQLITE_DATABASE=/tmp/ibis_testing.db
- IBIS_TEST_NN_HOST=impala
- IBIS_TEST_IMPALA_HOST=impala
- IBIS_TEST_IMPALA_PORT=21050
- IBIS_TEST_WEBHDFS_PORT=50070
- IBIS_TEST_WEBHDFS_USER=hdfs
- IBIS_TEST_MYSQL_HOST=mysql
- IBIS_TEST_MYSQL_PORT=3306
- IBIS_TEST_MYSQL_USER=ibis
- IBIS_TEST_MYSQL_PASSWORD=ibis
- IBIS_TEST_MYSQL_DATABASE=ibis_testing
- IBIS_TEST_POSTGRES_HOST=postgres
- IBIS_TEST_POSTGRES_PORT=5432
- IBIS_TEST_POSTGRES_USER=postgres
- IBIS_TEST_POSTGRES_PASSWORD=postgres
- IBIS_TEST_POSTGRES_DATABASE=ibis_testing
- IBIS_TEST_CLICKHOUSE_HOST=clickhouse
- IBIS_TEST_CLICKHOUSE_PORT=9000
- IBIS_TEST_CLICKHOUSE_DATABASE=ibis_testing
- GOOGLE_BIGQUERY_PROJECT_ID=ibis-gbq
- GOOGLE_APPLICATION_CREDENTIALS=/tmp/gcloud-service-key.json
IBIS_TEST_DOWNLOAD_DIRECTORY: /tmp
IBIS_TEST_DATA_DIRECTORY: /tmp/ibis-testing-data
IBIS_TEST_SQLITE_DATABASE: /tmp/ibis_testing.db
IBIS_TEST_NN_HOST: impala
IBIS_TEST_IMPALA_HOST: impala
IBIS_TEST_IMPALA_PORT: 21050
IBIS_TEST_WEBHDFS_PORT: 50070
IBIS_TEST_WEBHDFS_USER: hdfs
IBIS_TEST_MYSQL_HOST: mysql
IBIS_TEST_MYSQL_PORT: 3306
IBIS_TEST_MYSQL_USER: ibis
IBIS_TEST_MYSQL_PASSWORD: ibis
IBIS_TEST_MYSQL_DATABASE: ibis_testing
IBIS_TEST_POSTGRES_HOST: postgres
IBIS_TEST_POSTGRES_PORT: 5432
IBIS_TEST_POSTGRES_USER: postgres
IBIS_TEST_POSTGRES_PASSWORD: postgres
IBIS_TEST_POSTGRES_DATABASE: ibis_testing
IBIS_TEST_CLICKHOUSE_HOST: clickhouse
IBIS_TEST_CLICKHOUSE_PORT: 9000
IBIS_TEST_CLICKHOUSE_DATABASE: ibis_testing
IBIS_TEST_MAPD_HOST: mapd
IBIS_TEST_MAPD_PORT: 9091
IBIS_TEST_MAPD_DATABASE: ibis_testing
IBIS_TEST_MAPD_USER: mapd
IBIS_TEST_MAPD_PASSWORD: HyperInteractive
GOOGLE_BIGQUERY_PROJECT_ID: ibis-gbq
GOOGLE_APPLICATION_CREDENTIALS: /tmp/gcloud-service-key.json
volumes:
- /tmp/ibis:/tmp
build:
Expand Down
115 changes: 115 additions & 0 deletions ci/feedstock.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
#!/usr/bin/env python

import shutil
import sys

import click
import ruamel.yaml

from jinja2 import Environment, FileSystemLoader
from plumbum.cmd import git, conda

import ibis
from ibis.compat import Path, PY2


IBIS_DIR = Path(__file__).parent.parent.absolute()


def render(path):
env = Environment(loader=FileSystemLoader(str(path.parent)))
template = env.get_template(path.name)
return template.render()


@click.group()
def cli():
pass


default_repo = 'https://github.com/conda-forge/ibis-framework-feedstock'
default_dest = '/tmp/ibis-framework-feedstock'


@cli.command()
@click.argument('repo-uri', default=default_repo)
@click.argument('destination', default=default_dest)
def clone(repo_uri, destination):
if Path(destination).exists():
return

cmd = git['clone', repo_uri, destination]

cmd(stdout=click.get_binary_stream('stdout'),
stderr=click.get_binary_stream('stderr'))


@cli.command()
@click.argument('meta', default=default_dest + '/recipe/meta.yaml')
@click.option('--source-path', default=str(IBIS_DIR))
def update(meta, source_path):
path = Path(meta)

click.echo('\nUpdating {} recipe...'.format(path.parent))

content = render(path)
recipe = ruamel.yaml.round_trip_load(content)

# update the necessary fields, skip leading 'v' in the version
recipe['package']['version'] = ibis.__version__[1:]
recipe['source'] = {'path': source_path}

updated_content = ruamel.yaml.round_trip_dump(
recipe, default_flow_style=False)

if PY2:
updated_content = updated_content.decode('utf-8')

path.write_text(updated_content)


@cli.command()
@click.argument('recipe', default=default_dest + '/recipe')
def build(recipe):
click.echo('\nBuilding {} recipe...'.format(recipe))

python_version = '.'.join(map(str, sys.version_info[:3]))

cmd = conda['build', recipe,
'--channel', 'conda-forge',
'--python', python_version]

cmd(stdout=click.get_binary_stream('stdout'),
stderr=click.get_binary_stream('stderr'))


@cli.command()
@click.argument('package_location', default='/opt/conda/conda-bld')
@click.argument('artifact_directory', default='/tmp/packages')
@click.argument('architectures', default=('linux-64', 'noarch'))
def deploy(package_location, artifact_directory, architectures):
artifact_dir = Path(artifact_directory)
artifact_dir.mkdir(parents=True, exist_ok=True)
package_loc = Path(package_location)
assert package_loc.exists(), 'Path {} does not exist'.format(package_loc)

for architecture in architectures:
arch_artifact_directory = str(artifact_dir / architecture)
arch_package_directory = str(package_loc / architecture)
shutil.copytree(arch_package_directory, arch_artifact_directory)
cmd = conda['index', artifact_directory]
cmd(stdout=click.get_binary_stream('stdout'),
stderr=click.get_binary_stream('stderr'))


@cli.command()
@click.pass_context
def test(ctx):
ctx.invoke(clone)
ctx.invoke(update)
ctx.invoke(build)
ctx.invoke(deploy)


if __name__ == '__main__':
cli()
197 changes: 100 additions & 97 deletions ci/impalamgr.py
Original file line number Diff line number Diff line change
@@ -1,55 +1,56 @@
#!/usr/bin/env python
# Copyright 2015 Cloudera Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import concurrent.futures

import itertools
import os
import ibis

import click
import tempfile
import toolz

from plumbum import local, CommandNotFound
from plumbum.cmd import rm, make, cmake
from plumbum.cmd import make, cmake

import ibis

from ibis.compat import BytesIO, Path
from ibis.common import IbisError
from ibis.impala.tests.common import IbisTestEnv
from ibis.impala.tests.conftest import IbisTestEnv


SCRIPT_DIR = Path(__file__).parent.absolute()
DATA_DIR = Path(os.environ.get('IBIS_TEST_DATA_DIRECTORY',
SCRIPT_DIR / 'ibis-testing-data'))


logger = ibis.util.get_logger('impalamgr')

ENV = IbisTestEnv()

env_items = ENV.items()
maxlen = max(map(len, map(toolz.first, env_items))) + len('IbisTestEnv[""]')
format_string = '%-{:d}s == %r'.format(maxlen)
for key, value in env_items:
logger.info(format_string, 'IbisTestEnv[{!r}]'.format(key), value)


def make_ibis_client():
def make_ibis_client(env):
hc = ibis.hdfs_connect(
host=ENV.nn_host,
port=ENV.webhdfs_port,
auth_mechanism=ENV.auth_mechanism,
verify=ENV.auth_mechanism not in ['GSSAPI', 'LDAP'],
user=ENV.webhdfs_user
host=env.nn_host,
port=env.webhdfs_port,
auth_mechanism=env.auth_mechanism,
verify=env.auth_mechanism not in ['GSSAPI', 'LDAP'],
user=env.webhdfs_user
)
auth_mechanism = ENV.auth_mechanism
auth_mechanism = env.auth_mechanism
if auth_mechanism == 'GSSAPI' or auth_mechanism == 'LDAP':
print("Warning: ignoring invalid Certificate Authority errors")
logger.warning('Ignoring invalid Certificate Authority errors')
return ibis.impala.connect(
host=ENV.impala_host,
port=ENV.impala_port,
auth_mechanism=ENV.auth_mechanism,
hdfs_client=hc
host=env.impala_host,
port=env.impala_port,
auth_mechanism=env.auth_mechanism,
hdfs_client=hc,
pool_size=16
)


Expand All @@ -61,56 +62,53 @@ def can_write_to_hdfs(con):
con.hdfs.rm(test_path)
return True
except Exception:
logger.exception('Could not write to HDFS')
return False


def can_build_udfs():
try:
local.which('cmake')
except CommandNotFound:
print('Could not find cmake on PATH')
logger.exception('Could not find cmake on PATH')
return False
try:
local.which('make')
except CommandNotFound:
print('Could not find make on PATH')
logger.exception('Could not find make on PATH')
return False
try:
local.which('clang++')
except CommandNotFound:
print('Could not find LLVM on PATH; if IBIS_TEST_LLVM_CONFIG is set, '
'try setting PATH="$($IBIS_TEST_LLVM_CONFIG --bindir):$PATH"')
logger.exception(
'Could not find LLVM on PATH; if IBIS_TEST_LLVM_CONFIG is set, '
'try setting PATH="$($IBIS_TEST_LLVM_CONFIG --bindir):$PATH"'
)
return False
return True


def is_impala_loaded(con):
if not con.hdfs.exists(ENV.test_data_dir):
return False
if not con.exists_database(ENV.test_data_db):
return False
return True
return con.hdfs.exists(ENV.test_data_dir) and con.exists_database(
ENV.test_data_db)


def is_udf_loaded(con):
bitcode_dir = os.path.join(ENV.test_data_dir, 'udf')
if con.hdfs.exists(bitcode_dir):
return True
return False
return con.hdfs.exists(os.path.join(ENV.test_data_dir, 'udf'))


def upload_ibis_test_data_to_hdfs(con, data_path):
hdfs = con.hdfs
if hdfs.exists(ENV.test_data_dir):
hdfs.rmdir(ENV.test_data_dir)
hdfs.put(ENV.test_data_dir, data_path, verbose=True)
hdfs.put(ENV.test_data_dir, data_path)


def create_test_database(con):
if con.exists_database(ENV.test_data_db):
con.drop_database(ENV.test_data_db, force=True)
con.create_database(ENV.test_data_db)
print('Created database {}'.format(ENV.test_data_db))
logger.info('Created database %s', ENV.test_data_db)

con.create_table(
'alltypes',
Expand All @@ -127,10 +125,18 @@ def create_test_database(con):
]),
database=ENV.test_data_db
)
print('Created empty table {}.`alltypes`'.format(ENV.test_data_db))
logger.info('Created empty table %s.`alltypes`', ENV.test_data_db)


def create_parquet_tables(con):
def create_parquet_tables(con, executor):
def create_table(table_name):
logger.info('Creating %s', table_name)
schema = schemas.get(table_name)
path = os.path.join(ENV.test_data_dir, 'parquet', table_name)
table = con.parquet_file(path, schema=schema, name=table_name,
database=ENV.test_data_db, persist=True)
return table

parquet_files = con.hdfs.ls(os.path.join(ENV.test_data_dir, 'parquet'))
schemas = {
'functional_alltypes': ibis.schema(
Expand All @@ -151,19 +157,21 @@ def create_parquet_tables(con):
[('r_regionkey', 'int16'),
('r_name', 'string'),
('r_comment', 'string')])}
tables = []
for table_name in parquet_files:
print('Creating {}'.format(table_name))
# if no schema infer!
schema = schemas.get(table_name)
path = os.path.join(ENV.test_data_dir, 'parquet', table_name)
table = con.parquet_file(path, schema=schema, name=table_name,
database=ENV.test_data_db, persist=True)
tables.append(table)
return tables
return (
executor.submit(create_table, table_name)
for table_name in parquet_files
)


def create_avro_tables(con, executor):
def create_table(table_name):
logger.info('Creating %s', table_name)
schema = schemas[table_name]
path = os.path.join(ENV.test_data_dir, 'avro', table_name)
table = con.avro_file(path, schema, name=table_name,
database=ENV.test_data_db, persist=True)
return table

def create_avro_tables(con):
avro_files = con.hdfs.ls(os.path.join(ENV.test_data_dir, 'avro'))
schemas = {
'tpch_region_avro': {
Expand All @@ -173,31 +181,25 @@ def create_avro_tables(con):
{'name': 'R_REGIONKEY', 'type': ['null', 'int']},
{'name': 'R_NAME', 'type': ['null', 'string']},
{'name': 'R_COMMENT', 'type': ['null', 'string']}]}}
tables = []
for table_name in avro_files:
print('Creating {}'.format(table_name))
schema = schemas[table_name]
path = os.path.join(ENV.test_data_dir, 'avro', table_name)
table = con.avro_file(path, schema, name=table_name,
database=ENV.test_data_db, persist=True)
tables.append(table)
return tables
return (
executor.submit(create_table, table_name) for table_name in avro_files
)


def build_udfs():
print('Building UDFs')
logger.info('Building UDFs')
ibis_home_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
udf_dir = os.path.join(ibis_home_dir, 'ci', 'udf')

with local.cwd(udf_dir):
assert (cmake('.') and make('VERBOSE=1'))
assert cmake('.') and make('VERBOSE=1')


def upload_udfs(con):
ibis_home_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
build_dir = os.path.join(ibis_home_dir, 'ci', 'udf', 'build')
bitcode_dir = os.path.join(ENV.test_data_dir, 'udf')
print('Uploading UDFs to {}'.format(bitcode_dir))
logger.info('Uploading UDFs to %s', bitcode_dir)
if con.hdfs.exists(bitcode_dir):
con.hdfs.rmdir(bitcode_dir)
con.hdfs.put(bitcode_dir, build_dir, verbose=True)
Expand All @@ -206,7 +208,7 @@ def upload_udfs(con):
# ==========================================


@click.group(context_settings={'help_option_names': ['-h', '--help']})
@click.group(context_settings=dict(help_option_names=['-h', '--help']))
def main():
"""Manage test data for Ibis"""
pass
Expand All @@ -232,9 +234,7 @@ def main():
)
def load(data, udf, data_dir, overwrite):
"""Load Ibis test data and build/upload UDFs"""
print(str(ENV))

con = make_ibis_client()
con = make_ibis_client(ENV)

# validate our environment before performing possibly expensive operations
if not can_write_to_hdfs(con):
Expand All @@ -244,50 +244,55 @@ def load(data, udf, data_dir, overwrite):

# load the data files
if data:
tmp_dir = tempfile.mkdtemp(prefix='__ibis_tmp_')
try:
load_impala_data(con, str(data_dir), overwrite)
finally:
rm('-rf', tmp_dir)
load_impala_data(con, str(data_dir), overwrite)
else:
print('Skipping Ibis test data load (--no-data)')
logger.info('Skipping Ibis test data load (--no-data)')

# build and upload the UDFs
if udf:
already_loaded = is_udf_loaded(con)
print('Attempting to build and load test UDFs')
logger.info('Attempting to build and load test UDFs')
if already_loaded and not overwrite:
print('UDFs already loaded and not overwriting; moving on')
logger.info('UDFs already loaded and not overwriting; moving on')
else:
if already_loaded:
print('UDFs already loaded; attempting to overwrite')
print('Building UDFs')
logger.info('UDFs already loaded; attempting to overwrite')
logger.info('Building UDFs')
build_udfs()
print('Uploading UDFs')
logger.info('Uploading UDFs')
upload_udfs(con)
else:
print('Skipping UDF build/load (--no-udf)')
logger.info('Skipping UDF build/load (--no-udf)')


def load_impala_data(con, data_dir, overwrite=False):
already_loaded = is_impala_loaded(con)
print('Attempting to load Ibis Impala test data (--data)')
logger.info('Attempting to load Ibis Impala test data (--data)')
if already_loaded and not overwrite:
print('Data is already loaded and not overwriting; moving on')
logger.info('Data is already loaded and not overwriting; moving on')
else:
if already_loaded:
print('Data is already loaded; attempting to overwrite')
logger.info('Data is already loaded; attempting to overwrite')

print('Uploading to HDFS')
logger.info('Uploading to HDFS')
upload_ibis_test_data_to_hdfs(con, data_dir)
print('Creating Ibis test data database')

logger.info('Creating Ibis test data database')
create_test_database(con)
parquet_tables = create_parquet_tables(con)
avro_tables = create_avro_tables(con)
for table in parquet_tables + avro_tables:
print('Computing stats for', table.op().name)

def compute_stats(table):
logger.info('Computing stats for %s', table.op().name)
table.compute_stats()

with concurrent.futures.ThreadPoolExecutor() as executor:
parquet_tables = create_parquet_tables(con, executor)
avro_tables = create_avro_tables(con, executor)
completed_futures = concurrent.futures.as_completed(
itertools.chain(parquet_tables, avro_tables)
)
results = [future.result() for future in completed_futures]
list(executor.map(compute_stats, results))


@main.command()
@click.option(
Expand All @@ -302,9 +307,7 @@ def load_impala_data(con, data_dir, overwrite=False):
@click.option('--tmp-db', is_flag=True, help='Cleanup Ibis temporary database')
def cleanup(test_data, udfs, tmp_data, tmp_db):
"""Cleanup Ibis test data and UDFs"""
print(str(ENV))

con = make_ibis_client()
con = make_ibis_client(ENV)

if udfs:
# this comes before test_data bc the latter clobbers this too
Expand Down
3 changes: 1 addition & 2 deletions ci/load-data.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ declare -A argcommands=([sqlite]=sqlite
[parquet]="parquet -i"
[postgres]=postgres
[clickhouse]=clickhouse
[mapd]=mapd
[mysql]=mysql
[impala]=impala)

Expand Down Expand Up @@ -33,9 +34,7 @@ do
done

if [[ "${FAIL}" == 0 ]]; then
echo "Done loading ${ARGS[@]}"
exit 0
else
echo "Failed loading ${ARGS[@]}" >&2
exit 1
fi
1 change: 1 addition & 0 deletions ci/mapd.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
enable-watchdog = false
11 changes: 8 additions & 3 deletions ci/requirements-dev-2.7.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,28 +10,33 @@ dependencies:
- flake8
- funcsigs
- functools32
- google-cloud-bigquery<0.28
- google-cloud-bigquery>=1.0.0
- graphviz
- impyla>=0.14.0
- jinja2
- lz4
- mock
- multipledispatch
- numpy=1.11.*
- pandas
- pandas=0.20
- pathlib2
- plumbum
- psycopg2
- pyarrow>=0.6.0
- pymysql
- pytables
- pytest
- pytest-cov
- pytest-xdist
- python=2.7
- python-graphviz
- python-hdfs>=2.0.16
- pytz
- regex
- requests
- ruamel.yaml
- six
- sqlalchemy>=1.0.0,<1.1.15
- sqlalchemy
- thriftpy<=0.3.9
- thrift<=0.9.3
- toolz
Expand Down
28 changes: 0 additions & 28 deletions ci/requirements-dev-3.4.yml

This file was deleted.

12 changes: 9 additions & 3 deletions ci/requirements-dev-3.5.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,25 +7,31 @@ dependencies:
- clickhouse-sqlalchemy
- cmake
- flake8
- google-cloud-bigquery<0.28
- google-cloud-bigquery>=1.0.0
- graphviz
- impyla>=0.14.0
- jinja2
- lz4
- multipledispatch
- numpy=1.12.0
- pandas
- pandas=0.22
- plumbum
- psycopg2
- pyarrow>=0.6.0
- pymapd>=0.3.2
- pymysql
- pytest
- pytest-cov
- pytest-xdist
- python=3.5
- python-graphviz
- python-hdfs>=2.0.16
- pytz
- regex
- requests
- ruamel.yaml
- six
- sqlalchemy>=1.0.0,<1.1.15
- sqlalchemy
- toolz
- xorg-libxpm
- xorg-libxrender
10 changes: 8 additions & 2 deletions ci/requirements-dev-3.6.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,26 +7,32 @@ dependencies:
- clickhouse-sqlalchemy
- cmake
- flake8
- google-cloud-bigquery<0.28
- google-cloud-bigquery>=1.0.0
- graphviz
- impyla>=0.14.0
- jinja2
- lz4
- multipledispatch
- numpy
- pandas
- plumbum
- psycopg2
- pyarrow>=0.6.0
- pymapd>=0.3.2
- pymysql
- pytables
- pytest
- pytest-cov
- pytest-xdist
- python=3.6
- python-graphviz
- python-hdfs>=2.0.16
- pytz
- regex
- requests
- ruamel.yaml
- six
- sqlalchemy>=1.0.0,<1.1.15
- sqlalchemy
- thrift
- toolz
- xorg-libxpm
Expand Down
14 changes: 11 additions & 3 deletions ci/requirements-docs-3.6.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,11 @@ dependencies:
- clickhouse-sqlalchemy
- cmake
- flake8
- google-cloud-bigquery<0.28
- google-cloud-bigquery>=1.0.0
- graphviz
- impyla>=0.14.0
- ipython
- jinja2
- jupyter
- lz4
- matplotlib
Expand All @@ -22,16 +23,23 @@ dependencies:
- plumbum
- psycopg2
- pyarrow>=0.6.0
- pymapd>=0.3.2
- pymysql
- pytables
- pytest
- pytest-cov
- pytest-xdist
- python=3.6
- python-graphviz
- python-hdfs>=2.0.16
- pytz
- regex
- requests
- ruamel.yaml
- six
- sphinx_rtd_theme
- sqlalchemy>=1.0.0,<1.1.15
- sphinx_rtd_theme<0.3
- sqlalchemy
- thrift
- toolz
- xorg-libxpm
- xorg-libxrender
72 changes: 72 additions & 0 deletions ci/schema/mapd.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
DROP TABLE IF EXISTS diamonds;

CREATE TABLE diamonds (
carat FLOAT,
cut TEXT,
color TEXT,
clarity TEXT,
depth FLOAT,
table_ FLOAT,
price BIGINT,
x FLOAT,
y FLOAT,
z FLOAT
);

DROP TABLE IF EXISTS batting;

CREATE TABLE batting (
playerID VARCHAR(255),
yearID BIGINT,
stint BIGINT,
teamID VARCHAR(7),
lgID VARCHAR(7),
G BIGINT,
AB BIGINT,
R BIGINT,
H BIGINT,
X2B BIGINT,
X3B BIGINT,
HR BIGINT,
RBI BIGINT,
SB BIGINT,
CS BIGINT,
BB BIGINT,
SO BIGINT,
IBB BIGINT,
HBP BIGINT,
SH BIGINT,
SF BIGINT,
GIDP BIGINT
);

DROP TABLE IF EXISTS awards_players;

CREATE TABLE awards_players (
playerID VARCHAR(255),
awardID VARCHAR(255),
yearID BIGINT,
lgID VARCHAR(7),
tie VARCHAR(7),
notes VARCHAR(255)
);

DROP TABLE IF EXISTS functional_alltypes;

CREATE TABLE functional_alltypes (
index BIGINT,
Unnamed__0 BIGINT,
id INTEGER,
bool_col BOOLEAN,
tinyint_col SMALLINT,
smallint_col SMALLINT,
int_col INTEGER,
bigint_col BIGINT,
float_col FLOAT,
double_col DOUBLE,
date_string_col TEXT,
string_col TEXT,
timestamp_col TIMESTAMP,
year_ INTEGER,
month_ INTEGER
);
38 changes: 38 additions & 0 deletions ci/setup_docker_volume.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#!/bin/bash

if [ -z "${GOOGLE_APPLICATION_CREDENTIALS}" ]; then
echo "GOOGLE_APPLICATION_CREDENTIALS environment variable is empty"
exit 1
fi

if [ ! -e "${GOOGLE_APPLICATION_CREDENTIALS}" ]; then
echo "GOOGLE_APPLICATION_CREDENTIALS=${GOOGLE_APPLICATION_CREDENTIALS} does not exist"
exit 1
fi

if [ ! -f "${GOOGLE_APPLICATION_CREDENTIALS}" ]; then
echo "GOOGLE_APPLICATION_CREDENTIALS=${GOOGLE_APPLICATION_CREDENTIALS} is not a file"
exit 1
fi

if [ -z "${IBIS_TEST_DATA_DIRECTORY}" ]; then
echo "IBIS_TEST_DATA_DIRECTORY environment variable is empty"
exit 1
fi

if [ ! -e "${IBIS_TEST_DATA_DIRECTORY}" ]; then
echo "IBIS_TEST_DATA_DIRECTORY=${IBIS_TEST_DATA_DIRECTORY} does not exist"
exit 1
fi

if [ ! -d "${IBIS_TEST_DATA_DIRECTORY}" ]; then
echo "IBIS_TEST_DATA_DIRECTORY=${IBIS_TEST_DATA_DIRECTORY} is not a directory"
exit 1
fi

mkdir -p /tmp/ibis
cp "${GOOGLE_APPLICATION_CREDENTIALS}" /tmp/ibis/gcloud-service-key.json
cp -rf "${IBIS_TEST_DATA_DIRECTORY}" /tmp/ibis

gzipprog="$([ "$(which pigz)" ] && echo pigz || echo gzip)"
tar -I "${gzipprog}" -cf /tmp/ibis/ibis-testing-data.tar.gz "${IBIS_TEST_DATA_DIRECTORY}" 2> /dev/null
6 changes: 4 additions & 2 deletions ci/test.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#!/bin/bash -e

compose_file=$(dirname "$0")/docker-compose.yml

cmd='$(find /ibis -name "*.py[co]" -delete > /dev/null 2>&1 || true) && pytest "$@"'
docker-compose build --pull ibis
docker-compose run --rm ibis bash -c "$cmd" -- "$@"
docker-compose -f "$compose_file" build --pull ibis
docker-compose -f "$compose_file" run --rm ibis bash -c "$cmd" -- "$@"
7 changes: 7 additions & 0 deletions codecov.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
comment:
layout: "reach, diff, flags, files"
behavior: default
require_changes: false # if true: only post the comment if coverage changes
require_base: no # [yes :: must have a base report to post]
require_head: yes # [yes :: must have a head report to post]
branches: null
8 changes: 0 additions & 8 deletions conda-recipes/hdfs/bld.bat

This file was deleted.

9 changes: 0 additions & 9 deletions conda-recipes/hdfs/build.sh

This file was deleted.

42 changes: 0 additions & 42 deletions conda-recipes/hdfs/meta.yaml

This file was deleted.

10 changes: 0 additions & 10 deletions conda-recipes/ibis-framework/bld.bat

This file was deleted.

10 changes: 0 additions & 10 deletions conda-recipes/ibis-framework/build.sh

This file was deleted.

83 changes: 0 additions & 83 deletions conda-recipes/ibis-framework/meta.yaml

This file was deleted.

8 changes: 0 additions & 8 deletions conda-recipes/impyla/bld.bat

This file was deleted.

9 changes: 0 additions & 9 deletions conda-recipes/impyla/build.sh

This file was deleted.

39 changes: 0 additions & 39 deletions conda-recipes/impyla/meta.yaml

This file was deleted.

8 changes: 0 additions & 8 deletions conda-recipes/sasl/bld.bat

This file was deleted.

9 changes: 0 additions & 9 deletions conda-recipes/sasl/build.sh

This file was deleted.

63 changes: 0 additions & 63 deletions conda-recipes/sasl/meta.yaml

This file was deleted.

8 changes: 0 additions & 8 deletions conda-recipes/thrift/bld.bat

This file was deleted.

9 changes: 0 additions & 9 deletions conda-recipes/thrift/build.sh

This file was deleted.

28 changes: 0 additions & 28 deletions conda-recipes/thrift/meta.yaml

This file was deleted.

8 changes: 0 additions & 8 deletions conda-recipes/thrift_sasl/bld.bat

This file was deleted.

9 changes: 0 additions & 9 deletions conda-recipes/thrift_sasl/build.sh

This file was deleted.

65 changes: 0 additions & 65 deletions conda-recipes/thrift_sasl/meta.yaml

This file was deleted.

8 changes: 0 additions & 8 deletions conda-recipes/thriftpy/bld.bat

This file was deleted.

9 changes: 0 additions & 9 deletions conda-recipes/thriftpy/build.sh

This file was deleted.

43 changes: 0 additions & 43 deletions conda-recipes/thriftpy/meta.yaml

This file was deleted.

31 changes: 31 additions & 0 deletions conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import fnmatch
import os
import sys
import pytest

from ibis.compat import Path


collect_ignore = ['setup.py']

if sys.version_info.major == 2:
this_directory = os.path.dirname(__file__)
bigquery_udf = os.path.join(this_directory, 'ibis', 'bigquery', 'udf')
for root, _, filenames in os.walk(bigquery_udf):
for filename in filenames:
if fnmatch.fnmatch(filename, '*.py'):
collect_ignore.append(os.path.join(root, filename))


@pytest.fixture(scope='session')
def data_directory():
root = Path(__file__).absolute().parent

default = root / 'ci' / 'ibis-testing-data'
datadir = os.environ.get('IBIS_TEST_DATA_DIRECTORY', default)
datadir = Path(datadir)

if not datadir.exists():
pytest.skip('test data directory not found')

return datadir
7 changes: 0 additions & 7 deletions docs/README

This file was deleted.

4 changes: 0 additions & 4 deletions docs/requirements-docs.txt

This file was deleted.

Binary file modified docs/source/_static/favicon.ico
Binary file not shown.
146 changes: 146 additions & 0 deletions docs/source/_static/favicon.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
135 changes: 135 additions & 0 deletions docs/source/_static/logo-wide.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
128 changes: 128 additions & 0 deletions docs/source/_static/logo.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
93 changes: 92 additions & 1 deletion docs/source/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -440,6 +440,7 @@ All string operations are valid either on scalar or array values
StringValue.capitalize
StringValue.contains
StringValue.like
StringValue.to_timestamp
StringValue.parse_url
StringValue.substr
StringValue.left
Expand Down Expand Up @@ -468,14 +469,104 @@ All timestamp operations are valid either on scalar or array values
.. autosummary::
:toctree: generated/

TimestampValue.truncate
TimestampValue.strftime
TimestampValue.year
TimestampValue.month
TimestampValue.day
TimestampValue.day_of_week
TimestampValue.hour
TimestampValue.minute
TimestampValue.second
TimestampValue.millisecond
TimestampValue.truncate
TimestampValue.time
TimestampValue.date
TimestampValue.add
TimestampValue.radd
TimestampValue.sub
TimestampValue.rsub

.. _api.date:

Date methods
------------

.. autosummary::
:toctree: generated/

DateValue.strftime
DateValue.year
DateValue.month
DateValue.day
DateValue.day_of_week
DateValue.truncate
DateValue.add
DateValue.radd
DateValue.sub
DateValue.rsub

.. _api.dow:

Day of week methods
-------------------

.. currentmodule:: ibis.expr.types

.. autosummary::
:toctree: generated/

DayOfWeek.index
DayOfWeek.full_name

.. currentmodule:: ibis.expr.api

.. _api.time:

Time methods
------------

.. autosummary::
:toctree: generated/

TimeValue.between
TimeValue.truncate
TimeValue.hour
TimeValue.minute
TimeValue.second
TimeValue.millisecond
TimeValue.add
TimeValue.radd
TimeValue.sub
TimeValue.rsub

.. _api.interval:

Interval methods
----------------

.. autosummary::
:toctree: generated/

IntervalValue.to_unit
IntervalValue.years
IntervalValue.quarters
IntervalValue.months
IntervalValue.weeks
IntervalValue.days
IntervalValue.hours
IntervalValue.minutes
IntervalValue.seconds
IntervalValue.milliseconds
IntervalValue.microseconds
IntervalValue.nanoseconds
IntervalValue.add
IntervalValue.radd
IntervalValue.sub
IntervalValue.mul
IntervalValue.rmul
IntervalValue.floordiv
IntervalValue.negate


Boolean methods
---------------
Expand Down
2 changes: 1 addition & 1 deletion docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@

# The name of an image file (relative to this directory) to place at the top
# of the sidebar.
# html_logo = None
html_logo = '_static/logo-wide.svg'

# The name of an image file (within the static path) to use as favicon of the
# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
Expand Down
35 changes: 35 additions & 0 deletions docs/source/contributing.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
.. _contrib:

********************
Contributing to Ibis
********************

.. note::

Make sure you've read the :ref:`installation section <install>` of the docs
before continuing.

.. _contrib.running_tests:

Running the Test Suite
----------------------

Contributor `Krisztián Szűcs <https://github.com/kszucs>`_ has spent many hours
crafting an easy-to-use `docker-compose <https://docs.docker.com/compose/>`_
setup that enables ibis developers to get up and running quickly.

Here are the steps to run clone the repo and run the test suite:

.. code-block:: sh
# clone ibis
git clone https://github.com/ibis-project/ibis
# go to where the docker-compose file is
pushd ibis/ci
# start services, build ibis, and load data into databases
ENVKIND=docs ./build.sh
# optionally run all tests
ENVKIND=docs ./test.sh -m 'not udf' -n auto -o cache_dir=/tmp/.pytest_cache
95 changes: 38 additions & 57 deletions docs/source/developer.rst
Original file line number Diff line number Diff line change
Expand Up @@ -34,111 +34,92 @@ Conda Environment Setup
.. code:: sh
# Create a conda environment ready for ibis development
conda env create --name ibis36 --file=ci/requirements_dev-3.6.yml
# including building the documentation
conda env create --name ibis36 --file=ci/requirements-docs-3.6.yml
# Activate the conda environment
source activate ibis36
# Install ibis
python setup.py develop
make develop
All-in-One Command
------------------

We use `docker-compose <https://docs.docker.com/compose/>`_ for
ibis development to make it easy for developers to test ibis
against databases that have traditionally requied a lot of setup,
such as Impala.

The following command does three steps:

#. Downloads the test data
#. Starts each backend via docker-compose
#. Starts each backend as a service via docker-compose
#. Initializes the backends with the test tables

.. code:: sh
cd testing
bash start-all.sh
make init
To use specific backends follow the instructions below.
Take a peek at the Makefile to see what else is available.

Download Test Datasets
----------------------

Download Test Dataset
---------------------
This step isn't necessary, but can sometimes be helpful if you
want to investigate something outside of the docker-compose setup
that ships with ibis.

#. `Install docker <https://docs.docker.com/engine/installation/>`_
#. **Download the test data**:

By default this will download and extract the dataset under
testing/ibis-testing-data.

.. code:: sh
testing/datamgr.py download
ci/datamgr.py download
Setting Up Test Databases
-------------------------

To start each backends
To start every backend as a service using ``docker-compose`` and
load test datasets into each backend use this command:

.. code:: sh
cd testing
docker-compose up
Impala (with UDFs)
^^^^^^^^^^^^^^^^^^

#. **Start the Impala docker image in another terminal**:

.. code:: sh
make init
# Keeping this running as long as you want to test ibis
docker run --tty --rm --hostname impala cpcloud86/impala:java8
The one backend that ibis supports that can't be started as a
service running in a docker container is BigQuery.

#. **Load data and UDFs into impala**:
Read the next section for details on how to get setup with
BigQuery and ibis.

.. code:: sh
BigQuery
^^^^^^^^

testing/impalamgr.py load --data --data-dir ibis-testing-data
Before you begin, you must have a `Google Cloud Platform project
<https://cloud.google.com/docs/overview/#projects>`_ with billing set up and
the `BigQuery API enabled
<https://console.cloud.google.com/flows/enableapi?apiid=bigquery>`_.

Clickhouse
^^^^^^^^^^
#. **Set up application default credentials by following the `getting started with
GCP authentication guide
<https://cloud.google.com/docs/authentication/getting-started>`_.**

#. **Start the Clickhouse Server docker image in another terminal**:
#. **Set the ``GOOGLE_BIGQUERY_PROJECT_ID`` environment variable**:

.. code:: sh
# Keeping this running as long as you want to test ibis
docker run --rm -p 9000:9000 --tty yandex/clickhouse-server
export GOOGLE_BIGQUERY_PROJECT_ID=your-project-id
#. **Load data**:
#. **Load data into BigQuery**:

.. code:: sh
testing/datamgr.py clickhouse
PostgreSQL
^^^^^^^^^^

PostgreSQL can be used from either the installation that resides on the Impala
docker image or from your machine directly.

Here's how to load test data into PostgreSQL:

.. code:: sh
testing/datamgr.py postgres
SQLite
^^^^^^

SQLite comes already installed on many systems. If you used the conda setup
instructions above, then SQLite will be available in the conda environment.

.. code:: sh
testing/datamgr.py sqlite
ci/datamgr.py bigquery
Running Tests
-------------
Expand All @@ -147,7 +128,7 @@ You are now ready to run the full ibis test suite:

.. code:: sh
pytest ibis
make test
Contribution Ideas
==================
Expand Down
6 changes: 4 additions & 2 deletions docs/source/extending.rst
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,14 @@ Adding a New Expression
docker-compose run waiter
docker-compose run ibis ci/load-data.sh postgres
Here we show how to add a ``sha1`` method to the PostgreSQL backend:
Here we show how to add a ``sha1`` method to the PostgreSQL backend as well as
how to add a new ``bitwise_and`` reduction operation:

.. toctree::
:maxdepth: 1

notebooks/tutorial/9-Adding-a-new-expression.ipynb
notebooks/tutorial/9-Adding-a-new-elementwise-expression.ipynb
notebooks/tutorial/10-Adding-a-new-reduction-expression.ipynb


Adding a New Backend
Expand Down
73 changes: 28 additions & 45 deletions docs/source/getting-started.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,21 +4,17 @@
Installation and Getting Started
********************************

Getting up and running with Ibis involves installing the Python package and
connecting to HDFS and Impala. If you don't have a Hadoop cluster available
with Impala, see :ref:`install.quickstart` below for instructions to use a VM
to get up and running quickly.

Installation
------------

System dependencies
System Dependencies
~~~~~~~~~~~~~~~~~~~

Ibis requires a working Python 2.7 or >= 3.4 installation. We recommend
`Anaconda <http://continuum.io/downloads>`_.
Ibis requires a working Python 2.7 or 3.5+ installation. We recommend using
`Anaconda <http://continuum.io/downloads>`_ to manage Python versions and
environments.

Installing the Python package
Installing the Python Package
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Install ibis using ``pip`` or ``conda``:
Expand Down Expand Up @@ -113,7 +109,7 @@ Create a client by passing a connection string or individual parameters to
... user='bob', port=23569, database='ibis_testing'
... )
.. _install.bigquery:
.. _install.clickhouse:

`Clickhouse <https://clickhouse.yandex/>`_ Quickstart
-----------------------------------------------------
Expand All @@ -132,6 +128,8 @@ Create a client by passing in database connection parameters such as ``host``,
>>> con = ibis.clickhouse.connect(host='localhost', port=9000)
.. _install.bigquery:

`BigQuery <https://cloud.google.com/bigquery/>`_ Quickstart
-----------------------------------------------------------

Expand All @@ -149,46 +147,31 @@ with:
>>> con = ibis.bigquery.connect(project_id='ibis-gbq', dataset_id='testing')
Learning resources
------------------

We are collecting Jupyter notebooks for learning here:
https://github.com/ibis-project/ibis/tree/master/docs/source/notebooks. Some of
these notebooks will be reproduced as part of the documentation.

.. _install.quickstart:
By default ibis assumes that the BigQuery project that's billed for queries is
also the project where the data lives.

However, it's very easy to query data that does **not** live in the billing
project.

Running Ibis Queries using Docker
---------------------------------
.. note::

Contributor `Krisztián Szűcs <https://github.com/kszucs>`_ has spent many hours
crafting a very easy-to-use ``docker-compose`` setup that enables users and
developers of ibis to get up and running quickly.
When you run queries against data from other projects **the billing project
will still be billed for any and all queries**.

Here are the steps:
If you want to query data that lives in a different project than the billing
project you can use the :meth:`~ibis.bigquery.client.BigQueryClient.database`
method of :class:`~ibis.bigquery.client.BigQueryClient` objects:

.. code-block:: python
.. code-block:: sh
# clone ibis
git clone https://github.com/ibis-project/ibis
# go to where the docker-compose file is
pushd ibis/ci
# build the latest version of ibis
docker-compose build --pull ibis
# spin up containers
docker-compose up -d --no-build postgres impala clickhouse
# wait for things to finish starting
docker-compose run waiter
>>> db = con.database('other-data-project.other-dataset')
>>> t = db.my_awesome_table
>>> t.sweet_column.sum().execute() # runs against the billing project
# load data into databases
docker-compose run ibis ci/load-data.sh
Learning Resources
------------------

# confirm that you can reach impala
impala_ip_address="$(docker inspect -f '{{.NetworkSettings.Networks.ci_default.IPAddress}}' ci_impala_1)"
ping -c 1 "${impala_ip_address}"
We collect Jupyter notebooks for learning how to use ibis here:
https://github.com/ibis-project/ibis/tree/master/docs/source/notebooks/tutorial.
Some of these notebooks will be reproduced as part of the documentation
:ref:`in the tutorial section <tutorial>`.
27 changes: 2 additions & 25 deletions docs/source/impala.rst
Original file line number Diff line number Diff line change
Expand Up @@ -104,8 +104,8 @@ table itself has a method ``drop`` that you can use:
table.drop()
Expression execution and asynchronous queries
---------------------------------------------
Expression execution
--------------------

Ibis expressions have an ``execute`` method with compiles and runs the
expressions on Impala or whichever backend is being referenced.
Expand All @@ -122,29 +122,6 @@ For longer-running queries, if you press Control-C (or whatever triggers the
Python ``KeyboardInterrupt`` on your system), Ibis will attempt to cancel the
query in progress.

As of Ibis 0.5.0, there is an explicit asynchronous API:

.. ipython:: python
query = expr.execute(async=True)
With the returned ``AsyncQuery`` object, you have various methods available to
check on the status of the executing expression:

.. ipython:: python
import time
while not query.is_finished():
time.sleep(1)
query.is_finished()
query.get_result()
If the query is still running, you can attempt to cancel it:

.. code-block:: python
query.cancel()
Creating tables
---------------

Expand Down
27 changes: 13 additions & 14 deletions docs/source/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -46,12 +46,13 @@ Or from `conda-forge <http://conda-forge.github.io>`_ with
At this time, Ibis offers some level of support for the following systems:

- `Apache Impala (incubating) <http://impala.io/>`_
- `Apache Kudu (incubating) <http://getkudu.io>`_
- PostgreSQL
- SQLite
- Google BigQuery (experimental)
- Yandex Clickhouse
- Direct execution of ibis expressions against pandas objects (Experimental)
- `Apache Kudu (incubating) <http://getkudu.io/>`_
- `PostgreSQL <https://www.postgresql.org/>`_
- `SQLite <https://www.sqlite.org/>`_
- `Google BigQuery <https://cloud.google.com/bigquery/>`_
- `Yandex Clickhouse <https://clickhouse.yandex/>`_
- Direct execution of ibis expressions against `Pandas
<http://pandas.pydata.org/>`_ objects

Coming from SQL? Check out :ref:`Ibis for SQL Programmers <sql>`.

Expand All @@ -66,14 +67,11 @@ Architecturally, Ibis features:

SQL engine support needing code contributors:

- Redshift
- Vertica
- Spark SQL
- Presto
- Hive

Since this is a young project, the documentation is definitely patchy in
places, but this will improve as things progress.
- `Redshift <https://aws.amazon.com/redshift/>`_
- `Vertica <https://www.vertica.com/>`_
- `Spark SQL <https://spark.apache.org/sql/>`_
- `Presto <https://prestodb.io/>`_
- `Hive <https://hive.apache.org/>`_

.. toctree::
:maxdepth: 1
Expand All @@ -85,6 +83,7 @@ places, but this will improve as things progress.
api
sql
udf
contributing
developer
design
extending
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,356 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Extending Ibis Part 2: Adding a New Reduction Expression"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This notebook will show you how to add a new *reduction* operation (`bitwise_and`) to an existing backend (PostgreSQL).\n",
"\n",
"A reduction operation is a function that maps $N$ rows to 1 row, for example the `sum` function."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Description\n",
"\n",
"We're going to add a **`bitwise_and`** function to ibis. `bitwise_and` computes the logical `AND` of the individual bits of an integer.\n",
"\n",
"For example,\n",
"\n",
"```\n",
" 0101\n",
" 0111\n",
" 0011\n",
"& 1101\n",
"------\n",
" 0001\n",
"```"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Step 1: Define the Operation"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's define the `bitwise_and` operation as a function that takes any integer typed column as input and returns an integer\n",
"\n",
"```haskell\n",
"bitwise_and :: Column Int -> Int\n",
"```"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import ibis.expr.datatypes as dt\n",
"import ibis.expr.rules as rlz\n",
"\n",
"from ibis.expr.operations import Reduction, Arg\n",
"\n",
"\n",
"class BitwiseAnd(Reduction):\n",
" arg = Arg(rlz.column(rlz.integer))\n",
" where = Arg(rlz.boolean, default=None)\n",
" output_type = rlz.scalar_like('arg')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We just defined a `BitwiseAnd` class that takes one integer column as input, and returns a scalar output of the same type as the input. This matches both the requirements of a reduction and the spepcifics of the function that we want to implement.\n",
"\n",
"**Note**: It is very important that you write the correct argument rules and output type here. The expression *will not work* otherwise."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Step 2: Define the API"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Because every reduction in ibis has the ability to filter out values during aggregation (a typical feature in databases and analytics tools), to make an expression out of ``BitwiseAnd`` we need to pass an additional argument: `where` to our `BitwiseAnd` constructor."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from ibis.expr.types import IntegerColumn # not IntegerValue! reductions are only valid on columns\n",
"\n",
"\n",
"def bitwise_and(integer_column, where=None):\n",
" return BitwiseAnd(integer_column, where=where).to_expr()\n",
"\n",
"\n",
"IntegerColumn.bitwise_and = bitwise_and"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Interlude: Create some expressions using `bitwise_and`"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import ibis"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"t = ibis.table([('bigint_col', 'int64'), ('string_col', 'string')], name='t')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"t.bigint_col.bitwise_and()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"t.bigint_col.bitwise_and(t.string_col == '1')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Step 3: Turn the Expression into SQL"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import sqlalchemy as sa\n",
"\n",
"\n",
"@ibis.postgres.compiles(BitwiseAnd)\n",
"def compile_sha1(translator, expr):\n",
" # pull out the arguments to the expression\n",
" arg, where = expr.op().args\n",
" \n",
" # compile the argument\n",
" compiled_arg = translator.translate(arg)\n",
" \n",
" # call the appropriate postgres function\n",
" agg = sa.func.bit_and(compiled_arg)\n",
" \n",
" # handle a non-None filter clause\n",
" if where is not None:\n",
" return agg.filter(translator.translate(where))\n",
" return agg"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Step 4: Putting it all Together"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Connect to the `ibis_testing` database"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**NOTE:**\n",
"\n",
"To be able to execute the rest of this notebook you need to run the following command from your ibis clone:\n",
"\n",
"```sh\n",
"ci/build.sh\n",
"```"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"con = ibis.postgres.connect(\n",
" user='postgres',\n",
" host='postgres',\n",
" password='postgres',\n",
" database='ibis_testing'\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create and execute a `bitwise_and` expression"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"t = con.table('functional_alltypes')\n",
"t"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"expr = t.bigint_col.bitwise_and()\n",
"expr"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sql_expr = expr.compile()\n",
"print(sql_expr)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"expr.execute()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Let's see what a `bitwise_and` call looks like with a `where` argument"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"expr = t.bigint_col.bitwise_and(where=(t.bigint_col == 10) | (t.bigint_col == 40))\n",
"expr"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"result = expr.execute()\n",
"result"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Let's confirm that taking bitwise `AND` of 10 and 40 is in fact 8"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"10 & 40"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(' {:0>8b}'.format(10))\n",
"print('& {:0>8b}'.format(40))\n",
"print('-' * 10)\n",
"print(' {:0>8b}'.format(10 & 40))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"# Extending Ibis Part 1: Adding a New Expression"
"# Extending Ibis Part 1: Adding a New Elementwise Expression"
]
},
{
Expand All @@ -16,7 +16,7 @@
"1. Expressions (for example, by adding a new operation)\n",
"1. Backends\n",
"\n",
"This notebook will show you how to add a new operation (`sha1`) to an existing backend (BigQuery)."
"This notebook will show you how to add a new elementwise operation--`sha1`--to an existing backend (PostgreSQL)."
]
},
{
Expand All @@ -42,7 +42,7 @@
"Let's define the `sha` operation as a function that takes one string input argument and returns a hexidecimal string.\n",
"\n",
"```haskell\n",
"sha1 :: string -> string\n",
"sha1 :: String -> String\n",
"```"
]
},
Expand All @@ -53,15 +53,14 @@
"outputs": [],
"source": [
"import ibis.expr.datatypes as dt\n",
"import ibis.expr.rules as rlz\n",
"\n",
"from ibis.expr import rules\n",
"from ibis.expr.operations import ValueOp\n",
"from ibis.expr.operations import ValueOp, Arg\n",
"\n",
"\n",
"class SHA1(ValueOp):\n",
" \n",
" input_type = [rules.string]\n",
" output_type = rules.shape_like_arg(0, 'string')"
" arg = Arg(rlz.string)\n",
" output_type = rlz.shape_like('arg', 'string')"
]
},
{
Expand Down Expand Up @@ -130,7 +129,7 @@
"metadata": {},
"outputs": [],
"source": [
"t = ibis.table([('string_col', 'string')])"
"t = ibis.table([('string_col', 'string')], name='t')"
]
},
{
Expand Down Expand Up @@ -190,12 +189,10 @@
"source": [
"**NOTE:**\n",
"\n",
"To be able to execute the rest of this notebook you need to run:\n",
"To be able to execute the rest of this notebook you need to run the following command from your ibis clone:\n",
"\n",
"```sh\n",
"docker-compose up -d --no-build postgres impala clickhouse mysql dns\n",
"docker-compose run waiter\n",
"docker-compose run ibis ci/load-data.sh\n",
"ci/build.sh\n",
"```"
]
},
Expand All @@ -205,6 +202,7 @@
"metadata": {},
"outputs": [],
"source": [
"import ibis\n",
"con = ibis.postgres.connect(\n",
" database='ibis_testing', user='postgres', host='postgres', password='postgres')"
]
Expand All @@ -213,7 +211,9 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"### Register the [`pgcrypto`](https://www.postgresql.org/docs/10/static/pgcrypto.html) extension"
"### Register the pgcrypto extension\n",
"\n",
"See https://www.postgresql.org/docs/10/static/pgcrypto.html for details about this extension"
]
},
{
Expand All @@ -222,7 +222,9 @@
"metadata": {},
"outputs": [],
"source": [
"con.raw_sql('CREATE EXTENSION IF NOT EXISTS pgcrypto'); # we don't care about the output"
"# the output here is an AlchemyProxy instance that cannot iterate\n",
"# (because there's no output from the database) so we hide it with a semicolon\n",
"con.raw_sql('CREATE EXTENSION IF NOT EXISTS pgcrypto');"
]
},
{
Expand Down Expand Up @@ -332,7 +334,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.3"
"version": "3.6.5"
}
},
"nbformat": 4,
Expand Down
Loading