9 changes: 4 additions & 5 deletions asv.conf.json
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,6 @@
"numpy": [],
"pandas": [],
"toolz": [],
"six": [],
"multipledispatch": [],
"impyla": [],
"sqlalchemy": [],
Expand Down Expand Up @@ -90,10 +89,10 @@
// {"environment_type": "conda", "six": null}, // don't run without six on conda
// ],
//
"include": [
// additional env for python2.7
{"python": "2.7", "funcsigs": [], "enum34": [], "functools32": []}
],
// "include": [
// // additional env for python2.7
// {"python": "2.7", "funcsigs": [], "enum34": [], "functools32": []}
// ],

// The directory (relative to the current directory) that benchmarks are
// stored in. If not provided, defaults to "benchmarks"
Expand Down
159 changes: 159 additions & 0 deletions azure-pipelines.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
jobs:
- job: WindowsTest
pool:
vmImage: 'VS2017-Win2016'
strategy:
maxParallel: 3
matrix:
py36:
python.version: "3.6"
conda.env: "ibis_3.6"
py37:
python.version: "3.7"
conda.env: "ibis_3.7"
steps:
- powershell: Write-Host "##vso[task.prependpath]$env:CONDA\Scripts"
displayName: Add conda to PATH

- script: |
conda config --set always_yes True --set show_channel_urls True
conda config --add channels conda-forge
displayName: 'Set conda configuration'
- script: conda update --all
displayName: 'Update conda'

- script: conda create --name $(conda.env) python=$(python.version) numpy pandas pytables ruamel.yaml jinja2 pyarrow multipledispatch pymysql sqlalchemy psycopg2 graphviz click mock plumbum flake8 pytest-xdist
displayName: 'Create conda environment'

- script: conda info
displayName: 'Show conda info'

- script: conda list --name $(conda.env)
displayName: 'Show installed packages'

- script: |
call activate $(conda.env)
python -c "import numpy; import pandas"
displayName: 'Import numpy and pandas'
- script: |
call activate $(conda.env)
flake8
displayName: 'Lint'
- script: choco install -y mariadb --version=10.3.11
displayName: 'Install mariadb (mysql) from chocolatey'

- script: '"C:\\Program Files\\MariaDB 10.3\\bin\\mysql" -u root -e "CREATE OR REPLACE USER ibis@localhost IDENTIFIED BY ''ibis''"'
displayName: 'Create ibis user and password in MySQL database'

- script: '"C:\\Program Files\\MariaDB 10.3\\bin\\mysql" -u root -e "GRANT ALL PRIVILEGES ON *.* TO ibis@localhost"'
displayName: 'Setup privileges for ibis user in MySQL'

- script: choco install -y postgresql10 --params '/Password:postgres'
displayName: 'Install postgres from chocolatey'

- script: |
call activate $(conda.env)
python setup.py develop
displayName: 'Install ibis'
- script: |
call activate $(conda.env)
python ci/datamgr.py download
displayName: 'Download data'
- script: |
call activate $(conda.env)
python ci/datamgr.py mysql
displayName: 'Load MySQL data'
- script: |
call activate $(conda.env)
python ci/datamgr.py postgres --psql-path="C:/Program Files/PostgreSQL/10/bin/psql.exe"
displayName: 'Load PostgreSQL data'
- script: |
call activate $(conda.env)
python ci/datamgr.py sqlite
displayName: 'Load SQLite data'
- script: |
call activate $(conda.env)
python ci/datamgr.py parquet -i
displayName: 'Load Parquet data'
- script: |
call activate $(conda.env)
pytest --tb=short --junitxml="junit-$(python.version).xml" -n auto -m "not backend and not clickhouse and not impala and not hdfs and not bigquery and not mapd and not mysql" -ra ibis
displayName: 'Run tests'
# publish test results
- task: PublishTestResults@2
displayName: 'Publish test results from pytest JUnitXML'
inputs:
testResultsFiles: junit-$(python.version).xml
testRunTitle: 'Publish test results'
mergeTestResults: False
condition: succeededOrFailed() # pass or fail, but not cancelled

#- job: WindowsCondaBuild
#pool:
#vmImage: 'VS2017-Win2016'
#strategy:
#maxParallel: 3
#matrix:
#py36:
#python.version: "3.6"
#py37:
#python.version: "3.7"
#steps:
#- powershell: Write-Host "##vso[task.prependpath]$env:CONDA\Scripts"
#displayName: Add conda to PATH

#- script: |
#conda config --set always_yes True
#conda update --all
#conda config --add channels conda-forge
#conda config --set show_channel_urls True
#conda create --quiet --name "ibis_build_$(python.version)" python=$(python.version)
#displayName: Create Anaconda environment

#- script: |
#call activate "ibis_build_$(python.version)"
#conda install python=$(python.version) conda-build click jinja2 ruamel.yaml plumbum
#displayName: Install dependencies

#- script: |
#call activate "ibis_build_$(python.version)"
#python setup.py develop
#displayName: 'Install ibis'

#- script: |
#call activate "ibis_build_$(python.version)"
#python ci/feedstock.py clone
#displayName: 'Clone conda-forge recipe'

#- script: |
#call activate "ibis_build_$(python.version)"
#python ci/feedstock.py update
#displayName: 'Update conda-forge recipe'

#- script: |
#call activate "ibis_build_$(python.version)"
#python ci/feedstock.py build --python=$(python.version)
#displayName: 'Build conda package from conda-forge recipe'

#- script: |
#call activate "ibis_build_$(python.version)"
#python ci/feedstock.py deploy C:/Miniconda/envs/ibis_build_$(python.version)/conda-bld conda win-64
#displayName: 'Copy conda package to artifact directory'

## publish sdist and wheel and conda package
#- task: PublishBuildArtifacts@1
#displayName: 'Publish conda package to Azure'
#inputs:
#pathToPublish: conda
#artifactName: conda
#condition: and(succeeded(), eq(variables['System.PullRequest.IsFork'], 'False'))
210 changes: 130 additions & 80 deletions benchmarks/benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,74 +5,121 @@
import ibis.expr.datatypes as dt


def make_t(name='t'):
return ibis.table(
(
('_timestamp', 'int32'),
('dim1', 'int32'),
('dim2', 'int32'),
('valid_seconds', 'int32'),
('meas1', 'int32'),
('meas2', 'int32'),
('year', 'int32'),
('month', 'int32'),
('day', 'int32'),
('hour', 'int32'),
('minute', 'int32'),
),
name=name,
)


def make_base(t):
return (
(t.year > 2016)
| ((t.year == 2016) & (t.month > 6))
| ((t.year == 2016) & (t.month == 6) & (t.day > 6))
| ((t.year == 2016) & (t.month == 6) & (t.day == 6) & (t.hour > 6))
| (
(t.year == 2016)
& (t.month == 6)
& (t.day == 6)
& (t.hour == 6)
& (t.minute >= 5)
)
) & (
(t.year < 2016)
| ((t.year == 2016) & (t.month < 6))
| ((t.year == 2016) & (t.month == 6) & (t.day < 6))
| ((t.year == 2016) & (t.month == 6) & (t.day == 6) & (t.hour < 6))
| (
(t.year == 2016)
& (t.month == 6)
& (t.day == 6)
& (t.hour == 6)
& (t.minute <= 5)
)
)


def make_large_expr(t, base):
src_table = t[base]
src_table = src_table.mutate(
_timestamp=(src_table['_timestamp'] - src_table['_timestamp'] % 3600)
.cast('int32')
.name('_timestamp'),
valid_seconds=300,
)

aggs = []
for meas in ['meas1', 'meas2']:
aggs.append(src_table[meas].sum().cast('float').name(meas))
src_table = src_table.aggregate(
aggs, by=['_timestamp', 'dim1', 'dim2', 'valid_seconds']
)

part_keys = ['year', 'month', 'day', 'hour', 'minute']
ts_col = src_table['_timestamp'].cast('timestamp')
new_cols = {}
for part_key in part_keys:
part_col = getattr(ts_col, part_key)()
new_cols[part_key] = part_col
src_table = src_table.mutate(**new_cols)
return src_table[
[
'_timestamp',
'dim1',
'dim2',
'meas1',
'meas2',
'year',
'month',
'day',
'hour',
'minute',
]
]


class Suite:
def setup(self):
self.t = t = ibis.table((('_timestamp', 'int32'),
('dim1', 'int32'),
('dim2', 'int32'),
('valid_seconds', 'int32'),
('meas1', 'int32'),
('meas2', 'int32'),
('year', 'int32'),
('month', 'int32'),
('day', 'int32'),
('hour', 'int32'),
('minute', 'int32')), name='t')
self.base = (
(t.year > 2016) | (
(t.year == 2016) & (t.month > 6)) | (
(t.year == 2016) & (t.month == 6) &
(t.day > 6)) | (
(t.year == 2016) & (t.month == 6) &
(t.day == 6) & (t.hour > 6)) |
((t.year == 2016) & (t.month == 6) &
(t.day == 6) & (t.hour == 6) &
(t.minute >= 5))) & ((t.year < 2016) | (
(t.year == 2016) & (t.month < 6)) | (
(t.year == 2016) & (t.month == 6) &
(t.day < 6)) | (
(t.year == 2016) & (t.month == 6) &
(t.day == 6) & (t.hour < 6)) | (
(t.year == 2016) &
(t.month == 6) & (t.day == 6) &
(t.hour == 6) &
(t.minute <= 5)))
self.t = t = make_t()
self.base = make_base(t)
self.expr = self.large_expr

@property
def large_expr(self):
src_table = self.t[self.base]
src_table = src_table.mutate(_timestamp=(
src_table['_timestamp'] - src_table['_timestamp'] % 3600
).cast('int32').name('_timestamp'), valid_seconds=300)

aggs = []
for meas in ['meas1', 'meas2']:
aggs.append(src_table[meas].sum().cast('float').name(meas))
src_table = src_table.aggregate(
aggs, by=['_timestamp', 'dim1', 'dim2', 'valid_seconds'])

part_keys = ['year', 'month', 'day', 'hour', 'minute']
ts_col = src_table['_timestamp'].cast('timestamp')
new_cols = {}
for part_key in part_keys:
part_col = getattr(ts_col, part_key)()
new_cols[part_key] = part_col
src_table = src_table.mutate(**new_cols)
return src_table[[
'_timestamp', 'dim1', 'dim2', 'meas1', 'meas2',
'year', 'month', 'day', 'hour', 'minute'
]]
t = make_t()
return make_large_expr(t, make_base(t))


class Construction(Suite):

def time_large_expr_construction(self):
self.large_expr


class Formatting(Suite):
class Hashing(Suite):
def time_hash_small_expr(self):
hash(make_t())

def time_hash_medium_expr(self):
hash(make_base(make_t()))

def time_hash_large_expr(self):
hash(self.large_expr)


class Formatting(Suite):
def time_base_expr_formatting(self):
str(self.base)

Expand All @@ -81,7 +128,6 @@ def time_large_expr_formatting(self):


class Compilation(Suite):

def time_impala_base_compile(self):
ibis.impala.compile(self.base)

Expand All @@ -90,22 +136,24 @@ def time_impala_large_expr_compile(self):


class PandasBackend:

def setup(self):
n = 30 * int(2e5)
data = pd.DataFrame({
'key': np.random.choice(16000, size=n),
'low_card_key': np.random.choice(30, size=n),
'value': np.random.rand(n),
'timestamps': pd.date_range(
start='now', periods=n, freq='s'
).values,
'timestamp_strings': pd.date_range(
start='now', periods=n, freq='s'
).values.astype(str),
'repeated_timestamps': pd.date_range(
start='2018-09-01', periods=30).repeat(int(n / 30))
})
data = pd.DataFrame(
{
'key': np.random.choice(16000, size=n),
'low_card_key': np.random.choice(30, size=n),
'value': np.random.rand(n),
'timestamps': pd.date_range(
start='now', periods=n, freq='s'
).values,
'timestamp_strings': pd.date_range(
start='now', periods=n, freq='s'
).values.astype(str),
'repeated_timestamps': pd.date_range(
start='2018-09-01', periods=30
).repeat(int(n / 30)),
}
)

t = ibis.pandas.connect({'df': data}).table('df')

Expand All @@ -116,10 +164,10 @@ def setup(self):
self.cast_to_dates = t.timestamps.cast(dt.date)
self.cast_to_dates_from_strings = t.timestamp_strings.cast(dt.date)

self.multikey_group_by_with_mutate = t.mutate(
dates=t.timestamps.cast('date')
).groupby(['low_card_key', 'dates']).aggregate(
avg_value=lambda t: t.value.mean()
self.multikey_group_by_with_mutate = (
t.mutate(dates=t.timestamps.cast('date'))
.groupby(['low_card_key', 'dates'])
.aggregate(avg_value=lambda t: t.value.mean())
)

self.simple_sort = t.sort_by([t.key])
Expand All @@ -128,20 +176,22 @@ def setup(self):

self.multikey_sort = t.sort_by(['low_card_key', 'key'])

self.multikey_sort_projection = t[[
'low_card_key', 'key', 'value'
]].sort_by(['low_card_key', 'key'])
self.multikey_sort_projection = t[
['low_card_key', 'key', 'value']
].sort_by(['low_card_key', 'key'])

low_card_window = ibis.trailing_range_window(
2 * ibis.day(),
ibis.interval(days=2),
order_by=t.repeated_timestamps,
group_by=t.low_card_key)
group_by=t.low_card_key,
)
self.low_card_grouped_rolling = t.value.mean().over(low_card_window)

high_card_window = ibis.trailing_range_window(
2 * ibis.day(),
ibis.interval(days=2),
order_by=t.repeated_timestamps,
group_by=t.key)
group_by=t.key,
)
self.high_card_grouped_rolling = t.value.mean().over(high_card_window)

def time_high_cardinality_group_by(self):
Expand Down
27 changes: 27 additions & 0 deletions ci/.env
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
IBIS_TEST_DATA_DIRECTORY=/tmp/ibis-testing-data
IBIS_TEST_SQLITE_DATABASE=/tmp/ibis_testing.db
IBIS_TEST_NN_HOST=impala
IBIS_TEST_IMPALA_HOST=impala
IBIS_TEST_IMPALA_PORT=21050
IBIS_TEST_WEBHDFS_PORT=50070
IBIS_TEST_WEBHDFS_USER=hdfs
IBIS_TEST_MYSQL_HOST=mysql
IBIS_TEST_MYSQL_PORT=3306
IBIS_TEST_MYSQL_USER=ibis
IBIS_TEST_MYSQL_PASSWORD=ibis
IBIS_TEST_MYSQL_DATABASE=ibis_testing
IBIS_TEST_POSTGRES_HOST=postgres
IBIS_TEST_POSTGRES_PORT=5432
IBIS_TEST_POSTGRES_USER=postgres
IBIS_TEST_POSTGRES_PASSWORD=postgres
IBIS_TEST_POSTGRES_DATABASE=ibis_testing
IBIS_TEST_CLICKHOUSE_HOST=clickhouse
IBIS_TEST_CLICKHOUSE_PORT=9000
IBIS_TEST_CLICKHOUSE_DATABASE=ibis_testing
IBIS_TEST_MAPD_HOST=mapd
IBIS_TEST_MAPD_PORT=9091
IBIS_TEST_MAPD_DATABASE=ibis_testing
IBIS_TEST_MAPD_USER=mapd
IBIS_TEST_MAPD_PASSWORD=HyperInteractive
GOOGLE_BIGQUERY_PROJECT_ID=ibis-gbq
GOOGLE_APPLICATION_CREDENTIALS=/tmp/gcloud-service-key.json
29 changes: 0 additions & 29 deletions ci/Dockerfile

This file was deleted.

20 changes: 20 additions & 0 deletions ci/Dockerfile.dev
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
FROM continuumio/miniconda3

RUN apt-get -qq update --yes \
&& apt-get -qq install --yes --no-install-recommends \
build-essential git make clang libboost-dev postgresql-client ca-certificates \
&& rm -rf /var/lib/apt/lists/*

ARG PYTHON
ADD ci/requirements-dev.yml /

RUN conda config --add channels conda-forge \
&& conda update --all --yes --quiet \
&& conda install --yes --quiet --file /requirements-dev.yml python=$PYTHON conda-build \
&& conda clean --all --yes \
&& pip install pydata-google-auth

COPY . /ibis
WORKDIR /ibis

RUN python setup.py develop
19 changes: 19 additions & 0 deletions ci/Dockerfile.docs
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
ARG PYTHON
FROM ibis:$PYTHON

# fonts are for docs
RUN apt-get -qq update --yes \
&& apt-get -qq install --yes ttf-dejavu \
&& rm -rf /var/lib/apt/lists/*

ADD ci/requirements-docs.yml /
RUN conda config --add channels conda-forge \
&& conda update --all --yes --quiet \
&& conda install --yes --quiet --file /requirements-docs.yml python=$PYTHON conda-build \
&& conda clean --all --yes \
&& pip install pydata-google-auth

COPY . /ibis
WORKDIR /ibis

RUN python setup.py develop
4 changes: 3 additions & 1 deletion ci/asvconfig.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,7 @@

machine_info = asv.machine.Machine.get_defaults()
machine_info['machine'] = hostname
machine_info['ram'] = '{:d}GB'.format(int(machine_info['ram']) // 1000000)
machine_info['ram'] = '{:d}GB'.format(
int(machine_info['ram']) // 1_000_000
)
print(json.dumps({hostname: machine_info, 'version': 1}, indent=2))
6 changes: 3 additions & 3 deletions ci/benchmark.sh
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
#!/usr/bin/env bash

CWD=$(dirname $0)
CWD="$(dirname "${0}")"

pip install asv
$CWD/asvconfig.py $1 | tee $HOME/.asv-machine.json
"${CWD}"/asvconfig.py "${1}" | tee "${HOME}"/.asv-machine.json
git remote add upstream https://github.com/ibis-project/ibis
git fetch upstream refs/heads/master
asv continuous -f 1.5 -e upstream/master $2 || echo > /dev/null
asv continuous -f 1.5 -e upstream/master "${2}" || echo > /dev/null
225 changes: 144 additions & 81 deletions ci/datamgr.py

Large diffs are not rendered by default.

57 changes: 23 additions & 34 deletions ci/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ services:
- 25020:25020

clickhouse:
image: yandex/clickhouse-server:1.1.54388
image: yandex/clickhouse-server:18.12
ports:
- 8123:8123
- 9000:9000
Expand Down Expand Up @@ -75,9 +75,12 @@ services:
KUDU_MASTER: "false"

mapd:
image: mapd/mapd-ce-cpu:v3.6.0
image: mapd/mapd-ce-cpu:v4.4.2
ports:
- "9091-9092:9091-9092"
- 9090:9090
- 9091:9091
- 9092:9092
- 9093:9093
environment:
- MAPD_HOST=mapd
- MAPD_PORT=9091
Expand All @@ -104,40 +107,26 @@ services:
ibis:
image: ibis:${PYTHON_VERSION:-3.6}
environment:
IBIS_TEST_DOWNLOAD_DIRECTORY: /tmp
IBIS_TEST_DATA_DIRECTORY: /tmp/ibis-testing-data
IBIS_TEST_SQLITE_DATABASE: /tmp/ibis_testing.db
IBIS_TEST_NN_HOST: impala
IBIS_TEST_IMPALA_HOST: impala
IBIS_TEST_IMPALA_PORT: 21050
IBIS_TEST_WEBHDFS_PORT: 50070
IBIS_TEST_WEBHDFS_USER: hdfs
IBIS_TEST_MYSQL_HOST: mysql
IBIS_TEST_MYSQL_PORT: 3306
IBIS_TEST_MYSQL_USER: ibis
IBIS_TEST_MYSQL_PASSWORD: ibis
IBIS_TEST_MYSQL_DATABASE: ibis_testing
IBIS_TEST_POSTGRES_HOST: postgres
IBIS_TEST_POSTGRES_PORT: 5432
IBIS_TEST_POSTGRES_USER: postgres
IBIS_TEST_POSTGRES_PASSWORD: postgres
IBIS_TEST_POSTGRES_DATABASE: ibis_testing
IBIS_TEST_CLICKHOUSE_HOST: clickhouse
IBIS_TEST_CLICKHOUSE_PORT: 9000
IBIS_TEST_CLICKHOUSE_DATABASE: ibis_testing
IBIS_TEST_MAPD_HOST: mapd
IBIS_TEST_MAPD_PORT: 9091
IBIS_TEST_MAPD_DATABASE: ibis_testing
IBIS_TEST_MAPD_USER: mapd
IBIS_TEST_MAPD_PASSWORD: HyperInteractive
GOOGLE_BIGQUERY_PROJECT_ID: ibis-gbq
GOOGLE_APPLICATION_CREDENTIALS: /tmp/gcloud-service-key.json
env_file:
- ./.env
volumes:
- /tmp/ibis:/tmp
- ..:/ibis
build:
context: ..
dockerfile: ci/Dockerfile.dev
args:
PYTHON: ${PYTHON_VERSION:-3.6}

ibis-docs:
image: ibis-docs:${PYTHON_VERSION:-3.6}
env_file:
- ./.env
volumes:
- /tmp/ibis:/tmp
- ..:/ibis
build:
context: ..
dockerfile: ci/Dockerfile
dockerfile: ci/Dockerfile.docs
args:
PYTHON: ${PYTHON_VERSION:-3.6}
ENVKIND: ${ENVKIND:-dev}
17 changes: 10 additions & 7 deletions ci/docs.sh
Original file line number Diff line number Diff line change
@@ -1,14 +1,17 @@
#!/bin/bash -e

export ENVKIND=docs
export PYTHON_VERSION="3.6"

docker-compose build --pull ibis
docker-compose run --rm ibis ping -c 1 quickstart.cloudera
docker-compose run --rm ibis rm -rf /tmp/docs.ibis-project.org
docker-compose run --rm ibis git clone \
docker-compose build ibis
docker-compose build ibis-docs

# TODO(kszucs): move the following commands in a single script
docker-compose run --rm ibis-docs ping -c 1 quickstart.cloudera
docker-compose run --rm ibis-docs rm -rf /tmp/docs.ibis-project.org
docker-compose run --rm ibis-docs git clone \
--branch gh-pages \
https://github.com/ibis-project/docs.ibis-project.org /tmp/docs.ibis-project.org

docker-compose run --rm ibis find /tmp/docs.ibis-project.org -maxdepth 1 ! -wholename /tmp/docs.ibis-project.org ! -name '*.git' ! -name '.' ! -name 'CNAME' ! -name '*.nojekyll' -exec rm -rf {} \;
docker-compose run --rm ibis sphinx-build -b html docs/source /tmp/docs.ibis-project.org -W -j auto -T
docker-compose run --rm ibis-docs find /tmp/docs.ibis-project.org -maxdepth 1 ! -wholename /tmp/docs.ibis-project.org ! -name '*.git' ! -name '.' ! -name 'CNAME' ! -name '*.nojekyll' -exec rm -rf {} \;
docker-compose run --rm ibis-docs conda list --export
docker-compose run --rm ibis-docs sphinx-build -b html docs/source /tmp/docs.ibis-project.org -W -T
75 changes: 49 additions & 26 deletions ci/feedstock.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
#!/usr/bin/env python

import os
import shutil
import sys
import tempfile

from pathlib import Path

import click
import ruamel.yaml
Expand All @@ -10,15 +14,15 @@
from plumbum.cmd import git, conda

import ibis
from ibis.compat import Path, PY2


IBIS_DIR = Path(__file__).parent.parent.absolute()


def render(path):
env = Environment(loader=FileSystemLoader(str(path.parent)))
template = env.get_template(path.name)
parent = str(path.parent)
env = Environment(loader=FileSystemLoader(parent))
template = env.get_template(path.name, parent=parent)
return template.render()


Expand All @@ -28,7 +32,7 @@ def cli():


default_repo = 'https://github.com/conda-forge/ibis-framework-feedstock'
default_dest = '/tmp/ibis-framework-feedstock'
default_dest = os.path.join(tempfile.gettempdir(), 'ibis-framework-feedstock')


@cli.command()
Expand All @@ -40,17 +44,27 @@ def clone(repo_uri, destination):

cmd = git['clone', repo_uri, destination]

cmd(stdout=click.get_binary_stream('stdout'),
stderr=click.get_binary_stream('stderr'))
cmd(
stdout=click.get_binary_stream('stdout'),
stderr=click.get_binary_stream('stderr'),
)


SCRIPT = (
'{{ PYTHON }} -m pip install . --no-deps --ignore-installed '
'--no-cache-dir -vvv'
)


@cli.command()
@click.argument('meta', default=default_dest + '/recipe/meta.yaml')
@click.argument(
'meta', default=os.path.join(default_dest, 'recipe', 'meta.yaml')
)
@click.option('--source-path', default=str(IBIS_DIR))
def update(meta, source_path):
path = Path(meta)

click.echo('\nUpdating {} recipe...'.format(path.parent))
click.echo('Updating {} recipe...'.format(path.parent))

content = render(path)
recipe = ruamel.yaml.round_trip_load(content)
Expand All @@ -59,47 +73,56 @@ def update(meta, source_path):
recipe['package']['version'] = ibis.__version__[1:]
recipe['source'] = {'path': source_path}

# XXX: because render will remove the {{ PYTHON }} variable
recipe['build']['script'] = SCRIPT

updated_content = ruamel.yaml.round_trip_dump(
recipe, default_flow_style=False)
recipe, default_flow_style=False, width=sys.maxsize
).strip()

if PY2:
updated_content = updated_content.decode('utf-8')
click.echo(updated_content)

path.write_text(updated_content)


@cli.command()
@click.argument('recipe', default=default_dest + '/recipe')
def build(recipe):
click.echo('\nBuilding {} recipe...'.format(recipe))

python_version = '.'.join(map(str, sys.version_info[:3]))
@click.argument('recipe', default=os.path.join(default_dest, 'recipe'))
@click.option(
'--python',
default='{}.{}'.format(sys.version_info.major, sys.version_info.minor),
)
def build(recipe, python):
click.echo('Building {} recipe...'.format(recipe))

cmd = conda['build', recipe,
'--channel', 'conda-forge',
'--python', python_version]
cmd = conda[
'build', recipe, '--channel', 'conda-forge', '--python', python
]

cmd(stdout=click.get_binary_stream('stdout'),
stderr=click.get_binary_stream('stderr'))
cmd(
stdout=click.get_binary_stream('stdout'),
stderr=click.get_binary_stream('stderr'),
)


@cli.command()
@click.argument('package_location', default='/opt/conda/conda-bld')
@click.argument('artifact_directory', default='/tmp/packages')
@click.argument('architectures', default=('linux-64', 'noarch'))
def deploy(package_location, artifact_directory, architectures):
@click.argument('architecture', default='linux-64')
def deploy(package_location, artifact_directory, architecture):
artifact_dir = Path(artifact_directory)
artifact_dir.mkdir(parents=True, exist_ok=True)
package_loc = Path(package_location)
assert package_loc.exists(), 'Path {} does not exist'.format(package_loc)

for architecture in architectures:
for architecture in (architecture, 'noarch'):
arch_artifact_directory = str(artifact_dir / architecture)
arch_package_directory = str(package_loc / architecture)
shutil.copytree(arch_package_directory, arch_artifact_directory)
cmd = conda['index', artifact_directory]
cmd(stdout=click.get_binary_stream('stdout'),
stderr=click.get_binary_stream('stderr'))
cmd(
stdout=click.get_binary_stream('stdout'),
stderr=click.get_binary_stream('stderr'),
)


@cli.command()
Expand Down
117 changes: 72 additions & 45 deletions ci/impalamgr.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
#!/usr/bin/env python

import concurrent.futures

import itertools
import os

from io import BytesIO
from pathlib import Path

import click
import toolz

Expand All @@ -13,14 +15,16 @@

import ibis

from ibis.compat import BytesIO, Path
from ibis.common import IbisError
from ibis.impala.tests.conftest import IbisTestEnv


SCRIPT_DIR = Path(__file__).parent.absolute()
DATA_DIR = Path(os.environ.get('IBIS_TEST_DATA_DIRECTORY',
SCRIPT_DIR / 'ibis-testing-data'))
DATA_DIR = Path(
os.environ.get(
'IBIS_TEST_DATA_DIRECTORY', SCRIPT_DIR / 'ibis-testing-data'
)
)


logger = ibis.util.get_logger('impalamgr')
Expand All @@ -40,7 +44,7 @@ def make_ibis_client(env):
port=env.webhdfs_port,
auth_mechanism=env.auth_mechanism,
verify=env.auth_mechanism not in ['GSSAPI', 'LDAP'],
user=env.webhdfs_user
user=env.webhdfs_user,
)
auth_mechanism = env.auth_mechanism
if auth_mechanism == 'GSSAPI' or auth_mechanism == 'LDAP':
Expand All @@ -50,7 +54,7 @@ def make_ibis_client(env):
port=env.impala_port,
auth_mechanism=env.auth_mechanism,
hdfs_client=hc,
pool_size=16
pool_size=16,
)


Expand Down Expand Up @@ -90,7 +94,8 @@ def can_build_udfs():

def is_impala_loaded(con):
return con.hdfs.exists(ENV.test_data_dir) and con.exists_database(
ENV.test_data_db)
ENV.test_data_db
)


def is_udf_loaded(con):
Expand All @@ -112,18 +117,20 @@ def create_test_database(con):

con.create_table(
'alltypes',
schema=ibis.schema([
('a', 'int8'),
('b', 'int16'),
('c', 'int32'),
('d', 'int64'),
('e', 'float'),
('f', 'double'),
('g', 'string'),
('h', 'boolean'),
('i', 'timestamp')
]),
database=ENV.test_data_db
schema=ibis.schema(
[
('a', 'int8'),
('b', 'int16'),
('c', 'int32'),
('d', 'int64'),
('e', 'float'),
('f', 'double'),
('g', 'string'),
('h', 'boolean'),
('i', 'timestamp'),
]
),
database=ENV.test_data_db,
)
logger.info('Created empty table %s.`alltypes`', ENV.test_data_db)

Expand All @@ -133,30 +140,42 @@ def create_table(table_name):
logger.info('Creating %s', table_name)
schema = schemas.get(table_name)
path = os.path.join(ENV.test_data_dir, 'parquet', table_name)
table = con.parquet_file(path, schema=schema, name=table_name,
database=ENV.test_data_db, persist=True)
table = con.parquet_file(
path,
schema=schema,
name=table_name,
database=ENV.test_data_db,
persist=True,
)
return table

parquet_files = con.hdfs.ls(os.path.join(ENV.test_data_dir, 'parquet'))
schemas = {
'functional_alltypes': ibis.schema(
[('id', 'int32'),
('bool_col', 'boolean'),
('tinyint_col', 'int8'),
('smallint_col', 'int16'),
('int_col', 'int32'),
('bigint_col', 'int64'),
('float_col', 'float'),
('double_col', 'double'),
('date_string_col', 'string'),
('string_col', 'string'),
('timestamp_col', 'timestamp'),
('year', 'int32'),
('month', 'int32')]),
[
('id', 'int32'),
('bool_col', 'boolean'),
('tinyint_col', 'int8'),
('smallint_col', 'int16'),
('int_col', 'int32'),
('bigint_col', 'int64'),
('float_col', 'float'),
('double_col', 'double'),
('date_string_col', 'string'),
('string_col', 'string'),
('timestamp_col', 'timestamp'),
('year', 'int32'),
('month', 'int32'),
]
),
'tpch_region': ibis.schema(
[('r_regionkey', 'int16'),
('r_name', 'string'),
('r_comment', 'string')])}
[
('r_regionkey', 'int16'),
('r_name', 'string'),
('r_comment', 'string'),
]
),
}
return (
executor.submit(create_table, table_name)
for table_name in parquet_files
Expand All @@ -168,8 +187,13 @@ def create_table(table_name):
logger.info('Creating %s', table_name)
schema = schemas[table_name]
path = os.path.join(ENV.test_data_dir, 'avro', table_name)
table = con.avro_file(path, schema, name=table_name,
database=ENV.test_data_db, persist=True)
table = con.avro_file(
path,
schema,
name=table_name,
database=ENV.test_data_db,
persist=True,
)
return table

avro_files = con.hdfs.ls(os.path.join(ENV.test_data_dir, 'avro'))
Expand All @@ -180,7 +204,10 @@ def create_table(table_name):
'fields': [
{'name': 'R_REGIONKEY', 'type': ['null', 'int']},
{'name': 'R_NAME', 'type': ['null', 'string']},
{'name': 'R_COMMENT', 'type': ['null', 'string']}]}}
{'name': 'R_COMMENT', 'type': ['null', 'string']},
],
}
}
return (
executor.submit(create_table, table_name) for table_name in avro_files
)
Expand Down Expand Up @@ -210,8 +237,7 @@ def upload_udfs(con):

@click.group(context_settings=dict(help_option_names=['-h', '--help']))
def main():
"""Manage test data for Ibis"""
pass
"""Manage impala test data for Ibis."""


@main.command()
Expand All @@ -227,7 +253,7 @@ def main():
'Path to testing data. This downloads data from Google Cloud Storage '
'if unset'
),
default=DATA_DIR
default=DATA_DIR,
)
@click.option(
'--overwrite', is_flag=True, help='Forces overwriting of data/UDFs'
Expand Down Expand Up @@ -296,9 +322,10 @@ def compute_stats(table):

@main.command()
@click.option(
'--test-data', is_flag=True,
'--test-data',
is_flag=True,
help='Cleanup Ibis test data, test database, and also the test UDFs if '
'they are stored in the test data directory/database'
'they are stored in the test data directory/database',
)
@click.option('--udfs', is_flag=True, help='Cleanup Ibis test UDFs only')
@click.option(
Expand Down
3 changes: 2 additions & 1 deletion ci/mapd.conf
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
enable-watchdog = false
enable-watchdog = false
cpu-buffer-mem-bytes = 1000000000
44 changes: 0 additions & 44 deletions ci/requirements-dev-2.7.yml

This file was deleted.

37 changes: 0 additions & 37 deletions ci/requirements-dev-3.5.yml

This file was deleted.

39 changes: 0 additions & 39 deletions ci/requirements-dev-3.6.yml

This file was deleted.

39 changes: 39 additions & 0 deletions ci/requirements-dev.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
black
click
clickhouse-cityhash
clickhouse-driver
clickhouse-sqlalchemy
cmake
flake8
google-cloud-bigquery>=1.0.0
graphviz
impyla>=0.14.2
jinja2
lz4
multipledispatch>=0.6.0
mypy
numpy
pandas>=0.21
plumbum
pre_commit
psycopg2
pyarrow>=0.12
pymapd>=0.8.3
pymysql
pytables
pytest
pytest-cov
pytest-xdist
python-graphviz
python-hdfs>=2.0.16
pytz
regex
requests
ruamel.yaml
sqlalchemy
thrift
# required for impyla in case of py3
thriftpy
toolz
xorg-libxpm
xorg-libxrender
45 changes: 0 additions & 45 deletions ci/requirements-docs-3.6.yml

This file was deleted.

11 changes: 11 additions & 0 deletions ci/requirements-docs.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
ipython
jupyter
matplotlib
nbconvert
nbsphinx
nomkl
numpydoc
# must pin again otherwise strange things happen
pyarrow>=0.12
sphinx-releases
sphinx_rtd_theme
10 changes: 10 additions & 0 deletions ci/schema/mapd.sql
Original file line number Diff line number Diff line change
Expand Up @@ -70,3 +70,13 @@ CREATE TABLE functional_alltypes (
year_ INTEGER,
month_ INTEGER
);

DROP TABLE IF EXISTS geo;

CREATE TABLE geo (
id INTEGER,
geo_point POINT,
geo_linestring LINESTRING,
geo_polygon POLYGON,
geo_multipolygon MULTIPOLYGON
);
4 changes: 0 additions & 4 deletions ci/setup_docker_volume.sh
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,3 @@ fi

mkdir -p /tmp/ibis
cp "${GOOGLE_APPLICATION_CREDENTIALS}" /tmp/ibis/gcloud-service-key.json
cp -rf "${IBIS_TEST_DATA_DIRECTORY}" /tmp/ibis

gzipprog="$([ "$(which pigz)" ] && echo pigz || echo gzip)"
tar -I "${gzipprog}" -cf /tmp/ibis/ibis-testing-data.tar.gz "${IBIS_TEST_DATA_DIRECTORY}" 2> /dev/null
2 changes: 1 addition & 1 deletion conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import sys
import pytest

from ibis.compat import Path
from pathlib import Path


collect_ignore = ['setup.py']
Expand Down
390 changes: 174 additions & 216 deletions dev/merge-pr.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,225 +18,183 @@

# Utility for creating well-formed pull request merges and pushing them to
# Apache.
# usage: ./apache-pr-merge.py (see config env vars below)
# usage: ./apache-pull_request_number-merge.py (see config env vars below)
#
# Lightly modified from version of this script in incubator-parquet-format

from __future__ import print_function
"""Command line tool for merging PRs."""

from requests.auth import HTTPBasicAuth
import requests

import os
import subprocess
import sys
import collections
import pathlib
import textwrap

from six.moves import input
import six

if __name__ == '__main__':
IBIS_HOME = os.path.abspath(__file__).rsplit("/", 2)[0]
PROJECT_NAME = 'ibis'
print("IBIS_HOME = " + IBIS_HOME)

# Remote name with the PR
PR_REMOTE_NAME = os.environ.get("PR_REMOTE_NAME", "upstream")

# Remote name where results pushed
PUSH_REMOTE_NAME = os.environ.get("PUSH_REMOTE_NAME", "upstream")

GITHUB_BASE = "https://github.com/pandas-dev/" + PROJECT_NAME + "/pull"
GITHUB_API_BASE = "https://api.github.com/repos/pandas-dev/" + PROJECT_NAME

# Prefix added to temporary branches
BRANCH_PREFIX = "PR_TOOL"

os.chdir(IBIS_HOME)

auth_required = False

if auth_required:
GITHUB_USERNAME = os.environ['GITHUB_USER']
import getpass
GITHUB_PASSWORD = getpass.getpass('Enter github.com password for %s:'
% GITHUB_USERNAME)

def get_json_auth(url):
auth = HTTPBasicAuth(GITHUB_USERNAME, GITHUB_PASSWORD)
req = requests.get(url, auth=auth)
return req.json()

get_json = get_json_auth
else:
def get_json_no_auth(url):
req = requests.get(url)
return req.json()

get_json = get_json_no_auth

def fail(msg):
print(msg)
clean_up()
sys.exit(-1)

def run_cmd(cmd):
if isinstance(cmd, six.string_types):
cmd = cmd.split(' ')

try:
output = subprocess.check_output(cmd)
except subprocess.CalledProcessError as e:
# this avoids hiding the stdout / stderr of failed processes
print('Command failed: %s' % cmd)
print('With output:')
print('--------------')
print(e.output)
print('--------------')
raise e

if isinstance(output, six.binary_type):
output = output.decode('utf-8')
return output

def continue_maybe(prompt):
result = input("\n%s (y/n): " % prompt)
if result.lower() != "y":
fail("Okay, exiting")

original_head = run_cmd("git rev-parse HEAD")[:8]

def clean_up():
print("Restoring head pointer to %s" % original_head)
run_cmd("git checkout %s" % original_head)

branches = run_cmd("git branch").replace(" ", "").split("\n")

for branch in filter(lambda x: x.startswith(BRANCH_PREFIX), branches):
print("Deleting local branch %s" % branch)
run_cmd("git branch -D %s" % branch)

# merge the requested PR and return the merge hash
def merge_pr(pr_num, target_ref):
pr_branch_name = "%s_MERGE_PR_%s" % (BRANCH_PREFIX, pr_num)
target_branch_name = "%s_MERGE_PR_%s_%s" % (BRANCH_PREFIX, pr_num,
target_ref.upper())
run_cmd("git fetch %s pull/%s/head:%s" % (PR_REMOTE_NAME, pr_num,
pr_branch_name))
run_cmd("git fetch %s %s:%s" % (PUSH_REMOTE_NAME, target_ref,
target_branch_name))
run_cmd("git checkout %s" % target_branch_name)

had_conflicts = False
try:
run_cmd(['git', 'merge', pr_branch_name, '--squash'])
except Exception as e:
msg = ("Error merging: %s\nWould you like to "
"manually fix-up this merge?" % e)
continue_maybe(msg)
msg = ("Okay, please fix any conflicts and 'git add' "
"conflicting files... Finished?")
continue_maybe(msg)
had_conflicts = True

commit_authors = run_cmd(['git', 'log', 'HEAD..%s' % pr_branch_name,
'--pretty=format:%an <%ae>']).split("\n")
distinct_authors = sorted(set(commit_authors),
key=lambda x: commit_authors.count(x),
reverse=True)
primary_author = distinct_authors[0]
commits = run_cmd(['git', 'log', 'HEAD..%s' % pr_branch_name,
'--pretty=format:%h [%an] %s']).split("\n\n")

merge_message_flags = []

merge_message_flags += ["-m", title]
if body is not None:
merge_message_flags += ["-m", '\n'.join(textwrap.wrap(body))]

authors = "\n".join(["Author: %s" % a for a in distinct_authors])

merge_message_flags += ["-m", authors]

if had_conflicts:
committer_name = run_cmd("git config --get user.name").strip()
committer_email = run_cmd("git config --get user.email").strip()
message = ("This patch had conflicts when merged, "
"resolved by\nCommitter: %s <%s>" %
(committer_name, committer_email))
merge_message_flags += ["-m", message]

# The string "Closes #%s" string is required for GitHub to correctly
# close the PR
merge_message_flags += [
"-m",
"Closes #%s from %s and squashes the following commits:"
% (pr_num, pr_repo_desc)]
for c in commits:
merge_message_flags += ["-m", c]

run_cmd(['git', 'commit',
'--no-verify', # do not run commit hooks
'--author="%s"' % primary_author] +
merge_message_flags)

continue_maybe("Merge complete (local ref %s). Push to %s?" % (
target_branch_name, PUSH_REMOTE_NAME))

try:
run_cmd('git push %s %s:%s' % (
PUSH_REMOTE_NAME, target_branch_name, target_ref))
except Exception as e:
clean_up()
fail("Exception while pushing: %s" % e)

merge_hash = run_cmd("git rev-parse %s" % target_branch_name)[:8]
clean_up()
print("Pull request #%s merged!" % pr_num)
print("Merge hash: %s" % merge_hash)
return merge_hash

branches = get_json("%s/branches" % GITHUB_API_BASE)
branch_names = filter(lambda x: x.startswith("branch-"),
[x['name'] for x in branches])

pr_num = input("Which pull request would you like to merge? (e.g. 34): ")
pr = get_json("%s/pulls/%s" % (GITHUB_API_BASE, pr_num))

url = pr["url"]
title = pr["title"]
body = pr["body"]
target_ref = pr["base"]["ref"]
user_login = pr["user"]["login"]
base_ref = pr["head"]["ref"]
pr_repo_desc = "%s/%s" % (user_login, base_ref)

if pr["merged"] is True:
print("Pull request {0} has already been merged, assuming "
"you want to backport".format(pr_num))
merge_commit_desc = run_cmd([
'git', 'log', '--merges', '--first-parent',
'--grep=pull request #%s' % pr_num, '--oneline']).split("\n")[0]
if merge_commit_desc == "":
fail("Couldn't find any merge commit for #{0}"
", you may need to update HEAD.".format(pr_num))

merge_hash = merge_commit_desc[:7]
message = merge_commit_desc[8:]

print("Found: %s" % message)
sys.exit(0)

if not bool(pr["mergeable"]):
msg = ("Pull request {0} is not mergeable in its current form.\n"
"Continue? (experts only!)".format(pr_num))
continue_maybe(msg)

print("\n=== Pull Request #%s ===" % pr_num)
print("title\t%s\nsource\t%s\ntarget\t%s\nurl\t%s" % (
title, pr_repo_desc, target_ref, url))
continue_maybe("Proceed with merging pull request #%s?" % pr_num)

merged_refs = [target_ref]

merge_hash = merge_pr(pr_num, target_ref)
import click

import plumbum

from plumbum import cmd

import requests

IBIS_HOME = pathlib.Path(__file__).parent.parent
GITHUB_API_BASE = "https://api.github.com/repos/ibis-project/ibis"

git = cmd.git["-C", IBIS_HOME]


def merge_pr(
pr_num: int,
base_ref: str,
target_ref: str,
commit_title: str,
body: str,
pr_repo_desc: str,
original_head: str,
remote: str,
merge_method: str,
github_user: str,
password: str,
) -> None:
"""Merge a pull request."""
git_log = git["log", f"{remote}/{target_ref}..{base_ref}"]

commit_authors = git_log["--pretty=format:%an <%ae>"]().splitlines()
author_count = collections.Counter(commit_authors)
distinct_authors = [author for author, _ in author_count.most_common()]
commits = git_log["--pretty=format:%h [%an] %s"]().splitlines()

merge_message_pieces = []
if body:
merge_message_pieces.append("\n".join(textwrap.wrap(body)))
merge_message_pieces.extend(map("Author: {}".format, distinct_authors))

# The string f"Closes #{pull_request_number:d}" is required for GitHub to
# correctly close the PR
merge_message_pieces.append(
f"\nCloses #{pr_num:d} from {pr_repo_desc} and squashes the following "
"commits:\n"
)
merge_message_pieces += commits

commit_message = "\n".join(merge_message_pieces)
# PUT /repos/:owner/:repo/pulls/:number/merge
resp = requests.put(
f"{GITHUB_API_BASE}/pulls/{pr_num:d}/merge",
json=dict(
commit_title=commit_title,
commit_message=commit_message,
merge_method=merge_method,
),
auth=(github_user, password),
)
resp.raise_for_status()
if resp.status_code == 200:
resp_json = resp.json()
merged = resp_json["merged"]
assert merged is True, merged
click.echo(f"Pull request #{pr_num:d} successfully merged.")


@click.command()
@click.option(
"-p",
"--pull-request-number",
type=int,
prompt="Which pull request would you like to merge? (e.g., 34)",
help="The pull request number to merge.",
)
@click.option(
"-M",
"--merge-method",
type=click.Choice(("merge", "squash", "rebase")),
default="squash",
help="The method to use for merging the PR.",
show_default=True,
)
@click.option(
"-r",
"--remote",
default="upstream",
help="A valid git remote.",
show_default=True,
)
@click.option("-u", "--github-user", help="Your GitHub user name.")
@click.option(
"-P",
"--password",
help="Your GitHub password for authentication and authorization.",
)
def main(
pull_request_number: int,
merge_method: str,
remote: str,
github_user: str,
password: str,
) -> None: # noqa: D103
try:
git["fetch", remote]()
except plumbum.commands.processes.ProcessExecutionError as e:
raise click.ClickException(e.stderr)
try:
git["fetch", remote, f"pull/{pull_request_number:d}/head"]()
except plumbum.commands.processes.ProcessExecutionError as e:
raise click.ClickException(e.stderr)

original_head = git["rev-parse", "--abbrev-ref", "HEAD"]().strip()

if not original_head:
original_head = git["rev-parse", "HEAD"]().strip()

resp = requests.get(f"{GITHUB_API_BASE}/pulls/{pull_request_number:d}")
resp.raise_for_status()
pr_json = resp.json()

message = pr_json.get("message", None)
if message is not None and message.lower() == "not found":
raise click.ClickException(
f"PR {pull_request_number:d} does not exist."
)

if not pr_json["mergeable"]:
raise click.ClickException(
"Pull request {:d} cannot be merged in its current form."
)

url = pr_json["url"]
commit_title = pr_json["title"]
body = pr_json["body"]
target_ref = pr_json["base"]["ref"]
user_login = pr_json["user"]["login"]
base_ref = pr_json["head"]["ref"]
pr_repo_desc = f"{user_login}/{base_ref}"

click.echo(f"=== Pull Request #{pull_request_number:d} ===")
click.echo(
f"title\t{commit_title}\n"
f"source\t{pr_repo_desc}\n"
f"target\t{remote}/{target_ref}\n"
f"url\t{url}"
)

base_ref_commit = (
git["ls-remote", remote, f"refs/pull/{pull_request_number:d}/head"]()
.strip()
.split()[0]
)
merge_pr(
pull_request_number,
base_ref_commit,
target_ref,
commit_title,
body,
pr_repo_desc,
original_head,
remote,
merge_method,
github_user,
password,
)


if __name__ == "__main__":
main()
4 changes: 3 additions & 1 deletion docs/source/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -225,8 +225,10 @@ These methods are available directly in the ``ibis`` module namespace.
expr_list
row_number
window
range_window
trailing_window
cumulative_window
trailing_range_window

.. _api.expr:

Expand Down Expand Up @@ -254,10 +256,10 @@ Table methods
.. autosummary::
:toctree: generated/

TableExpr.add_column
TableExpr.aggregate
TableExpr.count
TableExpr.distinct
TableExpr.drop
TableExpr.info
TableExpr.filter
TableExpr.get_column
Expand Down
27 changes: 17 additions & 10 deletions docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,15 @@
'sphinx.ext.mathjax',
'numpydoc',
'nbsphinx',

'IPython.sphinxext.ipython_directive',
'IPython.sphinxext.ipython_console_highlighting',
'releases',
]

releases_github_path = "ibis-project/ibis"
releases_unstable_prehistory = True
releases_document_name = ["release"]

autosummary_generate = glob.glob("*.rst")

# autosummary_generate = True
Expand Down Expand Up @@ -217,7 +221,7 @@
# (source start file, target name, title,
# author, documentclass [howto, manual, or own class]).
latex_documents = [
('index', 'Ibis.tex', 'Ibis Documentation', 'Ibis Developers', 'manual'),
('index', 'Ibis.tex', 'Ibis Documentation', 'Ibis Developers', 'manual')
]

# The name of an image file (relative to this directory) to place at the top of
Expand All @@ -242,17 +246,14 @@


# extlinks alias
extlinks = {'issue': ('https://github.com/ibis-project/ibis/issues/%s', '#')}
extlinks = {'ghissue': ('https://github.com/ibis-project/ibis/issues/%s', '#')}


# -- Options for manual page output ---------------------------------------

# One entry per manual page. List of tuples
# (source start file, name, description, authors, manual section).
man_pages = [
('index', 'ibis', 'Ibis Documentation',
['Ibis Developers'], 1)
]
man_pages = [('index', 'ibis', 'Ibis Documentation', ['Ibis Developers'], 1)]

# If true, show URL addresses after external links.
# man_show_urls = False
Expand All @@ -264,9 +265,15 @@
# (source start file, target name, title, author,
# dir menu entry, description, category)
texinfo_documents = [
('index', 'Ibis', 'Ibis Documentation',
'Ibis Developers', 'Ibis', 'Pandas-like expressions for analytics',
'Miscellaneous'),
(
'index',
'Ibis',
'Ibis Documentation',
'Ibis Developers',
'Ibis',
'Pandas-like expressions for analytics',
'Miscellaneous',
)
]

# Documents to append as an appendix to all manuals.
Expand Down
4 changes: 2 additions & 2 deletions docs/source/contributing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ Here are the steps to run clone the repo and run the test suite:
pushd ibis/ci
# start services, build ibis, and load data into databases
ENVKIND=docs ./build.sh
./build.sh
# optionally run all tests
ENVKIND=docs ./test.sh -m 'not udf' -n auto -o cache_dir=/tmp/.pytest_cache
./test.sh -m 'not udf' -n auto -o cache_dir=/tmp/.pytest_cache
14 changes: 8 additions & 6 deletions docs/source/developer.rst
Original file line number Diff line number Diff line change
Expand Up @@ -35,14 +35,16 @@ Conda Environment Setup
# Create a conda environment ready for ibis development
# including building the documentation
conda env create --name ibis36 --file=ci/requirements-docs-3.6.yml
conda create -n ibis36 -c conda-forge --file=ci/requirements-dev.yml python=3.6
# Activate the conda environment
source activate ibis36
# Install ibis
make develop
*Note: `make develop` command also install a `pre-commit` Git hook.*


All-in-One Command
------------------
Expand Down Expand Up @@ -105,17 +107,17 @@ Before you begin, you must have a `Google Cloud Platform project
the `BigQuery API enabled
<https://console.cloud.google.com/flows/enableapi?apiid=bigquery>`_.

#. **Set up application default credentials by following the `getting started with
GCP authentication guide
<https://cloud.google.com/docs/authentication/getting-started>`_.**
#. Set up application default credentials by following the `getting started
with GCP authentication guide
<https://cloud.google.com/docs/authentication/getting-started>`_.

#. **Set the ``GOOGLE_BIGQUERY_PROJECT_ID`` environment variable**:
#. Set the ``GOOGLE_BIGQUERY_PROJECT_ID`` environment variable:

.. code:: sh
export GOOGLE_BIGQUERY_PROJECT_ID=your-project-id
#. **Load data into BigQuery**:
#. Load data into BigQuery:

.. code:: sh
Expand Down
60 changes: 60 additions & 0 deletions docs/source/getting-started.rst
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,66 @@ method of :class:`~ibis.bigquery.client.BigQueryClient` objects:
>>> t = db.my_awesome_table
>>> t.sweet_column.sum().execute() # runs against the billing project
`Pandas <https://pandas.pydata.org/>`_ Quickstart
------------------------------------------------------

Ibis's Pandas backend is available on Ibis's core:

Create a client by supplying a dictionary of DataFrames using
:func:`~ibis.pandas.connect`. The keys become the table names:

.. code-block:: python
>>> con = ibis.pandas.connect({
... 'A': pandas.util.testing.makeDataFrame(),
... 'B': pandas.util.testing.makeDataFrame()
... })
.. _install.mapd:

`MapD <https://www.omnisci.com/>`_ Quickstart
------------------------------------------------------

Install dependencies for Ibis's MapD dialect:

::

pip install ibis-framework[mapd]

Create a client by passing in database connection parameters such as ``host``,
``port``, ``database``, ``user`` and ``password`` to
:func:`~ibis.mapd.connect`:

.. code-block:: python
>>> con = ibis.mapd.connect(
... host='localhost', database='mapd', port=9091,
... user='mapd', password='HyperInteractive'
... )
.. _install.mysql:

`MySQL <https://www.mysql.com/>`_ Quickstart
------------------------------------------------------

Install dependencies for Ibis's MySQL dialect:

::

pip install ibis-framework[mysql]

Create a client by passing a connection string or individual parameters to
:func:`~ibis.mysql.connect`:

.. code-block:: python
>>> con = ibis.mysql.connect(
... 'mysql://user:pass@host:port/my_database'
... )
>>> con = ibis.mysql.connect(
... user='bob', port=23569, database='ibis_testing'
... )
Learning Resources
------------------

Expand Down
6 changes: 4 additions & 2 deletions docs/source/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,8 @@ Or from `conda-forge <http://conda-forge.github.io>`_ with

At this time, Ibis offers some level of support for the following systems:

- `Apache Impala (incubating) <http://impala.io/>`_
- `Apache Kudu (incubating) <http://getkudu.io/>`_
- `Apache Impala <https://impala.apache.org/>`_
- `Apache Kudu <https://kudu.apache.org/>`_
- `PostgreSQL <https://www.postgresql.org/>`_
- `SQLite <https://www.sqlite.org/>`_
- `Google BigQuery <https://cloud.google.com/bigquery/>`_
Expand Down Expand Up @@ -88,7 +88,9 @@ SQL engine support needing code contributors:
design
extending
backends
roadmap
release
release-pre-1.0
legal


Expand Down
19 changes: 1 addition & 18 deletions docs/source/notebooks/tutorial/3-Projection-Join-Sort.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -119,23 +119,6 @@
"proj2.limit(5)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Adding columns is a shortcut for projection. In Ibis, adding columns always produces a new table reference"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"table2 = table.add_column(bigger_expr)\n",
"table2.limit(5)"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down Expand Up @@ -506,7 +489,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.3"
"version": "3.6.7"
}
},
"nbformat": 4,
Expand Down
6 changes: 3 additions & 3 deletions docs/source/notebooks/tutorial/4-More-Value-Expressions.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -488,7 +488,7 @@
"outputs": [],
"source": [
"table[table.timestamp_col < \n",
" (ibis.timestamp('2010-01-01') + ibis.month(3))].count()"
" (ibis.timestamp('2010-01-01') + ibis.interval(months=3))].count()"
]
},
{
Expand All @@ -497,7 +497,7 @@
"metadata": {},
"outputs": [],
"source": [
"expr = (table.timestamp_col + ibis.day(1) + ibis.hour(4)).name('offset')\n",
"expr = (table.timestamp_col + ibis.interval(days=1) + ibis.interval(hours=4)).name('offset')\n",
"table[table.timestamp_col, expr, ibis.now().name('current_time')].limit(10)"
]
}
Expand All @@ -518,7 +518,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.3"
"version": "3.6.7"
}
},
"nbformat": 4,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,9 @@
"expr = (t2\n",
" [t2.bigint_col > 30]\n",
" .group_by('string_col')\n",
" .aggregate([t2.foo.min().name('min_foo'),\n",
" t2.foo.max().name('max_foo'),\n",
" t2.foo.sum().name('sum_foo')]))\n",
" .aggregate(min_foo=lambda t: t.foo.min(),\n",
" max_foo=lambda t: t.foo.max(),\n",
" sum_foo=lambda t: t.foo.sum()))\n",
"expr"
]
},
Expand Down
665 changes: 665 additions & 0 deletions docs/source/release-pre-1.0.rst

Large diffs are not rendered by default.

734 changes: 71 additions & 663 deletions docs/source/release.rst

Large diffs are not rendered by default.

155 changes: 155 additions & 0 deletions docs/source/roadmap.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
.. _roadmap:

Roadmap
=======
This document is an outline of the next set of major efforts within ibis.

.. _long_term_goals:

Long Term Goals
---------------
This section outlines broader, longer-term goals for the project alongside a
few short-term goals and provides information and direction for a few key areas
of focus over the next 1-2 years, possibly longer depending on the amount of
time the developers of Ibis can devote to the project.

.. _compiler_structure:

Compiler Structure
~~~~~~~~~~~~~~~~~~

.. _separation_of_concerns:

Separation of Concerns
^^^^^^^^^^^^^^^^^^^^^^
The current architecture of the ibis compiler has a few key problems that need
to be addressed to ensure longevity and maintainability of the project going
forward.

The main structural problem is that there isn’t one place where expression
optimizations and transformations happen. Sometimes optimizations occur as an
expression is being built, and other times they occur on a whole expression.

The solution is to separate the expression construction and optimization into
two phases, and guarantee that a constructed expression, if compiled without
optimization, maps clearly to SQL constructs.

The optimization pass would happen just before compilation and would be free to
optimize whole expression trees.

This approach lets us optimize queries piece by piece, as opposed to having to
provide all optimization implementations in a single pull request.

.. _unifying_table_and_column_compilation:

Unifying Table and Column Compilation
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Right now, it is very difficult to customize the way the operations underlying
table expressions are compiled. The logic to compile them is hard-coded in each
backend (or the compiler’s parent class). This needs to be addressed, if only
to ease the burden of implementing the UNNEST operation and make the codebase
easier to understand and maintain.

.. _depth:

Depth
~~~~~
"Depth" goals relate to enhancing Ibis to provide better support for
backend-specific functionality.

.. _backend_specific_operations:

Backend-Specific Operations
^^^^^^^^^^^^^^^^^^^^^^^^^^^
As the number of ibis users and use cases grows there will be an increasing
need for individual backends to support more exotic operations. Many SQL
databases have features that are unique only to themselves and often this is
why people will choose that technology over another. Ibis should support an API
that reflects the backend that underlies an expression and expose the
functionality of that specific backend.

A concrete example of this is the `FARM_FINGERPRINT
<https://cloud.google.com/bigquery/docs/reference/standard-sql/functions-and-operators#farm_fingerprint>`_
function in BigQuery.

It is unlikely that the main ValueExpr API will ever grow such a method, but a
BigQuery user shouldn’t be restricted to using only the methods this API
provides. Moreover, users should be able to bring their own methods to this API
without having to consult the ibis developers and without the addition of such
operations polluting the namespace of the main API.

One drawback to enabling this is that it provides an incentive for people to
define operations with a backend-specific spelling (presumably in the name of
expediency) that may actually be easily generalizable to or useful for other
backends. This behavior should be discouraged to the extent possible.

.. _standardize_udfs:

Standardize UDFs (User Defined Functions)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
A few backends have support for UDFs. Impala, Pandas and BigQuery all have at
least some level of support for user-defined functions. This mechanism should
be extended to other backends where possible. We outline different approaches
to adding UDFs to the backends that are well-supported but currently do not
have a UDF implementation. Development of a standard interface for UDFs is
ideal, so that it’s easy for new backends to implement the interface.

.. _breadth:

Breadth
~~~~~~~
The major breadth-related question ibis is facing is how to grow the number of
backends in ibis in a scalable, minimum-maintenance way is an open question.

Currently there is a test suite that runs across all backends that xfails tests
when a backend doesn’t implement a particular operation.

At minimum we need a way to display which backends implement which operations.
With the ability to provide custom operations we also need a way to display the
custom operations that each backend provides.

.. _backend_specific_goals:

Backend-Specific Goals
----------------------
These goals relate to specific backends

.. _pandas:

Pandas
~~~~~~

.. _speed_up_grouped_rolling_and_simple_aggregations_using_numba:

Speed up grouped, rolling, and simple aggregations using numba
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Pandas aggregations are quite slow relative to an equivalent numba
implementation, for various reasons. Since ibis hides the implementation
details of a particular expression we can experiment with using different
aggregation implementations.

.. _dask:

Dask
~~~~

.. _implement_a_dask_backed:

Implement a Dask backend
^^^^^^^^^^^^^^^^^^^^^^^^
There is currently no way in ibis to easily parallelize a computation on a
single machine, let alone distribute a computation across machines.

Dask provides APIs for doing such things.

.. _spark:

Spark
~~~~~

.. _implement_a_spark_backend:

Implement a SparkSQL backend
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
SparkSQL provides a way to execute distributed SQL queries similar to other
backends supported by ibis such as Impala and BigQuery.
33 changes: 2 additions & 31 deletions docs/source/sql.rst
Original file line number Diff line number Diff line change
Expand Up @@ -198,34 +198,6 @@ Indeed:
print(ibis.impala.compile(expr))
A useful pattern you can try is that of the *function factory* which allows you
to create function that reference a field of interest:

.. code-block:: python
def mad(field):
def closure(table):
return table[field].abs().mean()
return closure
.. ipython:: python
:suppress:
def mad(field):
def closure(table):
return table[field].abs().mean()
return closure
Now you can do:

.. ipython:: python
expr = (t.group_by(['one', 'three'])
.aggregate(the_sum=t.two.sum())
.group_by('one')
.aggregate(mad=mad('the_sum')))
Filtering / ``WHERE``
---------------------

Expand Down Expand Up @@ -663,8 +635,7 @@ which yield boolean arrays:
agged = (expr
[expr.one.notnull()]
.group_by('is_valid')
.aggregate(expr.three.notnull()
.sum().name('three_count')))
.aggregate(three_count=lambda t: t.three.notnull().sum()))
print(ibis.impala.compile(agged))
``BETWEEN``
Expand Down Expand Up @@ -1216,7 +1187,7 @@ arithmetic. For example:

.. ipython:: python
expr = events[events.ts > (ibis.now() - ibis.year())]
expr = events[events.ts > (ibis.now() - ibis.interval(years=1))]
print(ibis.impala.compile(expr))
The implementation of each timedelta offset will depend on the query engine.
Expand Down
134 changes: 92 additions & 42 deletions docs/source/udf.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,51 +10,96 @@ topic.
This section of the documentation will discuss some of the backend specific
details of user defined functions.

API
---
.. warning::

.. _udf.api:
The UDF API is provisional and subject to change.

.. warning::
.. _udf.pandas:

Pandas
------
Ibis supports defining three kinds of user-defined functions for operations on
expressions targeting the pandas backend: **element-wise**, **reduction**, and
**analytic**.

.. _udf.elementwise:

The UDF/UDAF API is experimental. It is provisional and subject to change.
Element-wise Functions
~~~~~~~~~~~~~~~~~~~~~~
An **element-wise** function is a function that takes N rows as input and
produces N rows of output. ``log``, ``exp``, and ``floor`` are examples of
element-wise functions.

The API for user defined *scalar* functions will look like this:
Here's how to define an element-wise function:

.. code-block:: python
@udf(input_type=[double], output_type=double)
import ibis.expr.datatypes as dt
from ibis.pandas import udf
@udf.elementwise(input_type=[dt.int64], output_type=.dtdouble)
def add_one(x):
return x + 1.0
.. _udf.reduction:

User defined *aggregate* functions are nearly identical, with the exception
of using the ``@udaf`` decorator instead of the ``@udf`` decorator.
Reduction Functions
~~~~~~~~~~~~~~~~~~~
A **reduction** is a function that takes N rows as input and produces 1 row
as output. ``sum``, ``mean`` and ``count`` are examples of reductions. In
the context of a ``GROUP BY``, reductions produce 1 row of output *per
group*.

Impala
------
Here's how to define a reduction function:

.. _udf.impala:
.. code-block:: python
TODO
import ibis.expr.datatypes as dt
from ibis.pandas import udf
Pandas
------
@udf.reduction(input_type=[dt.double], output_type=.dtdouble)
def double_mean(series):
return 2 * series.mean()
.. _udf.pandas:
.. _udf.analytic:

Analytic Functions
~~~~~~~~~~~~~~~~~~
An **analytic** function is like an **element-wise** function in that it
takes N rows as input and produces N rows of output. The key difference is
that analytic functions can be applied *per group* using window functions.
Z-score is an example of an analytic function.

Pandas supports defining both UDFs and UDAFs.
Here's how to define an analytic function:

When you define a UDF you automatically get support for applying that UDF in a
scalar context, as well as in any group by operation.
.. code-block:: python
When you define a UDAF you automatically get support for standard scalar
aggregations, group bys, *as well as* any supported windowing operation.
import ibis.expr.datatypes as dt
from ibis.pandas import udf
@udf.analytic(input_type=[dt.double], output_type=.dtdouble)
def zscore(series):
return (series - series.mean()) / series.std()
Details of Pandas UDFs
~~~~~~~~~~~~~~~~~~~~~~
- :ref:`Element-wise functions <udf.elementwise>` automatically provide support
for applying your UDF to any combination of scalar values and columns.
- :ref:`Reduction functions <udf.reduction>` automatically provide support for
whole column aggregations, grouped aggregations, and application of your
function over a window.
- :ref:`Analytic functions <udf.analytic>` work in both grouped and non-grouped
settings
- The objects you receive as input arguments are either ``pandas.Series`` or
Python/NumPy scalars.

The API for these functions is the same as described above.
.. note::

The objects you receive as input arguments are either ``pandas.Series`` or
python or numpy scalars depending on the operation.
Any keyword arguments must be given a default value or the function **will
not work**.

A common Python convention is to set the default value to ``None`` and
handle setting it to something not ``None`` in the body of the function.

Using ``add_one`` from above as an example, the following call will receive a
``pandas.Series`` for the ``x`` argument:
Expand All @@ -74,32 +119,34 @@ And this will receive the ``int`` 1:
>>> expr = add_one(1)
Finally, since the pandas backend passes around ``**kwargs`` you can accept
``**kwargs`` in your function:
Since the pandas backend passes around ``**kwargs`` you can accept ``**kwargs``
in your function:

.. code-block:: python
@udf([double], double)
def add_one(x, **kwargs):
return x + 1.0
import ibis.expr.datatypes as dt
from ibis.pandas import udf
Or you can leave them out as we did in the example above. You can also
optionally accept *specific* keyword arguments. This requires knowledge of how
the pandas backend works for it to be useful:
@udf.elementwise([dt.int64], dt.double)
def add_two(x, **kwargs):
# do stuff with kwargs
return x + 2.0
.. note::

Any keyword arguments (other than ``**kwargs``) must be given a default
value or the UDF/UDAF **will not work**. A standard Python convention is to
set the default value to ``None``.
Or you can leave them out as we did in the example above. You can also
optionally accept specific keyword arguments.

For example:

.. code-block:: python
@udf([double], double)
def add_one(x, scope=None):
return x + 1.0
import ibis.expr.datatypes as dt
from ibis.pandas import udf
@udf.elementwise([dt.int64], dt.double)
def add_two_with_none(x, y=None):
if y is None:
y = 2.0
return x + y
BigQuery
--------
Expand All @@ -108,7 +155,7 @@ BigQuery

.. note::

BigQuery only supports scalar UDFs at this time.
BigQuery only supports element-wise UDFs at this time.

BigQuery supports UDFs through JavaScript. Ibis provides support for this by
turning Python code into JavaScript.
Expand All @@ -117,7 +164,10 @@ The interface is very similar to the pandas UDF API:

.. code-block:: python
@udf([double], double)
import ibis.expr.datatypes as dt
from ibis.bigquery import udf
@udf([dt.double], dt.double)
def my_bigquery_add_one(x):
return x + 1.0
Expand Down
135 changes: 66 additions & 69 deletions ibis/__init__.py
Original file line number Diff line number Diff line change
@@ -1,142 +1,139 @@
# Copyright 2014 Cloudera Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


# flake8: noqa
import sys
from multipledispatch import halt_ordering, restart_ordering

import ibis.config_init
import ibis.util as util
import ibis.expr.api as api
import ibis.expr.types as ir

from ibis.config import options
from contextlib import suppress

import ibis.config_init # noqa: F401
import ibis.util as util # noqa: F401
import ibis.expr.api as api # noqa: F401
import ibis.expr.types as ir # noqa: F401

from ibis.config import options # noqa: F401
from ibis.common import IbisError
from ibis.compat import suppress
from ibis.filesystems import HDFS, WebHDFS
from ibis.filesystems import HDFS, WebHDFS # noqa: F401

# __all__ is defined
from ibis.expr.api import *

# speeds up signature registration
halt_ordering()
from ibis.expr.api import * # noqa: F401,F403

# pandas backend is mandatory
import ibis.pandas.api as pandas
import ibis.pandas.api as pandas # noqa: F401

with suppress(ImportError):
# pip install ibis-framework[csv]
import ibis.file.csv as csv
import ibis.file.csv as csv # noqa: F401

with suppress(ImportError):
# pip install ibis-framework[parquet]
import ibis.file.parquet as parquet
import ibis.file.parquet as parquet # noqa: F401

with suppress(ImportError):
# pip install ibis-framework[hdf5]
import ibis.file.hdf5 as hdf5
import ibis.file.hdf5 as hdf5 # noqa: F401

with suppress(ImportError):
# pip install ibis-framework[impala]
import ibis.impala.api as impala
import ibis.impala.api as impala # noqa: F401

with suppress(ImportError):
# pip install ibis-framework[sqlite]
import ibis.sql.sqlite.api as sqlite
import ibis.sql.sqlite.api as sqlite # noqa: F401

with suppress(ImportError):
# pip install ibis-framework[postgres]
import ibis.sql.postgres.api as postgres
import ibis.sql.postgres.api as postgres # noqa: F401

with suppress(ImportError):
# pip install ibis-framework[mysql]
import ibis.sql.mysql.api as mysql
import ibis.sql.mysql.api as mysql # noqa: F401

with suppress(ImportError):
# pip install ibis-framework[clickhouse]
import ibis.clickhouse.api as clickhouse
import ibis.clickhouse.api as clickhouse # noqa: F401

with suppress(ImportError):
# pip install ibis-framework[bigquery]
import ibis.bigquery.api as bigquery
import ibis.bigquery.api as bigquery # noqa: F401

with suppress(ImportError):
# pip install ibis-framework[mapd]
if sys.version_info.major < 3:
raise ImportError('The MapD backend is not supported under Python 2.')
import ibis.mapd.api as mapd
import ibis.mapd.api as mapd # noqa: F401

restart_ordering()


def hdfs_connect(host='localhost', port=50070, protocol='webhdfs',
use_https='default', auth_mechanism='NOSASL',
verify=True, **kwds):
"""
Connect to HDFS
def hdfs_connect(
host='localhost',
port=50070,
protocol='webhdfs',
use_https='default',
auth_mechanism='NOSASL',
verify=True,
session=None,
**kwds,
):
"""Connect to HDFS.
Parameters
----------
host : string, Host name of the HDFS NameNode
port : int, NameNode's WebHDFS port (default 50070)
protocol : {'webhdfs'}
use_https : boolean, default 'default'
host : str
Host name of the HDFS NameNode
port : int
NameNode's WebHDFS port
protocol : str,
The protocol used to communicate with HDFS. The only valid value is
``'webhdfs'``.
use_https : bool
Connect to WebHDFS with HTTPS, otherwise plain HTTP. For secure
authentication, the default for this is True, otherwise False
auth_mechanism : string, Set to NOSASL or PLAIN for non-secure clusters.
authentication, the default for this is True, otherwise False.
auth_mechanism : str
Set to NOSASL or PLAIN for non-secure clusters.
Set to GSSAPI or LDAP for Kerberos-secured clusters.
verify : boolean, Set to False to turn off verifying SSL certificates.
(default True)
verify : bool
Set to :data:`False` to turn off verifying SSL certificates.
session : Optional[requests.Session]
A custom :class:`requests.Session` object.
Other keywords are forwarded to hdfs library classes
Notes
-----
Other keywords are forwarded to HDFS library classes.
Returns
-------
client : WebHDFS
WebHDFS
"""
import requests
session = kwds.setdefault('session', requests.Session())

if session is None:
session = requests.Session()
session.verify = verify
if auth_mechanism in ['GSSAPI', 'LDAP']:
if auth_mechanism in ('GSSAPI', 'LDAP'):
if use_https == 'default':
prefix = 'https'
else:
prefix = 'https' if use_https else 'http'
try:
import requests_kerberos
import requests_kerberos # noqa: F401
except ImportError:
raise IbisError(
"Unable to import requests-kerberos, which is required for "
"Kerberos HDFS support. Install it by executing `pip install "
"requests-kerberos` or `pip install hdfs[kerberos]`.")
"requests-kerberos` or `pip install hdfs[kerberos]`."
)
from hdfs.ext.kerberos import KerberosClient

# note SSL
url = '{0}://{1}:{2}'.format(prefix, host, port)
kwds.setdefault('mutual_auth', 'OPTIONAL')
hdfs_client = KerberosClient(url, **kwds)
hdfs_client = KerberosClient(url, session=session, **kwds)
else:
if use_https == 'default':
prefix = 'http'
else:
prefix = 'https' if use_https else 'http'
from hdfs.client import InsecureClient
url = '{0}://{1}:{2}'.format(prefix, host, port)
hdfs_client = InsecureClient(url, **kwds)

url = '{}://{}:{}'.format(prefix, host, port)
hdfs_client = InsecureClient(url, session=session, **kwds)
return WebHDFS(hdfs_client)


from ._version import get_versions
from ._version import get_versions # noqa: E402

__version__ = get_versions()['version']
del get_versions
121 changes: 75 additions & 46 deletions ibis/_version.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@

# This file helps to compute a version number in source trees obtained from
# git-archive tarball (such as those provided by githubs download-from-tag
# feature). Distribution tarballs (built by setup.py sdist) and build
Expand Down Expand Up @@ -57,6 +56,7 @@ def decorate(f):
HANDLERS[vcs] = {}
HANDLERS[vcs][method] = f
return f

return decorate


Expand All @@ -67,9 +67,12 @@ def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False):
try:
dispcmd = str([c] + args)
# remember shell=False, so use git.cmd on windows, not just git
p = subprocess.Popen([c] + args, cwd=cwd, stdout=subprocess.PIPE,
stderr=(subprocess.PIPE if hide_stderr
else None))
p = subprocess.Popen(
[c] + args,
cwd=cwd,
stdout=subprocess.PIPE,
stderr=(subprocess.PIPE if hide_stderr else None),
)
break
except EnvironmentError:
e = sys.exc_info()[1]
Expand Down Expand Up @@ -99,12 +102,17 @@ def versions_from_parentdir(parentdir_prefix, root, verbose):
dirname = os.path.basename(root)
if not dirname.startswith(parentdir_prefix):
if verbose:
print("guessing rootdir is '%s', but '%s' doesn't start with "
"prefix '%s'" % (root, dirname, parentdir_prefix))
print(
"guessing rootdir is '%s', but '%s' doesn't start with "
"prefix '%s'" % (root, dirname, parentdir_prefix)
)
raise NotThisMethod("rootdir doesn't start with parentdir_prefix")
return {"version": dirname[len(parentdir_prefix):],
"full-revisionid": None,
"dirty": False, "error": None}
return {
"version": dirname[len(parentdir_prefix) :],
"full-revisionid": None,
"dirty": False,
"error": None,
}


@register_vcs_handler("git", "get_keywords")
Expand Down Expand Up @@ -144,7 +152,7 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose):
# starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of
# just "foo-1.0". If we see a "tag: " prefix, prefer those.
TAG = "tag: "
tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)])
tags = set([r[len(TAG) :] for r in refs if r.startswith(TAG)])
if not tags:
# Either we're using git < 1.8.3, or there really are no tags. We use
# a heuristic: assume all version tags have a digit. The old git %d
Expand All @@ -155,25 +163,30 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose):
# "stabilization", as well as "HEAD" and "master".
tags = set([r for r in refs if re.search(r'\d', r)])
if verbose:
print("discarding '%s', no digits" % ",".join(refs-tags))
print("discarding '%s', no digits" % ",".join(refs - tags))
if verbose:
print("likely tags: %s" % ",".join(sorted(tags)))
for ref in sorted(tags):
# sorting will prefer e.g. "2.0" over "2.0rc1"
if ref.startswith(tag_prefix):
r = ref[len(tag_prefix):]
r = ref[len(tag_prefix) :]
if verbose:
print("picking %s" % r)
return {"version": r,
"full-revisionid": keywords["full"].strip(),
"dirty": False, "error": None
}
return {
"version": r,
"full-revisionid": keywords["full"].strip(),
"dirty": False,
"error": None,
}
# no suitable tags, so version is "0+unknown", but full hex is still there
if verbose:
print("no suitable tags, using unknown + full revision id")
return {"version": "0+unknown",
"full-revisionid": keywords["full"].strip(),
"dirty": False, "error": "no suitable tags"}
return {
"version": "0+unknown",
"full-revisionid": keywords["full"].strip(),
"dirty": False,
"error": "no suitable tags",
}


@register_vcs_handler("git", "pieces_from_vcs")
Expand All @@ -193,9 +206,9 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
GITS = ["git.cmd", "git.exe"]
# if there is a tag, this yields TAG-NUM-gHEX[-dirty]
# if there are no tags, this yields HEX[-dirty] (no NUM)
describe_out = run_command(GITS, ["describe", "--tags", "--dirty",
"--always", "--long"],
cwd=root)
describe_out = run_command(
GITS, ["describe", "--tags", "--dirty", "--always", "--long"], cwd=root
)
# --long was added in git-1.5.5
if describe_out is None:
raise NotThisMethod("'git describe' failed")
Expand All @@ -218,7 +231,7 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
dirty = git_describe.endswith("-dirty")
pieces["dirty"] = dirty
if dirty:
git_describe = git_describe[:git_describe.rindex("-dirty")]
git_describe = git_describe[: git_describe.rindex("-dirty")]

# now we have TAG-NUM-gHEX or HEX

Expand All @@ -227,8 +240,9 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe)
if not mo:
# unparseable. Maybe git-describe is misbehaving?
pieces["error"] = ("unable to parse git-describe output: '%s'"
% describe_out)
pieces["error"] = (
"unable to parse git-describe output: '%s'" % describe_out
)
return pieces

# tag
Expand All @@ -237,10 +251,12 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
if verbose:
fmt = "tag '%s' doesn't start with prefix '%s'"
print(fmt % (full_tag, tag_prefix))
pieces["error"] = ("tag '%s' doesn't start with prefix '%s'"
% (full_tag, tag_prefix))
pieces["error"] = "tag '%s' doesn't start with prefix '%s'" % (
full_tag,
tag_prefix,
)
return pieces
pieces["closest-tag"] = full_tag[len(tag_prefix):]
pieces["closest-tag"] = full_tag[len(tag_prefix) :]

# distance: number of commits since tag
pieces["distance"] = int(mo.group(2))
Expand All @@ -251,8 +267,9 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
else:
# HEX: no tags
pieces["closest-tag"] = None
count_out = run_command(GITS, ["rev-list", "HEAD", "--count"],
cwd=root)
count_out = run_command(
GITS, ["rev-list", "HEAD", "--count"], cwd=root
)
pieces["distance"] = int(count_out) # total number of commits

return pieces
Expand Down Expand Up @@ -281,8 +298,7 @@ def render_pep440(pieces):
rendered += ".dirty"
else:
# exception #1
rendered = "0+untagged.%d.g%s" % (pieces["distance"],
pieces["short"])
rendered = "0+untagged.%d.g%s" % (pieces["distance"], pieces["short"])
if pieces["dirty"]:
rendered += ".dirty"
return rendered
Expand Down Expand Up @@ -389,10 +405,12 @@ def render_git_describe_long(pieces):

def render(pieces, style):
if pieces["error"]:
return {"version": "unknown",
"full-revisionid": pieces.get("long"),
"dirty": None,
"error": pieces["error"]}
return {
"version": "unknown",
"full-revisionid": pieces.get("long"),
"dirty": None,
"error": pieces["error"],
}

if not style or style == "default":
style = "pep440" # the default
Expand All @@ -412,8 +430,12 @@ def render(pieces, style):
else:
raise ValueError("unknown style '%s'" % style)

return {"version": rendered, "full-revisionid": pieces["long"],
"dirty": pieces["dirty"], "error": None}
return {
"version": rendered,
"full-revisionid": pieces["long"],
"dirty": pieces["dirty"],
"error": None,
}


def get_versions():
Expand All @@ -426,8 +448,9 @@ def get_versions():
verbose = cfg.verbose

try:
return git_versions_from_keywords(get_keywords(), cfg.tag_prefix,
verbose)
return git_versions_from_keywords(
get_keywords(), cfg.tag_prefix, verbose
)
except NotThisMethod:
pass

Expand All @@ -439,9 +462,12 @@ def get_versions():
for i in cfg.versionfile_source.split('/'):
root = os.path.dirname(root)
except NameError:
return {"version": "0+unknown", "full-revisionid": None,
"dirty": None,
"error": "unable to find root of source tree"}
return {
"version": "0+unknown",
"full-revisionid": None,
"dirty": None,
"error": "unable to find root of source tree",
}

try:
pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose)
Expand All @@ -455,6 +481,9 @@ def get_versions():
except NotThisMethod:
pass

return {"version": "0+unknown", "full-revisionid": None,
"dirty": None,
"error": "unable to compute version"}
return {
"version": "0+unknown",
"full-revisionid": None,
"dirty": None,
"error": "unable to compute version",
}
1 change: 1 addition & 0 deletions ibis/bigquery/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from ibis.bigquery.udf.api import udf # noqa: F401
68 changes: 53 additions & 15 deletions ibis/bigquery/api.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,11 @@
import google.cloud.bigquery # noqa: F401 fail early if bigquery is missing
"""BigQuery public API."""

from typing import Optional

import google.cloud.bigquery # noqa: F401, fail early if bigquery is missing
import google.auth.credentials
import pydata_google_auth

import ibis.common as com

from ibis.config import options # noqa: F401
Expand All @@ -11,43 +18,74 @@
pass


__all__ = ('compile', 'connect', 'verify', 'udf')


def compile(expr, params=None):
"""
Force compilation of expression as though it were an expression depending
on BigQuery. Note you can also call expr.compile()
"""Compile an expression for BigQuery.
Returns
-------
compiled : string
compiled : str
See Also
--------
ibis.expr.types.Expr.compile
"""
from ibis.bigquery.compiler import to_sql

return to_sql(expr, dialect.make_context(params=params))


def verify(expr, params=None):
"""
Determine if expression can be successfully translated to execute on
BigQuery
"""
"""Check if an expression can be compiled using BigQuery."""
try:
compile(expr, params=params)
return True
except com.TranslationError:
return False


def connect(project_id, dataset_id, credentials=None):
"""Create a BigQueryClient for use with Ibis
SCOPES = ["https://www.googleapis.com/auth/bigquery"]
CLIENT_ID = (
"546535678771-gvffde27nd83kfl6qbrnletqvkdmsese.apps.googleusercontent.com"
)
CLIENT_SECRET = "iU5ohAF2qcqrujegE3hQ1cPt"


def connect(
project_id: Optional[str] = None,
dataset_id: Optional[str] = None,
credentials: Optional[google.auth.credentials.Credentials] = None,
) -> BigQueryClient:
"""Create a BigQueryClient for use with Ibis.
Parameters
----------
project_id: str
dataset_id: str
credentials : google.auth.credentials.Credentials, optional, default None
project_id : str
A BigQuery project id.
dataset_id : str
A dataset id that lives inside of the project indicated by
`project_id`.
credentials : google.auth.credentials.Credentials
Returns
-------
BigQueryClient
"""
if credentials is None:
credentials_cache = pydata_google_auth.cache.ReadWriteCredentialsCache(
filename="ibis.json"
)
credentials, project_id = pydata_google_auth.default(
SCOPES,
client_id=CLIENT_ID,
client_secret=CLIENT_SECRET,
credentials_cache=credentials_cache,
)

return BigQueryClient(project_id, dataset_id, credentials=credentials)
return BigQueryClient(
project_id, dataset_id=dataset_id, credentials=credentials
)
Loading