Showing with 8,979 additions and 4,753 deletions.
  1. +8 −0 .coveragerc
  2. +1 −0 .gitignore
  3. +25 −0 .landscape.yaml
  4. +3 −0 Makefile
  5. +46 −12 README.md
  6. +22 −8 conda-recipes/ibis-framework/meta.yaml
  7. +31 −20 dev/merge-pr.py
  8. +53 −8 docs/source/api.rst
  9. +13 −12 docs/source/configuration.rst
  10. +15 −9 docs/source/getting-started.rst
  11. +25 −29 docs/source/impala-udf.rst
  12. +46 −6 docs/source/index.rst
  13. +90 −0 docs/source/release.rst
  14. +28 −18 ibis/__init__.py
  15. +305 −37 ibis/client.py
  16. +7 −6 ibis/cloudpickle.py
  17. +30 −0 ibis/compat.py
  18. +1 −0 ibis/config_init.py
  19. +16 −27 ibis/expr/analysis.py
  20. +322 −9 ibis/expr/api.py
  21. +48 −15 ibis/expr/datatypes.py
  22. +5 −3 ibis/expr/format.py
  23. +24 −9 ibis/expr/groupby.py
  24. +91 −61 ibis/expr/operations.py
  25. +56 −20 ibis/expr/rules.py
  26. +15 −0 ibis/expr/tests/conftest.py
  27. +230 −2 ibis/expr/tests/mocks.py
  28. +21 −3 ibis/expr/tests/test_format.py
  29. +5 −0 ibis/expr/tests/test_interactive.py
  30. +80 −7 ibis/expr/tests/test_table.py
  31. +2 −2 ibis/expr/tests/test_temporal.py
  32. +4 −5 ibis/expr/tests/test_value_exprs.py
  33. +70 −1 ibis/expr/tests/test_window_functions.py
  34. +85 −164 ibis/expr/types.py
  35. +31 −5 ibis/expr/window.py
  36. +32 −138 ibis/filesystems.py
  37. +66 −26 ibis/impala/api.py
  38. +356 −269 ibis/impala/client.py
  39. +440 −220 ibis/{sql/exprs.py → impala/compiler.py}
  40. +140 −88 ibis/impala/ddl.py
  41. +91 −0 ibis/impala/madlib.py
  42. +150 −0 ibis/impala/tests/common.py
  43. +85 −1 ibis/impala/tests/test_client.py
  44. +49 −13 ibis/impala/tests/test_ddl.py
  45. +906 −7 ibis/impala/tests/test_exprs.py
  46. +59 −0 ibis/impala/tests/test_madlib.py
  47. +4 −4 ibis/impala/tests/test_pandas_interop.py
  48. +2 −1 ibis/impala/tests/test_partition.py
  49. +33 −0 ibis/impala/tests/test_sql.py
  50. +280 −281 ibis/impala/tests/test_udf.py
  51. +22 −1 ibis/{sql → impala}/tests/test_window.py
  52. +203 −142 ibis/impala/udf.py
  53. +1 −1 ibis/server.py
  54. +847 −0 ibis/sql/alchemy.py
  55. +581 −80 ibis/sql/compiler.py
  56. +0 −170 ibis/sql/context.py
  57. +0 −451 ibis/sql/ddl.py
  58. 0 ibis/sql/postgres/__init__.py
  59. 0 ibis/sql/postgres/tests/__init__.py
  60. +15 −0 ibis/sql/postgres/tests/conftest.py
  61. +44 −0 ibis/sql/sqlite/api.py
  62. +108 −0 ibis/sql/sqlite/client.py
  63. +183 −0 ibis/sql/sqlite/compiler.py
  64. +59 −0 ibis/sql/sqlite/tests/common.py
  65. +15 −0 ibis/sql/sqlite/tests/conftest.py
  66. +97 −0 ibis/sql/sqlite/tests/test_client.py
  67. +339 −0 ibis/sql/sqlite/tests/test_functions.py
  68. +15 −0 ibis/sql/tests/conftest.py
  69. +928 −678 ibis/sql/tests/test_compiler.py
  70. +0 −903 ibis/sql/tests/test_exprs.py
  71. +537 −0 ibis/sql/tests/test_sqlalchemy.py
  72. +6 −9 ibis/sql/transforms.py
  73. +7 −9 ibis/tasks.py
  74. +39 −5 ibis/tests/conftest.py
  75. +64 −42 ibis/tests/test_filesystems.py
  76. +2 −1 ibis/tests/test_server.py
  77. +9 −10 ibis/tests/test_tasks.py
  78. +0 −120 ibis/tests/util.py
  79. +0 −39 ibis/uda.py
  80. +3 −2 ibis/util.py
  81. +2 −1 requirements.txt
  82. +1 −1 scripts/airline.py
  83. +31 −11 scripts/run_jenkins.sh
  84. +186 −48 scripts/test_data_admin.py
  85. +5 −3 setup.py
  86. +1 −3 testing/udf/CMakeLists.txt
  87. +0 −136 testing/udf/hyperloglog-uda.cc
  88. +54 −102 testing/udf/uda-sample.cc
  89. +14 −93 testing/udf/uda-sample.h
  90. +11 −0 testing/udf/udf-sample.cc
  91. +3 −0 testing/udf/udf-sample.h
  92. +0 −146 testing/udf/variance-uda.cc
8 changes: 8 additions & 0 deletions .coveragerc
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
[run]
branch: True
omit: ibis/tests/*,
ibis/*/tests/*,
ibis/cloudpickle.py
ibis/tasks.py
ibis/server.py
ibis/wire.py
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ dist
*.egg-info
# coverage
.coverage
coverage.xml

# OS generated files
.directory
Expand Down
25 changes: 25 additions & 0 deletions .landscape.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
strictness: medium

output-format: grouped

test-warnings: true
doc-warnings: false

ignore-paths:
- dev/merge-pr.py
- ibis/wire.py

ignore-patterns:
- ^build
- ^scripts

pep8:
run: true
options:
max-line-length: 79

mccabe:
run: false

pylint:
run: false
3 changes: 3 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,6 @@ all:

impala-test:
pushd scripts && python load_test_data.py --udf && popd

clean-pyc:
find . -name "*.pyc" -exec rm -rf {} \;
58 changes: 46 additions & 12 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,17 +1,51 @@
Ibis
===
[![codecov.io](http://codecov.io/github/cloudera/ibis/coverage.svg?branch=master)](http://codecov.io/github/cloudera/ibis?branch=master)

Ibis is a Python data analysis library enabling a 100% Python user workflow on
top of big data systems for maximum performance and scalability. It was
co-created by the creator of pandas (github.com/pydata/pandas) and designed to
have a familiar user interface for folks used to small data on single machines
in Python.
# Ibis: Python data analysis framework for Hadoop and SQL engines

Ibis targets Impala (github.com/cloudera/impala) as a first class execution
engine, but may grow to encompass other backend systems in the future.
Install Ibis from PyPI with:

Learn much more at http://ibis-project.org
$ pip install ibis-framework

Install ibis from PyPI with:
Ibis is a Python data analysis library with a handful of related goals:

$ pip install ibis-framework
- Enable data analysts to translation analytics on SQL engines to
Python code instead of the SQL code.
- Provide high level analytics APIs and workflow tools to accelerate
productivity.
- Provide high performance extensions for the Impala MPP query engine to enable
high performance Python code to operate in a scalable Hadoop-like environment
- Abstract away database-specific SQL differences
- Integrate with the Python data ecosystem using the above tools

At this item, Ibis supports the following SQL-based systems:

- Impala (on HDFS)
- SQLite

Ibis is being designed and led by the creator of pandas
(github.com/pydata/pandas) and is intended to have a familiar user interface
for folks used to small data on single machines in Python.

Architecturally, Ibis features:

- A pandas-like domain specific language (DSL) designed specifically for
analytics, aka **Ibis expressions**, that enable composable, reusable
analytics on structured data. If you can express something with a SQL SELECT
query, you can write it with Ibis.
- A translation system that targets multiple SQL systems
- Tools for wrapping user-defined functions in Impala and eventually other SQL
engines

SQL engine support near on the horizon:

- PostgreSQL
- Redshift
- Vertica
- Spark SQL
- Presto
- Hive
- MySQL / MariaDB

Read the project blog at http://blog.ibis-project.org.

Learn much more at http://ibis-project.org.
30 changes: 22 additions & 8 deletions conda-recipes/ibis-framework/meta.yaml
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
package:
name: ibis-framework
version: "0.3.0"
version: "0.4.0"

source:
fn: ibis-framework-0.3.0.tar.gz
url: https://pypi.python.org/packages/source/i/ibis-framework/ibis-framework-0.3.0.tar.gz
md5: 5aafdf43711df3fa45df3e17f68e46d5
fn: ibis-framework-0.4.0.tar.gz
url: https://pypi.python.org/packages/source/i/ibis-framework/ibis-framework-0.4.0.tar.gz
md5: 87323b54e070a538912cbb8d8174854a

requirements:
build:
Expand All @@ -14,28 +14,42 @@ requirements:
- pytest
- numpy >=1.7.0
- pandas >=0.12.0
- impyla >=0.9.1
- impyla >=0.10.0
- psutil ==0.6.1
- hdfs >=1.4.0
- hdfs ==1.4.3
- six

run:
- python
- pytest
- numpy >=1.7.0
- pandas >=0.12.0
- impyla >=0.9.1
- impyla >=0.10.0
- psutil ==0.6.1
- hdfs >=1.4.0
- hdfs ==1.4.3
- six

test:
imports:
- ibis
- ibis.expr
- ibis.expr.tests
- ibis.hive
- ibis.hive.tests
- ibis.impala
- ibis.impala.tests
- ibis.spark
- ibis.spark.tests
- ibis.sql
- ibis.sql.presto
- ibis.sql.presto.tests
- ibis.sql.redshift
- ibis.sql.redshift.tests
- ibis.sql.sqlite
- ibis.sql.sqlite.tests
- ibis.sql.tests
- ibis.sql.vertica
- ibis.sql.vertica.tests
- ibis.tests

about:
Expand Down
51 changes: 31 additions & 20 deletions dev/merge-pr.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,19 +22,20 @@
#
# Lightly modified from version of this script in incubator-parquet-format

from __future__ import print_function

from requests.auth import HTTPBasicAuth
import requests

import json
import os
import six
import subprocess
import sys
import urllib2
import textwrap

IBIS_HOME = os.path.abspath(__file__).rsplit("/", 2)[0]
PROJECT_NAME = 'ibis'
print "IBIS_HOME = " + IBIS_HOME
print("IBIS_HOME = " + IBIS_HOME)

# Remote name with the PR
PR_REMOTE_NAME = os.environ.get("PR_REMOTE_NAME", "upstream")
Expand Down Expand Up @@ -73,14 +74,14 @@ def get_json_no_auth(url):


def fail(msg):
print msg
print(msg)
clean_up()
sys.exit(-1)


def run_cmd(cmd):
# py2.6 does not have subprocess.check_output
if isinstance(cmd, basestring):
if isinstance(cmd, six.string_types):
cmd = cmd.split(' ')

popenargs = [cmd]
Expand All @@ -107,13 +108,13 @@ def continue_maybe(prompt):


def clean_up():
print "Restoring head pointer to %s" % original_head
print("Restoring head pointer to %s" % original_head)
run_cmd("git checkout %s" % original_head)

branches = run_cmd("git branch").replace(" ", "").split("\n")

for branch in filter(lambda x: x.startswith(BRANCH_PREFIX), branches):
print "Deleting local branch %s" % branch
print("Deleting local branch %s" % branch)
run_cmd("git branch -D %s" % branch)


Expand Down Expand Up @@ -167,13 +168,15 @@ def merge_pr(pr_num, target_ref):
for c in commits:
merge_message_flags += ["-m", c]

run_cmd(['git', 'commit', '--author="%s"' % primary_author] + merge_message_flags)
run_cmd(['git', 'commit', '--author="%s"' % primary_author] +
merge_message_flags)

continue_maybe("Merge complete (local ref %s). Push to %s?" % (
target_branch_name, PUSH_REMOTE_NAME))

try:
run_cmd('git push %s %s:%s' % (PUSH_REMOTE_NAME, target_branch_name, target_ref))
run_cmd('git push %s %s:%s' % (PUSH_REMOTE_NAME, target_branch_name,
target_ref))
except Exception as e:
clean_up()
fail("Exception while pushing: %s" % e)
Expand All @@ -190,17 +193,20 @@ def cherry_pick(pr_num, merge_hash, default_branch):
if pick_ref == "":
pick_ref = default_branch

pick_branch_name = "%s_PICK_PR_%s_%s" % (BRANCH_PREFIX, pr_num, pick_ref.upper())
pick_branch_name = "%s_PICK_PR_%s_%s" % (BRANCH_PREFIX, pr_num,
pick_ref.upper())

run_cmd("git fetch %s %s:%s" % (PUSH_REMOTE_NAME, pick_ref, pick_branch_name))
run_cmd("git fetch %s %s:%s" % (PUSH_REMOTE_NAME, pick_ref,
pick_branch_name))
run_cmd("git checkout %s" % pick_branch_name)
run_cmd("git cherry-pick -sx %s" % merge_hash)

continue_maybe("Pick complete (local ref %s). Push to %s?" % (
pick_branch_name, PUSH_REMOTE_NAME))

try:
run_cmd('git push %s %s:%s' % (PUSH_REMOTE_NAME, pick_branch_name, pick_ref))
run_cmd('git push %s %s:%s' % (PUSH_REMOTE_NAME, pick_branch_name,
pick_ref))
except Exception as e:
clean_up()
fail("Exception while pushing: %s" % e)
Expand All @@ -214,7 +220,8 @@ def cherry_pick(pr_num, merge_hash, default_branch):


def fix_version_from_branch(branch, versions):
# Note: Assumes this is a sorted (newest->oldest) list of un-released versions
# Note: Assumes this is a sorted (newest->oldest) list of un-released
# versions
if branch == "master":
return versions[0]
else:
Expand All @@ -223,7 +230,8 @@ def fix_version_from_branch(branch, versions):


branches = get_json("%s/branches" % GITHUB_API_BASE)
branch_names = filter(lambda x: x.startswith("branch-"), [x['name'] for x in branches])
branch_names = filter(lambda x: x.startswith("branch-"),
[x['name'] for x in branches])
# Assumes branch names can be sorted lexicographically
# latest_branch = sorted(branch_names, reverse=True)[0]

Expand All @@ -239,23 +247,25 @@ def fix_version_from_branch(branch, versions):
pr_repo_desc = "%s/%s" % (user_login, base_ref)

if pr["merged"] is True:
print "Pull request %s has already been merged, assuming you want to backport" % pr_num
print("Pull request {0} has already been merged, assuming "
"you want to backport".format(pr_num))
merge_commit_desc = run_cmd([
'git', 'log', '--merges', '--first-parent',
'--grep=pull request #%s' % pr_num, '--oneline']).split("\n")[0]
if merge_commit_desc == "":
fail("Couldn't find any merge commit for #%s, you may need to update HEAD." % pr_num)
fail("Couldn't find any merge commit for #{0}"
", you may need to update HEAD.".format(pr_num))

merge_hash = merge_commit_desc[:7]
message = merge_commit_desc[8:]

print "Found: %s" % message
print("Found: %s" % message)
maybe_cherry_pick(pr_num, merge_hash, latest_branch)
sys.exit(0)

if not bool(pr["mergeable"]):
msg = "Pull request %s is not mergeable in its current form.\n" % pr_num + \
"Continue? (experts only!)"
msg = ("Pull request {0} is not mergeable in its current form.\n"
"Continue? (experts only!)".format(pr_num))
continue_maybe(msg)

print ("\n=== Pull Request #%s ===" % pr_num)
Expand All @@ -269,4 +279,5 @@ def fix_version_from_branch(branch, versions):

pick_prompt = "Would you like to pick %s into another branch?" % merge_hash
while raw_input("\n%s (y/n): " % pick_prompt).lower() == "y":
merged_refs = merged_refs + [cherry_pick(pr_num, merge_hash, latest_branch)]
merged_refs = merged_refs + [cherry_pick(pr_num, merge_hash,
latest_branch)]
Loading