300 changes: 0 additions & 300 deletions docs/source/conf.py

This file was deleted.

134 changes: 0 additions & 134 deletions docs/source/index.rst

This file was deleted.

154 changes: 0 additions & 154 deletions docs/source/tutorial/data/Create-geography-database.ipynb

This file was deleted.

Binary file removed docs/source/tutorial/data/geography.db
Binary file not shown.
1 change: 0 additions & 1 deletion docs/source/tutorial/data/geography.json

This file was deleted.

17 changes: 0 additions & 17 deletions docs/source/tutorial/index.rst

This file was deleted.

99 changes: 0 additions & 99 deletions docs/source/user_guide/configuration.rst

This file was deleted.

211 changes: 0 additions & 211 deletions docs/source/user_guide/design.rst

This file was deleted.

72 changes: 0 additions & 72 deletions docs/source/user_guide/extending/index.rst

This file was deleted.

24 changes: 0 additions & 24 deletions docs/source/user_guide/index.rst

This file was deleted.

137 changes: 0 additions & 137 deletions docs/source/user_guide/self_joins.rst

This file was deleted.

1,264 changes: 0 additions & 1,264 deletions docs/source/user_guide/sql.rst

This file was deleted.

96 changes: 0 additions & 96 deletions docs/source/user_guide/topk.rst

This file was deleted.

21 changes: 0 additions & 21 deletions docs/source/user_guide/udf.rst

This file was deleted.

29 changes: 29 additions & 0 deletions docs/sqlalchemy_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import sqlalchemy as sa

c = sa.table("companies")
i = sa.table("investments")

a = (
sa.select(
[
sa.case(
[(i.c.investor_name.is_(None), "NO INVESTOR")],
else_=i.c.investor_name,
).label("investor_name"),
sa.func.count(c.c.permalink.distinct()).label("num_investments"),
sa.func.count(
sa.case(
[(c.status.in_(("ipo", "acquired")), c.c.permalink)]
).distinct()
).label("acq_ipos"),
]
)
.select_from(
c.join(
i, onclause=c.c.permalink == i.c.company_permalink, isouter=True
)
)
.group_by(1)
.order_by(sa.desc(2))
)
expr = sa.select([(a.c.acq_ipos / a.c.num_investments).label("acq_rate")])
File renamed without changes
65 changes: 65 additions & 0 deletions docs/stylesheets/extra.css
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
:root {
--md-admonition-icon--experimental: url('data:image/svg+xml;charset=utf-8,<svg aria-hidden="true" focusable="false" data-prefix="fas" data-icon="flask" class="svg-inline--fa fa-flask fa-w-14" role="img" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><path fill="currentColor" d="M437.2 403.5L320 215V64h8c13.3 0 24-10.7 24-24V24c0-13.3-10.7-24-24-24H120c-13.3 0-24 10.7-24 24v16c0 13.3 10.7 24 24 24h8v151L10.8 403.5C-18.5 450.6 15.3 512 70.9 512h306.2c55.7 0 89.4-61.5 60.1-108.5zM137.9 320l48.2-77.6c3.7-5.2 5.8-11.6 5.8-18.4V64h64v160c0 6.9 2.2 13.2 5.8 18.4l48.2 77.6h-172z"></path></svg>');
}
.md-typeset .admonition.experimental,
.md-typeset details.experimental {
border-color: rgb(43, 155, 70);
}

.md-typeset .experimental > .admonition-title,
.md-typeset .experimental > summary {
background-color: rgba(43, 155, 70, 0.1);
border-color: rgb(43, 155, 70);
}

.md-typeset .experimental > .admonition-title::before,
.md-typeset .experimental > summary::before {
background-color: rgb(43, 155, 70);
-webkit-mask-image: var(--md-admonition-icon--experimental);
mask-image: var(--md-admonition-icon--experimental);
}

.verified {
color: #00c853;
}

.unverified {
color: #ff9100;
}

.bug {
color: #f50057;
}

.cancel {
color: #ff5252;
}

.download-button {
text-align: center;
}

.support-matrix .md-typeset__table {
display: table;
min-width: 100%;
}

.support-matrix .md-typeset table:not([class]) {
display: table;
min-width: 100%;
}

body
> div.md-container
> main
> div
> div.md-content
> article
> div.md-typeset__scrollwrap
> div
> table
> thead
> tr
> th:nth-child(1) {
min-width: 9.8rem;
}

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

59 changes: 59 additions & 0 deletions docs/user_guide/configuration.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
# Configuring Ibis

Ibis configuration happens through the `ibis.options` attribute. Attributes can
be get and set like class attributes.

## Interactive mode

Ibis out of the box is in _developer mode_. Expressions display their internal
details when printed to the console. For a better interactive experience, set
the `interactive` option:

```python
ibis.options.interactive = True
```

This will cause expressions to be executed immediately when printed to the
console.

## SQL Query Execution

If an Ibis table expression has no row limit set using the `limit` API, a
default one is applied to prevent too much data from being retrieved from the
query engine. The default is currently 10000 rows, but this can be configured
with the `sql.default_limit` option:

```python
ibis.options.sql.default_limit = 100
```

Set this to `None` to retrieve all rows in all queries

!!! warning "Be careful with `None`"

Setting the default limit to `None` will result in *all* rows from a query
coming back to the client from the backend.

```python
ibis.options.sql.default_limit = None
```

## Verbose option and Logging

To see all internal Ibis activity (like queries being executed) set
`ibis.options.verbose`:

```python
ibis.options.verbose = True
```

By default this information is sent to `sys.stdout`, but you can set some
other logging function:

```python
def cowsay(msg):
print(f"Cow says: {msg}")


ibis.options.verbose_log = cowsay
```
175 changes: 175 additions & 0 deletions docs/user_guide/design.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
# Design

## Primary Goals

1. Type safety
1. Expressiveness
1. Composability
1. Familiarity

## Flow of Execution

1. User writes expression
1. Each method or function call builds a new expression
1. Expressions are type checked as you create them
1. Expressions have some optimizations that happen as the user builds them
1. Backend specific rewrites
1. Expressions are compiled
1. The SQL string that generated by the compiler is sent to the database and
executed (this step is skipped for the pandas backend)
1. The database returns some data that is then turned into a pandas DataFrame
by ibis

## Expressions

The main user-facing component of ibis is expressions. The base class of all
expressions in ibis is the [ibis.expr.types.Expr][] class.

Expressions provide the user facing API, most of which is defined in
`ibis/expr/api.py`.

### Type System

Ibis's type system consists of a set of rules for specifying the types of
inputs to `ibis.expr.types.Node` subclasses. Upon construction of a `Node`
subclass, ibis performs validation of every input to the node based on the rule
that was used to declare the input.

Rules are defined in `ibis.expr.rules`

<!-- prettier-ignore-start -->
### The [`Expr`][ibis.expr.types.Expr] class
<!-- prettier-ignore-end -->

Expressions are a thin but important abstraction over operations, containing
only type information and shape information, i.e., whether they are tables,
columns, or scalars.

<!-- prettier-ignore-start -->
Examples of expression types include
[`StringValue`][ibis.expr.types.StringValue] and
[`TableExpr`][ibis.expr.types.TableExpr].
<!-- prettier-ignore-end -->

<!-- prettier-ignore-start -->
### The `ibis.expr.types.Node` Class
<!-- prettier-ignore-end -->

`Node` subclasses make up the core set of operations of ibis. Each node
corresponds to a particular operation.

Most nodes are defined in the `ibis.expr.operations` module.

Examples of nodes include `ibis.expr.operations.Add` and
`ibis.expr.operations.Sum`.

Nodes (transitively) inherit from a class that allows node authors to define
their node's input arguments directly in the class body.

Additionally the `output_type` member of the class is a rule or method that
defines the shape (scalar or column) and element type of the operation.

An example of usage is a node that representats a logarithm operation:

```python

import ibis.expr.rules as rlz
from ibis.expr.operations import ValueOp

class Log(ValueOp):
# A double scalar or column
arg = rlz.double
# Optional argument, defaults to None
base = rlz.optional(rlz.double)
# Output expression's datatype will correspond to arg's datatype
output_dtype = rlz.dtype_like('arg')
# Output expression will be scalar if arg is scalar, column otherwise
output_shape = rlz.shape_like('arg')
```

This class describes an operation called `Log` that takes one required
argument: a double scalar or column, and one optional argument: a double scalar
or column named `base` that defaults to nothing if not provided. The `base`
argument is `None` by default so that the expression will behave as the
underlying database does.

Similar objects are instantiated when you use ibis APIs:

```python
import ibis
t = ibis.table([('a', 'float')], name='t')
log_1p = (1 + t.a).log() # an Add and a Log are instantiated here
```

### Expressions vs Operations: Why are they different?

Separating expressions from their underlying operations makes it easy to
generically describe and validate the inputs to particular nodes. In the log
example, it doesn't matter what _operation_ (node) the double-valued arguments
are coming from, they must only satisfy the requirement denoted by the rule.

Separation of the `ibis.expr.types.Node` and
`ibis.expr.types.Expr` classes also allows the API to be tied to the
physical type of the expression rather than the particular operation, making it
easy to define the API in terms of types rather than specific operations.

Furthermore, operations often have an output type that depends on the input
type. An example of this is the `greatest` function, which takes the maximum
of all of its arguments. Another example is `CASE` statements, whose `THEN`
expressions determine the output type of the expression.

This allows ibis to provide **only** the APIs that make sense for a particular
type, even when an operation yields a different output type depending on its
input. Concretely, this means that you cannot perform operations that don't
make sense, like computing the average of a string column.

## Compilation

The next major component of ibis is the compilers.

The first few versions of ibis directly generated strings, but the compiler
infrastructure was generalized to support compilation of
[SQLAlchemy](https://docs.sqlalchemy.org/en/latest/core/tutorial.html) based
expressions.

The compiler works by translating the different pieces of SQL expression into a
string or SQLAlchemy expression.

The main pieces of a `SELECT` statement are:

!. The set of column expressions (`select_set`)
!. `WHERE` clauses (`where`)
!. `GROUP BY` clauses (`group_by`)
!. `HAVING` clauses (`having`)
!. `LIMIT` clauses (`limit`)
!. `ORDER BY` clauses (`order_by`)
!. `DISTINCT` clauses (`distinct`)

Each of these pieces is translated into a SQL string and finally assembled by
the instance of the `ibis.sql.compiler.ExprTranslator` subclass
specific to the backend being compiled. For example, the
`ibis.impala.compiler.ImpalaExprTranslator` is one of the subclasses
that will perform this translation.

!!! note "Ibis can target other systems besides SQL"

While ibis was designed with an explicit goal of first-class SQL support,
ibis can target other systems such as pandas.

## Execution

Presumably we want to _do_ something with our compiled expressions. This is
where execution comes in.

This is least complex part of ibis, mostly only requiring ibis to correctly
handle whatever the database hands back.

By and large, the execution of compiled SQL is handled by the database to which
SQL is sent from ibis.

However, once the data arrives from the database we need to convert that
data to a pandas DataFrame.

The Query class, with its `ibis.sql.client.Query._fetch` method, provides a way
for ibis `ibis.sql.client.SQLClient` objects to do any additional processing
necessary after the database returns results to the client.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

133 changes: 133 additions & 0 deletions docs/user_guide/self_joins.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
# Self Joins

If you’re a relational data guru, you may have wondered how it’s possible to
join tables with themselves, because joins clauses involve column references
back to the original table.

Consider the SQL

```sql
SELECT t1.key, sum(t1.value - t2.value) AS metric
FROM my_table t1
JOIN my_table t2
ON t1.key = t2.subkey
GROUP BY 1
```

Here, we have an unambiguous way to refer to each of the tables through
aliasing.

Let’s consider the TPC-H database, and support we want to compute
year-over-year change in total order amounts by region using joins.

```python
>>> region = con.table('tpch_region')
>>> nation = con.table('tpch_nation')
>>> customer = con.table('tpch_customer')
>>> orders = con.table('tpch_orders')
>>> orders.limit(5)
o_orderkey o_custkey o_orderstatus o_totalprice o_orderdate \
0 1 36901 O 173665.47 1996-01-02
1 2 78002 O 46929.18 1996-12-01
2 3 123314 F 193846.25 1993-10-14
3 4 136777 O 32151.78 1995-10-11
4 5 44485 F 144659.20 1994-07-30

o_orderpriority o_clerk o_shippriority \
0 5-LOW Clerk#000000951 0
1 1-URGENT Clerk#000000880 0
2 5-LOW Clerk#000000955 0
3 5-LOW Clerk#000000124 0
4 5-LOW Clerk#000000925 0

o_comment
0 nstructions sleep furiously among
1 foxes. pending accounts at the pending, silen...
2 sly final accounts boost. carefully regular id...
3 sits. slyly regular warthogs cajole. regular, ...
4 quickly. bold deposits sleep slyly. packages u...
```

First, let’s join all the things and select the fields we care about:

```python
>>> fields_of_interest = [region.r_name.name('region'),
... nation.n_name.name('nation'),
... orders.o_totalprice.name('amount'),
... orders.o_orderdate.cast('timestamp').name('odate') # these are strings
... ]
>>> joined_all = (region.join(nation, region.r_regionkey == nation.n_regionkey)
... .join(customer, customer.c_nationkey == nation.n_nationkey)
... .join(orders, orders.o_custkey == customer.c_custkey)
... [fields_of_interest])
```

Okay, great, let’s have a look:

```python
>>> joined_all.limit(5)
region nation amount odate
0 AMERICA UNITED STATES 160843.35 1992-06-22
1 MIDDLE EAST IRAN 78307.91 1996-04-19
2 EUROPE FRANCE 103237.90 1994-10-12
3 EUROPE FRANCE 201463.59 1997-09-12
4 ASIA JAPAN 166098.86 1995-09-12
```

Sweet, now let’s aggregate by year and region:

```python
>>> year = joined_all.odate.year().name('year')
>>> total = joined_all.amount.sum().cast('float').name('total')
>>> annual_amounts = (joined_all
... .group_by(['region', year])
... .aggregate(total))
>>> annual_amounts.limit(5)
region year total
0 EUROPE 1994 6.979473e+09
1 EUROPE 1996 7.015421e+09
2 ASIA 1997 6.910663e+09
3 ASIA 1998 4.058824e+09
4 EUROPE 1992 6.926705e+09
```

Looking good so far. Now, we need to join this table on itself, by
subtracting 1 from one of the year columns.

We do this by creating a “joinable” view of a table that is considered a
distinct object within Ibis. To do this, use the `view` function:

```python
>>> current = annual_amounts
>>> prior = annual_amounts.view()
>>> yoy_change = (current.total - prior.total).name('yoy_change')
>>> results = (current.join(prior, ((current.region == prior.region) &
... (current.year == (prior.year - 1))))
... [current.region, current.year, yoy_change])
>>> df = results.execute()
```

```python
>>> df['yoy_pretty'] = df.yoy_change.map(lambda x: '$%.2fmm' % (x / 1000000.))
```

If you’re being fastidious and want to consider the first year occurring
in the dataset for each region to have 0 for the prior year, you will
instead need to do an outer join and treat nulls in the prior side of
the join as zero:

```python
>>> yoy_change = (current.total - prior.total.zeroifnull()).name('yoy_change')
>>> results = (current.outer_join(prior, ((current.region == prior.region) &
... (current.year == (prior.year - 1))))
... [current.region, current.year, current.total,
... prior.total.zeroifnull().name('prior_total'),
... yoy_change])
>>> results.limit(5)
region year total prior_total yoy_change
0 ASIA 1998 4.058824e+09 0.000000e+00 4.058824e+09
1 AFRICA 1994 6.837587e+09 6.908429e+09 -7.084172e+07
2 AMERICA 1996 6.883057e+09 6.922465e+09 -3.940791e+07
3 AFRICA 1996 6.878112e+09 6.848983e+09 2.912979e+07
4 AFRICA 1992 6.873319e+09 6.859733e+09 1.358699e+07
```
88 changes: 88 additions & 0 deletions docs/user_guide/topk.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
# Top-K Filtering

A common analytical pattern involves subsetting based on some method of
ranking. For example, “the 5 most frequently occurring widgets in a dataset”.
By choosing the right metric, you can obtain the most important or least
important items from some dimension, for some definition of important.

To carry out the pattern by hand involves the following

- Choose a ranking metric
- Aggregate, computing the ranking metric, by the target dimension
- Order by the ranking metric and take the highest K values
- Use those values as a set filter (either with `semi_join` or
`isin`) in your next query

For example, let’s look at the TPC-H tables and find the 5 or 10
customers who placed the most orders over their lifetime:

```python
>>> orders = con.table('tpch_orders')
>>> top_orders = (orders
... .group_by('o_custkey')
... .size()
... .sort_by(('count', False))
... .limit(5))
>>> top_orders
o_custkey count
0 3451 41
1 102022 41
2 102004 41
3 79300 40
4 117082 40
```

Now, we could use these customer keys as a filter in some other analysis:

```python
>>> # Among the top 5 most frequent customers, what's the histogram of their order statuses?
>>> analysis = (orders[orders.o_custkey.isin(top_orders.o_custkey)]
... .group_by('o_orderstatus')
... .size())
>>> analysis
o_orderstatus count
0 P 5
1 F 85
2 O 113
```

This is such a common pattern that Ibis supports a high level primitive
`topk` operation, which can be used immediately as a filter:

```python
>>> top_orders = orders.o_custkey.topk(5)
>>> orders[top_orders].group_by('o_orderstatus').size()
o_orderstatus count
0 P 5
1 F 85
2 O 113
```

This goes a little further. Suppose now we want to rank customers by their
total spending instead of the number of orders, perhaps a more meaningful
metric:

```python
>>> total_spend = orders.o_totalprice.sum().name('total')
>>> top_spenders = (orders
... .group_by('o_custkey')
... .aggregate(total_spend)
... .sort_by(('total', False))
... .limit(5))
>>> top_spenders
o_custkey total
0 143500 7012696.48
1 95257 6563511.23
2 87115 6457526.26
3 131113 6311428.86
4 103834 6306524.23
```

```python
>>> top_spenders = orders.o_custkey.topk(5, by=total_spend)
>>> orders[top_spenders].group_by('o_orderstatus').size()
o_orderstatus count
0 P 1
1 F 78
2 O 98
```
20 changes: 20 additions & 0 deletions docs/user_guide/udfs.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# User Defined Functions

!!! experimental "UD(A)Fs are unstable"

The user-defined elementwise and aggregate function APIs are provisional
and subject to change.

Ibis has mechanisms for writing custom scalar and aggregate functions with
varying levels of support for depending on the backend

User-defined function are a complex and interesting topic. Please get involved
if you're interested in working on them!

The following backends provide some level of support for user-defined functions:

- [Google BigQuery](https://github.com/ibis-project/ibis-bigquery)
- [Pandas](../backends/Pandas.md)
- [PostgreSQL](../backends/PostgreSQL.md)
- [Datafusion](../backends/Datafusion.md)
- [Impala](../backends/Impala.md)
23 changes: 0 additions & 23 deletions docs/web/about/index.md

This file was deleted.

22 changes: 0 additions & 22 deletions docs/web/about/team.md

This file was deleted.

415 changes: 0 additions & 415 deletions docs/web/contribute.md

This file was deleted.

30 changes: 0 additions & 30 deletions docs/web/getting_started.md

This file was deleted.

74 changes: 0 additions & 74 deletions docs/web/index.md

This file was deleted.

Binary file removed docs/web/static/img/favicon.ico
Binary file not shown.
Binary file removed docs/web/static/img/ibis_sky.png
Binary file not shown.
128 changes: 0 additions & 128 deletions docs/web/static/img/logo_ibis.svg

This file was deleted.

65 changes: 65 additions & 0 deletions gen_matrix.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
from pathlib import Path

import pandas as pd
import tomli

import ibis
import ibis.expr.operations as ops


def get_backends():
pyproject = tomli.loads(Path("pyproject.toml").read_text())
backends = pyproject["tool"]["poetry"]["plugins"]["ibis.backends"]
del backends["spark"]
return [
(backend, getattr(ibis, backend))
for backend in sorted(backends.keys())
]


def get_leaf_classes(op):
for child_class in op.__subclasses__():
if not child_class.__subclasses__():
yield child_class
else:
yield from get_leaf_classes(child_class)


ICONS = {
True: ":material-check-decagram:{ .verified }",
False: ":material-cancel:{ .cancel }",
}


def main():
possible_ops = frozenset(get_leaf_classes(ops.ValueOp))

support = {
"operation": [f"`{op.__name__}`" for op in possible_ops],
}
support.update(
(name, list(map(backend.has_operation, possible_ops)))
for name, backend in get_backends()
)

df = pd.DataFrame(support).set_index("operation").sort_index()

counts = df.sum().sort_values(ascending=False)
num_ops = len(possible_ops)
coverage = (
counts.map(lambda n: f"_{n} ({round(100 * n / num_ops)}%)_")
.to_frame(name="**API Coverage**")
.T
)

ops_table = df.loc[:, counts.index].replace(ICONS)
table = pd.concat([coverage, ops_table])
dst = Path(__file__).parent.joinpath(
"docs",
"backends",
"support_matrix.csv",
)
table.to_csv(dst, index_label="Backends")


main()
40 changes: 3 additions & 37 deletions ibis/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,46 +15,13 @@
try:
import importlib.metadata as importlib_metadata
except ImportError:
# TODO: remove this when Python 3.7 support is dropped
# TODO: remove this when Python 3.9 support is dropped
import importlib_metadata

__all__ = ['api', 'ir', 'util', 'IbisError', 'options']
__all__ += api.__all__

ibis.config.register_option(
'interactive', False, validator=ibis.config.is_bool
)
ibis.config.register_option('verbose', False, validator=ibis.config.is_bool)
ibis.config.register_option('verbose_log', None)
ibis.config.register_option(
'graphviz_repr',
True,
"""\
Whether to render expressions as GraphViz PNGs when repr-ing in a Jupyter
notebook.
""",
validator=ibis.config.is_bool,
)
ibis.config.register_option('default_backend', None)
with ibis.config.config_prefix('context_adjustment'):
ibis.config.register_option(
'time_col',
'time',
'Name of the timestamp col for execution with a timecontext'
'See ibis.expr.timecontext for details.',
validator=ibis.config.is_str,
)
with ibis.config.config_prefix('sql'):
ibis.config.register_option(
'default_limit',
10_000,
'Number of rows to be retrieved for an unlimited table expression',
)

try:
__version__ = importlib_metadata.version(__name__)
except Exception:
__version__ = importlib_metadata.version("ibis-framework")
__version__ = "3.0.0"


def __getattr__(name: str) -> BaseBackend:
Expand Down Expand Up @@ -99,8 +66,7 @@ def __getattr__(name: str) -> BaseBackend:

# The first time a backend is loaded, we register its options, and we set
# it as an attribute of `ibis`, so `__getattr__` is not called again for it
with ibis.config.config_prefix(name):
backend.register_options()
backend.register_options()

setattr(ibis, name, backend)
return backend
Loading