| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,358 @@ | ||
| --- | ||
| title: "Better PyPI stats with Python" | ||
| author: "Cody Peterson" | ||
| date: "2024-09-03" | ||
| image: thumbnail.png | ||
| categories: | ||
| - clickhouse | ||
| - shiny | ||
| --- | ||
|
|
||
| ***Ibis + ClickHouse + Shiny for Python = better PyPI stats.*** | ||
|
|
||
| ## Overview | ||
|
|
||
| [PyPI Stats](https://pypistats.org/about) is a great resource for Python package | ||
| download statistics from PyPI. However, it only contains 180 days of data and | ||
| lacks more detailed analysis we might be interested in. In this post, we'll | ||
| build a dynamic Python application for better PyPI stats using | ||
| [ClickHouse](https://github.com/clickhouse/clickhouse) as our data platform, | ||
| [Ibis](https://github.com/ibis-project/ibis) as our Python data interface, and | ||
| [Shiny for Python](https://github.com/posit-dev/py-shiny) as our dashboarding | ||
| tool. | ||
|
|
||
| ::: {.callout-note title="What about ClickPy?"} | ||
| [ClickPy](https://github.com/ClickHouse/clickpy) is an existing open source and | ||
| reproducible project built on the same data with ClickHouse. The primary | ||
| difference is that ClickPy uses SQL and JavaScript whereas this project is in | ||
| Python. We also focus on different visualizations and metrics. | ||
| ::: | ||
|
|
||
| ## Prerequisites | ||
|
|
||
| Install the required dependencies: | ||
|
|
||
| ```bash | ||
| pip install 'ibis-framework[clickhouse]' plotly | ||
| ``` | ||
|
|
||
| Then run imports and setup: | ||
|
|
||
| ```{python} | ||
| import ibis | ||
| import plotly.express as px | ||
| import clickhouse_connect | ||
| px.defaults.template = "plotly_dark" | ||
| ibis.options.interactive = True | ||
| ``` | ||
|
|
||
| ## Connecting to ClickHouse | ||
|
|
||
| You can connect to the public ClickHouse playground's PyPI database: | ||
|
|
||
| ```{python} | ||
| host = "clickpy-clickhouse.clickhouse.com" | ||
| port = 443 | ||
| user = "play" | ||
| database = "pypi" | ||
| con = ibis.clickhouse.connect( | ||
| host=host, | ||
| port=port, | ||
| user=user, | ||
| database=database, | ||
| ) | ||
| con.list_tables() | ||
| ``` | ||
|
|
||
| ## Top packages by downloads | ||
|
|
||
| Let's start by looking at the most downloaded packages: | ||
|
|
||
| ```{python} | ||
| overall_t = con.table("pypi_downloads") | ||
| top_k = 10_000 | ||
| overall_t = ( | ||
| overall_t.order_by(ibis.desc("count")) | ||
| .limit(top_k) | ||
| .mutate(rank=1 + ibis.row_number().over(order_by=ibis.desc("count"))) | ||
| .rename({"downloads": "count"}) | ||
| .relocate("rank") | ||
| .order_by("rank") | ||
| ) | ||
| overall_t | ||
| ``` | ||
|
|
||
| ## Analyzing downloads for a package | ||
|
|
||
| Let's choose a package to analyze: | ||
|
|
||
| ```{python} | ||
| project = "clickhouse-connect" | ||
| ``` | ||
|
|
||
| And see where it ranks in the top downloads: | ||
|
|
||
| ```{python} | ||
| overall_t.filter(overall_t["project"] == project) | ||
| ``` | ||
|
|
||
| Let's look at downloads per day by various categories for this package: | ||
|
|
||
| ```{python} | ||
| downloads_t = con.table( | ||
| "pypi_downloads_per_day_by_version_by_installer_by_type_by_country" | ||
| ).filter(ibis._["project"] == project) | ||
| downloads_t | ||
| ``` | ||
|
|
||
| We might be interested in the day-of-week seasonality of downloads: | ||
|
|
||
| ```{python} | ||
| def day_of_week_bar(t): | ||
| t = t.mutate(day_of_week=t["date"].day_of_week.full_name()) | ||
| t = t.group_by("day_of_week").agg(downloads=ibis._["count"].sum()) | ||
| c = px.bar( | ||
| t, | ||
| x="day_of_week", | ||
| y="downloads", | ||
| category_orders={ | ||
| "day_of_week": [ | ||
| "Sunday", | ||
| "Monday", | ||
| "Tuesday", | ||
| "Wednesday", | ||
| "Thursday", | ||
| "Friday", | ||
| "Saturday", | ||
| ] | ||
| }, | ||
| ) | ||
| return c | ||
| day_of_week_bar(downloads_t) | ||
| ``` | ||
|
|
||
| Or the rolling 28-day downloads metric: | ||
|
|
||
| ```{python} | ||
| def rolling_downloads(t, days=28): | ||
| t = t.mutate( | ||
| timestamp=t["date"].cast("timestamp"), | ||
| ) | ||
| t = t.group_by("timestamp").agg(downloads=ibis._["count"].sum()) | ||
| t = t.select( | ||
| "timestamp", | ||
| rolling_downloads=ibis._["downloads"] | ||
| .sum() | ||
| .over( | ||
| ibis.window( | ||
| order_by="timestamp", | ||
| preceding=days, | ||
| following=0, | ||
| ) | ||
| ), | ||
| ).order_by("timestamp") | ||
| c = px.line( | ||
| t, | ||
| x="timestamp", | ||
| y="rolling_downloads", | ||
| ) | ||
| return c | ||
| rolling_downloads(downloads_t) | ||
| ``` | ||
|
|
||
| Or rolling 28-days downloads by version with a few options for how to group | ||
| versions: | ||
|
|
||
| ```{python} | ||
| def rolling_downloads_by_version(t, days=28, version_style="major.minor"): | ||
| t = t.mutate( | ||
| timestamp=t["date"].cast("timestamp"), | ||
| ) | ||
| match version_style: | ||
| case "major": | ||
| t = t.mutate(version=t["version"].split(".")[0]) | ||
| case "major.minor": | ||
| t = t.mutate( | ||
| version=t["version"].split(".")[0] + "." + t["version"].split(".")[1] | ||
| ) | ||
| case _: | ||
| pass | ||
| t = t.group_by("timestamp", "version").agg(downloads=ibis._["count"].sum()) | ||
| t = t.select( | ||
| "timestamp", | ||
| "version", | ||
| rolling_downloads=ibis._["downloads"] | ||
| .sum() | ||
| .over( | ||
| ibis.window( | ||
| order_by="timestamp", | ||
| group_by="version", | ||
| preceding=28, | ||
| following=0, | ||
| ) | ||
| ), | ||
| ).order_by("timestamp") | ||
| c = px.line( | ||
| t, | ||
| x="timestamp", | ||
| y="rolling_downloads", | ||
| color="version", | ||
| category_orders={ | ||
| "version": reversed( | ||
| sorted( | ||
| t.distinct(on="version")["version"].to_pyarrow().to_pylist(), | ||
| key=lambda x: tuple(int(y) for y in x.split(".") if y.isdigit()), | ||
| ) | ||
| ) | ||
| }, | ||
| ) | ||
| return c | ||
| rolling_downloads_by_version(downloads_t) | ||
| ``` | ||
|
|
||
| Or a bar chart of downloads grouped by a category: | ||
|
|
||
| ```{python} | ||
| def group_bar(t, group_by="installer", log_y=True): | ||
| t = t.mutate(timestamp=t["date"].cast("timestamp")) | ||
| t = t.group_by(group_by).agg(downloads=ibis._["count"].sum()) | ||
| t = t.order_by(ibis.desc("downloads")) | ||
| c = px.bar( | ||
| t, | ||
| x=group_by, | ||
| y="downloads", | ||
| log_y=log_y, | ||
| ) | ||
| return c | ||
| group_bar(downloads_t) | ||
| ``` | ||
|
|
||
| ::: {.callout-tip title="More examples" collapse="true"} | ||
|
|
||
| Since we're just writing Python, we've already organized code into functions for | ||
| reuse. We can rerun our above analytics on a different package by changing the | ||
| `project` variable and adjusting our table accordingly. We'll demonstrate this | ||
| with a few more packages below. | ||
|
|
||
| Notice you could also pass in Ibis tables from different backends, not just | ||
| ClickHouse, to these functions! | ||
|
|
||
| ::: {.panel-tabset} | ||
|
|
||
| ## PyArrow | ||
|
|
||
| ```{python} | ||
| package = "pyarrow" | ||
| t = con.table( | ||
| "pypi_downloads_per_day_by_version_by_installer_by_type_by_country" | ||
| ).filter(ibis._["project"] == package) | ||
| ``` | ||
|
|
||
| ```{python} | ||
| day_of_week_bar(t) | ||
| ``` | ||
|
|
||
| ```{python} | ||
| rolling_downloads(t) | ||
| ``` | ||
|
|
||
| ```{python} | ||
| rolling_downloads_by_version(t, version_style="major") | ||
| ``` | ||
|
|
||
| ```{python} | ||
| group_bar(t, group_by="installer") | ||
| ``` | ||
|
|
||
| ## chDB | ||
|
|
||
| ```{python} | ||
| package = "chdb" | ||
| t = con.table( | ||
| "pypi_downloads_per_day_by_version_by_installer_by_type_by_country" | ||
| ).filter(ibis._["project"] == package) | ||
| ``` | ||
|
|
||
| ```{python} | ||
| day_of_week_bar(t) | ||
| ``` | ||
|
|
||
| ```{python} | ||
| rolling_downloads(t) | ||
| ``` | ||
|
|
||
| ```{python} | ||
| rolling_downloads_by_version(t, version_style="major.minor") | ||
| ``` | ||
|
|
||
| ```{python} | ||
| group_bar(t, group_by="installer") | ||
| ``` | ||
|
|
||
| ## Ibis | ||
|
|
||
| ```{python} | ||
| package = "ibis-framework" | ||
| t = con.table( | ||
| "pypi_downloads_per_day_by_version_by_installer_by_type_by_country" | ||
| ).filter(ibis._["project"] == package) | ||
| ``` | ||
|
|
||
| ```{python} | ||
| day_of_week_bar(t) | ||
| ``` | ||
|
|
||
| ```{python} | ||
| rolling_downloads(t) | ||
| ``` | ||
|
|
||
| ```{python} | ||
| rolling_downloads_by_version(t, version_style="major") | ||
| ``` | ||
|
|
||
| ```{python} | ||
| group_bar(t, group_by="installer") | ||
| ``` | ||
|
|
||
| ::: | ||
|
|
||
| ::: | ||
|
|
||
| ## Shiny for Python application | ||
|
|
||
| We can create an interactive Shiny with Python application using the code above | ||
| to serve as a dashboard for better PyPI stats: | ||
|
|
||
| ::: {.callout-tip} | ||
| See [the GitHub repository](https://github.com/ibis-project/better-pypi-stats) | ||
| for the most up-to-date code. | ||
| ::: | ||
|
|
||
| {{< video https://youtu.be/jkdWaL8CbK4 >}} | ||
|
|
||
| ## Reproducing and contributing | ||
|
|
||
| The code is [available on | ||
| GitHub](https://github.com/ibis-project/better-pypi-stats). Feel free to open an | ||
| issue or pull request if you have any suggested improvements. |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,104 @@ | ||
| --- | ||
| title: Farewell pandas, and thanks for all the fish. | ||
| author: Gil Forsyth | ||
| date: 2024-08-26 | ||
| categories: | ||
| - blog | ||
| - pandas | ||
| - community | ||
| --- | ||
|
|
||
| **TL; DR**: we are deprecating the `pandas` and `dask` backends and will be | ||
| removing them in version 10.0. | ||
|
|
||
| There is no feature gap between the `pandas` backend and our default DuckDB | ||
| backend, and DuckDB is _much_ more performant. `pandas` DataFrames will still | ||
| be available as _format_ for getting data to and from Ibis, we just won't | ||
| support using `pandas` to execute queries. | ||
|
|
||
| Most of the rationale below applies to the Dask backend since it has so much in | ||
| common with pandas. Dask is a great project and people should continue to use | ||
| it outside the Ibis context. | ||
|
|
||
| ## Why `pandas`? And a bit of Ibis history | ||
|
|
||
| Way back in the early days of Ibis, there was only one backend: Impala. Not | ||
| everyone used Impala (mindblowing, we know), and so it wasn't too long until the | ||
| Postgres backend was added (by the inimitable Phillip Cloud). | ||
|
|
||
| These two backends were both featureful, but there was a big problem with adoption: | ||
| Want to try out Ibis? You need to install Impala or Postgres first. | ||
|
|
||
| Not an insurmountable problem, but a LOT more work than "just `pip install | ||
| <newthing>`" -- which prompted the question, how can a prospective Ibis user | ||
| take the API for a spin without requiring a DBA or extra infrastructure beyond a | ||
| laptop? | ||
|
|
||
| The obvious answer (at the time) was to use the only in-memory DataFrame engine | ||
| around and wire up a `pandas` backend. | ||
|
|
||
| ## The agony and the agony | ||
|
|
||
| `pandas` was the best option at the time, and it allowed new users to try out | ||
| Ibis. But, it never fit well into the model of data analysis that Ibis strives | ||
| for. The `pandas` backend has more specialized code than any other backend, | ||
| because it is so fundamentally different than all the other systems Ibis works | ||
| with. | ||
|
|
||
| ### Deferred vs Eager | ||
|
|
||
| `pandas` is inherently an eager engine -- every time you hit Enter you are | ||
| computing an intermediate result. Ibis uses a deferred execution model, similar | ||
| to what nearly all SQL backends use, that enables query planning and | ||
| optimization passes. | ||
|
|
||
| Trying to make a `pandas` interface that behaves in a deferred way is hard. | ||
|
|
||
| One of the unfortunate effects of this mismatch is that, unlike our other | ||
| backends, the `pandas` backend is often _much_ slower than just using `pandas` | ||
| directly. | ||
|
|
||
| And to provide this suboptimal experience, we have a few thousand lines of code | ||
| that are only used in the `pandas` backend. | ||
|
|
||
| ### `NaN` vs `NULL` | ||
|
|
||
| The choice was made a long time ago to accept using `NaN` as the marker for | ||
| missing values in `pandas`. This is because NumPy has a notion of `NaN`, but a | ||
| Python `None` would lead to an `object`-dtype and poor performance. | ||
|
|
||
| Practicality beats purity, but this is a horrible decision to have to make. | ||
| Ibis _doesn't_ have to make it with any other backend, because NULL indicates a | ||
| missing value, and NaN is Not a Number. | ||
|
|
||
| Those are fundamentally different ideas and it is an ongoing headache for Ibis | ||
| to try to pretend that they aren't. | ||
|
|
||
| ### Data types | ||
|
|
||
| The new Arrow-backed types in `pandas` are a great improvement and we'll leave | ||
| it at that. | ||
|
|
||
| ## Misleading new users | ||
|
|
||
| People reach for what is familiar. When you try Ibis for the first time, we're | ||
| asking you to both a) try Ibis and b) pick a backend. We have defaults to try to | ||
| help with this, but it can be confusing at first. | ||
|
|
||
| We have many reports from new users that "Ibis is slow". What this almost | ||
| always means is that they tried the `pandas` backend (because they know | ||
| `pandas`) and they are having a less-than-great time. | ||
|
|
||
| If they tried DuckDB or Polars, instead, they would have a much easier time | ||
| getting things going. | ||
|
|
||
| ## Feature parity | ||
|
|
||
| This is the one of the strongest reasons to drop the `pandas` backend -- it is redundant. The | ||
| DuckDB backend can seamlessly query pandas DataFrames, supports several flavors | ||
| of UDF, and can read and write parquet, CSV, JSON, and other formats. | ||
|
|
||
| There is a reason DuckDB is our default backend: it's easy to install, it runs | ||
| locally, it's blazing fast, and it interacts well with the Python ecosystem. | ||
| Those are all the reasons we added `pandas` as a backend in the first place, but | ||
| with the added benefit of blazing-fast results, and no type-system headaches. |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2,6 +2,7 @@ | |
| title: "Ibis: an overview" | ||
| author: | ||
| - Cody Peterson | ||
| date: "2024-07-24" | ||
| execute: | ||
| echo: true | ||
| format: | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,12 @@ | ||
| /*-- scss:rules --*/ | ||
| .reveal div.sourceCode { | ||
| font-size: 2.4rem !important; | ||
| } | ||
|
|
||
| .cell-output-display { | ||
| font-size: 2.2rem !important; | ||
| display: block; | ||
| margin-left: 30%; | ||
| margin-right: 25%; | ||
| margin-top: 2.5%; | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,312 @@ | ||
| --- | ||
| title: "Testing 20 databases on every commit" | ||
| author: Phillip Cloud | ||
| date: "2024-08-14" | ||
| execute: | ||
| echo: true | ||
| format: | ||
| revealjs: | ||
| theme: [default, custom.scss] | ||
| footer: <https://ibis-project.org/presentations/positconf2024/talk> | ||
| --- | ||
|
|
||
| # Let's all stand! | ||
|
|
||
| ## Sit if you work with… | ||
|
|
||
| ::: {.incremental} | ||
| - 0 DBs ✅ | ||
| - 1 DB 😇 | ||
| - 2 DBs 😬 | ||
| - 3+ DBs 😱 | ||
| ::: | ||
|
|
||
| ::: {.fragment} | ||
| ::: {.r-fit-text} | ||
| _I feel your pain._ | ||
| ::: | ||
| ::: | ||
|
|
||
| ## Who? | ||
|
|
||
| :::: {.columns} | ||
|
|
||
| ::: {.column width="50%"} | ||
| ### Me | ||
|
|
||
| - Phillip Cloud | ||
| - Ibis project | ||
| - Voltron Data | ||
| - Data tools for 10+ years | ||
| ::: | ||
|
|
||
| ::: {.column width="50%"} | ||
| ### Where | ||
|
|
||
| - {{< fa brands github >}} [`@cpcloud`](https://github.com/cpcloud) | ||
| - {{< fa brands youtube >}} [Phillip in the Cloud](https://www.youtube.com/@cpcloud) | ||
| - {{< fa brands twitter >}} [`@cpcloudy`](https://x.com/cpcloudy) | ||
| ::: | ||
|
|
||
| :::: | ||
|
|
||
| # Ever needed to test a complex system? | ||
|
|
||
| ## Maybe this is you | ||
|
|
||
| {fig-align="center"} | ||
|
|
||
| ## Or this | ||
|
|
||
| {fig-align="center"} | ||
|
|
||
| ## Or maybe even this | ||
|
|
||
| {fig-align="center"} | ||
|
|
||
| # A complex system: Ibis | ||
|
|
||
| {fig-align="center" width="50%" height="50%"} | ||
|
|
||
| ## What's Ibis? | ||
|
|
||
| - Python library | ||
| - Exploratory data analysis | ||
| - Data engineering | ||
| - ML preprocessing | ||
|
|
||
| ::: {.fragment} | ||
| ::: {.r-fit-text} | ||
| _dbplyr, but Python_ | ||
| ::: | ||
| ::: | ||
|
|
||
| ## One API, 20+ backends {.smaller .scrollable} | ||
|
|
||
| ```{python} | ||
| #| code-fold: true | ||
| #| echo: false | ||
| import ibis | ||
| ibis.options.interactive = True | ||
| t = ibis.examples.penguins.fetch() | ||
| t.to_parquet("penguins.parquet") | ||
| ``` | ||
|
|
||
| ::: {.panel-tabset} | ||
|
|
||
| ## DuckDB | ||
|
|
||
| ```{python} | ||
| con = ibis.connect("duckdb://") | ||
| t = con.read_parquet("penguins.parquet") | ||
| t.group_by("species", "island").agg(count=t.count()).order_by("count") | ||
| ``` | ||
|
|
||
| ## Polars | ||
|
|
||
| ```{python} | ||
| #| code-line-numbers: "1,1" | ||
| con = ibis.connect("polars://") | ||
| t = con.read_parquet("penguins.parquet") | ||
| t.group_by("species", "island").agg(count=t.count()).order_by("count") | ||
| ``` | ||
|
|
||
| ## DataFusion | ||
|
|
||
| ```{python} | ||
| #| code-line-numbers: "1,1" | ||
| con = ibis.connect("datafusion://") | ||
| t = con.read_parquet("penguins.parquet") | ||
| t.group_by("species", "island").agg(count=t.count()).order_by("count") | ||
| ``` | ||
|
|
||
| ## PySpark | ||
|
|
||
| ```{python} | ||
| #| code-line-numbers: "1,1" | ||
| con = ibis.connect("pyspark://") | ||
| t = con.read_parquet("penguins.parquet") | ||
| t.group_by("species", "island").agg(count=t.count()).order_by("count") | ||
| ``` | ||
|
|
||
| ## 16+ other DBs | ||
|
|
||
| {fig-align="center" width="100%" height="100%"} | ||
|
|
||
| ::: | ||
|
|
||
| # Why is this hard to test? | ||
|
|
||
| ## By the numbers {.smaller} | ||
|
|
||
| :::: {.columns} | ||
| ::: {.column width="50%"} | ||
| ### Backends | ||
| - **17** SQL | ||
| - **3** non-SQL | ||
| - **2** cloud | ||
| ::: | ||
|
|
||
| ::: {.column width="50%"} | ||
| ### Engines + APIs | ||
| - **9** distributed SQL | ||
| - **3** dataframe | ||
| - oldest: **~45** years 👀 | ||
| - youngest: **~2** years | ||
| ::: | ||
| :::: | ||
|
|
||
| ### Other facts | ||
|
|
||
| - Latency is variable | ||
| - Deployment models vary | ||
|
|
||
| ::: {.fragment} | ||
| ::: {.r-fit-text} | ||
| _… **Feature development**_❓ | ||
| ::: | ||
| ::: | ||
|
|
||
| ## Bit of a pickle | ||
|
|
||
|  | ||
|
|
||
| # How | ||
|
|
||
| ## High level | ||
|
|
||
| ### Goal: fast iteration | ||
|
|
||
| - fast env setup (dependency management) | ||
| - fast(ish) tests (test-running library) | ||
| - high **job** concurrency (ci/provider) | ||
| - **easy to run**: dev speed ([`just`](https://github.com/casey/just)) | ||
|
|
||
| ::: {.fragment} | ||
| ::: {.r-fit-text} | ||
| _CI must complete "quickly"_ | ||
| ::: | ||
| ::: | ||
|
|
||
| ## Tools: overview | ||
|
|
||
| - 📦 **deps**: _poetry_ | ||
| - 🖥️ **ci**: _GitHub Actions_ | ||
| - 🦁 **"big" backends**: _docker_ | ||
| - 🐱 **"small" backends**: _no special tx (duckdb, polars)_ | ||
| - 🏃 **tasks**: [`just`](https://github.com/casey/just) (e.g.: `just up postgres`) | ||
|
|
||
| ## Tools: poetry | ||
|
|
||
| - **Env setup must be _fast_**: no constraint solving | ||
| - Poetry is one way; there are others | ||
| - Get yourself a lockfile | ||
| - Downsides? | ||
|
|
||
| ::: {.fragment} | ||
| ::: {.r-fit-text} | ||
| … _Are you doing that **now**_❓ | ||
| ::: | ||
| ::: | ||
|
|
||
| ## Tools: docker | ||
|
|
||
| - Do you use it locally? | ||
| - Use health checks; "dumb" ones are fine | ||
| - Make it easy for devs to use | ||
|
|
||
| {fig-align="center"} | ||
|
|
||
| ## Tools: GitHub Actions {.smaller} | ||
|
|
||
| - Pay for the [the Teams plan](https://docs.github.com/en/actions/learn-github-actions/usage-limits-billing-and-administration#usage-limits) to get more concurrency | ||
| - Automate dependency updates | ||
|
|
||
| ::: {.columns} | ||
| ::: {.column width="50%"} | ||
| ### GHA limits | ||
|
|
||
|  | ||
| ::: | ||
|
|
||
| ::: {.column width="50%"} | ||
| ### Ibis CI cost | ||
|
|
||
|  | ||
| ::: | ||
| ::: | ||
|
|
||
| # How does this stack up? | ||
|
|
||
| ## Terminology {auto-animate=true} | ||
|
|
||
| ::: {.fragment} | ||
| Job | ||
| : a set of commands | ||
|
|
||
| ```yaml | ||
| my_job: | ||
| - run: pip install ibis-framework | ||
| - run: just ci-check -m ${{ matrix.backend.name }} | ||
| - run: coverage upload | ||
| ``` | ||
| ::: | ||
| ::: {.fragment} | ||
| Workflow | ||
| : A collection of jobs, one `.yml` file | ||
|
|
||
| ```yaml | ||
| name: Backends | ||
| my_job: | ||
| - run: ... | ||
| my_other_job: | ||
| - run: ... | ||
| ``` | ||
| ::: | ||
|
|
||
| ## Job metrics {auto-animate=true} | ||
|
|
||
| {fig-align="center"} | ||
|
|
||
| ::: {.fragment} | ||
| ::: {.r-fit-text} | ||
| _We've added 3 or 4 new backends since the switch_ | ||
| ::: | ||
| ::: | ||
|
|
||
| ## Workflow metrics {auto-animate=true} | ||
|
|
||
| {fig-align="center"} | ||
|
|
||
| ## Workflow metrics {auto-animate=true} | ||
|
|
||
| {fig-align="center"} | ||
|
|
||
| ## Workflow metrics {auto-animate=true} | ||
|
|
||
| {fig-align="center"} | ||
|
|
||
| - 🟢 Queues + workflows correlated | ||
| - 🟡 Queues slow + workflows fast: not enough concurrency | ||
| - 🟡 Queues fast + workflows slow: jobs doing too much | ||
| - 🔴 Queues slow + workflows slow: hard to say | ||
|
|
||
| # Summary | ||
|
|
||
| - Testing complex projects is possible | ||
| - Use docker for dev **and** prod | ||
| - Don't SAT solve in CI | ||
| - Track CI run durations, workflow metrics | ||
| - Spend time on dev ex | ||
|
|
||
| # Questions? | ||
|
|
||
| {fig-align="center"} | ||
|
|
||
| ::: {.r-fit-text} | ||
| <https://ibis-project.org> | ||
| ::: |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2,7 +2,7 @@ | |
|
|
||
| from __future__ import annotations | ||
|
|
||
| __version__ = "9.4.0" | ||
|
|
||
| import warnings | ||
| from typing import Any | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,7 @@ | ||
| SELECT | ||
| [ | ||
| approx_quantiles(`t0`.`double_col`, 4 IGNORE NULLS)[1], | ||
| approx_quantiles(`t0`.`double_col`, 4 IGNORE NULLS)[2], | ||
| approx_quantiles(`t0`.`double_col`, 4 IGNORE NULLS)[3] | ||
| ] AS `qs` | ||
| FROM `functional_alltypes` AS `t0` |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,3 @@ | ||
| SELECT | ||
| approx_quantiles(`t0`.`double_col`, 4 IGNORE NULLS) AS `qs` | ||
| FROM `functional_alltypes` AS `t0` |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,3 @@ | ||
| SELECT | ||
| approx_quantiles(`t0`.`double_col`, 2 IGNORE NULLS)[1] AS `qs` | ||
| FROM `functional_alltypes` AS `t0` |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,7 @@ | ||
| SELECT | ||
| [ | ||
| approx_quantiles(`t0`.`double_col`, 4 IGNORE NULLS)[2], | ||
| approx_quantiles(`t0`.`double_col`, 4 IGNORE NULLS)[1], | ||
| approx_quantiles(`t0`.`double_col`, 4 IGNORE NULLS)[3] | ||
| ] AS `qs` | ||
| FROM `functional_alltypes` AS `t0` |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,3 @@ | ||
| SELECT | ||
| approx_quantiles(`t0`.`double_col`, 100000 IGNORE NULLS)[33333] AS `qs` | ||
| FROM `functional_alltypes` AS `t0` |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,3 +1,3 @@ | ||
| SELECT | ||
| CAST("t0"."string_col" AS Nullable(DateTime)) AS "Cast(string_col, timestamp)" | ||
| FROM "functional_alltypes" AS "t0" |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,3 +1,3 @@ | ||
| SELECT | ||
| CAST("t0"."timestamp_col" AS DateTime) AS "Cast(timestamp_col, !timestamp)" | ||
| FROM "functional_alltypes" AS "t0" |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,3 +1,3 @@ | ||
| SELECT | ||
| CAST("t0"."int_col" AS DateTime) AS "Cast(int_col, !timestamp)" | ||
| FROM "functional_alltypes" AS "t0" |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,2 +1,2 @@ | ||
| SELECT | ||
| toStartOfDay(parseDateTimeBestEffort('2009-05-17T12:34:56')) AS "TimestampTruncate(datetime.datetime(2009, 5, 17, 12, 34, 56), DAY)" |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,44 @@ | ||
| import ibis | ||
|
|
||
|
|
||
| lineitem = ibis.table( | ||
| name="lineitem", | ||
| schema={ | ||
| "l_orderkey": "int32", | ||
| "l_partkey": "int32", | ||
| "l_suppkey": "int32", | ||
| "l_linenumber": "int32", | ||
| "l_quantity": "decimal(15, 2)", | ||
| "l_extendedprice": "decimal(15, 2)", | ||
| "l_discount": "decimal(15, 2)", | ||
| "l_tax": "decimal(15, 2)", | ||
| "l_returnflag": "string", | ||
| "l_linestatus": "string", | ||
| "l_shipdate": "date", | ||
| "l_commitdate": "date", | ||
| "l_receiptdate": "date", | ||
| "l_shipinstruct": "string", | ||
| "l_shipmode": "string", | ||
| "l_comment": "string", | ||
| }, | ||
| ) | ||
| lit = ibis.literal(1) | ||
| f = lineitem.filter((lineitem.l_shipdate <= ibis.literal("1998-09-02").cast("date"))) | ||
| multiply = f.l_extendedprice * ((lit - f.l_discount)) | ||
| agg = f.aggregate( | ||
| [ | ||
| f.l_quantity.sum().name("sum_qty"), | ||
| f.l_extendedprice.sum().name("sum_base_price"), | ||
| multiply.sum().name("sum_disc_price"), | ||
| ((multiply) * ((lit + f.l_tax))).sum().name("sum_charge"), | ||
| f.l_quantity.mean().name("avg_qty"), | ||
| f.l_extendedprice.mean().name("avg_price"), | ||
| f.l_discount.mean().name("avg_disc"), | ||
| f.count().name("count_order"), | ||
| ], | ||
| by=[f.l_returnflag, f.l_linestatus], | ||
| ) | ||
|
|
||
| result = agg.order_by( | ||
| agg.l_returnflag.asc(nulls_first=True), agg.l_linestatus.asc(nulls_first=True) | ||
| ) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,106 @@ | ||
| import ibis | ||
|
|
||
|
|
||
| customer = ibis.table( | ||
| name="customer", | ||
| schema={ | ||
| "c_custkey": "int64", | ||
| "c_name": "string", | ||
| "c_address": "string", | ||
| "c_nationkey": "int16", | ||
| "c_phone": "string", | ||
| "c_acctbal": "decimal", | ||
| "c_mktsegment": "string", | ||
| "c_comment": "string", | ||
| }, | ||
| ) | ||
| lit = ibis.literal(True) | ||
| orders = ibis.table( | ||
| name="orders", | ||
| schema={ | ||
| "o_orderkey": "int64", | ||
| "o_custkey": "int64", | ||
| "o_orderstatus": "string", | ||
| "o_totalprice": "decimal(12, 2)", | ||
| "o_orderdate": "date", | ||
| "o_orderpriority": "string", | ||
| "o_clerk": "string", | ||
| "o_shippriority": "int32", | ||
| "o_comment": "string", | ||
| }, | ||
| ) | ||
| lineitem = ibis.table( | ||
| name="lineitem", | ||
| schema={ | ||
| "l_orderkey": "int32", | ||
| "l_partkey": "int32", | ||
| "l_suppkey": "int32", | ||
| "l_linenumber": "int32", | ||
| "l_quantity": "decimal(15, 2)", | ||
| "l_extendedprice": "decimal(15, 2)", | ||
| "l_discount": "decimal(15, 2)", | ||
| "l_tax": "decimal(15, 2)", | ||
| "l_returnflag": "string", | ||
| "l_linestatus": "string", | ||
| "l_shipdate": "date", | ||
| "l_commitdate": "date", | ||
| "l_receiptdate": "date", | ||
| "l_shipinstruct": "string", | ||
| "l_shipmode": "string", | ||
| "l_comment": "string", | ||
| }, | ||
| ) | ||
| cast = ibis.literal("1995-03-15").cast("date") | ||
| joinchain = ( | ||
| customer.inner_join( | ||
| orders, | ||
| [(customer.c_custkey == orders.o_custkey), lit, (orders.o_orderdate < cast)], | ||
| ) | ||
| .inner_join( | ||
| lineitem, | ||
| [(orders.o_orderkey == lineitem.l_orderkey), lit, (lineitem.l_shipdate > cast)], | ||
| ) | ||
| .select( | ||
| customer.c_custkey, | ||
| customer.c_name, | ||
| customer.c_address, | ||
| customer.c_nationkey, | ||
| customer.c_phone, | ||
| customer.c_acctbal, | ||
| customer.c_mktsegment, | ||
| customer.c_comment, | ||
| orders.o_orderkey, | ||
| orders.o_custkey, | ||
| orders.o_orderstatus, | ||
| orders.o_totalprice, | ||
| orders.o_orderdate, | ||
| orders.o_orderpriority, | ||
| orders.o_clerk, | ||
| orders.o_shippriority, | ||
| orders.o_comment, | ||
| lineitem.l_orderkey, | ||
| lineitem.l_partkey, | ||
| lineitem.l_suppkey, | ||
| lineitem.l_linenumber, | ||
| lineitem.l_quantity, | ||
| lineitem.l_extendedprice, | ||
| lineitem.l_discount, | ||
| lineitem.l_tax, | ||
| lineitem.l_returnflag, | ||
| lineitem.l_linestatus, | ||
| lineitem.l_shipdate, | ||
| lineitem.l_commitdate, | ||
| lineitem.l_receiptdate, | ||
| lineitem.l_shipinstruct, | ||
| lineitem.l_shipmode, | ||
| lineitem.l_comment, | ||
| ) | ||
| ) | ||
| f = joinchain.filter((joinchain.c_mktsegment == "BUILDING")) | ||
| agg = f.aggregate( | ||
| [(f.l_extendedprice * ((1 - f.l_discount))).sum().name("revenue")], | ||
| by=[f.l_orderkey, f.o_orderdate, f.o_shippriority], | ||
| ) | ||
| s = agg.order_by(agg.revenue.desc(), agg.o_orderdate.asc(nulls_first=True)) | ||
|
|
||
| result = s.select(s.l_orderkey, s.revenue, s.o_orderdate, s.o_shippriority).limit(10) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,101 @@ | ||
| from __future__ import annotations | ||
|
|
||
| import importlib | ||
| from contextlib import contextmanager | ||
| from pathlib import Path | ||
|
|
||
| import pytest | ||
| from pytest import param | ||
|
|
||
| import ibis | ||
| from ibis.backends.tests.tpc.conftest import compare_tpc_results | ||
| from ibis.formats.pandas import PandasData | ||
|
|
||
| tpch_catalog = { | ||
| "lineitem": { | ||
| "l_orderkey": "int32", | ||
| "l_partkey": "int32", | ||
| "l_suppkey": "int32", | ||
| "l_linenumber": "int32", | ||
| "l_quantity": "decimal(15, 2)", | ||
| "l_extendedprice": "decimal(15, 2)", | ||
| "l_discount": "decimal(15, 2)", | ||
| "l_tax": "decimal(15, 2)", | ||
| "l_returnflag": "string", | ||
| "l_linestatus": "string", | ||
| "l_shipdate": "date", | ||
| "l_commitdate": "date", | ||
| "l_receiptdate": "date", | ||
| "l_shipinstruct": "string", | ||
| "l_shipmode": "string", | ||
| "l_comment": "string", | ||
| }, | ||
| "customer": [ | ||
| ("c_custkey", "int64"), | ||
| ("c_name", "string"), | ||
| ("c_address", "string"), | ||
| ("c_nationkey", "int16"), | ||
| ("c_phone", "string"), | ||
| ("c_acctbal", "decimal"), | ||
| ("c_mktsegment", "string"), | ||
| ("c_comment", "string"), | ||
| ], | ||
| "orders": [ | ||
| ("o_orderkey", "int64"), | ||
| ("o_custkey", "int64"), | ||
| ("o_orderstatus", "string"), | ||
| ("o_totalprice", "decimal(12,2)"), | ||
| ("o_orderdate", "date"), | ||
| ("o_orderpriority", "string"), | ||
| ("o_clerk", "string"), | ||
| ("o_shippriority", "int32"), | ||
| ("o_comment", "string"), | ||
| ], | ||
| } | ||
|
|
||
| root = Path(__file__).absolute().parents[3] | ||
|
|
||
| SQL_QUERY_PATH = root / "backends" / "tests" / "tpc" / "queries" / "duckdb" / "h" | ||
|
|
||
|
|
||
| @contextmanager | ||
| def set_database(con, db): | ||
| olddb = con.current_database | ||
| con.raw_sql(f"USE {db}") | ||
| yield | ||
| con.raw_sql(f"USE {olddb}") | ||
|
|
||
|
|
||
| @pytest.mark.parametrize( | ||
| "tpch_query", | ||
| [ | ||
| param(1, id="tpch01"), | ||
| param(3, id="tpch03"), | ||
| ], | ||
| ) | ||
| def test_parse_sql_tpch(tpch_query, snapshot, con, data_dir): | ||
| tpch_query_file = SQL_QUERY_PATH / f"{tpch_query:02d}.sql" | ||
| with open(tpch_query_file) as f: | ||
| sql = f.read() | ||
|
|
||
| expr = ibis.parse_sql(sql, tpch_catalog) | ||
| code = ibis.decompile(expr, format=True) | ||
| snapshot.assert_match(code, "out_tpch.py") | ||
|
|
||
| # Import just-created snapshot | ||
| SNAPSHOT_MODULE = f"ibis.backends.duckdb.tests.snapshots.test_decompile_tpch.test_parse_sql_tpch.tpch{tpch_query:02d}.out_tpch" | ||
| module = importlib.import_module(SNAPSHOT_MODULE) | ||
|
|
||
| with set_database(con, "tpch"): | ||
| # Get results from executing SQL directly on DuckDB | ||
| expected_df = con.con.execute(sql).df() | ||
| # Get results from decompiled ibis query | ||
| result_df = con.to_pandas(module.result) | ||
|
|
||
| # Then set the expected columns so we can coerce the datatypes | ||
| # of the pandas dataframe correctly | ||
| expected_df.columns = result_df.columns | ||
|
|
||
| expected_df = PandasData.convert_table(expected_df, module.result.schema()) | ||
|
|
||
| compare_tpc_results(result_df, expected_df) |