| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,75 @@ | ||
| DROP SCHEMA IF EXISTS EXASOL CASCADE; | ||
| CREATE SCHEMA EXASOL; | ||
|
|
||
| CREATE OR REPLACE TABLE EXASOL.diamonds | ||
| ( | ||
| "carat" DOUBLE, | ||
| "cut" VARCHAR(256), | ||
| "color" VARCHAR(256), | ||
| "clarity" VARCHAR(256), | ||
| "depth" DOUBLE, | ||
| "table" DOUBLE, | ||
| "price" BIGINT, | ||
| "x" DOUBLE, | ||
| "y" DOUBLE, | ||
| "z" DOUBLE | ||
| ); | ||
|
|
||
| CREATE OR REPLACE TABLE EXASOL.batting | ||
| ( | ||
| "playerID" VARCHAR(256), | ||
| "yearID" BIGINT, | ||
| "stint" BIGINT, | ||
| "teamID" VARCHAR(256), | ||
| "logID" VARCHAR(256), | ||
| "G" BIGINT, | ||
| "AB" BIGINT, | ||
| "R" BIGINT, | ||
| "H" BIGINT, | ||
| "X2B" BIGINT, | ||
| "X3B" BIGINT, | ||
| "HR" BIGINT, | ||
| "RBI" BIGINT, | ||
| "SB" BIGINT, | ||
| "CS" BIGINT, | ||
| "BB" BIGINT, | ||
| "SO" BIGINT, | ||
| "IBB" BIGINT, | ||
| "HBP" BIGINT, | ||
| "SH" BIGINT, | ||
| "SF" BIGINT, | ||
| "GIDP" BIGINT | ||
| ); | ||
|
|
||
| CREATE OR REPLACE TABLE EXASOL.awards_players | ||
| ( | ||
| "playerId" VARCHAR(256), | ||
| "awardID" VARCHAR(256), | ||
| "yearID" VARCHAR(256), | ||
| "logID" VARCHAR(256), | ||
| "tie" VARCHAR(256), | ||
| "notest" VARCHAR(256) | ||
| ); | ||
|
|
||
| CREATE OR REPLACE TABLE EXASOL.functional_alltypes | ||
| ( | ||
| "id" INTEGER, | ||
| "bool_col" BOOLEAN, | ||
| "tinyint_col" SHORTINT, | ||
| "small_int" SMALLINT, | ||
| "int_col" INTEGER, | ||
| "bigint_col" BIGINT, | ||
| "float_col" FLOAT, | ||
| "double_col" DOUBLE PRECISION, | ||
| "date_string_col" VARCHAR(256), | ||
| "string_col" VARCHAR(256), | ||
| "timestamp_col" TIMESTAMP, | ||
| "year" INTEGER, | ||
| "month" INTEGER | ||
| ); | ||
|
|
||
|
|
||
| IMPORT INTO EXASOL.diamonds FROM LOCAL CSV FILE '/data/diamonds.csv' COLUMN SEPARATOR = ',' SKIP = 1; | ||
| IMPORT INTO EXASOL.batting FROM LOCAL CSV FILE '/data/batting.csv' COLUMN SEPARATOR = ',' SKIP = 1; | ||
| IMPORT INTO EXASOL.awards_players FROM LOCAL CSV FILE '/data/awards_players.csv' COLUMN SEPARATOR = ',' SKIP = 1; | ||
| IMPORT INTO EXASOL.functional_alltypes FROM LOCAL CSV FILE '/data/functional_alltypes.csv' COLUMN SEPARATOR = ',' SKIP = 1; |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,3 @@ | ||
| FROM flink:1.18.0-scala_2.12 | ||
| # ibis-flink requires PyFlink dependency | ||
| RUN wget -nv -P $FLINK_HOME/lib/ https://repo1.maven.org/maven2/org/apache/flink/flink-python/1.18.0/flink-python-1.18.0.jar |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,96 @@ | ||
| # Exasol | ||
|
|
||
| [https://www.exasol.com](https://www.exasol.com) | ||
|
|
||
| ## Install | ||
|
|
||
| Install Ibis and dependencies for the Exasol backend: | ||
|
|
||
| ::: {.panel-tabset} | ||
|
|
||
| ## `pip` | ||
|
|
||
| Install with the `exasol` extra: | ||
|
|
||
| ```{.bash} | ||
| pip install 'ibis-framework[exasol]' | ||
| ``` | ||
|
|
||
| And connect: | ||
|
|
||
| ```{.python} | ||
| import ibis | ||
| con = ibis.exasol.connect(...) # <1> | ||
| ``` | ||
|
|
||
| 1. Adjust connection parameters as needed. | ||
|
|
||
| ## `conda` | ||
|
|
||
| Install for Exasol: | ||
|
|
||
| ```{.bash} | ||
| conda install -c conda-forge ibis-exasol | ||
| ``` | ||
|
|
||
| And connect: | ||
|
|
||
| ```{.python} | ||
| import ibis | ||
| con = ibis.exasol.connect(...) # <1> | ||
| ``` | ||
|
|
||
| 1. Adjust connection parameters as needed. | ||
|
|
||
| ## `mamba` | ||
|
|
||
| Install for Exasol: | ||
|
|
||
| ```{.bash} | ||
| mamba install -c conda-forge ibis-exasol | ||
| ``` | ||
|
|
||
| And connect: | ||
|
|
||
| ```{.python} | ||
| import ibis | ||
| con = ibis.exasol.connect(...) # <1> | ||
| ``` | ||
|
|
||
| 1. Adjust connection parameters as needed. | ||
|
|
||
| ::: | ||
|
|
||
| ## Connect | ||
|
|
||
| ### `ibis.exasol.connect` | ||
|
|
||
| ```python | ||
| con = ibis.exasol.connect( | ||
| user = "username", | ||
| password = "password", | ||
| host = "localhost", | ||
| port = 8563, | ||
| schema = None, | ||
| encryption = True, | ||
| certificate_validation = True, | ||
| encoding = "en_US.UTF-8" | ||
| ) | ||
| ``` | ||
|
|
||
| ::: {.callout-note} | ||
| `ibis.exasol.connect` is a thin wrapper around [`ibis.backends.exasol.Backend.do_connect`](#ibis.backends.exasol.Backend.do_connect). | ||
| ::: | ||
|
|
||
| ### Connection Parameters | ||
|
|
||
| ```{python} | ||
| #| echo: false | ||
| #| output: asis | ||
| from _utils import render_do_connect | ||
| render_do_connect("exasol") | ||
| ``` |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,49 @@ | ||
| --- | ||
| title: "Test Class Reference" | ||
| --- | ||
|
|
||
| This page provides a partial reference to the attributes, methods, properties | ||
| and class-level variables that are used to help configure a backend for the Ibis | ||
| test suite. | ||
|
|
||
| Contributors are encouraged to look over the methods and class-level variables | ||
| in `ibis/backends/tests/base.py`. | ||
|
|
||
| To add a new backend test configuration import one of `BackendTest` or | ||
| `ServiceBackendTest` into a `conftest.py` file with the path | ||
| `ibis/backends/{backend_name}/tests/conftest.py`. Then update / override the | ||
| relevant class-level variables and methods. | ||
|
|
||
| ```python | ||
| from ibis.backends.tests.base import BackendTest | ||
|
|
||
| class TestConf(BackendTest): | ||
| """Backend-specific class with information for testing.""" | ||
|
|
||
| supports_divide_by_zero = True | ||
| supports_floating_modulus = False | ||
| returned_timestamp_unit = "us" | ||
| supports_structs = True | ||
| supports_json = True | ||
| check_names = False | ||
| force_sort = True | ||
|
|
||
| @staticmethod | ||
| def connect(*args, **kwargs): | ||
| ... | ||
| ``` | ||
|
|
||
| ```{python} | ||
| #| echo: false | ||
| #| output: asis | ||
| import os | ||
| paths = [ | ||
| "../reference/BackendTest.qmd", | ||
| "../reference/ServiceBackendTest.qmd", | ||
| ] | ||
| for path in filter(os.path.exists, paths): | ||
| with open(path) as f: | ||
| print(f.read()) | ||
| ``` |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,5 +1,6 @@ | ||
| --- | ||
| execute: | ||
| freeze: auto | ||
| --- | ||
|
|
||
| # Reference built-in functions | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,117 @@ | ||
| --- | ||
| title: "dbt-ibis: Write your dbt models using Ibis" | ||
| author: "Stefan Binder" | ||
| date: "2023-11-24" | ||
| categories: | ||
| - blog | ||
| - dbt | ||
| - data engineering | ||
| --- | ||
|
|
||
| # Introduction to dbt | ||
| [dbt](https://github.com/dbt-labs/dbt-core) has revolutionized how transformations are | ||
| orchestrated and managed within modern data warehouses. Initially released in 2016, | ||
| dbt quickly gained traction within the data analytics community due to its focus on bringing software engineering best practices to analytics code like modularity, portability, CI/CD, and documentation. | ||
|
|
||
| At the heart of dbt are so called "models" which are just simple SQL SELECT statements | ||
| (see further below for an example). dbt removes the need to write any DDL/DML, | ||
| allowing users to focus on writing SELECT statements. Depending on how you configure it, the queries are materialized as tables, views, or custom materializations. dbt also infers dependencies between models and runs them in order. The following is a dbt model which selects from two | ||
| other models called `stg_orders` and `stg_customers`: | ||
|
|
||
| ```sql | ||
| WITH customer_orders as ( | ||
| SELECT | ||
| customer_id AS customer_id, | ||
| MIN(order_date) AS first_order, | ||
| MAX(order_date) AS most_recent_order, | ||
| COUNT(*) AS number_of_orders | ||
| FROM {{ ref('stg_orders') }} AS orders | ||
| GROUP BY | ||
| customer_id | ||
| ), customer_orders_info as ( | ||
| SELECT | ||
| customers.customer_id AS customer_id, | ||
| customers.first_name AS first_name, | ||
| customers.last_name AS last_name, | ||
| customer_orders.customer_id AS customer_id_right, | ||
| customer_orders.first_order AS first_order, | ||
| customer_orders.most_recent_order AS most_recent_order, | ||
| customer_orders.number_of_orders AS number_of_orders | ||
| FROM {{ ref('stg_customers') }} AS customers | ||
| LEFT OUTER JOIN customer_orders | ||
| ON customers.customer_id = customer_orders.customer_id | ||
| ) | ||
| SELECT | ||
| customer_id, | ||
| first_name, | ||
| last_name, | ||
| first_order, | ||
| most_recent_order, | ||
| number_of_orders | ||
| FROM customer_orders_info | ||
| ``` | ||
| dbt will make sure that the resulting table will be created after `stg_orders` | ||
| and `stg_customers`. This model is inspired by the [jaffle shop demo project by dbt Labs](https://github.com/dbt-labs/jaffle_shop) | ||
| where you can find more example queries. | ||
|
|
||
| At the end of 2022, dbt added support for [Python models](https://docs.getdbt.com/docs/build/python-models) | ||
| on specific platforms (Snowflake, Databricks, Google Cloud Platform). This can be useful | ||
| for complex transformations such as using a machine learning model and storing the results. | ||
| However, it also requires that your Python code is run in a cloud data warehouse and often, | ||
| that data is moved into a Python process which can be slower than leveraging the power of modern SQL engines. | ||
|
|
||
|
|
||
| # Why dbt and Ibis go great together | ||
| [dbt-ibis](https://github.com/binste/dbt-ibis) offers a lightweight and compatible alternative, | ||
| which allows you to write dbt models using Ibis. dbt-ibis transparently converts your Ibis | ||
| statements into SQL and then hands it over to dbt. Your database does not need to have Python | ||
| support for this as everything is executed in the same process as dbt. Hence, this allows for | ||
| working in Python for all dbt adapters with supported Ibis backends. Rewriting the above SQL model in Ibis we get: | ||
|
|
||
| ```python | ||
| from dbt_ibis import depends_on, ref | ||
|
|
||
|
|
||
| @depends_on(ref("stg_customers"), ref("stg_orders")) | ||
| def model(customers, orders): | ||
| customer_orders = orders.group_by("customer_id").aggregate( | ||
| first_order=orders["order_date"].min(), | ||
| most_recent_order=orders["order_date"].max(), | ||
| number_of_orders=orders.count(), | ||
| ) | ||
| # Add first_name and last_name | ||
| customer_orders = customers.join(customer_orders, "customer_id", how="left") | ||
| return customer_orders.select( | ||
| "customer_id", | ||
| "first_name", | ||
| "last_name", | ||
| "first_order", | ||
| "most_recent_order", | ||
| "number_of_orders", | ||
| ) | ||
| ``` | ||
|
|
||
| Using Ibis instead of SQL for dbt models brings you many advantages: | ||
|
|
||
| * Type checks and validation before your code is executed in a database. | ||
| * More composable as you can break down complex queries into smaller pieces. | ||
| * Better reusability of code. Although dbt allows you to use [Jinja and macros](https://docs.getdbt.com/docs/build/jinja-macros), which is an improvement over plain SQL, this gets you only so far. String manipulation is inherently fragile. With dbt-ibis, you can easily share common code between models. | ||
| * Your dbt models become backend agnostic which reduces lock-in to a specific database. Furthermore, you get the possibility of building a [multi-engine data stack](https://juhache.substack.com/p/n-engines-1-language?publication_id=1211981&post_id=137718100). For example, you could use DuckDB for small to medium workloads and Snowflake for heavy workloads and as an end-user and BI layer leveraging its governance features. Depending on the size of your warehouse, this can result in significant cost savings. | ||
| * Unit test your code with your favorite Python testing frameworks such as pytest. | ||
|
|
||
| In addition, you can stick to the tool (Ibis) you like, no matter if you're writing an | ||
| ingestion pipeline, a dbt model to transform the data in your data warehouse, or conduct an ad-hoc analysis in a Jupyter notebook. | ||
|
|
||
| Be aware that a current limitation of dbt-ibis is that you cannot connect to the database | ||
| from within your dbt models, i.e. you purely use Ibis to construct a SELECT statement. You cannot execute statements and act based on the results. | ||
|
|
||
| # Further readings | ||
| If you want to give dbt-ibis a try, head over to the [GitHub repo](https://github.com/binste/dbt-ibis/blob/main/README.md) | ||
| for more information on how to get up and running in no time! | ||
|
|
||
| For more details on the future of the integration of Ibis within dbt, you can check out | ||
| [this PR](https://github.com/dbt-labs/dbt-core/pull/5274#issuecomment-1132772028) and [this GitHub issue](https://github.com/dbt-labs/dbt-core/issues/6184) | ||
| on adding an official plugin system to dbt | ||
| which could be used to provide first-class support for modeling languages in general and | ||
| which might allow dbt-ibis to provide an even better user experience and more features. | ||
| See also this [discussion on Ibis as a dataframe API in the dbt GitHub repo](https://github.com/dbt-labs/dbt-core/discussions/5738). |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,248 @@ | ||
| --- | ||
| title: "Ibis + DuckDB geospatial: a match made on Earth" | ||
| author: Naty Clementi | ||
| date: 2023-12-07 | ||
| categories: | ||
| - blog | ||
| - duckdb | ||
| - geospatial | ||
| execute: | ||
| freeze: false | ||
| --- | ||
|
|
||
| Ibis now has support for [DuckDB geospatial functions](https://gist.github.com/ncclementi/fbc5564af709e2d7f8882821e3a8649f)! | ||
|
|
||
| This blogpost showcases some examples of the geospatial API for the DuckDB backend. The material is inspired by | ||
| the ["DuckDB: Spatial Relationships"](https://geog-414.gishub.org/book/duckdb/07_spatial_relationships.html) lesson from | ||
| [Dr. Qiusheng Wu](https://geog-414.gishub.org/book/preface/instructor.html)'s course "Spatial Data Management" from the | ||
| Department of Geography & Sustainability at the University of Tennessee, Knoxville. | ||
|
|
||
| ::: {.callout-note} | ||
| You can check Dr. Qiusheng Wu's full Spatial Data Management course material on its | ||
| [website](https://geog-414.gishub.org/index.html), and the classes are also on | ||
| [YouTube](https://www.youtube.com/watch?v=A4TOAdsXsEs&list=PLAxJ4-o7ZoPe9SkgnophygyLjTDBzIEbi). | ||
| ::: | ||
|
|
||
| ## Data | ||
|
|
||
| We are going to be working with data from New York City. The database contains multiple tables with information about | ||
| subway stations, streets, neighborhood, census data and, homicides. The datasets in the database are in NAD83 / UTM zone | ||
| 18N projection, EPSG:26918. | ||
|
|
||
| ```{python} | ||
| from pathlib import Path | ||
| from zipfile import ZipFile | ||
| from urllib.request import urlretrieve | ||
| # Download and unzip | ||
| url = "https://open.gishub.org/data/duckdb/nyc_data.db.zip" | ||
| zip_path = Path("nyc_data.db.zip") | ||
| db_path = Path("nyc_data.db") | ||
| if not zip_path.exists(): | ||
| urlretrieve(url, zip_path) | ||
| if not db_path.exists(): | ||
| with ZipFile(zip_path) as zip_file: | ||
| zip_file.extract("nyc_data.db") | ||
| ``` | ||
|
|
||
| ## Let's get started | ||
|
|
||
| The beauty of spatial databases is that they allow us to both store *and* compute over geometries. | ||
|
|
||
| ```{python} | ||
| import ibis | ||
| from ibis import _ | ||
| ibis.options.interactive = True | ||
| con = ibis.duckdb.connect("nyc_data.db") | ||
| con.list_tables() | ||
| ``` | ||
|
|
||
| We have multiple tables with information about New York City. Following Dr. Wu's class, we'll take a look at some | ||
| spatial relations. | ||
|
|
||
| We can start by taking a peek at the `nyc_subway_stations` table. | ||
|
|
||
| ```{python} | ||
| subway_stations = con.table("nyc_subway_stations") | ||
| subway_stations | ||
| ``` | ||
|
|
||
| Notice that the last column has a `geometry` type, and in this case it contains points that represent the location of | ||
| each subway station. Let's grab the entry for the Broad St subway station. | ||
|
|
||
| ```{python} | ||
| broad_station = subway_stations.filter(subway_stations.NAME == "Broad St") | ||
| broad_station | ||
| ``` | ||
|
|
||
| ### `geo_equals` (`ST_Equals`) | ||
|
|
||
| In DuckDB `ST_Equals` returns `True` if two geometries are topologically equal. This means that they have the same | ||
| dimension and identical coordinate values, although the order of the vertices may be different. | ||
|
|
||
| The following is a bit redundant but we can check if our `"Broad St"` point matches only one point in our data using | ||
| `geo_equals` | ||
|
|
||
| ```{python} | ||
| subway_stations.filter(subway_stations.geom.geo_equals(broad_station.geom)) | ||
| ``` | ||
|
|
||
| We can also write this query without using `broad_station` as a variable, and with the help of the deferred expressions | ||
| API, also known as [the underscore API](../../how-to/analytics/chain_expressions.qmd). | ||
|
|
||
| ```{python} | ||
| subway_stations.filter(_.geom.geo_equals(_.filter(_.NAME == "Broad St").geom)) | ||
| ``` | ||
|
|
||
| ### `intersect` (ST_Intersect) | ||
|
|
||
| Let's locate the neighborhood of the "Broad Street" subway station using the | ||
| geospatial `intersect` function. The `intersect` function returns `True` if two geometries have any points in common. | ||
|
|
||
| ```{python} | ||
| boroughs = con.table("nyc_neighborhoods") | ||
| boroughs | ||
| ``` | ||
|
|
||
| ```{python} | ||
| boroughs.filter(boroughs.geom.intersects(broad_station.select(broad_station.geom).to_array())) | ||
| ``` | ||
|
|
||
| ### `d_within` (ST_DWithin) | ||
|
|
||
| We can also find the streets near (say, within 10 meters) the Broad St subway station using the `d_within` | ||
| function. The `d_within` function returns True if the geometries are within a given distance. | ||
|
|
||
| ```{python} | ||
| streets = con.table("nyc_streets") | ||
| streets | ||
| ``` | ||
|
|
||
| Using the deferred API, we can check which streets are within `d=10` meters of distance. | ||
|
|
||
| ```{python} | ||
| sts_near_broad = streets.filter(_.geom.d_within(broad_station.select(_.geom).to_array(), 10)) | ||
| sts_near_broad | ||
| ``` | ||
|
|
||
| ::: {.callout-note} | ||
| In the previous query, `streets` and `broad_station` are different tables. We use [`to_array()`](../../reference/expression-tables.qmd#ibis.expr.types.relations.Table.to_array) to generate a | ||
| scalar subquery from a table with a single column (whose shape is scalar). | ||
| ::: | ||
|
|
||
| To visualize the findings, we will convert the tables to GeoPandas DataFrames. | ||
|
|
||
| ```{python} | ||
| broad_station_gdf = broad_station.to_pandas() | ||
| broad_station_gdf.crs = "EPSG:26918" | ||
| sts_near_broad_gdf = sts_near_broad.to_pandas() | ||
| sts_near_broad_gdf.crs = "EPSG:26918" | ||
| streets_gdf = streets.to_pandas() | ||
| streets_gdf.crs = "EPSG:26918" | ||
| ``` | ||
|
|
||
| ```{python} | ||
| import leafmap.deckgl as leafmap # <1> | ||
| ``` | ||
|
|
||
| 1. `leafmap.deckgl` allows us to visualize multiple layers | ||
|
|
||
| ```{python} | ||
| m = leafmap.Map() | ||
| m.add_vector(broad_station_gdf, get_fill_color="blue") | ||
| m.add_vector(sts_near_broad_gdf, get_color="red", opacity=0.5) | ||
| m.add_vector(streets_gdf, get_color="grey", zoom_to_layer=False, opacity=0.3) | ||
| m | ||
| ``` | ||
|
|
||
| You can zoom in and out, and hover over the map to check on the street names. | ||
|
|
||
| ### `buffer` (ST_Buffer) | ||
|
|
||
| Next, we'll take a look at the homicides table and showcase some | ||
| additional functionality related to polygon handling. | ||
|
|
||
| ```{python} | ||
| homicides = con.table("nyc_homicides") | ||
| homicides | ||
| ``` | ||
|
|
||
| Let's use the `buffer` method to find homicides near our `"Broad St"` station point. | ||
|
|
||
| The `buffer` method computes a polygon or multipolygon that represents all points whose distance from a geometry is less | ||
| than or equal to a given distance. | ||
|
|
||
| ```{python} | ||
| broad_station.geom.buffer(200) | ||
| ``` | ||
|
|
||
| We can check the area using the `area` (`ST_Area`) function, and see that is $~ \pi r^{2}=125664$ | ||
|
|
||
| ```{python} | ||
| broad_station.geom.buffer(200).area() | ||
| ``` | ||
|
|
||
| To find if there were any homicides in that area, we can find where the polygon resulting from adding the | ||
| 200 meters buffer to our "Broad St" station point intersects with the geometry column in our homicides table. | ||
|
|
||
| ```{python} | ||
| h_near_broad = homicides.filter(_.geom.intersects(broad_station.select(_.geom.buffer(200)).to_array())) | ||
| h_near_broad | ||
| ``` | ||
|
|
||
| It looks like there was one homicide within 200 meters from the "Broad St" station, but from this | ||
| data we can't tell the street near which it happened. However, we can check if the homicide point is within a small | ||
| distance of a street. | ||
|
|
||
| ```{python} | ||
| h_street = streets.filter(_.geom.d_within(h_near_broad.select(_.geom).to_array(), 2)) | ||
| h_street | ||
| ``` | ||
|
|
||
| Let's plot this: | ||
|
|
||
| ```{python} | ||
| broad_station_zone = broad_station.mutate(geom=broad_station.geom.buffer(200)) | ||
| broad_station_zone = broad_station_zone.to_pandas() | ||
| broad_station_zone.crs = "EPSG:26918" | ||
| h_near_broad_gdf = h_near_broad.to_pandas() | ||
| h_near_broad_gdf.crs = "EPSG:26918" | ||
| h_street_gdf = h_street.to_pandas() | ||
| h_street_gdf.crs = "EPSG:26918" | ||
| mh = leafmap.Map() | ||
| mh.add_vector(broad_station_gdf, get_fill_color="orange") | ||
| mh.add_vector(broad_station_zone, get_fill_color="orange", opacity=0.1) | ||
| mh.add_vector(h_near_broad_gdf, get_fill_color="red", opacity=0.5) | ||
| mh.add_vector(h_street_gdf, get_color="blue", opacity=0.3) | ||
| mh.add_vector(streets_gdf, get_color="grey", zoom_to_layer=False, opacity=0.2) | ||
| mh | ||
| ``` | ||
|
|
||
|
|
||
| ## Functions supported and next steps | ||
|
|
||
| At the moment in Ibis we have support for around thirty geospatial functions in DuckDB and we will add some more | ||
| (see list [here](https://gist.github.com/ncclementi/fbc5564af709e2d7f8882821e3a8649f)). | ||
|
|
||
| We also support reading multiple geospatial formats via [`read_geo()`](../../backends/duckdb.qmd#ibis.backends.duckdb.Backend.read_geo). | ||
|
|
||
| Here are some resources to learn more about Ibis: | ||
|
|
||
| - [Ibis Docs](https://ibis-project.org/) | ||
| - [Ibis GitHub](https://github.com/ibis-project/ibis) | ||
|
|
||
| Chat with us on Zulip: | ||
|
|
||
| - [Ibis Zulip Chat](https://ibis-project.zulipchat.com/) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,33 @@ | ||
| from __future__ import annotations | ||
|
|
||
| import ibis | ||
| from ibis import _ | ||
|
|
||
| ibis.set_backend("datafusion") | ||
|
|
||
| expr = ( | ||
| ibis.read_parquet("/data/pypi-parquet/*.parquet") | ||
| .filter( | ||
| [ | ||
| _.path.re_search( | ||
| r"\.(asm|c|cc|cpp|cxx|h|hpp|rs|[Ff][0-9]{0,2}(?:or)?|go)$" | ||
| ), | ||
| ~_.path.re_search(r"(^|/)test(|s|ing)"), | ||
| ~_.path.contains("/site-packages/"), | ||
| ] | ||
| ) | ||
| .group_by( | ||
| month=_.uploaded_on.truncate("M"), | ||
| ext=_.path.re_extract(r"\.([a-z0-9]+)$", 1) | ||
| .re_replace(r"cxx|cpp|cc|c|hpp|h", "C/C++") | ||
| .re_replace("^f.*$", "Fortran") | ||
| .replace("rs", "Rust") | ||
| .replace("go", "Go") | ||
| .replace("asm", "Assembly") | ||
| .nullif(""), | ||
| ) | ||
| .aggregate(project_count=_.project_name.nunique()) | ||
| .dropna("ext") | ||
| .order_by([_.month.desc(), _.project_count.desc()]) | ||
| ) | ||
| df = expr.to_pandas() |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,12 @@ | ||
| from __future__ import annotations | ||
|
|
||
| import datafusion | ||
|
|
||
| with open("./datafusion_native.sql") as f: | ||
| query = f.read() | ||
|
|
||
| ctx = datafusion.SessionContext() | ||
| ctx.register_parquet(name="pypi", path="/data/pypi-parquet/*.parquet") | ||
| expr = ctx.sql(query) | ||
|
|
||
| df = expr.to_pandas() |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,47 @@ | ||
| SELECT | ||
| month, | ||
| ext, | ||
| COUNT(DISTINCT project_name) AS project_count | ||
| FROM ( | ||
| SELECT | ||
| project_name, | ||
| DATE_TRUNC('month', uploaded_on) AS month, | ||
| NULLIF( | ||
| REPLACE( | ||
| REPLACE( | ||
| REPLACE( | ||
| REGEXP_REPLACE( | ||
| REGEXP_REPLACE( | ||
| REGEXP_MATCH(path, CONCAT('(', '\.([a-z0-9]+)$', ')'))[2], | ||
| 'cxx|cpp|cc|c|hpp|h', | ||
| 'C/C++', | ||
| 'g' | ||
| ), | ||
| '^f.*$', | ||
| 'Fortran', | ||
| 'g' | ||
| ), | ||
| 'rs', | ||
| 'Rust' | ||
| ), | ||
| 'go', | ||
| 'Go' | ||
| ), | ||
| 'asm', | ||
| 'Assembly' | ||
| ), | ||
| '' | ||
| ) AS ext | ||
| FROM pypi | ||
| WHERE COALESCE( | ||
| ARRAY_LENGTH( | ||
| REGEXP_MATCH(path, '\.(asm|c|cc|cpp|cxx|h|hpp|rs|[Ff][0-9]{0,2}(?:or)?|go)$') | ||
| ) > 0, | ||
| FALSE | ||
| ) | ||
| AND NOT COALESCE(ARRAY_LENGTH(REGEXP_MATCH(path, '(^|/)test(|s|ing)')) > 0, FALSE) | ||
| AND NOT STRPOS(path, '/site-packages/') > 0 | ||
| ) | ||
| WHERE ext IS NOT NULL | ||
| GROUP BY month, ext | ||
| ORDER BY month DESC, project_count DESC |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,31 @@ | ||
| from __future__ import annotations | ||
|
|
||
| import ibis | ||
| from ibis import _ | ||
|
|
||
| expr = ( | ||
| ibis.read_parquet("/data/pypi-parquet/*.parquet") | ||
| .filter( | ||
| [ | ||
| _.path.re_search( | ||
| r"\.(asm|c|cc|cpp|cxx|h|hpp|rs|[Ff][0-9]{0,2}(?:or)?|go)$" | ||
| ), | ||
| ~_.path.re_search(r"(^|/)test(|s|ing)"), | ||
| ~_.path.contains("/site-packages/"), | ||
| ] | ||
| ) | ||
| .group_by( | ||
| month=_.uploaded_on.truncate("M"), | ||
| ext=_.path.re_extract(r"\.([a-z0-9]+)$", 1) | ||
| .re_replace(r"cxx|cpp|cc|c|hpp|h", "C/C++") | ||
| .re_replace("^f.*$", "Fortran") | ||
| .replace("rs", "Rust") | ||
| .replace("go", "Go") | ||
| .replace("asm", "Assembly") | ||
| .nullif(""), | ||
| ) | ||
| .aggregate(project_count=_.project_name.nunique()) | ||
| .dropna("ext") | ||
| .order_by([_.month.desc(), _.project_count.desc()]) # <1> | ||
| ) | ||
| df = expr.to_pandas() |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,281 @@ | ||
| --- | ||
| title: "Ibis versus X: Performance across the ecosystem part 2" | ||
| author: "Phillip Cloud" | ||
| date: 2023-12-11 | ||
| categories: | ||
| - blog | ||
| - case study | ||
| - ecosystem | ||
| - performance | ||
| --- | ||
|
|
||
| **TL; DR**: Ibis supports both Polars and DataFusion. Both backends are have | ||
| about the same runtime performance, and lag far behind DuckDB on this workload. | ||
| There's negligible performance difference between Ibis and the backend native | ||
| APIs. | ||
|
|
||
| ## Motivation | ||
|
|
||
| This is part 2 of a series of posts showing performance across various backends | ||
| that Ibis supports. | ||
|
|
||
| Check out [part 1](../pydata-performance/) if you haven't already! | ||
|
|
||
| In this post, I'll continue with the [Polars](../../backends/polars.qmd) and | ||
| [DataFusion](../../backends/datafusion.qmd) backends. | ||
|
|
||
| I show each tool using both the Ibis API and the tool's native API. We'll see | ||
| that the performance difference between these approaches is negligible. | ||
|
|
||
| ```{python} | ||
| #| echo: false | ||
| def show_file(path, language: str = "python") -> None: | ||
| with open(path) as f: | ||
| source = f.read() | ||
| print(f"```{language}\n{source}\n```") | ||
| ``` | ||
|
|
||
| ## Setup | ||
|
|
||
| I ran all of the code in this blog post on a machine with these specs. | ||
|
|
||
| ```{python} | ||
| #| echo: false | ||
| #| output: asis | ||
| import os | ||
| import platform | ||
| import shutil | ||
| import cpuinfo | ||
| import psutil | ||
| info = cpuinfo.get_cpu_info() | ||
| uname = platform.uname() | ||
| MiB = 1 << 20 | ||
| GiB = 1 << 30 | ||
| TiB = 1 << 40 | ||
| ram_gib = psutil.virtual_memory().total / GiB | ||
| disk_tib = shutil.disk_usage("/").total / TiB | ||
| lines = [ | ||
| "| Component | Specification |", | ||
| "| --------- | ------------- |", | ||
| f"| CPU | {info['brand_raw']} ({os.cpu_count()} threads) |", | ||
| f"| RAM | {ram_gib:.0f} GiB |", | ||
| f"| Disk | {disk_tib:.1f} TiB SSD |", | ||
| f"| OS | NixOS ({uname.system} {uname.release}) |", | ||
| ] | ||
| print("\n".join(lines)) | ||
| ``` | ||
|
|
||
| ### Library versions | ||
|
|
||
| Here are the versions I used to run this experiment at the time of writing. | ||
|
|
||
| ```{python} | ||
| #| echo: false | ||
| #| output: asis | ||
| import importlib | ||
| import subprocess | ||
| import sys | ||
| import pandas as pd | ||
| cmd = "git", "rev-parse", "--short", "HEAD" | ||
| proc = subprocess.run(cmd, check=True, text=True, capture_output=True) | ||
| commit = proc.stdout.strip() | ||
| link = f"https://github.com/ibis-project/ibis/tree/{commit}" | ||
| version_pair = lambda name: (name, importlib.import_module(name).__version__) | ||
| versions = pd.DataFrame( | ||
| [("Python", sys.version)] + sorted( | ||
| [ | ||
| *map(version_pair, ("pandas", "polars", "datafusion", "pyarrow")), | ||
| ("ibis", f"[`{commit}`]({link})"), | ||
| ] | ||
| ), | ||
| columns=["Dependency", "Version"], | ||
| ) | ||
| print(versions.to_markdown(index=False)) | ||
| ``` | ||
|
|
||
| ## Running the query across backends | ||
|
|
||
| Here are the different Ibis expressions for each backend as well as the same | ||
| query with native APIs, along with timed executions of the query. | ||
|
|
||
| ### DuckDB | ||
|
|
||
| First, let's run the Ibis + DuckDB version of the query from the original post: | ||
|
|
||
| ```{python} | ||
| #| echo: false | ||
| #| output: asis | ||
| show_file("./duckdb_ibis.py") | ||
| ``` | ||
|
|
||
| ```{python} | ||
| duckdb_ibis_results = %timeit -n1 -r1 -o %run duckdb_ibis.py | ||
| df.head() | ||
| ``` | ||
|
|
||
| ### DataFusion and Polars | ||
|
|
||
| ::: {.panel-tabset} | ||
|
|
||
| ## DataFusion | ||
|
|
||
| ::: {.panel-tabset} | ||
|
|
||
| ## Ibis | ||
|
|
||
| ```{python} | ||
| #| echo: false | ||
| #| output: asis | ||
| show_file("./datafusion_ibis.py") | ||
| ``` | ||
|
|
||
| ```{python} | ||
| datafusion_ibis_results = %timeit -n1 -r1 -o %run datafusion_ibis.py | ||
| df.head() | ||
| ``` | ||
|
|
||
| ## DataFusion native | ||
|
|
||
| <details open> | ||
|
|
||
| <summary>DataFusion SQL</summary> | ||
|
|
||
| ```{python} | ||
| #| echo: false | ||
| #| output: asis | ||
| show_file("./datafusion_native.sql", language="sql") | ||
| ``` | ||
|
|
||
| </details> | ||
|
|
||
| ```{python} | ||
| #| echo: false | ||
| #| output: asis | ||
| show_file("./datafusion_native.py") | ||
| ``` | ||
|
|
||
| ```{python} | ||
| datafusion_native_results = %timeit -n1 -r1 -o %run datafusion_native.py | ||
| df.head() | ||
| ``` | ||
|
|
||
| ::: | ||
|
|
||
| ## Polars | ||
|
|
||
| ::: {.panel-tabset} | ||
|
|
||
| ## Ibis | ||
|
|
||
| ```{python} | ||
| #| echo: false | ||
| #| output: asis | ||
| show_file("./polars_ibis.py") | ||
| ``` | ||
|
|
||
| ```{python} | ||
| polars_ibis_results = %timeit -n1 -r1 -o %run polars_ibis.py | ||
| df.head() | ||
| ``` | ||
|
|
||
| ## Polars native | ||
|
|
||
| ```{python} | ||
| #| echo: false | ||
| #| output: asis | ||
| show_file("./polars_native.py") | ||
| ``` | ||
|
|
||
| ```{python} | ||
| polars_native_results = %timeit -n1 -r1 -o %run polars_native.py | ||
| df.head() | ||
| ``` | ||
|
|
||
| ::: | ||
|
|
||
| ::: | ||
|
|
||
| ## Takeaways | ||
|
|
||
| **Ibis + DuckDB is the only system tested that handles this workload well out of the box** | ||
|
|
||
| * Both Polars and DataFusion are much slower than DuckDB and Dask on this | ||
| workload. | ||
| * Polars memory use fluctuates quite bit, while DataFusion's memory profile is | ||
| similar to DuckDB. | ||
|
|
||
| Let's recap the results with some numbers: | ||
|
|
||
| ### Numbers | ||
|
|
||
| ```{python} | ||
| #| echo: false | ||
| #| output: asis | ||
| import glob | ||
| allfiles = glob.glob("/data/pypi-parquet/*.parquet") | ||
| total_size = sum(map(os.path.getsize, allfiles)) | ||
| def make_line(name, results, file_size: int = total_size): | ||
| duration = results.best | ||
| mib = file_size / MiB | ||
| throughput = mib / duration | ||
| data = [ | ||
| name, f"{mib:,.0f} MiB", f"{duration:.0f} seconds", f"{throughput:.0f} MiB/s" | ||
| ] | ||
| row = " | ".join(data) | ||
| return f"| {row} |" | ||
| results = sorted( | ||
| [ | ||
| {"name": "Ibis + DuckDB", "results": duckdb_ibis_results}, | ||
| {"name": "Ibis + Polars", "results": polars_ibis_results}, | ||
| {"name": "Polars native API", "results": polars_native_results}, | ||
| {"name": "Ibis + DataFusion", "results": datafusion_ibis_results}, | ||
| {"name": "DataFusion native API", "results": datafusion_native_results}, | ||
| ], | ||
| key=lambda run: total_size / run["results"].best, | ||
| reverse=True, | ||
| ) | ||
| header = "| Toolset | Data size | Duration | Throughput |" | ||
| sep = "| ------------------ | --------: | -----------: | ---------: |" | ||
| rows = [header, sep] | ||
| rows.extend(make_line(**result) for result in results) | ||
| print("\n".join(rows)) | ||
| ``` | ||
|
|
||
| ::: {.callout-warning} | ||
| ## The Polars run durations were highly variable | ||
|
|
||
| I couldn't figure out how to get consistent run times. | ||
| ::: | ||
|
|
||
| ## Conclusion | ||
|
|
||
| If you're considering Polars for new code, give Ibis a try with the [DuckDB | ||
| backend](../../backends/duckdb.qmd). | ||
|
|
||
| You'll get better performance than Polars on some workloads, and with a broader | ||
| cross-backend API that helps you scale from development to production. | ||
|
|
||
| If you find that Polars has better performance than DuckDB on a particular | ||
| workload you can always switch to the Polars backend for that workload. | ||
|
|
||
| Everyone wins! | ||
|
|
||
| In the next post in this series we'll cover the cloud backends: Snowflake, | ||
| BigQuery, Trino and ClickHouse. |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,33 @@ | ||
| from __future__ import annotations | ||
|
|
||
| import ibis | ||
| from ibis import _ | ||
|
|
||
| ibis.set_backend("polars") | ||
|
|
||
| expr = ( | ||
| ibis.read_parquet("/data/pypi-parquet/*.parquet") | ||
| .filter( | ||
| [ | ||
| _.path.re_search( | ||
| r"\.(asm|c|cc|cpp|cxx|h|hpp|rs|[Ff][0-9]{0,2}(?:or)?|go)$" | ||
| ), | ||
| ~_.path.re_search(r"(^|/)test(|s|ing)"), | ||
| ~_.path.contains("/site-packages/"), | ||
| ] | ||
| ) | ||
| .group_by( | ||
| month=_.uploaded_on.truncate("M"), | ||
| ext=_.path.re_extract(r"\.([a-z0-9]+)$", 1) | ||
| .re_replace(r"cxx|cpp|cc|c|hpp|h", "C/C++") | ||
| .re_replace("^f.*$", "Fortran") | ||
| .replace("rs", "Rust") | ||
| .replace("go", "Go") | ||
| .replace("asm", "Assembly") | ||
| .nullif(""), | ||
| ) | ||
| .aggregate(project_count=_.project_name.nunique()) | ||
| .dropna("ext") | ||
| .order_by([_.month.desc(), _.project_count.desc()]) | ||
| ) | ||
| df = expr.to_pandas(streaming=True) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,33 @@ | ||
| from __future__ import annotations | ||
|
|
||
| import polars as pl | ||
|
|
||
| expr = ( | ||
| pl.scan_parquet("/data/pypi-parquet/*.parquet") | ||
| .filter( | ||
| [ | ||
| pl.col("path").str.contains( | ||
| r"\.(asm|c|cc|cpp|cxx|h|hpp|rs|[Ff][0-9]{0,2}(?:or)?|go)$" | ||
| ), | ||
| ~pl.col("path").str.contains(r"(^|/)test(|s|ing)"), | ||
| ~pl.col("path").str.contains("/site-packages/", literal=True), | ||
| ] | ||
| ) | ||
| .with_columns( | ||
| month=pl.col("uploaded_on").dt.truncate("1mo"), | ||
| ext=pl.col("path") | ||
| .str.extract(pattern=r"\.([a-z0-9]+)$", group_index=1) | ||
| .str.replace_all(pattern=r"cxx|cpp|cc|c|hpp|h", value="C/C++") | ||
| .str.replace_all(pattern="^f.*$", value="Fortran") | ||
| .str.replace("rs", "Rust", literal=True) | ||
| .str.replace("go", "Go", literal=True) | ||
| .str.replace("asm", "Assembly", literal=True) | ||
| .replace({"": None}), | ||
| ) | ||
| .group_by(["month", "ext"]) | ||
| .agg(project_count=pl.col("project_name").n_unique()) | ||
| .drop_nulls(["ext"]) | ||
| .sort(["month", "project_count"], descending=True) | ||
| ) | ||
|
|
||
| df = expr.collect(streaming=True).to_pandas() |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,39 @@ | ||
| from __future__ import annotations | ||
|
|
||
| import logging | ||
|
|
||
| import dask.dataframe as dd | ||
| from dask.distributed import Client | ||
|
|
||
| if __name__ == "__main__": | ||
| client = Client(silence_logs=logging.ERROR) | ||
| df = dd.read_parquet( | ||
| "/data/pypi-parquet/*.parquet", | ||
| columns=["path", "uploaded_on", "project_name"], | ||
| split_row_groups=True, | ||
| ) | ||
| df = df[ | ||
| df.path.str.contains( | ||
| r"\.(?:asm|c|cc|cpp|cxx|h|hpp|rs|[Ff][0-9]{0,2}(?:or)?|go)$" | ||
| ) | ||
| & ~df.path.str.contains(r"(?:^|/)test(?:|s|ing)") | ||
| & ~df.path.str.contains("/site-packages/") | ||
| ] | ||
| print( | ||
| df.assign( | ||
| month=df.uploaded_on.dt.to_period("M").dt.to_timestamp(), | ||
| ext=df.path.str.extract(r"\.([a-z0-9]+)$", 0, expand=False) | ||
| .str.replace(r"cxx|cpp|cc|c|hpp|h", "C/C++", regex=True) | ||
| .str.replace("^f.*$", "Fortran", regex=True) | ||
| .str.replace("rs", "Rust") | ||
| .str.replace("go", "Go") | ||
| .str.replace("asm", "Assembly"), | ||
| ) | ||
| .groupby(["month", "ext"]) | ||
| .project_name.nunique() | ||
| .rename("project_count") | ||
| .compute() | ||
| .reset_index() | ||
| .sort_values(["month", "project_count"], ascending=False) | ||
| ) | ||
| client.shutdown() |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,32 @@ | ||
| from __future__ import annotations | ||
|
|
||
| import glob | ||
| import os | ||
|
|
||
| import pandas as pd | ||
|
|
||
| df = pd.read_parquet( | ||
| min(glob.glob("/data/pypi-parquet/*.parquet"), key=os.path.getsize), | ||
| columns=["path", "uploaded_on", "project_name"], | ||
| ) | ||
| df = df[ | ||
| df.path.str.contains(r"\.(?:asm|c|cc|cpp|cxx|h|hpp|rs|[Ff][0-9]{0,2}(?:or)?|go)$") | ||
| & ~df.path.str.contains(r"(?:(?:^|/)test(?:|s|ing)|/site-packages/)") | ||
| ] | ||
| print( | ||
| df.assign( | ||
| month=df.uploaded_on.dt.to_period("M").dt.to_timestamp(), | ||
| ext=df.path.str.extract(r"\.([a-z0-9]+)$", 0) | ||
| .iloc[:, 0] | ||
| .str.replace(r"cxx|cpp|cc|c|hpp|h", "C/C++", regex=True) | ||
| .str.replace("^f.*$", "Fortran", regex=True) | ||
| .str.replace("rs", "Rust") | ||
| .str.replace("go", "Go") | ||
| .str.replace("asm", "Assembly"), | ||
| ) | ||
| .groupby(["month", "ext"]) | ||
| .project_name.nunique() | ||
| .rename("project_count") | ||
| .reset_index() | ||
| .sort_values(["month", "project_count"], ascending=False) | ||
| ) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,41 @@ | ||
| from __future__ import annotations | ||
|
|
||
| import ibis | ||
| from ibis import _, udf | ||
|
|
||
|
|
||
| @udf.scalar.builtin | ||
| def flatten(x: list[list[str]]) -> list[str]: # <1> | ||
| ... | ||
|
|
||
|
|
||
| expr = ( | ||
| ibis.read_parquet("/data/pypi-parquet/*.parquet") | ||
| .filter( | ||
| [ | ||
| _.path.re_search( | ||
| r"\.(asm|c|cc|cpp|cxx|h|hpp|rs|[Ff][0-9]{0,2}(?:or)?|go)$" | ||
| ), | ||
| ~_.path.re_search(r"(^|/)test(|s|ing)"), | ||
| ~_.path.contains("/site-packages/"), | ||
| ] | ||
| ) | ||
| .group_by( | ||
| month=_.uploaded_on.truncate("M"), | ||
| ext=_.path.re_extract(r"\.([a-z0-9]+)$", 1), | ||
| ) | ||
| .aggregate(projects=_.project_name.collect().unique()) | ||
| .order_by(_.month.desc()) | ||
| .mutate( | ||
| ext=_.ext.re_replace(r"cxx|cpp|cc|c|hpp|h", "C/C++") | ||
| .re_replace("^f.*$", "Fortran") | ||
| .replace("rs", "Rust") | ||
| .replace("go", "Go") | ||
| .replace("asm", "Assembly") | ||
| .nullif(""), | ||
| ) | ||
| .group_by(["month", "ext"]) | ||
| .aggregate(project_count=flatten(_.projects.collect()).unique().length()) | ||
| .dropna("ext") | ||
| .order_by([_.month.desc(), _.project_count.desc()]) # <2> | ||
| ) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,37 @@ | ||
| from __future__ import annotations | ||
|
|
||
| import ibis | ||
| from ibis import _, udf | ||
|
|
||
|
|
||
| @udf.scalar.builtin | ||
| def flatten(x: list[list[str]]) -> list[str]: | ||
| ... | ||
|
|
||
|
|
||
| expr = ( | ||
| ibis.read_parquet("/data/pypi-parquet/*.parquet") | ||
| .filter( | ||
| [ | ||
| _.path.re_search( | ||
| r"\.(asm|c|cc|cpp|cxx|h|hpp|rs|[Ff][0-9]{0,2}(?:or)?|go)$" | ||
| ), | ||
| ~_.path.re_search(r"(^|/)test(|s|ing)"), | ||
| ~_.path.contains("/site-packages/"), | ||
| ] | ||
| ) | ||
| .group_by( | ||
| month=_.uploaded_on.truncate("M"), | ||
| ext=_.path.re_extract(r"\.([a-z0-9]+)$", 1) | ||
| .re_replace(r"cxx|cpp|cc|c|hpp|h", "C/C++") | ||
| .re_replace("^f.*$", "Fortran") | ||
| .replace("rs", "Rust") | ||
| .replace("go", "Go") | ||
| .replace("asm", "Assembly") | ||
| .nullif(""), | ||
| ) | ||
| .aggregate(projects=_.project_name.collect().unique()) | ||
| .order_by(_.month.desc()) | ||
| .group_by(["month", "ext"]) | ||
| .aggregate(project_count=flatten(_.projects.collect()).unique().length()) | ||
| ) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,37 @@ | ||
| from __future__ import annotations | ||
|
|
||
| import ibis | ||
| from ibis import _, udf | ||
|
|
||
|
|
||
| @udf.scalar.builtin | ||
| def flatten(x: list[list[str]]) -> list[str]: | ||
| ... | ||
|
|
||
|
|
||
| expr = ( | ||
| ibis.read_parquet("/data/pypi-parquet/*.parquet") | ||
| .filter( | ||
| [ | ||
| _.path.re_search( | ||
| r"\.(asm|c|cc|cpp|cxx|h|hpp|rs|[Ff][0-9]{0,2}(?:or)?|go)$" | ||
| ), | ||
| ~_.path.re_search(r"(^|/)test(|s|ing)"), | ||
| ~_.path.contains("/site-packages/"), | ||
| ] | ||
| ) | ||
| .group_by( | ||
| month=_.uploaded_on.truncate("M"), | ||
| ext=_.path.re_extract(r"\.([a-z0-9]+)$", 1) | ||
| .re_replace(r"cxx|cpp|cc|c|hpp|h", "C/C++") | ||
| .re_replace("^f.*$", "Fortran") | ||
| .replace("rs", "Rust") | ||
| .replace("go", "Go") | ||
| .replace("asm", "Assembly") | ||
| .nullif(""), | ||
| ) | ||
| .aggregate(projects=_.project_name.collect().unique()) | ||
| .group_by(["month", "ext"]) | ||
| .aggregate(project_count=flatten(_.projects.collect()).unique().length()) | ||
| .order_by(_.month.desc()) | ||
| ) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,30 @@ | ||
| from __future__ import annotations | ||
|
|
||
| import ibis | ||
| from ibis import _ | ||
|
|
||
| expr = ( | ||
| ibis.read_parquet("/data/pypi-parquet/*.parquet") | ||
| .filter( | ||
| [ | ||
| _.path.re_search( | ||
| r"\.(asm|c|cc|cpp|cxx|h|hpp|rs|[Ff][0-9]{0,2}(?:or)?|go)$" | ||
| ), | ||
| ~_.path.re_search(r"(^|/)test(|s|ing)"), | ||
| ~_.path.contains("/site-packages/"), | ||
| ] | ||
| ) | ||
| .group_by( | ||
| month=_.uploaded_on.truncate("M"), | ||
| ext=_.path.re_extract(r"\.([a-z0-9]+)$", 1) | ||
| .re_replace(r"cxx|cpp|cc|c|hpp|h", "C/C++") | ||
| .re_replace("^f.*$", "Fortran") | ||
| .replace("rs", "Rust") | ||
| .replace("go", "Go") | ||
| .replace("asm", "Assembly") | ||
| .nullif(""), | ||
| ) | ||
| .aggregate(project_count=_.project_name.nunique()) | ||
| .dropna("ext") | ||
| .order_by([_.month.desc(), _.project_count.desc()]) # <1> | ||
| ) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,272 @@ | ||
| --- | ||
| title: Querying every file in every release on the Python Package Index (redux) | ||
| author: Gil Forsyth | ||
| date: 2023-11-15 | ||
| categories: | ||
| - blog | ||
| --- | ||
|
|
||
| Seth Larson wrote a great [blog | ||
| post](https://sethmlarson.dev/security-developer-in-residence-weekly-report-18) | ||
| on querying a PyPI dataset to look for trends in the use of memory-safe | ||
| languages in Python. | ||
|
|
||
| Check out Seth's article for more information on the dataset (and | ||
| it's a good read!). It caught our eye because it makes use of | ||
| [DuckDB](https://duckdb.org/) to clean the data for analysis. | ||
|
|
||
| That's right up our alley here in Ibis land, so let's see if we can duplicate | ||
| Seth's results (and then continue on to plot them!) | ||
|
|
||
| ## Grab the data (locations) | ||
|
|
||
| Seth showed (and then safely decomposed) a nested `curl` statement and that's | ||
| always viable -- we're in Python land so why not grab the filenames using | ||
| `urllib3`? | ||
|
|
||
| ```{python} | ||
| import urllib3 | ||
| url = "https://raw.githubusercontent.com/pypi-data/data/main/links/dataset.txt" | ||
| with urllib3.PoolManager() as http: | ||
| resp = http.request("GET", url) | ||
| parquet_files = resp.data.decode().split() | ||
| parquet_files | ||
| ``` | ||
|
|
||
| ## Grab the data | ||
|
|
||
| Now we're ready to get started with Ibis! | ||
|
|
||
| DuckDB is clever enough to grab only the parquet metadata. This means we can | ||
| use `read_parquet` to create a lazy view of the parquet files and then build up | ||
| our expression without downloading everything beforehand! | ||
|
|
||
| ```{python} | ||
| import ibis | ||
| from ibis import _ # <1> | ||
| ibis.options.interactive = True | ||
| ``` | ||
|
|
||
| 1. See https://ibis-project.org/how-to/analytics/chain_expressions.html for docs | ||
| on the deferred operator! | ||
|
|
||
| Create a DuckDB connection: | ||
|
|
||
| ```{python} | ||
| con = ibis.duckdb.connect() | ||
| ``` | ||
|
|
||
| And load up one of the files (we can run the full query after)! | ||
|
|
||
| ```{python} | ||
| pypi = con.read_parquet(parquet_files[0], table_name="pypi") | ||
| ``` | ||
|
|
||
| ```{python} | ||
| pypi.schema() | ||
| ``` | ||
|
|
||
| ## Query crafting | ||
|
|
||
| Let's break down what we're looking for. As a high-level view of the use of | ||
| compiled languages, Seth is using file extensions as an indicator that a given | ||
| filetype is used in a Python project. | ||
|
|
||
| The dataset we're using has _every file in every project_ -- what criteria should we use? | ||
|
|
||
| We can follow Seth's lead and look for things: | ||
|
|
||
| 1. A file extension that is one of: `asm`, `cc`, `cpp`, `cxx`, `h`, `hpp`, `rs`, `go`, and variants of `F90`, `f90`, etc... | ||
| That is, C, C++, Assembly, Rust, Go, and Fortran. | ||
| 2. We exclude matches where the file path is within the `site-packages/` directory. | ||
| 3. We exclude matches that are in directories used for testing. | ||
|
|
||
| ```{python} | ||
| expr = pypi.filter( | ||
| [ | ||
| _.path.re_search(r"\.(asm|c|cc|cpp|cxx|h|hpp|rs|[Ff][0-9]{0,2}(?:or)?|go)$"), | ||
| ~_.path.re_search(r"(^|/)test(|s|ing)"), | ||
| ~_.path.contains("/site-packages/"), | ||
| ] | ||
| ) | ||
| expr | ||
| ``` | ||
|
|
||
| That _could_ be right -- we can peak at the filename at the end of the `path` column to do a quick check: | ||
|
|
||
| ```{python} | ||
| expr.path.split("/")[-1] | ||
| ``` | ||
|
|
||
| Ok! Next up, we want to group the matches by: | ||
|
|
||
| 1. The month that the package / file was published | ||
| For this, we can use the `truncate` method and ask for month as our truncation window. | ||
| 2. The file extension of the file used | ||
|
|
||
| ```{python} | ||
| expr.group_by( | ||
| month=_.uploaded_on.truncate("M"), | ||
| ext=_.path.re_extract(r"\.([a-z0-9]+)$", 1), | ||
| ).aggregate() | ||
| ``` | ||
|
|
||
| That looks promising. Now we need to grab the package names that correspond to a | ||
| given file extension in a given month and deduplicate it. And to match Seth's | ||
| results, we'll also sort by the month in descending order: | ||
|
|
||
| ```{python} | ||
| expr = ( | ||
| expr.group_by( | ||
| month=_.uploaded_on.truncate("M"), | ||
| ext=_.path.re_extract(r"\.([a-z0-9]+)$", 1), | ||
| ) | ||
| .aggregate(projects=_.project_name.collect().unique()) | ||
| .order_by(_.month.desc()) | ||
| ) | ||
| expr | ||
| ``` | ||
|
|
||
| ## Massage and plot | ||
|
|
||
| Let's continue and see what our results look like. | ||
|
|
||
| We'll do a few things: | ||
|
|
||
| 1. Combine all of the C and C++ extensions into a single group by renaming them all. | ||
| 2. Count the number of distinct entries in each group | ||
| 3. Plot the results! | ||
|
|
||
| ```{python} | ||
| collapse_names = expr.mutate( | ||
| ext=_.ext.re_replace(r"cxx|cpp|cc|c|hpp|h", "C/C++") | ||
| .re_replace("^f.*$", "Fortran") | ||
| .replace("rs", "Rust") | ||
| .replace("go", "Go") | ||
| .replace("asm", "Assembly") | ||
| .nullif(""), | ||
| ).dropna("ext") | ||
| collapse_names | ||
| ``` | ||
|
|
||
| Note that now we need to de-duplicate again, since we might've had separate | ||
| unique entries for both an `h` and `c` file extension, and we don't want to | ||
| double-count! | ||
|
|
||
| We could rewrite our original query and include the renames in the original | ||
| `group_by` (this would be the smart thing to do), but let's push on and see if | ||
| we can make this work. | ||
|
|
||
| The `projects` column is now a column of string arrays, so we want to collect | ||
| all of the arrays in each group, this will give us a "list of lists", then we'll | ||
| `flatten` that list and call `unique().length()` as before. | ||
|
|
||
| DuckDB has a `flatten` function, but it isn't exposed in Ibis (yet!). | ||
|
|
||
| We'll use a handy bit of Ibis magic to define a `builtin` `UDF` that will map directly | ||
| onto the underlying DuckDB function (what!? See | ||
| [here](https://ibis-project.org/how-to/extending/builtin.html#duckdb) for more | ||
| info): | ||
|
|
||
| ```{python} | ||
| @ibis.udf.scalar.builtin | ||
| def flatten(x: list[list[str]]) -> list[str]: | ||
| ... | ||
| collapse_names = collapse_names.group_by(["month", "ext"]).aggregate( | ||
| projects=flatten(_.projects.collect()) | ||
| ) | ||
| collapse_names | ||
| ``` | ||
|
|
||
| We could have included the `unique().length()` in the `aggregate` call, but | ||
| sometimes it's good to check that your slightly off-kilter idea has worked (and | ||
| it has!). | ||
|
|
||
| ```{python} | ||
| collapse_names = collapse_names.select( | ||
| _.month, _.ext, project_count=_.projects.unique().length() | ||
| ) | ||
| collapse_names | ||
| ``` | ||
|
|
||
| Now that the data are tidied, we can pass our expression directly to Altair and see what it looks like! | ||
|
|
||
| ```{python} | ||
| import altair as alt | ||
| chart = ( | ||
| alt.Chart(collapse_names.to_pandas()) | ||
| .mark_line() | ||
| .encode(x="month", y="project_count", color="ext") | ||
| .properties(width=600, height=300) | ||
| ) | ||
| chart | ||
| ``` | ||
|
|
||
| That looks good, but it definitely doesn't match the plot from Seth's post: | ||
|
|
||
|  | ||
|
|
||
| Our current plot is only showing the results from a subset of the available | ||
| data. Now that our expression is complete, we can re-run on the full dataset and | ||
| compare. | ||
|
|
||
| ## The full run | ||
|
|
||
| To recap -- we pulled a lazy view of a single parquet file from the `pypi-data` | ||
| repo, filtered for all the files that contain file extensions we care about, | ||
| then grouped them all together to get counts of the various filetypes used | ||
| across projects by month. | ||
|
|
||
| Here's the entire query chained together into a single command, now running on | ||
| all of the `parquet` files we have access to: | ||
|
|
||
| ```{python} | ||
| pypi = con.read_parquet(parquet_files, table_name="pypi") | ||
| full_query = ( | ||
| pypi.filter( | ||
| [ | ||
| _.path.re_search( | ||
| r"\.(asm|c|cc|cpp|cxx|h|hpp|rs|[Ff][0-9]{0,2}(?:or)?|go)$" | ||
| ), | ||
| ~_.path.re_search(r"(^|/)test(|s|ing)"), | ||
| ~_.path.contains("/site-packages/"), | ||
| ] | ||
| ) | ||
| .group_by( | ||
| month=_.uploaded_on.truncate("M"), | ||
| ext=_.path.re_extract(r"\.([a-z0-9]+)$", 1), | ||
| ) | ||
| .aggregate(projects=_.project_name.collect().unique()) | ||
| .order_by(_.month.desc()) | ||
| .mutate( | ||
| ext=_.ext.re_replace(r"cxx|cpp|cc|c|hpp|h", "C/C++") | ||
| .re_replace("^f.*$", "Fortran") | ||
| .replace("rs", "Rust") | ||
| .replace("go", "Go") | ||
| .replace("asm", "Assembly") | ||
| .nullif(""), | ||
| ) | ||
| .dropna("ext") | ||
| .group_by(["month", "ext"]) | ||
| .aggregate(project_count=flatten(_.projects.collect()).unique().length()) | ||
| ) | ||
| chart = ( | ||
| alt.Chart(full_query.to_pandas()) | ||
| .mark_line() | ||
| .encode(x="month", y="project_count", color="ext") | ||
| .properties(width=600, height=300) | ||
| ) | ||
| chart | ||
| ``` |