# Welcome to the Kedro-Ibis tutorial!

The side bar on the left ðŸ‘ˆ shows the tutorial files in the Explorer.

For convenience, they're also linked below: ðŸ‘‡

1. [Getting Started](./01%20-%20Getting%20Started.ipynb)
1. [Ibis and the Python Ecosystem](./02%20-%20Ibis%20and%20the%20Python%20Ecosystem.ipynb)
1. [Switching Backends](./03%20-%20Switching%20Backends.ipynb)
1. [Playing with PyPI](./04%20-%20Playing%20with%20PyPI.ipynb)

First, let's download the [nycflights13 data](https://github.com/hadley/nycflights13); we'll use this later on.

In [None]:
import concurrent.futures
import tempfile
import zipfile
from pathlib import Path

import ibis
import tqdm
import requests

Metadata = dict[str, dict[str, str] | None]


# https://github.com/ibis-project/ibis/blob/9.0.0/ibis/examples/gen_registry.py#L111-L165
def add_nycflights13_example(data_path: Path, *, metadata: Metadata) -> None:
    filenames = [
        "airlines.csv",
        "airports.csv",
        "flights.csv.zip",
        "planes.csv",
        "weather.csv",
    ]

    BASE_URL = (
        "https://github.com/machow/nycflights13-py/raw/master/nycflights13/data/{}"
    )

    def download_and_convert(filename: str, *, bar: tqdm.tqdm):
        parquet_path = data_path / f"nycflights13_{filename.split('.')[0]}.parquet"

        if parquet_path.exists():
            metadata[parquet_path.with_suffix("").name] = {}
            bar.update()
            return

        if not filename.endswith("zip"):
            with tempfile.TemporaryDirectory() as d:
                con = ibis.duckdb.connect()
                table = con.read_csv(BASE_URL.format(filename))
                table.to_parquet(parquet_path, codec="zstd")
        else:
            resp = requests.get(BASE_URL.format(filename))
            resp.raise_for_status()
            raw_bytes = resp.content

            # convert to parquet
            with tempfile.TemporaryDirectory() as d:
                con = ibis.duckdb.connect()
                d = Path(d)
                all_data = d / filename
                all_data.write_bytes(raw_bytes)

                # extract the CSVs into the current temp dir and convert them to
                # zstd-compressed Parquet files using DuckDB
                with zipfile.ZipFile(all_data) as zf:
                    zf.extractall(d)

                parquet_path = data_path / "nycflights13_flights.parquet"
                con.read_csv(d / "flights.csv").to_parquet(parquet_path, codec="zstd")

        metadata[parquet_path.with_suffix("").name] = {}
        bar.update()

    bar = tqdm.tqdm(total=len(filenames))
    with concurrent.futures.ThreadPoolExecutor() as e:
        for fut in concurrent.futures.as_completed(
            e.submit(download_and_convert, filename, bar=bar) for filename in filenames
        ):
            fut.result()


data_path = Path("data")
data_path.mkdir(exist_ok=True)

metadata = {}

print("Downloading the nycflights13 data...")
add_nycflights13_example(data_path, metadata=metadata)

list(metadata.keys())

Let's continue by loading the data into a local PostgreSQL database!

We will do this using DuckDBâ€”yes, you can do that!

In [None]:
!curl -OLsS 'https://storage.googleapis.com/ibis-tutorial-data/imdb/2024-03-22/imdb_title_ratings.parquet'
!curl -OLsS 'https://storage.googleapis.com/ibis-tutorial-data/imdb/2024-03-22/imdb_title_basics.parquet'
!psql < demo/create_imdb.sql
!duckdb < load_imdb.sql

And we'll confirm that our PostgreSQL database contains the tables we just loaded.

In [None]:
!psql < verify.sql