In [None]:
import ibis
from ibis import _

ibis.options.interactive = True

In [None]:
filenames = [
    "deps.parquet",
    "maintainers.parquet",
    "package_urls.parquet",
    "packages.parquet",
    "scorecard_checks.parquet",
    "wheels.parquet",
]

In [None]:
from pathlib import Path

In [None]:
con = ibis.duckdb.connect()

In [None]:
folder = Path("pypi")
for filename in filenames:
    path = folder / filename
    con.read_parquet(path, table_name=filename.split(".")[0])

In [None]:
con.list_tables()

In [None]:
deps = con.tables.deps

deps

In [None]:
maintainers = con.tables.maintainers

maintainers

In [None]:
packages = con.tables.packages

packages

### Total number of packages

In [None]:
packages.count()

### Package counts by release day (sunday == 0)

In [None]:
release_days = (
    packages.dropna("last_uploaded_at")
    .group_by(_.last_uploaded_at.day_of_week.index().name("day"))
    .count()
)

release_days

### What maintainers have the most downloads?

In [None]:
top_maintainers_by_downloads = (
    maintainers.join(packages, [("package_name", "name")])
    .group_by("name")
    .aggregate(downloads=_.downloads.sum())
    .select("name", "downloads")
    .order_by(ibis.desc("downloads"))
    .limit(10)
)

top_maintainers_by_downloads

### What packages depend on ibis-framework

In [None]:
ibis_dependents = (
    deps.filter(_.dep_name == "ibis-framework").select("package_name").distinct()
)

ibis_dependents

### What packages depend on things I maintain?

In [None]:
my_dependents = (
    deps.join(maintainers.filter(_.name == "gforsyth"), [("dep_name", "package_name")])
    .select(package="dep_name", dependent="package_name")
    .distinct()
)

my_dependents

### What packages are commonly used by `test*` extras?

In [None]:
top_test_deps = (
    deps.filter(_.extra.startswith("test"))
    .group_by("dep_name")
    .agg(count=_.count())
    .order_by(ibis.desc("count"))
    .limit(10)
)

top_test_deps

### What are the top pytest extensions?

In [None]:
top_pytest_extensions = (
    deps.filter(_.dep_name.startswith("pytest-"))
    .select("package_name", "dep_name")
    .distinct()
    .group_by("dep_name")
    .agg(count=_.count())
    .order_by(ibis.desc("count"))
    .limit(10)
)

top_pytest_extensions

In [None]:
(
    deps.filter(_.dep_name.startswith("pytest-"))
    .group_by("dep_name")
    .agg(dep_count=_.package_name.nunique())
    .order_by(_.dep_count.desc())
    .limit(10)
)

In [None]:
deps.filter(_.dep_name.startswith("pytest-")).dep_name.topk(
    10, by=lambda t: t.package_name.nunique()
)

### What packages are the most depended on

In [None]:
most_dependents = (
    deps.select("package_name", "dep_name")
    .distinct()
    .group_by("dep_name")
    .agg(dep_count=_.count())
    .order_by(ibis.desc("dep_count"))
    .limit(10)
)

most_dependents

In [None]:
most_dependents = (
    deps.group_by("dep_name")
    .agg(dep_count=_.package_name.nunique())
    .order_by(ibis.desc("dep_count"))
    .limit(10)
)

most_dependents

### Histogram of maintainer count

In [None]:
maintainer_counts = (
    maintainers.group_by("package_name")
    .agg(maintainers=_.count())
    .group_by("maintainers")
    .agg(count=_.count())
    .order_by(_.maintainers)
)

maintainer_counts

In [None]:
import altair as alt

alt.__version__

In [None]:
chart = (
    alt.Chart(maintainer_counts)
    .mark_bar()
    .encode(x="maintainers", y=alt.Y("count", scale=alt.Scale(type="log")))
)

chart

Followup question - what's up with the spike at 12?

Start by looking at just the packages with 12 maintainers

Then make the repr a bit longer

Then look for patterns

In [None]:
maintainer_counts = (
    maintainers.group_by("package_name")
    .agg(maintainers=_.count())
    .filter([_.maintainers == 12, _.package_name.startswith("ftw")])
)

ibis.options.repr.interactive.max_rows = 20
maintainer_counts.count()

### What are the most common package prefixes?

In [None]:
common_prefixes = (
    maintainers.group_by("package_name")
    .agg(maintainers=_.count())
    .filter(_.maintainers == 12)
    .package_name.re_extract(r"^(\w*)-?", 1)
    .name("prefix")
    .topk(5)
)

common_prefixes

### What packages have few downloads but lots of maintainers?

Looking for "clubs", as described by Nadia Eghbal's "Working in Public".

In [None]:
packages.filter(_.downloads > 0).downloads.min()

In [None]:
import datetime

# These prefixes are all "zope" related, and are so prolific that they mask anything interesting.
# Zope used to be really popular, but downloads have waned. We'll ignore them for now since they
# mask anything else interesting.
ignore_prefixes = ["zope", "zc", "z3c", "collective", "plone", "products"]

clubs = (
    maintainers.group_by("package_name")
    .agg(maintainers=_.count())
    .join(
        packages.filter(
            [
                _.downloads == 0,
                _.last_uploaded_at
                > (datetime.datetime.now() - datetime.timedelta(days=365)),
                *(~_.name.startswith(p) for p in ignore_prefixes),
            ]
        ),
        [("package_name", "name")],
    )
    .order_by([ibis.desc("maintainers")])
    .select("package_name", "maintainers")
    .limit(10)
)

clubs

### Find the top 20 most depended on packages that have only one maintainer

In [None]:
bus_factor_1 = (
    maintainers.group_by("package_name")
    .agg(maintainer_count=_.count())
    .filter(_.maintainer_count == 1)
    .join(maintainers, "package_name")
    .join(
        (
            deps.select("package_name", "dep_name")
            .distinct()
            .group_by("dep_name")
            .agg(dep_count=_.count())
        ),
        [("package_name", "dep_name")],
    )
    .select("package_name", "name", "dep_count")
    .order_by(ibis.desc("dep_count"))
    .limit(10)
)

bus_factor_1

In [None]:
deps.select("package_name", "dep_name").distinct().group_by("dep_name").agg(
    dep_count=_.count()
)

In [None]:
deps.group_by("dep_name").agg(dep_count=_.package_name.nunique())

### pypi users who have the most distinct collaborators

In [None]:
most_collaborators = (
    maintainers.join(maintainers, "package_name")
    .select("name", "name_right")
    .filter(_.name != _.name_right)
    .distinct()
    .group_by("name")
    .agg(n_collaborators=_.count())
    .order_by(ibis.desc("n_collaborators"))
    .limit(10)
)

most_collaborators

In [None]:
maintainers.join(maintainers, "package_name").filter(_.name != _.name_right).group_by(
    "name"
).agg(n_collaborators=_.count()).order_by(_.n_collaborators.desc())

### Finding the most popular transitive dependencies

We can do this by using `.sql` to wrap a recursive CTE, then query it like a normal Ibis table. The recursive CTE will produce a table with a row for each package `package` and its recursive 

In [None]:
transitive_deps = con.sql(
    """
    WITH RECURSIVE
    direct_deps(package, dependency) AS (
      SELECT
        package_name,
        dep_name
      FROM deps
      WHERE
        extra IS NULL
    ),
    transitive_deps(package, intermediate, dependency) AS (
      SELECT
        package,
        package,
        dependency
      FROM direct_deps
      UNION
      SELECT
        transitive_deps.package,
        direct_deps.package,
        direct_deps.dependency
      FROM direct_deps
      JOIN transitive_deps
        ON direct_deps.package = transitive_deps.dependency
    )
    SELECT package, dependency FROM transitive_deps
    """,
    schema={"package": "string", "dependency": "string"},
)

In [None]:
top_20_transitive_deps = (
    transitive_deps.group_by("dependency")
    .agg(n_dependents=_.package.nunique())
    .order_by(ibis.desc("n_dependents"))
    .limit(20)
)

In [None]:
top_20_transitive_deps