# PyHEP WG topical meeting - boost-histogram / Hist

### Henry Schreiner (Princeton University) and Aman Goel (University of Delhi)

**March 2nd, 2022**

In [None]:
import hist
from hist import Hist

Run the code with us through Binder, altering examples and asking "what if" questions along the way :)

[![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/henryiii/histogram-tutorial/main?urlpath=lab/tree/1-PyHEP-topical.ipynb)

In [None]:
import numpy as np

Let's prepare a default random number generator (numpy 1.17+)

In [None]:
rng = np.random.default_rng()

And let's make something interesting to histogram:

In [None]:
data = np.hstack(
    [
        rng.normal(0, 2, size=40_000),
        rng.normal(3, 0.3, size=5_000),
        rng.normal(-3, 0.3, size=5_000),
    ]
)

Now, let's make a histogram:

In [None]:
h = Hist.new.Reg(100, -10, 10).Int64().fill(data)
h

In [None]:
h.plot();

In [None]:
h[-4j:4j].plot();

In [None]:
# NumPy API
# hist.numpy.histogram(data, bins=100, histogram=Hist)

## What is a Hist object made up of?

- Axis objects
- Storage

## Defining a histogram with Hist
Let's first import Hist and quickly define a histogram!

### Classic definition

In [None]:
h = Hist(
    hist.axis.Regular(50, -5, 5, name="S", label="s [units]", flow=False),
    hist.axis.Regular(50, -5, 5, name="W", label="w [units]", flow=False),
    storage=hist.storage.Weight(),
)
print(h)

### QuickConstruct

In [None]:
h = (
    Hist.new.Reg(50, -5, 5, name="s", label="s [units]", flow=False)
    .Reg(50, -5, 5, name="w", label="w [units]", flow=False)
    .Weight()
)
print(h)

## Filling



In [None]:
s_data = rng.standard_normal(50_000)
w_data = rng.standard_normal(50_000)

h.fill(s=s_data, w=w_data, threads=4)

In [None]:
h.plot2d_full();

In [None]:
from uncertainties import unumpy as unp


def pdf(x, a=1 / np.sqrt(2 * np.pi), x0=0, sigma=1, offset=0):
    exp = unp.exp if a.dtype == np.dtype("O") else np.exp
    return a * exp(-((x - x0) ** 2) / (2 * sigma**2)) + offset

In [None]:
h.project("s").plot_pull(pdf);

## Playing with the Hist Object: computing manylinux compatibility

Jan 2022

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
def read_data(date):
    df = pd.read_csv(
        f"https://raw.githubusercontent.com/mayeut/manylinux-timeline/main/consumer_data/{date}.csv",
        converters={
            "python_version": str,
            "pip_version": lambda x: float(".".join(x.split("b")[0].split(".")[:2]))
            % 2000,
            "glibc_version": lambda x: int(x.split("-")[0].split(".")[1]),
        },
    )

    manylinux_pip = pd.cut(
        df.pip_version,
        [8.1, 19.0, 19.3, 20.3, np.inf],
        right=False,
        labels=["manylinux1", "manylinux2010", "manylinux2014", "manylinux_2_24"],
    )
    manylinux_glibc = pd.cut(
        df.glibc_version,
        [5, 12, 17, 24, np.inf],
        right=False,
        labels=["manylinux1", "manylinux2010", "manylinux2014", "manylinux_2_24"],
    )
    codes = pd.concat([manylinux_pip.cat.codes, manylinux_glibc.cat.codes], axis=1).min(
        axis=1
    )
    policy = pd.Categorical.from_codes(codes, dtype=manylinux_pip.dtype)
    df["policy"] = policy
    df["pip_version"] = df["pip_version"].astype(int)
    return df

In [None]:
data = (
    pd.concat([read_data(f"2022/01/{d:02}") for d in range(1, 32)], axis=0)
    .reset_index()
    .dropna()
)
data

In [None]:
h = Hist.from_columns(
    data,
    ("cpu", "python_version", "pip_version", "policy"),
    weight="num_downloads",
)
print(h)

In [None]:
fig, axs = plt.subplots(2, 3, figsize=(12, 8))
for i, py in enumerate(["2.7", "3.6", "3.7", "3.8", "3.9", "3.10"]):
    ax = axs.flatten()[i]
    ph = h.project("python_version", "pip_version")[py, :]
    ph.plot_pie(ax=ax, normalize=True, autopct="%1.0f%%", pctdistance=0.8)
    ax.set_title(f"Python {py} {int(ph.sum()) // 1_000_000:,} M")

plt.tight_layout()
plt.show()

In [None]:
fig, axs = plt.subplots(2, 3, figsize=(12, 8))
for i, py in enumerate(["2.7", "3.6", "3.7", "3.8", "3.9", "3.10"]):
    ax = axs.flatten()[i]
    ph = h.project("python_version", "policy")[py, :]
    ph.plot_pie(ax=ax, normalize=True, autopct="%1.0f%%", pctdistance=0.8)
    ax.set_title(f"Python {py} {int(ph.sum()) // 1_000_000:,} M")

plt.show()

## Playing with an image-like histogram

In [None]:
import PIL.Image
import PIL.ImageDraw
import PIL.ImageFilter
import PIL.ImageFont
import matplotlib.pyplot as plt

In [None]:
image = PIL.Image.new("L", (400, 150), color=255)

draw = PIL.ImageDraw.Draw(image)

# use a truetype font
font = PIL.ImageFont.truetype("Arial Bold.ttf", 110)

draw.text((30, 0), "PyHEP", font=font)

In [None]:
newimage = image.filter(PIL.ImageFilter.GaussianBlur(3))
newimage

In [None]:
arr = 255 - np.asarray(newimage)

In [None]:
h = hist.Hist(
    hist.axis.Regular(400, 0, 4), hist.axis.Regular(150, 0, 1.5), data=arr.T[:, ::-1]
)

In [None]:
h.plot();

In [None]:
h2 = h[::10j, ::10j]
x, y = np.broadcast_arrays(*h2.axes.centers)
fig, ax = plt.subplots(frameon=False, figsize=(7, 3.5))
ax.axis("off")
ax.scatter(x, y, 50 * h2.values() / np.max(h2.values()), marker="s", color="#808080")
plt.tight_layout()