<br><br><br><br><br>

# uproot: Python-native ROOT I/O

<br><br><br><br><br>

<br><br><br><br>

<center><img src="img/uproot-logo.png" width="40%"></center>

<br><br><br><br>

<br>

<p style="font-size: 1.25em">Reminder: uproot is a reimplementation of ROOT I/O in Python, using Numpy for speed and awkward-array to represent complex data.</p>

<br>

<center><img src="img/abstraction-layers.png" width="80%"></center>

<br>

In [None]:
# We've been using uproot for many of our examples so far.
# As a re-write of ROOT I/O in Python, uproot presents the data in a Pythonic way:

import uproot, numpy
file = uproot.open("http://scikit-hep.org/uproot/examples/nesteddirs.root")

print("file is a read-only dict, from object names to objects:\n")
print(f"file.keys()                  → {file.keys()}\n")
print(f"file['one'].keys()           → {file['one'].keys()}\n")
print(f"file['one']['two'].classes() → {dict(file['one']['two'].classes())}\n")
print(f"file['one']['two']['tree']   → {file['one']['two']['tree']}\n")
print(f"file['one/two/tree']         → {file['one/two/tree']}")

In [None]:
# TBranches of TTrees are also presented as dicts.
events = uproot.open("data/Zmumu.root")["events"]
events.keys()

In [None]:
# Get an array with TBranch.array().

events["E1"].array()

In [None]:
# Or TTree.array(branchname).

events.array("E1")

In [None]:
# The plural form, arrays, returns a dict from branch names to arrays.

events.arrays("E1")

In [None]:
# You get the arrays you ask for.

events.arrays(["E1", "px1", "py1", "pz1"])

In [None]:
# With wildcards.

events.arrays(["E1", "p[xyz]1"])

In [None]:
# These are the same wildcard patterns as matching files in UNIX.

events.arrays(["E1", "p*1"])

In [None]:
# Or with slashes, they become regular expressions.

events.arrays(["E1", "/p.*[0-1]/"])

In [None]:
# The "b" before each string (for bytestring) can be removed in Python 3 by
# specifying an encoding (strings in ROOT have no default encoding).

events.arrays(["E1", "px1", "py1", "pz1"], namedecode="utf-8")

In [None]:
# And we can change the container from a dict to something else by passing a
# class name; tuple is useful because it lets us assign each array.

E, px, py, pz = events.arrays(["E1", "px1", "py1", "pz1"], outputtype=tuple)

In [None]:
# outputtype=pandas.DataFrame is a synonym for TTree.pandas.df.

import pandas
events.arrays(["E1", "px1", "py1", "pz1"], outputtype=pandas.DataFrame)

In [None]:
# Use an explicit cache to avoid reading many times from the same file.

uproot.asdtype.debug_reading = True

print("asking for array...")
events.array("E1")

mycache = {}    # or maybe uproot.ArrayCache("1 GB")

print("asking for it with a cache...")
events.array("E1", cache=mycache)

print("asking for it again...")
events.array("E1", cache=mycache)

uproot.asdtype.debug_reading = False

<br>

<p style="font-weight: bold; font-size: 1.875em; color: gray">Three ways to get data:</p>

<table width="100%" style="font-size: 1.25em"><tr style="background: white;">
    <td width="33%" style="vertical-align: top">
        <p style="font-weight: bold; font-size: 1.5em; margin-bottom: 0.5em">Direct</p>
        <p>Read the file and return an array.</p>
        <ul>
            <li style="margin-bottom: 0.3em"><a href="https://uproot.readthedocs.io/en/latest/ttree-handling.html#id11">TBranch.array</a></li>
            <li style="margin-bottom: 0.3em"><a href="https://uproot.readthedocs.io/en/latest/ttree-handling.html#array">TTree.array</a></li>
            <li style="margin-bottom: 0.3em"><a href="https://uproot.readthedocs.io/en/latest/ttree-handling.html#arrays">TTree.arrays</a></li>
        </ul>
    </td><td width="33%" style="vertical-align: top">
        <p style="font-weight: bold; font-size: 1.5em; margin-bottom: 0.5em">Lazy</p>
        <p>Get an object that reads on demand.</p>
        <ul>
            <li style="margin-bottom: 0.3em"><a href="https://uproot.readthedocs.io/en/latest/ttree-handling.html#id13">TBranch.lazyarray</a></li>
            <li style="margin-bottom: 0.3em"><a href="https://uproot.readthedocs.io/en/latest/ttree-handling.html#lazyarray">TTree.lazyarray</a></li>
            <li style="margin-bottom: 0.3em"><a href="https://uproot.readthedocs.io/en/latest/ttree-handling.html#lazyarrays">TTree.lazyarrays</a></li>
            <li style="margin-bottom: 0.3em"><a href="https://uproot.readthedocs.io/en/latest/opening-files.html#uproot-lazyarray-and-lazyarrays">uproot.lazyarray</a>*</li>
            <li style="margin-bottom: 0.3em"><a href="https://uproot.readthedocs.io/en/latest/opening-files.html#uproot-lazyarray-and-lazyarrays">uproot.lazyarrays</a>*</li>
        </ul>
    </td><td width="33%" style="vertical-align: top">
        <p style="font-weight: bold; font-size: 1.5em; margin-bottom: 0.5em">Iterative</p>
        <p>Read arrays in batches of entries.</p>
        <ul>
            <li style="margin-bottom: 0.3em"><a href="https://uproot.readthedocs.io/en/latest/ttree-handling.html#iterate">TTree.iterate</a></li>
            <li style="margin-bottom: 0.3em"><a href="https://uproot.readthedocs.io/en/latest/opening-files.html#uproot-iterate">uproot.iterate</a>*</li>
        </ul>
    </td>
</tr></table>

<p>*Lazy arrays or iteration over sets of files.</p>

In [None]:
# Direct:

events.array("E1")

In [None]:
# Lazy:

uproot.asdtype.debug_reading = True

print("getting lazy array...")
lazyarray = events.lazyarray("E1", entrysteps=500)
print(f"len(lazyarray.chunks) = {len(lazyarray.chunks)}")

print("before looking at the array...")
print(f"lazyarray = {lazyarray}")
print(f"chunks read = {[x.ismaterialized for x in lazyarray.chunks]}")

print("before computing a value...")
print(f"numpy.sqrt(lazyarray) = {numpy.sqrt(lazyarray)}")

print("before computing another value...")
print(f"lazyarray**2 = {numpy.sqrt(lazyarray)}")

uproot.asdtype.debug_reading = False

In [None]:
# Iterative:

for arrays in events.iterate("E1", entrysteps=500):
    print(arrays)

<br><br>

<p style="font-weight: bold; font-size: 1.875em; color: gray">Advantages and disadvantages of each:</p>

<table width="100%" style="font-size: 1.25em"><tr style="background: white;">
    <td width="33%" style="vertical-align: top">
        <p style="font-weight: bold; font-size: 1.5em; margin-bottom: 0.5em">Direct</p>
        <p>Simple; returns pure Numpy arrays if possible.</p>
    </td><td width="33%" style="vertical-align: top">
        <p style="font-weight: bold; font-size: 1.5em; margin-bottom: 0.5em">Lazy</p>
        <p>Transparently work on data too large to fit into memory.</p>
    </td><td width="33%" style="vertical-align: top">
        <p style="font-weight: bold; font-size: 1.5em; margin-bottom: 0.5em">Iterative</p>
        <p>Control the loading of data into and out of memory.</p>
    </td>
</tr></table>

In [None]:
# Controlling the chunk size:

print("Lazy or iteration steps as a fixed number of entries:")
for arrays in events.iterate(entrysteps=500):
    print(len(arrays[b"E1"]))

print("\nLazy or iteration steps as a fixed memory footprint:")
for arrays in events.iterate(entrysteps="100 kB"):
    print(len(arrays[b"E1"]))

In [None]:
# Reading complex data: mostly simplified by the fact that C++ classes are "split"
# into TBranches, and most TBranches are simple arrays.

tree = uproot.open("http://scikit-hep.org/uproot/examples/Event.root")["T"]
tree.show()

# branch name              streamer type, if any      uproot's interpretation

In [None]:
# In this view, class attributes are NOT special types; they're just numbers.

tree.array("fTemperature", entrystop=20)

In [None]:
# Fixed-width matrices are multidimensional arrays,

tree.array("fMatrix[4][4]", entrystop=6)

In [None]:
# branches with multiple leaves ("leaf-list") are Numpy record arrays,

uproot.open("http://scikit-hep.org/uproot/examples/"
                                    "leaflist.root")["tree"]["leaflist"].array()

In [None]:
# and anything in variable-length lists is a JaggedArray,

tree.array("fTracks.fMass2", entrystop=6)

In [None]:
# even if it's fixed-width within jagged or whatever.

tree.array("fTracks.fTArray[3]", entrystop=6)

In [None]:
# There are some types that ROOT does not split because they are too complex.
# For example, *histograms* inside a TTree:

tree.array("fH", entrystop=6)

In [None]:
# Uproot can read objects like this because ROOT describes their layout in
# "streamers;" uproot reads the (most common types of) streamers and generates
# Python classes, some of which have specialized, high-level methods.

for histogram in tree.array("fH", entrystop=3):
    print(histogram.title)
    print(histogram.values)
print("\n...\n")
for histogram in tree.array("fH", entrystart=-3):
    print(histogram.title)
    print(histogram.values)

<br><br><br><br><br>

### Histograms

<br><br><br><br><br>

In [None]:
# As we've seen, histograms have some convenience methods.
# They're mostly for conversion to other formats, like Numpy.
# 
# Numpy "histograms" are a 2-tuple of counts and edges.

uproot.open("http://scikit-hep.org/uproot/examples/"
                                        "hepdata-example.root")["hpx"].numpy()

In [None]:
# Similarly for 2-dimensional histograms.

uproot.open("http://scikit-hep.org/uproot/examples/hepdata-example.root")["hpxpy"].numpy()

In [None]:
# It can also be useful to turn histograms into Pandas DataFrames (note the IntervalIndex).

uproot.open("http://scikit-hep.org/uproot/examples/Event.root")["htime"].pandas()

In [None]:
# Or HEPData's YAML format. As Python objects, it's just a little work to make different formats.

print(uproot.open("http://scikit-hep.org/uproot/examples/Event.root")["htime"].hepdata())

<br><br><br><br><br>

### Writing to ROOT files

<br><br><br><br><br>

In [None]:
# At the moment, only two kinds of objects can be *written* to ROOT files:
# TObjString and histograms.
# 
# To write, open a file for writing (create/recreate/update) and assign to it
# like a dict:

file = uproot.recreate("tmp.root", compression=uproot.ZLIB(4))
file["name"] = "Some object, like a TObjString."

In [None]:
import ROOT

pyroot_file = ROOT.TFile("tmp.root")
pyroot_file.Get("name")

In [None]:
# During assignment, uproot recognizes Pythonic types, such as Numpy histograms.

file["from_numpy"] = numpy.histogram(numpy.random.normal(0, 1, 10000))

In [None]:
pyroot_file = ROOT.TFile("tmp.root")           # refresh the PyROOT file
pyroot_hist = pyroot_file.Get("from_numpy")

canvas = ROOT.TCanvas("canvas", "", 400, 300)
pyroot_hist.Draw("hist")
canvas.Draw()

In [None]:
# 2-dimensional Numpy histograms.

file["from_numpy2d"] = numpy.histogram2d(numpy.random.normal(0, 1, 10000), numpy.random.normal(0, 1, 10000))

In [None]:
pyroot_file = ROOT.TFile("tmp.root")           # refresh the PyROOT file
pyroot_hist = pyroot_file.Get("from_numpy2d")

pyroot_hist.Draw()
canvas.Draw()

<br><br><br><br><br>

<p style="font-size: 1.25em">Coming soon: writing simple TTrees to ROOT files.</p>

<br><br><br><br><br>