In [1]:
import uproot as up
import pandas as pd
import numpy as np
import awkward as ak
from matplotlib import pyplot as plt
import mplhep as hep
import tempfile

In [2]:
import ROOT
# if the ROOT import fails, you can still run other sections that do not use it

# Tutorail Scope 
This tutorial aims to introduce the fundamental data operations across various frameworks. We will explore: **Pandas**, **ROOT**, ROOT **RDataFrame**, and the **uproot + Awkward** toolkit. The examples provided will cover building a data structure, filtering datasets, calculating basic statistics, as well as saving and reading from files. 

# Data preparation

Let's generate some toy data with different types. We are starting with 10000 rows. For each data type (bool, float32, float64, str) we are creating 10 columns with random values. 

In [3]:
n_rows = 10000
data_b = np.random.choice([False, True],size=(n_rows,10))
data_f = np.random.random(size=(n_rows,10)).astype(dtype=np.float32)
data_d = np.random.random(size=(n_rows,10)).astype(dtype=np.float64)
data_s = np.random.choice(["aaa","bbb","ccc"],size=(n_rows,10))

data_b[0,0], data_d[0,0], data_f[0,0], data_s[0,0]

(np.True_,
 np.float64(0.21124027427979897),
 np.float32(0.91410756),
 np.str_('ccc'))

In [4]:
# All columns in our table will be named after the type: 
# B0, B1, B2, ... - bool
# F0, F1, F1, ... - float
# D0, D1, D1, ... - double
# S0, S1, S1, ... - str
 
column_names_b = [f"B{i}" for i in range(10)]
column_names_f = [f"F{i}" for i in range(10)]
column_names_d = [f"D{i}" for i in range(10)]
column_names_s = [f"S{i}" for i in range(10)]
column_names = column_names_b + column_names_f + column_names_d + column_names_s 

# Pandas
Pandas is a powerful tool that has gained recognition outside of the data science community. Built on top of NumPy, which is partially written in C, it requires reading larger files in chunks due to its in-memory data storage approach. 

The main idea behind its core interface, the DataFrame, is vectorization. The best way to use Pandas is by applying operations to entire columns rather than processing row by row. 

### Create Data Frame

In [5]:
df = pd.DataFrame()
df[column_names_b] = data_b
df[column_names_f] = data_f
df[column_names_d] = data_d
df[column_names_s] = data_s
df[["B0", "F0", "D0", "S0"]].head()

Unnamed: 0,B0,F0,D0,S0
0,True,0.914108,0.21124,ccc
1,True,0.381422,0.003173,bbb
2,True,0.896872,0.103234,ccc
3,True,0.492902,0.75733,aaa
4,True,0.128116,0.390022,bbb


### Fliter data

In [6]:
print("Before selection. Number of rows = ", len(df))
df = df[df["B0"] + df["B1"]]
df = df[(df["F0"] > 0.1) & (df["F0"] < 0.9)]
df = df[np.abs(df["D0"] - df["D1"]) >= 0.02]
df = df[df["S0"] == "aaa"]
print("After selection. Number of rows = ", len(df))

Before selection. Number of rows =  10000
After selection. Number of rows =  1938


### Add new columns

In [7]:
df["D11"] = df["D2"] - df["D3"]
df["D12"] = df["D4"] * df["B0"].astype(float) # if B0, make D12 = D4, else D12 = 0
df["D13"] = df["D4"] * df["B0"].astype(float) + df["D5"] * (~df["B0"]).astype(float) # if B0, make D12 = D4, else D12 = D5 
df["S11"] = df["S1"].str.capitalize()
df["S12"] = df["S1"] + df["S2"]
df[["D11", "D12", "D13", "S11", "S12"]].head()

Unnamed: 0,D11,D12,D13,S11,S12
3,0.103905,0.0228,0.0228,Bbb,bbbaaa
5,0.617874,0.475685,0.475685,Ccc,cccaaa
6,0.281837,0.0,0.475534,Bbb,bbbccc
7,0.047393,0.755705,0.755705,Ccc,cccccc
11,-0.685805,0.540482,0.540482,Ccc,cccaaa


### Calculate stats

In [8]:
print("F1 mean: ", df["F1"].mean())
print("F1 max: ", df["F1"].max())
print("F1 min: ", df["F1"].min())
print("Count B0 == True: ",df["B0"].sum())

F1 mean:  0.5035985
F1 max:  0.99945223
F1 min:  0.00013765624
Count B0 == True:  1272


### Save / read

In [9]:
with tempfile.TemporaryDirectory() as tempdir:
    df.to_parquet(f"{tempdir}/temp.parquet")
    df = pd.read_parquet(f"{tempdir}/temp.parquet")
df.head()

Unnamed: 0,B0,B1,B2,B3,B4,B5,B6,B7,B8,B9,...,S5,S6,S7,S8,S9,D11,D12,D13,S11,S12
3,True,True,True,False,True,False,True,False,True,False,...,bbb,aaa,aaa,aaa,ccc,0.103905,0.0228,0.0228,Bbb,bbbaaa
5,True,True,True,False,False,True,True,False,True,False,...,ccc,bbb,aaa,aaa,bbb,0.617874,0.475685,0.475685,Ccc,cccaaa
6,False,True,True,False,True,True,False,False,False,False,...,ccc,ccc,bbb,bbb,aaa,0.281837,0.0,0.475534,Bbb,bbbccc
7,True,True,True,False,False,False,False,False,True,True,...,bbb,aaa,bbb,ccc,ccc,0.047393,0.755705,0.755705,Ccc,cccccc
11,True,True,True,True,False,True,False,True,False,False,...,ccc,ccc,ccc,aaa,aaa,-0.685805,0.540482,0.540482,Ccc,cccaaa


# ROOT
ROOT was originally designed for machines with limited cores, enabling processing of files larger than available RAM. Its architecture is heavily oriented towards C++. While it does offer Python bindings, these are essentially C++ wrappers.

In [10]:
from ROOT import TTree, TFile

### Create TTree

In [11]:
# ROOT will interpret data based on the selecten data type. Supported types:

# C : a character string terminated by the 0 character
# B : an 8 bit signed integer (Char_t); Treated as a character when in an array.
# b : an 8 bit unsigned integer (UChar_t)
# S : a 16 bit signed integer (Short_t)
# s : a 16 bit unsigned integer (UShort_t)
# I : a 32 bit signed integer (Int_t)
# i : a 32 bit unsigned integer (UInt_t)
# F : a 32 bit floating point (Float_t)
# f : a 24 bit floating point with truncated mantissa (Float16_t)
# D : a 64 bit floating point (Double_t)
# d : a 24 bit truncated floating point (Double32_t)
# L : a 64 bit signed integer (Long64_t)
# l : a 64 bit unsigned integer (ULong64_t)
# G : a long signed integer, stored as 64 bit (Long_t)
# g : a long unsigned integer, stored as 64 bit (ULong_t)
# O : [the letter o, not a zero] a boolean (bool)

In [12]:
file = TFile.Open("DataTree.root", "RECREATE")
tree = TTree("DataTree", "DataTree")

# Link the tree with python objects
data_dict = {}
for col_name in column_names_b:
    data_dict[col_name] = np.array([False],dtype=np.bool)
    tree.Branch(col_name, data_dict[col_name], f'{col_name}/O') 

for col_name in column_names_f:
    data_dict[col_name] = np.array([0.], dtype=np.float32)
    tree.Branch(col_name, data_dict[col_name], f'{col_name}/F') 

for col_name in column_names_d:
    data_dict[col_name] = np.array([0.], dtype=np.float64)
    tree.Branch(col_name, data_dict[col_name], f'{col_name}/D') 

# broken. strings are a real pain in ROOT
# for col_name in column_names_s:
    # data_dict[col_name] = np.array(["aaa"], dtype=str)
    # t.Branch(col_name, data_dict[col_name], f'{col_name}[3]/B') 

# Fill the tree
for i in range(len(data_f)):
    for j, col_name in enumerate(column_names_b):
        data_dict[col_name] = np.array([data_b[i,j]],dtype=np.bool)
    for j, col_name in enumerate(column_names_f):
        data_dict[col_name] = np.array([data_f[i,j]], dtype=np.float32)
    for j, col_name in enumerate(column_names_d):
        data_dict[col_name] = np.array([data_d[i,j]], dtype=np.float64)
    # for j, col_name in enumerate(column_names_s):
    #     data_dict[col_name] = np.array([data_s[i,j]], dtype=str)
    tree.Fill()

print(tree.GetEntries())
tree.Write()
file.Close()

10000


In [13]:
file2 = TFile.Open("DataTree.root");
tree2 = file2.Get("DataTree")
for i,x in enumerate(tree2):
    print(x.B0)
    print(x.F0)
    print(x.D0)
    # print(x.S0) # broken
    break
file2.Close()

True
0.9988673329353333
0.5210780568112082


### Fliter data

In [14]:
file3 = TFile.Open("DataTree.root","UPDATE");
tree3 = file3.Get("DataTree")
tree3_clone = tree3.CloneTree(0)
print("Before selection. Number of rows = ", tree3.GetEntries())
for i,event in enumerate(tree3):
    if not (event.B0 or event.B1): # broken
        continue
    if not ((event.F0 > 0.1) and (event.F0 < 0.9)):
        continue
    if not (abs(event.D0 - event.D1) >= 0.02):
        continue
    # 
    # no str selection. 
    #  
    tree3_clone.Fill()
print("After selection. Number of rows = ", tree3_clone.GetEntries())
tree3_clone.Write()
file3.Close()

Before selection. Number of rows =  10000
After selection. Number of rows =  5701


### Add new variables

In [15]:
file4 = TFile.Open("DataTree.root","UPDATE");
tree4 = file4.Get("DataTree")
tree4_clone = tree4.CloneTree(0)

d11 = np.array([0.],dtype=np.float64)
d12 = np.array([0.],dtype=np.float64)
d13 = np.array([0.],dtype=np.float64)

tree4_clone.Branch("D11", d11, f'D11/D') 
tree4_clone.Branch("D12", d12, f'D12/D') 
tree4_clone.Branch("D13", d13, f'D13/D') 

for i,event in enumerate(tree4):
    d11_val = event.D2 - event.D3

    d12_val = 0.
    if event.B0:
        d12_val = event.D4

    if event.B0:
        d13_val = event.D4
    else:
        d13_val = event.D5

    d11 = np.array([d11_val],dtype=np.float64)
    d12 = np.array([d12_val],dtype=np.float64)
    d13 = np.array([d13_val],dtype=np.float64)

    tree4_clone.Fill()

tree4_clone.Write()
file4.Close()

### Calculate stats

In [16]:
file5 = TFile.Open("DataTree.root");
tree5 = file5.Get("DataTree")

f1_sum: float = 0.
f1_max: float = 0.
f1_min: float = 999.
b0_true_count: int = 0

for i,x in enumerate(tree5):
    if x.B0:
        b0_true_count += 1

    f1_sum += x.F1   
    if x.F1 > f1_max:
        f1_max = x.F1
    if x.F1 < f1_min:
        f1_min = x.F1

print("F1 mean: ", f1_sum/tree5.GetEntries())
print("F1 max: ", f1_max)
print("F1 min: ", f1_min)
print("Count B0 == True: ",b0_true_count)
file5.Close()

F1 mean:  0.5040905077532873
F1 max:  0.9999719858169556
F1 min:  0.0002473511849530041
Count B0 == True:  3770


# ROOT Data Frame
RDataFrame represents a modern update to the ROOT framework, drawing inspiration from the Pandas interface. It offers capabilities for importing and exporting data to NumPy. This tool is good at handling straightforward tasks such as filtering datasets or calculating new variables.

### Create RDataTree

In [17]:
# bool data type is not suppported. Casting to int
numpy_dict = {
    column_names_b[i]: data_b[:,i].astype(np.int32) for i in range(len(column_names_b))
}
numpy_dict.update({
    column_names_f[i]: data_f[:,i] for i in range(len(column_names_f))
})
numpy_dict.update({
    column_names_d[i]: data_d[:,i] for i in range(len(column_names_d))
})
# str data type is not suppported. Skipping
rdf = ROOT.RDF.FromNumpy(numpy_dict)
rdf.Describe()

Dataframe from datasource RVecDS

Property                Value
--------                -----
Columns in total           30
Columns from defines        0
Event loops run             0
Processing slots            1

Column  Type    Origin
------  ----    ------
B0      int     Dataset
B1      int     Dataset
B2      int     Dataset
B3      int     Dataset
B4      int     Dataset
B5      int     Dataset
B6      int     Dataset
B7      int     Dataset
B8      int     Dataset
B9      int     Dataset
D0      double  Dataset
D1      double  Dataset
D2      double  Dataset
D3      double  Dataset
D4      double  Dataset
D5      double  Dataset
D6      double  Dataset
D7      double  Dataset
D8      double  Dataset
D9      double  Dataset
F0      float   Dataset
F1      float   Dataset
F2      float   Dataset
F3      float   Dataset
F4      float   Dataset
F5      float   Dataset
F6      float   Dataset
F7      float   Dataset
F8      float   Dataset
F9      float   Dataset

### Fliter data

In [18]:
print("Before selection. Number of rows = ", rdf.Count().GetValue())
rdf = rdf.Filter("B0 + B1")
rdf = rdf.Filter("(F0 > 0.1) & (F0 < 0.9)")
rdf = rdf.Define("temp_var_1", "abs(D0 - D1)")\
         .Filter("temp_var_1 >= 0.02" )
# Str not supported
print("After selection. Number of rows = ", rdf.Count().GetValue())

Before selection. Number of rows =  10000
After selection. Number of rows =  5766


### Add new columns

In [19]:
rdf = rdf.Define("D11", "D2 - D3")\
    .Define("D12", "D4 * float(B0)")\
    .Define("D13", "D4 * float(B0) + D5 * float(1 - B0)")
# Str not supported

### Calculate stats

In [20]:
print("F1 mean: ", rdf.Mean("F1").GetValue())
print("F1 max: ", rdf.Max("F1").GetValue())
print("F1 min: ", rdf.Min("F1").GetValue())
print("Count B0 == True: ",rdf.Sum("F1").GetValue())

F1 mean:  0.5037809315961159
F1 max:  0.9999741315841675
F1 min:  0.00013765624316874892
Count B0 == True:  2904.8008515832043


### Save / read

In [21]:
with tempfile.TemporaryDirectory() as tempdir:
    columns_to_save = [column for column in rdf.GetColumnNames() if column != "temp_var_1"]
    rdf.Snapshot("DataTree",f"{tempdir}/RDataFrame.root",columns_to_save)
    rdf2 = ROOT.RDataFrame("DataTree",f"{tempdir}/RDataFrame.root")
    print(rdf2.Describe())


Dataframe from TChain DataTree in file /tmp/tmpiuo12bsi/RDataFrame.root

Property                Value
--------                -----
Columns in total           33
Columns from defines        0
Event loops run             0
Processing slots            1

Column  Type            Origin
------  ----            ------
B0      Int_t           Dataset
B1      Int_t           Dataset
B2      Int_t           Dataset
B3      Int_t           Dataset
B4      Int_t           Dataset
B5      Int_t           Dataset
B6      Int_t           Dataset
B7      Int_t           Dataset
B8      Int_t           Dataset
B9      Int_t           Dataset
D0      Double_t        Dataset
D1      Double_t        Dataset
D11     Double_t        Dataset
D12     Double_t        Dataset
D13     Double_t        Dataset
D2      Double_t        Dataset
D3      Double_t        Dataset
D4      Double_t        Dataset
D5      Double_t        Dataset
D6      Double_t        Dataset
D7      Double_t        Dataset
D8      Doub

# uproot + awkward

Uproot serves as a ROOT alternative for those who need to work with ROOT files. Built on top of the Awkward array toolkit, it offers considerable flexibility. It supports columns containing arrays and advanced data types. Uproot can store ROOT histograms while adhering to a Python-centric design philosophy. It is a great compromise between the capabilities of ROOT and Pandas.

### Create Tree

In [22]:
numpy_dict = {
    column_names_b[i]: data_b[:,i] for i in range(len(column_names_b))
}
numpy_dict.update({
    column_names_f[i]: data_f[:,i] for i in range(len(column_names_f))
})
numpy_dict.update({
    column_names_d[i]: data_d[:,i] for i in range(len(column_names_d))
})
numpy_dict.update({
    column_names_s[i]: data_s[:,i] for i in range(len(column_names_s))
})

with up.recreate("UprootTree.root") as file:
    file["tree"] = numpy_dict
    file["tree"].show()

name                 | typename                 | interpretation                
---------------------+--------------------------+-------------------------------
B0                   | bool                     | AsDtype('bool')
B1                   | bool                     | AsDtype('bool')
B2                   | bool                     | AsDtype('bool')
B3                   | bool                     | AsDtype('bool')
B4                   | bool                     | AsDtype('bool')
B5                   | bool                     | AsDtype('bool')
B6                   | bool                     | AsDtype('bool')
B7                   | bool                     | AsDtype('bool')
B8                   | bool                     | AsDtype('bool')
B9                   | bool                     | AsDtype('bool')
F0                   | float                    | AsDtype('>f4')
F1                   | float                    | AsDtype('>f4')
F2                   | float                    

### Fliter data

In [23]:
with up.open("UprootTree.root") as file:
    tree = file["tree"]
    arr = tree.arrays(library="pd")

print("Before selection. Number of rows = ", tree.num_entries)
arr = arr[arr["B0"] + arr["B1"]]
arr = arr[(arr["F0"] > 0.1) & (arr["F0"] < 0.9)]
arr = arr[np.abs(arr["D0"] - arr["D1"]) >= 0.02]
arr = arr[arr["S0"] == "aaa"]
print("After selection. Number of rows = ", len(arr))
print(arr[:5][["B0", "F0", "D0", "S0"]])

Before selection. Number of rows =  10000
After selection. Number of rows =  1938
       B0        F0        D0   S0
3    True  0.492902  0.757330  aaa
5    True  0.357263  0.192829  aaa
6   False  0.143712  0.297107  aaa
7    True  0.174429  0.665158  aaa
11   True  0.722801  0.150573  aaa


  return np.array(ak.is_none(self._data))


### Add new columns

In [24]:
arr["D11"] = arr["D2"] - arr["D3"]
arr["D12"] = arr["D4"] * arr["B0"] # if B0, make D12 = D4, else D12 = 0
arr["D13"] = arr["D4"] * arr["B0"] + arr["D5"] * (~arr["B0"]) # if B0, make D12 = D4, else D12 = D5 
arr["S11"] = arr["S1"].apply(lambda x: x.capitalize())
arr["S12"] = arr["S1"].to_numpy() + arr["S2"].to_numpy()

arr[:5][["D11", "D12", "D13", "S11", "S12"]]

Unnamed: 0,D11,D12,D13,S11,S12
3,0.103905,0.0228,0.0228,Bbb,bbbaaa
5,0.617874,0.475685,0.475685,Ccc,cccaaa
6,0.281837,0.0,0.475534,Bbb,bbbccc
7,0.047393,0.755705,0.755705,Ccc,cccccc
11,-0.685805,0.540482,0.540482,Ccc,cccaaa


### Calculate stats

In [25]:
print("F1 mean: ", arr["F1"].mean())
print("F1 max: ", arr["F1"].max())
print("F1 min: ", arr["F1"].min())
print("Count B0 == True: ",arr["B0"].sum())

F1 mean:  0.5035985
F1 max:  0.99945223
F1 min:  0.00013765624
Count B0 == True:  1272


### Save / read 

In [26]:
with tempfile.TemporaryDirectory() as tempdir:
    with up.recreate(f"{tempdir}/TempUprootTree.root") as file:
        file["tree"] = arr
    with up.open(f"{tempdir}/TempUprootTree.root") as file:
        print(file["tree"].show())

name                 | typename                 | interpretation                
---------------------+--------------------------+-------------------------------
index                | int64_t                  | AsDtype('>i8')
B0                   | bool                     | AsDtype('bool')
B1                   | bool                     | AsDtype('bool')
B2                   | bool                     | AsDtype('bool')
B3                   | bool                     | AsDtype('bool')
B4                   | bool                     | AsDtype('bool')
B5                   | bool                     | AsDtype('bool')
B6                   | bool                     | AsDtype('bool')
B7                   | bool                     | AsDtype('bool')
B8                   | bool                     | AsDtype('bool')
B9                   | bool                     | AsDtype('bool')
F0                   | float                    | AsDtype('>f4')
F1                   | float                    