# How to rebin data in pandas

In [119]:
import pandas as pd
import numpy as np
from pandas import DataFrame
from   typing      import Tuple, List
from numba import njit
import time

In [36]:
import matplotlib.pyplot as plt

In [37]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

## Problem

We have a track defined by a collection of hits, (x,y,z,e). We want to "voxelize" the track, that is:
1. bin the variables x,y,z in (nx, ny, nz) bins so that 
    X = [x0,x1,x2... nx], Y = [y0,y1,y2...ny], z = [z0,z1,z2...nz]
2. A voxel then is defined by (for example): [x0,x1], [y0,y1], [z0,z1]
3. In such a voxel we want to compute the statistics of energy

### Define a test DF

In [231]:
xmax = 100
ymax = 100
zmax = 100
emax = 10
n    = 100000
df = pd.DataFrame(
    {
        "x": np.random.uniform(0,xmax,n),
        "y": np.random.uniform(0,ymax,n),
        "z": np.random.uniform(0,zmax,n),
        "e": np.random.uniform(0,emax,n)
    }
)
df

Unnamed: 0,x,y,z,e
0,21.787552,49.751646,17.713432,7.172502
1,34.559598,45.672821,98.671155,2.255229
2,40.626649,40.550600,69.569321,0.834966
3,13.890610,15.801966,43.066598,8.590746
4,57.885769,22.562021,89.319732,2.533528
...,...,...,...,...
99995,50.566304,46.530560,92.806363,4.492160
99996,42.888306,12.170705,52.221305,0.927654
99997,77.153682,51.212807,99.786166,8.892469
99998,35.370879,44.984646,50.615529,6.956882


### Define bins

In [179]:
bins = 10
xbins = np.linspace(0,xmax,bins+1)
ybins = np.linspace(0,ymax,bins+1)
zbins = np.linspace(0,zmax,bins+1)
print(f'xbins = {xbins}, with length {len(xbins)}')
print(f'ybins = {ybins}, with length {len(ybins)}')
print(f'zbins = {ybins}, with length {len(zbins)}')

xbins = [  0.  10.  20.  30.  40.  50.  60.  70.  80.  90. 100.], with length 11
ybins = [  0.  10.  20.  30.  40.  50.  60.  70.  80.  90. 100.], with length 11
zbins = [  0.  10.  20.  30.  40.  50.  60.  70.  80.  90. 100.], with length 11


### Logical conditions using numpy

In [136]:
conditions = np.logical_and.reduce(
                [
                    df["x"] > xbins[0],
                    df["x"] < xbins[1],
                    df["y"] > ybins[0],
                    df["y"] < ybins[1]
                ]
            )

In [137]:
e_values = df.loc[conditions, "e"]
e_values

51      8.234718
97      6.456811
195     1.025111
261     7.720286
338     3.222962
          ...   
9737    6.007275
9778    4.457896
9898    4.575119
9929    6.639476
9989    8.106396
Name: e, Length: 107, dtype: float64

In [138]:
e_values.count()

107

### Solution using loops:

In [146]:
def voxelize_with_loops_2d(df, xbins, ybins):
    """Uses good old slow loops to voxelize in (x,y)"""

    # prepare output lists
    x_mins= []
    x_maxs= []
    y_mins= []
    y_maxs= []
    e_means= []
    e_stds= []
    e_counts= []

    # loop over bins
    for i_x, x_min in enumerate(xbins[0:-1]):
        x_max = xbins[i_x + 1]
        #print(f'xbin = {x_min, x_max}')
        for i_y, y_min in enumerate(ybins[0:-1]):
            y_max = ybins[i_y + 1]
            #print(f'ybin = {y_min, y_max}')

            # binning conditions for current step
            conditions = np.logical_and.reduce(
                [
                    df["x"] > x_min,
                    df["x"] < x_max,
                    df["y"] > y_min,
                    df["y"] < y_max,
                ]
            )
            #print(conditions)
            # calculate statistics for e and store values in lists
            e_values = df.loc[conditions, "e"]
            e_means.append(e_values.mean())
            e_stds.append(e_values.std())
            e_counts.append(e_values.count())

            x_mins.append(x_min)
            x_maxs.append(x_max)
            y_mins.append(y_min)
            y_maxs.append(y_max)

    # store the results in a new DF
    binned = pd.DataFrame(
        data={
            "x_min"   : x_mins,
            "x_max"   : x_maxs,
            "y_min"   : y_mins,
            "y_max"   : y_maxs,
            "e_mean"  : e_means,
            "e_std"   : e_stds,
            "e_count" : e_counts 
            }
    )
    return binned

In [147]:
start = time.time()
binned = voxelize_with_loops_2d(df, xbins, ybins)
end = time.time()
print(f" time for voxelize_with_loops_2e: = {end - start}")


 time for voxelize_with_loops_2e: = 5.367176055908203


In [148]:
binned

Unnamed: 0,x_min,x_max,y_min,y_max,e_mean,e_std,e_count
0,0.0,10.0,0.0,10.0,5.059406,2.841478,985
1,0.0,10.0,10.0,20.0,4.936199,2.914402,990
2,0.0,10.0,20.0,30.0,4.817424,2.845690,937
3,0.0,10.0,30.0,40.0,4.821171,2.942377,1010
4,0.0,10.0,40.0,50.0,5.064302,2.873315,1030
...,...,...,...,...,...,...,...
95,90.0,100.0,50.0,60.0,5.038055,2.796539,982
96,90.0,100.0,60.0,70.0,4.896574,2.928222,974
97,90.0,100.0,70.0,80.0,4.882270,2.881011,1004
98,90.0,100.0,80.0,90.0,4.638108,2.894347,1037


### Using pd_cuts

In [149]:
def voxelize_with_pd_2d(df, xbins, ybins):
    """Uses pandas to voxelize in (x,y)"""

    df['xtag'] = pd.cut(df['x'],bins=xbins, labels=range(len(xbins)-1))
    df['ytag'] = pd.cut(df['y'],bins=ybins, labels=range(len(ybins)-1))
    df_out = df.groupby(['xtag','ytag'])['e'].describe()
    return df_out

In [150]:
start = time.time()
binned = voxelize_with_pd_2d(df, xbins, ybins)
end = time.time()
print(f" time for voxelize_with_pandas: = {end - start}")


 time for voxelize_with_pandas: = 0.2560999393463135


In [151]:
binned

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,std,min,25%,50%,75%,max
xtag,ytag,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,0,985.0,5.059406,2.841478,0.013544,2.668697,5.044367,7.547767,9.988099
0,1,990.0,4.936199,2.914402,0.000298,2.409456,4.944875,7.316973,9.997394
0,2,937.0,4.817424,2.845690,0.041075,2.316233,4.755086,7.207772,9.955396
0,3,1010.0,4.821171,2.942377,0.018728,2.173959,4.839414,7.298832,9.990803
0,4,1030.0,5.064302,2.873315,0.003235,2.539589,5.107382,7.557881,9.997211
...,...,...,...,...,...,...,...,...,...
9,5,982.0,5.038055,2.796539,0.009401,2.722820,5.012817,7.404645,9.998043
9,6,974.0,4.896574,2.928222,0.008577,2.327875,4.709792,7.460154,9.982878
9,7,1004.0,4.882270,2.881011,0.001813,2.484854,4.825793,7.282899,9.988994
9,8,1037.0,4.638108,2.894347,0.003461,2.106562,4.481783,7.163833,9.990453


#### Time difference:
- The time invested in the solution with loops grows quckly with number of events (1e+3, 0.15), (1e+4, 0.6), (1e+5, 5.4)
- The time used by the PD solution stays almost constant

### Numba

In [158]:
xbins

array([  0.,  10.,  20.,  30.,  40.,  50.,  60.,  70.,  80.,  90., 100.])

In [177]:
def voxelize_with_loops_2d_numba(df, xbins, ybins):
    """Uses numba and loops to voxelize in (x,y)"""
    
    @njit
    def voxelize_numba(x, y, e, xbins, ybins):

        size = (len(xbins) - 1)*(len(ybins) - 1)
        print(size)
        out = np.empty((size, 7))

        index = 0
        for i_x, x_min in enumerate(xbins[0:-1]):
            x_max = xbins[i_x + 1]

            for i_y, y_min in enumerate(ybins[0:-1]):
                y_max = ybins[i_y + 1]
                mfilter = (x_min < x)*(x < x_max)*(y_min < y)*(y < y_max)
                e_values = e[mfilter]
                
                out[index, :] = [
                    x_min,
                    x_max,
                    y_min,
                    y_max,
                    e_values.mean(),
                    e_values.std(),
                    len(e_values)
                ]

                index += 1

        return out

    columns = ["x_min", "x_max", "y_min", "y_max", "mean", "std", "count"]
    out = voxelize_numba(df["x"].values, df["y"].values, df["e"].values, xbins, ybins)
    return pd.DataFrame(out, columns=columns)

In [178]:
start = time.time()
binned = voxelize_with_loops_2d_numba(df, xbins, ybins)
end = time.time()
print(f" time for voxelize_with_loops_2d using numba: = {end - start}")


100
 time for voxelize_with_loops_2d using numba: = 0.33950209617614746


#### So, the numba solution is almost but not quite as fast as the "pandas solution"

## Voxelization 3D (using pandas)

In [180]:
def voxelize_with_pd_3d(df, xbins, ybins, zbins):
    """Uses pandas to voxelize in (x,y, z)"""

    df['xtag'] = pd.cut(df['x'],bins=xbins, labels=range(len(xbins)-1))
    df['ytag'] = pd.cut(df['y'],bins=ybins, labels=range(len(ybins)-1))
    df['ztag'] = pd.cut(df['z'],bins=zbins, labels=range(len(zbins)-1))
    df_out = df.groupby(['xtag','ytag','ztag'])['e'].describe()
    return df_out

In [181]:
start = time.time()
binned = voxelize_with_pd_3d(df, xbins, ybins, zbins)
end = time.time()
print(f" time for voxelize_with_pandas: = {end - start}")


 time for voxelize_with_pandas: = 2.7126290798187256


In [182]:
binned

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count,mean,std,min,25%,50%,75%,max
xtag,ytag,ztag,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0,0,106.0,5.254240,2.741186,0.078080,3.228853,5.746651,7.482562,9.730221
0,0,1,98.0,4.919544,2.612254,0.013544,2.674111,5.000559,7.019239,9.791689
0,0,2,99.0,5.155643,3.188343,0.072231,2.173777,5.287047,7.981366,9.978383
0,0,3,112.0,5.455643,2.789679,0.204300,3.345941,5.285967,7.743948,9.906612
0,0,4,99.0,4.888663,2.892889,0.044339,2.604040,5.044367,7.402008,9.719383
...,...,...,...,...,...,...,...,...,...,...
9,9,5,99.0,4.905993,2.952150,0.065195,2.246919,4.941967,7.221152,9.995907
9,9,6,102.0,4.653193,3.009864,0.003673,1.942647,4.434963,7.429908,9.994549
9,9,7,105.0,4.991888,2.737146,0.141494,2.814848,5.402684,7.270695,9.553948
9,9,8,109.0,4.618820,2.931971,0.131190,1.735230,4.346721,6.896648,9.952275


### Formal voxelisation DF

- In the voxelisation DF we need the central values of each bin in x,y,z, and the sum of the energies in the voxel

In [236]:
def voxelize(df, xbins, ybins, zbins):
    """Uses pandas to voxelize in (x,y, z)"""

    df['x_bins'] = pd.cut(df['x'],bins=xbins, labels=range(len(xbins)-1))
    df['y_bins'] = pd.cut(df['y'],bins=ybins, labels=range(len(ybins)-1))
    df['z_bins'] = pd.cut(df['z'],bins=zbins, labels=range(len(zbins)-1))
    
    return df.groupby(['x_bins','y_bins','z_bins']).aggregate({'x': 'mean',
                                                        'y': 'mean',
                                                        'z': 'mean',
                                                        'e': 'sum',
                                                       }).reset_index(drop=True)


In [237]:
dfv = voxelize(df, xbins, ybins, zbins)

In [238]:
dfv

Unnamed: 0,x,y,z,e
0,5.069501,5.052303,5.050050,520.180879
1,5.212960,4.949617,15.326434,588.924737
2,5.168249,4.906119,25.181948,482.797326
3,5.231744,4.953706,35.222778,477.537593
4,5.285323,4.755909,44.912928,506.324922
...,...,...,...,...
995,95.192328,95.045318,55.007796,492.579401
996,94.919689,94.715785,64.888035,488.959712
997,95.340015,95.478308,75.173536,463.274630
998,95.064271,95.158549,85.031479,441.303498
