# Process Awkward Arrays to Using NN
---
This notebook should help in order to process data from an awkward array to a format that can be used by the NN.<br>
Awkward arrays are a data structure that is used in the HEP community. It is a nested array structure that can be used to store data in a more efficient way than a numpy array. <br>
The data structure is described in more detail [here](https://awkward-array.org/doc/main/).

### Import Modules

In [134]:
import awkward as ak
from typing import List, Dict, Tuple
import pandas as pd
import uproot as ur
import os
import numpy as np

### Generate Sample Data
As it is shown for the example data, the length of the array is not the same for each event for _el_pt_.<br>
This is a typical case in HEP data, where the number of particles in an event is not the same for each event.<br>

In [135]:
# awkward array with event index, el_pt, met, event weight
ak_array = ak.Array([[{"Event": 0, "el_pt": [10, 20, 3], "met": 20, "eventWeight": 1}, 
                      {"Event": 1, "el_pt": [], "met": 30, "eventWeight": 1}, 
                      {"Event": 2, "el_pt": [3], "met": 40, "eventWeight": 1}]])

### Function to Process Awkward Arrays to Pandas Dataframe

The function ***from_ak_2_df*** takes an awkward array as input and returns a pandas dataframe.<br>
All the entries can be normalized when setting normalize to True.<br>
Unimportant fields can be dropped by setting remove fields to the list of names.<br>

In [136]:
def from_ak_2_df(ak_array: ak.Array,
                 remove_fields: List[str],
                 normalize: bool) -> Tuple[pd.DataFrame,
                                            Dict[str, float]]:
    """
    Convert awkward array to pandas dataframe.

    1. Convert awkward array to pandas dataframe.
    2. Remove fields.
    3. Itertate over all fields.
        1. Include an extra column for the length of the list.
        2. Get the maximum length of the list.
        3. Pad list to the same length.
        4. Convert to list after filling nones with 0.
    4. Normalize data.
        1. Iterate over all columns.
            1. Check if column is a list.
                1. Flatten the ak array.
                2. Assign minimum and maximum values.
                3. Normalization factor in dictionary.
            1. Get minimum and maximum values of column.
            2. Normalize column.
    5. Return dataframe and normalized dict with dropped index.

    Parameters:
        ak_array (ak.Array): Awkward array.
        remove_fields (List[str]): List of fields to remove.
        normalize (bool): Normalize data.

    Returns:
        Tuple[pd.DataFrame, Dict[str, float]]: Tuple of dataframe and normalization factors.
    """
    # 0. Normalization factors
    if normalize is True:
        norm_factors: Dict[str, float] = {}
    else:
        norm_factors = None
    # 1. Convert awkward array to pandas dataframe.
    df = pd.DataFrame(ak_array.tolist()[0])
    # 2. Remove fields.
    df = df.drop(remove_fields, axis=1)

    # 3. Itertate over all fields.
    for field in df.columns:
        if type(df[field][0]) == list:
            # 1. Include an extra column for the length of the list.
            df[field + "_len"] = df[field].apply(lambda x: len(x))
            # 2. Get the maximum length of the list.
            max_len = df[field + "_len"].max()
            # 3. Pad list to the same length.
            none_array = ak.pad_none(ak_array[field][0], 
                                     target=max_len,
                                     clip=True)
            # 4. Convert to list after filling nones with 0.
            df[field] = ak.fill_none(none_array, 0).tolist()
        
    # 4. Normalize data.
    if normalize is True:
        # 1. Iterate over all columns.
        for field in df.columns:
            # 1. Check if column is a list.
            if type(df[field][0]) == list:
                # 1. Flatten the ak array.
                flat_array = ak.flatten(ak_array[field])
                # 2. Assign minimum and maximum values.
                min_value = abs(ak.min(flat_array))
                max_value = abs(ak.max(flat_array))
                if min_value > max_value:
                    norm_factors[field] = 1/min_value
                else:
                    norm_factors[field] = 1/max_value
                # 3. Normalization factor in dictionary.
                df[field] = df[field].apply(
                    lambda x: [i * norm_factors[field] for i in x])
            else:
                # 1. Get minimum and maximum values of column.
                min_value = abs(df[field].min())
                max_value = abs(df[field].max())
                if min_value > max_value:
                    norm_factors[field] = 1/min_value
                else:
                    norm_factors[field] = 1/max_value
                # 2. Normalize column.
                df[field] = df[field].apply(
                    lambda x: x * norm_factors[field])
    else:
        pass
    # 5. Return dataframe and normalized dict with dropped index.
    return df.reset_index(drop=True), norm_factors

df, norm_factor = from_ak_2_df(ak_array=ak_array, 
                               remove_fields=["Event", "eventWeight"],
                               normalize=True)

### Load all Possible Signal Regions with Acceptance and Rejection

In [143]:
def sr_acc_rej(path: str):
    count=0
    # 1. Check if directory exists.
    if os.path.isdir(path) is False:
        raise ValueError("Directory does not exist.")
    # 2. Get all files in directory.
    files = os.listdir(path)
    # 3. Iterate over all files.
    for file in files:
        # 1. Check if file is a root file.
        if file.endswith(".root"):
            # 1. Open root file.
            root_file: ur.ReadOnlyFile = ur.open(path + file)
            # 2. Get tree.
            n = root_file["ntuple"].keys()
            count+=len(n)
    print(count)
            

sr_acc_rej(path="data/100293/")

2964
