In [55]:
import ROOT
import numpy as np
import pandas as pd
import os

In [56]:
feature_list = ["label","eventWeightLumi","PROCESS","file_root","Tau_pt","Tau_phi","Tau_mass","Tau_eta","Tau_charge","Tau_dz",
               "Tau_dxy","nTau","nPhoton","Photon_pt","Photon_phi","Photon_mass","Photon_eta","Photon_charge",
               "nMuon","Muon_pt","Muon_phi","Muon_mass","Muon_eta","Muon_dz","Muon_dxy","Muon_charge",
               "nElectron","Electron_pt","Electron_phi","Electron_mass","Electron_eta","Electron_dz","Electron_dxy","Electron_charge",
                "CaloMET_pt"]

In [57]:
Signal = ["TT_FCNC-aTtoHJ_Tleptonic_HToWWZZtautau_eta_hct-MadGraph5-pythia8_RunIISummer16",
"TT_FCNC-aTtoHJ_Tleptonic_HToWWZZtautau_eta_hct_TuneCP5-MadGraph5-pythia8_RunIIAutumn18",
"TT_FCNC-aTtoHJ_Tleptonic_HToWWZZtautau_eta_hct_TuneCP5-MadGraph5-pythia8_RunIIFall17",
"TT_FCNC-aTtoHJ_Tleptonic_HToWWZZtautau_eta_hut-MadGraph5-pythia8_RunIISummer16",
"TT_FCNC-aTtoHJ_Tleptonic_HToWWZZtautau_eta_hut_TuneCP5-MadGraph5-pythia8_RunIIAutumn18",
"TT_FCNC-aTtoHJ_Tleptonic_HToWWZZtautau_eta_hut_TuneCP5-MadGraph5-pythia8_RunIIFall17"]

In [82]:
BackDir = ["TTTo2L2Nu", "TTToSemiLeptonic", "TTToHadronic", "ST_", "TTWJets",
"TTZToLL", "WpWpJJ_", "WmWmJJ_", "WZTo3LNu", "WZTo2L2Q_"]

In [59]:
def get_dataframe(fname):
    f = ROOT.TFile.Open(fname)
    tree = f.Events
    col_names = [leaf.GetName() for leaf in tree.GetListOfLeaves()]
    data = []
    
    for event in tree:
        row = []
        for n in col_names:
            row.append(event.GetLeaf(n).GetValue())
        data.append(row)  
        
    return pd.DataFrame(data, columns = col_names,)

In [60]:
def trim_dataframe(df, feature_list):
    for col in df:
        if col not in set(feature_list):
            df = df.drop(columns = col)
    return df        
    

In [61]:
def read_process(pname,verbosity = False):
    dfs = []
    for root, dirs, filelist in os.walk(pname):
        if verbosity == True: print("--->Loading ", len(filelist)," root files.")
        for j,file in enumerate(filelist):
            if verbosity == True: print("--->Reading root file: ", file)
            dfs.append( get_dataframe(pname+"/"+file) )
            dfs[j]["file_root"] = file
            
    if verbosity == True: print("--->Concatenating dataframe")        
    df = pd.concat(dfs)    
    df["PROCESS"] = pname
    return df

In [62]:
def label_dataframe(df,label):
    df.insert(loc = 0, column = "label", value = label)
    return df

In [67]:
def pickle_batch(list_, jar_name):
    df_list = []
    for process in list_:
        #Load an entire process folder
        print(">Reading process ", process)
        df = read_process("ntuple/"+ process, verbosity = True)
        
        #Select important features to keep
        df = trim_dataframe(df, feature_list)
        
        if process in set(Signal):
            label = 1 #Actual signal
        else:    
            label = 0 #Background
        df_list.append( label_dataframe(df,label))    
    df = pd.concat( df_list )
    df.to_pickle(jar_name)
    return df
        

In [70]:
df = pickle_batch(Signal, "Signal.pkle")

>Reading process  TT_FCNC-aTtoHJ_Tleptonic_HToWWZZtautau_eta_hct-MadGraph5-pythia8_RunIISummer16
--->Loading  3  root files.
--->Reading root file:  D717F12F-B185-3348-9E3E-0204E2751363_Skim.root
--->Reading root file:  DE799445-30AD-E040-B4E2-3C8F90216D52_Skim.root
--->Reading root file:  6319226B-F5F4-1B45-950E-36BE5993AA49_Skim.root
--->Concatenating dataframe
>Reading process  TT_FCNC-aTtoHJ_Tleptonic_HToWWZZtautau_eta_hct_TuneCP5-MadGraph5-pythia8_RunIIAutumn18
--->Loading  18  root files.
--->Reading root file:  3C10F556-3AF7-A34F-8C4F-59D2C4AAA91D_Skim.root
--->Reading root file:  763253A5-C104-5C49-9170-67E33BED184B_Skim.root
--->Reading root file:  CA746C0B-DDDB-B742-B319-8D1786DFF0D7_Skim.root
--->Reading root file:  20CBA33E-EFD4-C04E-8607-D9C619D13A73_Skim.root
--->Reading root file:  C91D9FD7-C174-4B47-B8DD-1FBEB591538B_Skim.root
--->Reading root file:  297EA7BC-735B-A748-A2B5-62DF2DED29F1_Skim.root
--->Reading root file:  D09EEE5C-81D3-AA4B-8699-A3AB2E8174A1_Skim.root
---

In [None]:
for nbatch, root in enumerate(BackDir[1:]):
    flist = []
    for folder in os.listdir("ntuple/"):
        if folder.startswith(root):
            flist.append(folder)
            
    print(">>>Loading ", len(flist), " folders")    
    name = "BGround"+str(nbatch+1)+".pkle"
    pickle_batch(flist, name)

>>>Loading  3  folders
>Reading process  TTToSemiLeptonic_TuneCP5_13TeV-powheg-pythia8_RunIIAutumn18
--->Loading  192  root files.
--->Reading root file:  B7B7DF12-53D0-A34A-B23C-445BFE4908BF_Skim.root
--->Reading root file:  55232831-1123-D743-83DC-D634F7B6FE9A_Skim.root


In [16]:
root = "TTToSemiLeptonic"
i = -1
dataf = []
for dire in os.listdir("ntuple"):
       
        if dire.startswith(root):
            print("Loading ",dire,"..")
            s = dire
            i+=1
            
            this = []
            for root, dirs, filelist in os.walk("ntuple/"+s):
                for j,file in enumerate(filelist):
                    if j > 5: break   
                    print("loading file ",file)
                    this.append(get_dataframe("ntuple/"+s+"/"+file))
                    this[j]["file_root"] = file
            dataf.append(pd.concat(this))
            
            dataf[i]["PROCESS"] = s
            if i > 10: break
            break

Loading  TTToSemiLeptonic_TuneCP5_13TeV-powheg-pythia8_RunIIAutumn18 ..
loading file  B7B7DF12-53D0-A34A-B23C-445BFE4908BF_Skim.root
loading file  55232831-1123-D743-83DC-D634F7B6FE9A_Skim.root
loading file  B9B5E1A9-9AA2-304C-BE6B-DC901DC512B1_Skim.root
loading file  AC522E75-80D3-CB4F-992E-0A9A0D862BAA_Skim.root
loading file  8C813243-1DAE-6142-96B9-825B583BA0E6_Skim.root
loading file  E7C19C16-E48F-8448-9733-5AF7064E3045_Skim.root
