In [1]:
import ROOT
import numpy as np
import pandas as pd
import os
import operator

Welcome to JupyROOT 6.20/04


In [2]:
new_flist = ['nElectron', 'Electron_pt0', 'Electron_mass0', 'Electron_charge0',
 'Electron_phi0', 'Electron_eta0', 'Electron_pt1', 'Electron_mass1', 'Electron_charge1', 'Electron_phi1',
 'Electron_eta1', 'Electron_pt2', 'Electron_mass2', 'Electron_charge2', 'Electron_phi2', 'Electron_eta2',
 'nMuon', 'Muon_pt0', 'Muon_mass0', 'Muon_charge0',
 'Muon_phi0', 'Muon_eta0', 'Muon_pt1', 'Muon_mass1', 'Muon_charge1', 'Muon_phi1', 'Muon_eta1', 'Muon_pt2', 'Muon_mass2', 'Muon_charge2',
 'Muon_phi2', 'Muon_eta2', 'nJet', 'Jet_btagDeepB0', 'Jet_btagDeepC0', 'Jet_eta0', 'Jet_phi0', 'Jet_mass0', 'Jet_nConstituents0', 'Jet_btagDeepB1',
 'Jet_btagDeepC1', 'Jet_eta1', 'Jet_phi1', 'Jet_mass1', 'Jet_nConstituents1', 'Jet_btagDeepB2',
 'Jet_btagDeepC2', 'Jet_eta2', 'Jet_phi2', 'Jet_mass2', 'Jet_nConstituents2']

In [3]:
category = ["Electron","Muon","Jet"]
counters = ["nElectron","nMuon","nJet"]
particle_feat = ["_pt","_mass","_charge","_phi","_eta"]
jet_feat = ["_btagDeepB","_btagDeepC","_eta","_phi","_mass","_nConstituents"]
feat = [particle_feat,particle_feat,jet_feat]

In [4]:
Signal = ["TT_FCNC-aTtoHJ_Tleptonic_HToWWZZtautau_eta_hct-MadGraph5-pythia8_RunIISummer16",
"TT_FCNC-aTtoHJ_Tleptonic_HToWWZZtautau_eta_hct_TuneCP5-MadGraph5-pythia8_RunIIAutumn18",
"TT_FCNC-aTtoHJ_Tleptonic_HToWWZZtautau_eta_hct_TuneCP5-MadGraph5-pythia8_RunIIFall17",
"TT_FCNC-aTtoHJ_Tleptonic_HToWWZZtautau_eta_hut-MadGraph5-pythia8_RunIISummer16",
"TT_FCNC-aTtoHJ_Tleptonic_HToWWZZtautau_eta_hut_TuneCP5-MadGraph5-pythia8_RunIIAutumn18",
"TT_FCNC-aTtoHJ_Tleptonic_HToWWZZtautau_eta_hut_TuneCP5-MadGraph5-pythia8_RunIIFall17"]

In [5]:
BackDir = ["TTTo2L2Nu", "TTToSemiLeptonic", "TTToHadronic", "ST_", "TTWJets",
"TTZToLL", "WpWpJJ_", "WmWmJJ_", "WZTo3LNu", "WZTo2L2Q_"]

In [6]:
def get_dataframe(fname):
    f = ROOT.TFile.Open(fname)
    tree = f.Events
    col_names = [leaf.GetName() for leaf in tree.GetListOfLeaves()]
    data = []
    
    df = pd.DataFrame(np.zeros((tree.GetEntriesFast(),len(new_flist))),columns = new_flist)
    #cycling over all events
    for e,event in enumerate(tree):
        #getting electron muon and jet
        for c,cat in enumerate(category):
            n_obj = operator.attrgetter(counters[c])(event)
            df.iloc[e][counters[c]] = n_obj

            for j in range(len(feat[c])):
                for i in range(n_obj):
                    #maybe implement a better criterion, now getting the first 3 (high pT)
                    if i<3:
                        data = operator.attrgetter(cat+feat[c][j])(event)[i]
                        df.iloc[e][cat+feat[c][j]+str(i)] = data            
    return df

In [7]:
def trim_dataframe(df, feature_list):
    for col in df:
        if col not in set(feature_list):
            df = df.drop(columns = col)
    return df        
    

In [8]:
def read_process(pname,verbosity = False):
    dfs = []
    for root, dirs, filelist in os.walk(pname):
        if verbosity == True: print("--->Loading ", len(filelist)," root files.")
        for j,file in enumerate(filelist):
            if verbosity == True: print("--->Reading root file: ", file)
            dfs.append( get_dataframe(pname+"/"+file) )
            dfs[j]["file_root"] = file
            
    if verbosity == True: print("--->Concatenating dataframe")        
    df = pd.concat(dfs)    
    df["PROCESS"] = pname
    return df

In [9]:
def label_dataframe(df,label):
    df.insert(loc = 0, column = "label", value = label)
    return df

In [10]:
def pickle_batch(list_, jar_name):
    df_list = []
    for process in list_:
        #Load an entire process folder
        print(">Reading process ", process)
        df = read_process("ntuple/"+ process, verbosity = True)
        
        #Select important features to keep
        #df = trim_dataframe(df, new_flist)
        
        if process in set(Signal):
            label = 1 #Actual signal
        else:    
            label = 0 #Background
        df_list.append( label_dataframe(df,label))    
    df = pd.concat( df_list )
    df.to_pickle(jar_name)
    return df
        

In [None]:
for nbatch, root in enumerate(BackDir):
    flist = []
    for folder in os.listdir("ntuple/"):
        if folder.startswith(root):
            flist.append(folder)
            
    print(">>>Loading ", len(flist), " folders")    
    name = "BGround"+str(nbatch)+".pkle"
    pickle_batch(flist, name)

>>>Loading  24  folders
>Reading process  TT_FCNC-aTtoHJ_Tleptonic_HToWWZZtautau_eta_hct_TuneCP5-MadGraph5-pythia8_RunIIAutumn18
--->Loading  18  root files.
--->Reading root file:  3C10F556-3AF7-A34F-8C4F-59D2C4AAA91D_Skim.root
--->Reading root file:  763253A5-C104-5C49-9170-67E33BED184B_Skim.root
--->Reading root file:  CA746C0B-DDDB-B742-B319-8D1786DFF0D7_Skim.root
--->Reading root file:  20CBA33E-EFD4-C04E-8607-D9C619D13A73_Skim.root
--->Reading root file:  C91D9FD7-C174-4B47-B8DD-1FBEB591538B_Skim.root
--->Reading root file:  297EA7BC-735B-A748-A2B5-62DF2DED29F1_Skim.root
--->Reading root file:  D09EEE5C-81D3-AA4B-8699-A3AB2E8174A1_Skim.root
--->Reading root file:  5BEF19C0-5B0A-374C-BDD1-50787D51B640_Skim.root
--->Reading root file:  C6FA504E-CF85-DB4E-9766-51F839EC7234_Skim.root
--->Reading root file:  F04D0C06-16A8-6C41-9212-718AC05E51F5_Skim.root
--->Reading root file:  DD1B3C84-F1CD-2B4D-883D-048B4B4DE118_Skim.root
--->Reading root file:  62434923-F330-4747-B69D-7DB798729A2A_

--->Reading root file:  1930F644-A4DF-9441-BCDE-48B2D6045607_Skim.root
--->Reading root file:  DE610AC6-52C8-F243-B726-266E986C67C7_Skim.root
--->Reading root file:  5BB4B096-AC3F-BD49-B599-D43E0176890F_Skim.root
--->Reading root file:  99E411C5-8086-3C41-B5E0-8356B93A62AE_Skim.root
--->Reading root file:  E5D51928-D702-3B4E-93FF-10B011657478_Skim.root
--->Reading root file:  1BE985E2-0F6A-7B4D-98D8-A5CC8BDF64C0_Skim.root
--->Reading root file:  C329E2BB-0C74-A640-9F7B-DFC5505DA4A9_Skim.root
--->Reading root file:  0010502D-08FD-9A45-9B8F-A2FB501C776D_Skim.root
--->Reading root file:  4ADB829B-0293-0D48-8AEA-31AAFD1936B8_Skim.root
--->Reading root file:  54EC1465-9EDA-7B40-8042-1FD34081497A_Skim.root
--->Reading root file:  DB097816-5864-3640-A472-37E4518131AD_Skim.root
--->Reading root file:  FB37F4B8-4878-AC41-80AD-1AC7BCC96FBF_Skim.root
--->Reading root file:  7027E474-2CF4-354C-928D-26A03AC64602_Skim.root
--->Reading root file:  A1ACAB6F-3CE4-8E4B-A148-5CFB78AAB153_Skim.root
--->Re

--->Reading root file:  2492799E-1231-0547-873A-1F0E3453197D_Skim.root
--->Reading root file:  AB14F817-A8BD-0945-9F5E-72F0A36D9785_Skim.root
--->Reading root file:  CC8D6BF6-DD88-9445-9BC0-BF8DB9314F96_Skim.root
--->Reading root file:  81B67216-C5C6-D842-8F70-1D3BBCA1AAB6_Skim.root
--->Reading root file:  40B0A77E-DA40-1944-9E7E-8236A3EC4ADF_Skim.root
--->Reading root file:  CF992060-8A3C-5B4F-A221-319E76CC9C8D_Skim.root
--->Reading root file:  90F9EA31-1D94-154B-A39A-9372670FF89F_Skim.root
--->Reading root file:  2A4BD54D-CEEC-1444-A101-352DAAC7D7A3_Skim.root
--->Reading root file:  F6DADCB8-AE7C-A940-986F-CDE4F589FFC6_Skim.root
--->Reading root file:  E6739223-A20C-BB46-80F0-0A84480140A8_Skim.root
--->Reading root file:  F545ECA5-2CD7-B84E-B3A5-8E7C829DAF6C_Skim.root
--->Reading root file:  D54BCEC9-5DF3-094E-9758-BFD29A8DB105_Skim.root
--->Reading root file:  B6B14F03-3FD7-5641-82D3-13F18BAA4078_Skim.root
--->Reading root file:  6B5E5907-F3E1-AC46-8D55-21A5AB61CA5E_Skim.root
--->Re

--->Reading root file:  F00212E2-4470-1048-8E82-A66286C297F3_Skim.root
--->Reading root file:  8CCF49DE-1F75-A24C-9D95-A36BE0BDB6EB_Skim.root
--->Reading root file:  4CE55E1E-C33F-2944-8FC0-FFBFFCA365E0_Skim.root
--->Reading root file:  B27DF075-966F-3D4C-BAD2-BB936E5D6FA5_Skim.root
--->Reading root file:  FC64FA73-4E17-4946-AF88-3E1207455146_Skim.root
--->Reading root file:  6D53D36C-C911-914E-AC3F-2A867B9900C7_Skim.root
--->Reading root file:  CE1C4E0B-7DDB-4042-8C2F-9038628CA06E_Skim.root
--->Reading root file:  A54662B3-B2A1-B546-838D-745FA250468C_Skim.root
--->Reading root file:  70CBDD06-AFC8-EC40-A4BA-115141AEC3E2_Skim.root
--->Reading root file:  CBEC84EB-AF6B-BD43-93CF-437838E0E10F_Skim.root
--->Reading root file:  4407B8DA-3296-F04D-89C4-3A877386F1D7_Skim.root
--->Reading root file:  958C7DF3-01B6-604E-A714-EFA3DB20FE66_Skim.root
--->Reading root file:  54DE2FDB-0E5D-5144-950A-BA8C87D95FD2_Skim.root
--->Reading root file:  C199FA5D-586D-4C40-9242-2EC8CCE7CFFC_Skim.root
--->Re

--->Reading root file:  BF7429BF-12AF-1D4B-996A-8E074D7E7178_Skim.root
--->Reading root file:  D5F26319-BDC8-4149-897E-4FA930BB0C08_Skim.root
--->Reading root file:  7584D0B8-296B-C646-9602-039F7543C8A2_Skim.root
--->Reading root file:  7CD3C963-67B8-F243-B273-07B5D9FA58F9_Skim.root
--->Reading root file:  0E189825-DD1F-6049-803F-BBF24485FC25_Skim.root
--->Reading root file:  63F0272D-A567-404B-A9A2-4A7FFB524746_Skim.root
--->Reading root file:  33F6EAE6-E16E-D042-A7BD-849C3714902A_Skim.root
--->Reading root file:  75783488-77B4-A447-A3C3-3137B53400D9_Skim.root
--->Reading root file:  562CEFA8-C676-A445-A1E3-01A28456EBFE_Skim.root
--->Reading root file:  BE2B4615-BC9B-114B-995C-562C7D445EB1_Skim.root
--->Reading root file:  5264B5B8-B84A-FF42-97E6-4150DE63C7E2_Skim.root
--->Reading root file:  5CECC4A2-10CB-834E-A32C-C6599A936D70_Skim.root
--->Reading root file:  C4673F07-3B63-BA49-8F8C-BE5F41F465DF_Skim.root
--->Reading root file:  F04F31A3-A3A5-6A49-A540-594B76BDAFDC_Skim.root
--->Re

--->Reading root file:  7E4B98C8-DF23-A943-89B7-1DBAFC13B277_Skim.root
--->Reading root file:  6E6BF561-C72B-CA45-BE70-EC3A8F901178_Skim.root
--->Reading root file:  37014656-8350-9942-A491-C00397F1915E_Skim.root
--->Reading root file:  143960D3-320D-1A46-A9F2-3B55C351D369_Skim.root
--->Reading root file:  3D0A3F31-751E-5A47-8170-8957435C829A_Skim.root
--->Reading root file:  0F81F47A-ACDC-4C4B-9F52-41FA5AE49FE7_Skim.root
--->Reading root file:  AFA9B0BC-8923-574F-8392-FC63A5BB6837_Skim.root
--->Reading root file:  6250F7BC-699F-FC4B-A1DC-188DDE1B6397_Skim.root
--->Reading root file:  AF4B3D74-6F7F-4F4E-8456-FFE69CD362E4_Skim.root
--->Reading root file:  E6A2921B-4D01-2E45-9B61-2B55E456CE4E_Skim.root
--->Reading root file:  0BE92330-7DC4-E348-A5C0-2BCA591980B3_Skim.root
--->Reading root file:  8B949119-B857-BB4A-AE3D-E2969B8745B7_Skim.root
--->Reading root file:  D85F5394-A50A-CB45-AE37-704C69448F9F_Skim.root
--->Reading root file:  55505099-CA92-1549-89FC-3A7044623D4D_Skim.root
--->Re

--->Reading root file:  51A86D59-A769-404E-8FC5-14A21315B0C4_Skim.root
--->Reading root file:  BD764467-0BED-394C-A057-C170805CA59C_Skim.root
--->Reading root file:  29DEDDBF-E5A8-3343-8B48-24740047367C_Skim.root
--->Reading root file:  E9DF2D40-E744-2646-AC5F-A9EC2724CD20_Skim.root
--->Reading root file:  14733CB0-7C28-1240-A039-375720E35707_Skim.root
--->Reading root file:  94B2989F-DF04-B849-9AC2-D163D67160C2_Skim.root
--->Reading root file:  640CFBEC-5759-BC41-BD35-45FD08FD169F_Skim.root
--->Reading root file:  C2CCD574-5408-544C-B9F1-A494D4A1D136_Skim.root
--->Reading root file:  F610062C-8A86-9F4E-906E-BC2E320DC45A_Skim.root
--->Reading root file:  6C52B95A-30C6-B241-8830-6E285363FA6D_Skim.root
--->Reading root file:  BF05A371-93F2-6342-8CB2-B87C84B0778E_Skim.root
--->Reading root file:  8F0E3466-6862-054E-A6A6-9D9747C90F29_Skim.root
--->Reading root file:  D24FB895-32DE-4047-A89A-D56957900ED1_Skim.root
--->Reading root file:  1E4F26CC-C137-6946-AAB7-A4CD2F05358F_Skim.root
--->Re

--->Reading root file:  7E330CC2-206A-B84B-B0B2-A088110C8771_Skim.root
--->Reading root file:  F799ACEF-DBE5-1D4A-BC46-6B47B475C552_Skim.root
--->Reading root file:  C488408B-8DD2-734D-A575-FE5EA0FCF469_Skim.root
--->Reading root file:  84D28F19-A605-B942-B5B3-17EEF9E75754_Skim.root
--->Reading root file:  220B7AFF-9AAD-904A-A8B5-63C440A66D25_Skim.root
--->Reading root file:  3E73FDAF-6419-424E-A8F9-3D6660270BD4_Skim.root
--->Reading root file:  10C1C880-2AFA-1047-AA6A-6A6CCE3D9B13_Skim.root
--->Reading root file:  C66868D7-F7F8-6944-8247-43853C828C14_Skim.root
--->Reading root file:  ADA92E91-AA50-5448-B668-B4C5D55A18FA_Skim.root
--->Reading root file:  EDC409F3-D465-2C42-867E-1955A7608E38_Skim.root
--->Reading root file:  7A68AB85-867F-8C4A-A41C-F68A91E22BC6_Skim.root
--->Reading root file:  E42B72F8-467C-D94F-983B-472FA8B11F07_Skim.root
--->Reading root file:  2015B5CF-943C-8142-9D22-66800301F80A_Skim.root
--->Reading root file:  EBD5F1A4-5B73-3D4E-91FB-4F3E0208C3B7_Skim.root
--->Re

--->Reading root file:  A32CE2B6-C604-4745-A664-F6E84577C464_Skim.root
--->Reading root file:  B6EA20D3-C8EE-AC4E-B01C-575BDF23B0AB_Skim.root
--->Reading root file:  AFD0DA5C-87C8-064D-BB35-D7C4D52B313E_Skim.root
--->Reading root file:  3C153EB4-2647-EC44-8024-8491E60B4B9A_Skim.root
--->Reading root file:  425DE6DD-7CD9-264D-A69A-0DF9DA412101_Skim.root
--->Reading root file:  7BB450CF-9944-6D41-939A-E72CCECF332A_Skim.root
--->Reading root file:  D5643C25-43A2-064F-A45F-DBF60E3DBEE1_Skim.root
--->Reading root file:  3ADFF492-4C9E-3B4C-BCC8-2A243F43A518_Skim.root
--->Reading root file:  704F135D-F191-4840-B6F0-1256065051B7_Skim.root
--->Reading root file:  FFFF1B8B-1831-344C-B9C8-B7CC5ABEB626_Skim.root
--->Reading root file:  2E173B27-FDCB-BB4D-9101-7540D3CC81FA_Skim.root
--->Reading root file:  9C546DE0-298A-E64A-A904-1A7DDA00D1B6_Skim.root
--->Reading root file:  A75C4489-72D2-954C-A2E8-30A2E0372226_Skim.root
--->Reading root file:  6A823231-8D90-8C49-85F2-2A3D863DC7FB_Skim.root
--->Re

--->Reading root file:  D7FFBB9D-8FF8-A046-AD67-DA85A34CE5B4_Skim.root
--->Reading root file:  18B40B74-346E-9C4B-9584-725FECEE4007_Skim.root
--->Reading root file:  3872B9B9-AE79-3841-ACA6-6509DDA625AC_Skim.root
--->Reading root file:  9129999F-0AB6-1A40-9DA8-268018BDB072_Skim.root
--->Reading root file:  A5BB5207-755E-5D4E-ADF4-AAB398A0B781_Skim.root
--->Reading root file:  8607ACC2-9CC5-2B45-BAE0-BBF246AA5743_Skim.root
--->Reading root file:  3DD95880-7D14-344A-936D-EB4AE471E398_Skim.root
--->Reading root file:  26F3DAD5-4ED0-EB4D-A4B7-E5415B7DB159_Skim.root
--->Reading root file:  6D5FC985-D0B3-4F40-94A7-B26A053D7153_Skim.root
--->Reading root file:  D2A33A0A-BAF9-E34D-AE75-8AB231DBD5AD_Skim.root
--->Reading root file:  2F96FE17-BFFC-7F49-A808-C14730AD0DB9_Skim.root
--->Reading root file:  43A44AE8-31DC-BE46-A660-390A95419BDE_Skim.root
--->Reading root file:  CACE28A9-85B7-BB45-BF5C-34157E60588B_Skim.root
--->Reading root file:  F7144BCD-645B-444D-A996-991AE948599A_Skim.root
--->Re

In [11]:
ls = []
for i in range(1,10):
    ls.append(pd.read_pickle("BGround"+str(i)+".pkle"))

In [12]:
bg = pd.concat(ls)

In [13]:
bg

Unnamed: 0,label,nElectron,Electron_pt0,Electron_mass0,Electron_charge0,Electron_phi0,Electron_eta0,Electron_pt1,Electron_mass1,Electron_charge1,...,Jet_mass1,Jet_nConstituents1,Jet_btagDeepB2,Jet_btagDeepC2,Jet_eta2,Jet_phi2,Jet_mass2,Jet_nConstituents2,file_root,PROCESS
0,0,1.0,8.036550,-0.003635,-1.0,2.350586,-1.045898,0.000000,0.000000,0.0,...,14.671875,20.0,0.011337,0.069458,-0.236023,3.094727,14.656250,35.0,B7B7DF12-53D0-A34A-B23C-445BFE4908BF_Skim.root,ntuple/TTToSemiLeptonic_TuneCP5_13TeV-powheg-p...
1,0,1.0,5.839515,-0.007023,1.0,-1.845703,2.005371,0.000000,0.000000,0.0,...,10.265625,30.0,0.036713,0.124329,0.240326,2.766602,10.445312,26.0,B7B7DF12-53D0-A34A-B23C-445BFE4908BF_Skim.root,ntuple/TTToSemiLeptonic_TuneCP5_13TeV-powheg-p...
2,0,2.0,8.819495,-0.003479,1.0,-0.636719,1.189697,6.159093,0.001418,1.0,...,8.085938,22.0,0.076233,0.172363,0.567627,0.722534,7.949219,17.0,B7B7DF12-53D0-A34A-B23C-445BFE4908BF_Skim.root,ntuple/TTToSemiLeptonic_TuneCP5_13TeV-powheg-p...
3,0,1.0,17.129496,-0.010803,1.0,1.968506,-1.388916,0.000000,0.000000,0.0,...,17.312500,27.0,0.982910,0.015617,-1.441162,2.018066,13.007812,30.0,B7B7DF12-53D0-A34A-B23C-445BFE4908BF_Skim.root,ntuple/TTToSemiLeptonic_TuneCP5_13TeV-powheg-p...
4,0,2.0,13.363698,-0.003376,1.0,0.293152,-0.238525,8.615698,-0.002766,-1.0,...,7.703125,18.0,0.032013,0.355469,-0.215179,-0.917969,6.457031,14.0,B7B7DF12-53D0-A34A-B23C-445BFE4908BF_Skim.root,ntuple/TTToSemiLeptonic_TuneCP5_13TeV-powheg-p...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
805,0,2.0,3.360288,-0.001740,-1.0,-2.854980,1.695312,5.508017,0.003296,1.0,...,7.277344,14.0,0.142944,0.081421,0.939087,1.876221,6.156250,5.0,783BB937-A7F1-2643-90B5-6F7752F35200_Skim.root,ntuple/WZTo2L2Q_13TeV_amcatnloFXFX_madspin_pyt...
806,0,1.0,18.189581,-0.024292,1.0,-1.886230,-2.202148,0.000000,0.000000,0.0,...,4.867188,4.0,0.164429,0.087341,2.007812,-2.877441,5.203125,5.0,783BB937-A7F1-2643-90B5-6F7752F35200_Skim.root,ntuple/WZTo2L2Q_13TeV_amcatnloFXFX_madspin_pyt...
807,0,1.0,32.030087,0.012032,1.0,0.378540,-0.891968,0.000000,0.000000,0.0,...,14.625000,24.0,0.229370,0.446045,0.165558,1.205322,8.812500,20.0,783BB937-A7F1-2643-90B5-6F7752F35200_Skim.root,ntuple/WZTo2L2Q_13TeV_amcatnloFXFX_madspin_pyt...
808,0,1.0,63.227791,0.040649,1.0,-0.527710,1.892090,0.000000,0.000000,0.0,...,9.828125,14.0,0.318604,0.108459,1.776367,1.668701,9.171875,8.0,783BB937-A7F1-2643-90B5-6F7752F35200_Skim.root,ntuple/WZTo2L2Q_13TeV_amcatnloFXFX_madspin_pyt...


In [14]:
signal = pd.read_pickle("Signal.pkle")

In [None]:
signal["eventWeightLumi"].hist(bins = 100)

In [15]:
bgsample = bg.sample(frac=0.02)

In [16]:
bgsample

Unnamed: 0,label,nElectron,Electron_pt0,Electron_mass0,Electron_charge0,Electron_phi0,Electron_eta0,Electron_pt1,Electron_mass1,Electron_charge1,...,Jet_mass1,Jet_nConstituents1,Jet_btagDeepB2,Jet_btagDeepC2,Jet_eta2,Jet_phi2,Jet_mass2,Jet_nConstituents2,file_root,PROCESS
8037,0,1.0,47.741756,-0.012627,1.0,-1.153564,-0.154144,0.000000,0.000000,0.0,...,7.453125,10.0,0.404785,0.193481,-1.047363,1.386963,8.054688,16.0,A71D8AA8-1F6A-DB4C-B71B-4D9714F3B8FB_Skim.root,ntuple/TTToSemiLeptonic_TuneCP5_13TeV-powheg-p...
11555,0,1.0,35.368259,-0.012077,-1.0,1.599121,-0.821533,0.000000,0.000000,0.0,...,16.312500,26.0,0.698242,0.291992,-0.249634,-2.117188,13.179688,18.0,90F9EA31-1D94-154B-A39A-9372670FF89F_Skim.root,ntuple/TTToSemiLeptonic_TuneCP5_13TeV-powheg-p...
6633,0,1.0,27.421120,-0.010963,-1.0,-2.010254,1.016846,0.000000,0.000000,0.0,...,8.914062,21.0,0.031616,0.094910,0.644775,-2.788086,12.820312,31.0,F11E8229-DF57-A84C-AC4B-DE6C3CB97A20_Skim.root,ntuple/TTToSemiLeptonic_TuneCP5_PSweights_13Te...
2531,0,2.0,8.904668,-0.003775,1.0,0.719604,0.178894,5.155866,0.001488,-1.0,...,13.578125,29.0,0.016159,0.071228,1.741211,0.142792,14.898438,22.0,3050E63D-BE0F-D047-8166-ACD50A0F4BE8_Skim.root,ntuple/TTZToLLNuNu_M-10_TuneCP5_13TeV-amcatnlo...
25321,0,5.0,50.530613,0.015503,1.0,2.421875,-0.086090,9.336543,0.003168,1.0,...,12.765625,29.0,0.038910,0.105042,-0.056244,1.466553,8.710938,11.0,96F276D6-1C38-9F4F-93CE-503B860AB550_Skim.root,ntuple/TTZToLLNuNu_M-10_TuneCUETP8M1_13TeV-amc...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
919,0,1.0,33.980400,0.004299,-1.0,-0.521484,0.274292,0.000000,0.000000,0.0,...,10.820312,16.0,0.025192,0.107910,0.272705,-0.520020,2.890625,6.0,FC7B4051-7739-6446-93F7-733607440377_Skim.root,ntuple/WmWmJJ_EWK_TuneCP5_13TeV-powheg-pythia8...
7749,0,1.0,8.779903,-0.007412,-1.0,2.909668,-1.873535,0.000000,0.000000,0.0,...,22.078125,41.0,0.056824,0.133911,-0.875366,2.679688,15.984375,33.0,2B101AD6-3A12-D247-8924-032CFF7AE82F_Skim.root,ntuple/TTToSemiLeptonic_TuneCP5_13TeV-powheg-p...
7736,0,1.0,18.391096,0.010368,1.0,-0.379822,0.994263,0.000000,0.000000,0.0,...,10.335938,23.0,0.093628,0.162109,0.989746,-0.385071,10.867188,32.0,8CC8B22B-6E75-F648-97DA-0ACB3AB5DB98_Skim.root,ntuple/TTToSemiLeptonic_TuneCP5_13TeV-powheg-p...
10715,0,3.0,87.648651,0.027557,-1.0,-0.910278,0.578979,21.911482,0.026108,1.0,...,9.656250,11.0,0.026733,0.083008,0.576172,-0.916870,6.277344,8.0,1AAA906F-C31A-5F42-95D9-CC5CA9E69E09_Skim.root,ntuple/TTZToLLNuNu_M-10_TuneCP5_13TeV-amcatnlo...


In [17]:
dataset = pd.concat([bgsample,signal])

In [18]:
dataset

Unnamed: 0,label,nElectron,Electron_pt0,Electron_mass0,Electron_charge0,Electron_phi0,Electron_eta0,Electron_pt1,Electron_mass1,Electron_charge1,...,Jet_mass1,Jet_nConstituents1,Jet_btagDeepB2,Jet_btagDeepC2,Jet_eta2,Jet_phi2,Jet_mass2,Jet_nConstituents2,file_root,PROCESS
8037,0,1.0,47.741756,-0.012627,1.0,-1.153564,-0.154144,0.000000,0.000000,0.0,...,7.453125,10.0,0.404785,0.193481,-1.047363,1.386963,8.054688,16.0,A71D8AA8-1F6A-DB4C-B71B-4D9714F3B8FB_Skim.root,ntuple/TTToSemiLeptonic_TuneCP5_13TeV-powheg-p...
11555,0,1.0,35.368259,-0.012077,-1.0,1.599121,-0.821533,0.000000,0.000000,0.0,...,16.312500,26.0,0.698242,0.291992,-0.249634,-2.117188,13.179688,18.0,90F9EA31-1D94-154B-A39A-9372670FF89F_Skim.root,ntuple/TTToSemiLeptonic_TuneCP5_13TeV-powheg-p...
6633,0,1.0,27.421120,-0.010963,-1.0,-2.010254,1.016846,0.000000,0.000000,0.0,...,8.914062,21.0,0.031616,0.094910,0.644775,-2.788086,12.820312,31.0,F11E8229-DF57-A84C-AC4B-DE6C3CB97A20_Skim.root,ntuple/TTToSemiLeptonic_TuneCP5_PSweights_13Te...
2531,0,2.0,8.904668,-0.003775,1.0,0.719604,0.178894,5.155866,0.001488,-1.0,...,13.578125,29.0,0.016159,0.071228,1.741211,0.142792,14.898438,22.0,3050E63D-BE0F-D047-8166-ACD50A0F4BE8_Skim.root,ntuple/TTZToLLNuNu_M-10_TuneCP5_13TeV-amcatnlo...
25321,0,5.0,50.530613,0.015503,1.0,2.421875,-0.086090,9.336543,0.003168,1.0,...,12.765625,29.0,0.038910,0.105042,-0.056244,1.466553,8.710938,11.0,96F276D6-1C38-9F4F-93CE-503B860AB550_Skim.root,ntuple/TTZToLLNuNu_M-10_TuneCUETP8M1_13TeV-amc...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2013,1,2.0,8.373197,0.004055,-1.0,0.910767,2.014648,7.116570,0.001748,-1.0,...,6.148438,7.0,0.037018,0.156860,0.170837,-2.887695,6.210938,15.0,DBCD3319-98A0-0543-AC2F-627B6DA85023_Skim.root,ntuple/TT_FCNC-aTtoHJ_Tleptonic_HToWWZZtautau_...
2014,1,1.0,17.869263,-0.006237,1.0,-3.060059,-0.624268,0.000000,0.000000,0.0,...,25.343750,27.0,0.017075,0.090698,1.096680,-1.412354,11.156250,31.0,DBCD3319-98A0-0543-AC2F-627B6DA85023_Skim.root,ntuple/TT_FCNC-aTtoHJ_Tleptonic_HToWWZZtautau_...
2015,1,1.0,22.707615,-0.004124,-1.0,-2.412598,1.024658,0.000000,0.000000,0.0,...,2.156250,3.0,0.997070,0.002398,0.216644,1.414795,5.539062,22.0,DBCD3319-98A0-0543-AC2F-627B6DA85023_Skim.root,ntuple/TT_FCNC-aTtoHJ_Tleptonic_HToWWZZtautau_...
2016,1,3.0,22.325428,-0.027130,1.0,3.131348,2.067383,13.363412,0.000673,-1.0,...,13.796875,35.0,-2.000000,-1.000000,2.938477,-2.416504,10.875000,14.0,DBCD3319-98A0-0543-AC2F-627B6DA85023_Skim.root,ntuple/TT_FCNC-aTtoHJ_Tleptonic_HToWWZZtautau_...


In [19]:
sample = dataset.sample(frac = 1)

In [20]:
c = 0
for i in sample["label"]:
    if i == 1:
        c+=1

In [21]:
c

165883

In [22]:
sample = sample.drop(columns = ["PROCESS","file_root"])

In [None]:
sample["ev"]

In [23]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [24]:
train, test = train_test_split(sample, test_size=0.2)

In [42]:
c = 0
for i in test["label"]:
    if i == 1:
        c+=1

In [43]:
c

33299

In [25]:
mms = MinMaxScaler()

X_train = pd.DataFrame(mms.fit_transform(train.iloc[:,1:]),columns = train.columns[1:])
X_test  = pd.DataFrame(mms.fit_transform(test.iloc[:,1:]),columns = test.columns[1:])

In [45]:
len(X_train)

329799

In [26]:
X_train.insert(loc=0,column="label",value=train["label"].values)
X_test.insert(loc=0,column="label",value=test["label"].values)

In [27]:
X_train.to_csv("CMStrainBalance.csv")
X_test.to_csv("CMStestBalance.csv")

In [48]:
train = pd.read_csv("CMStrainBalance.csv")

In [49]:
train.describe()

Unnamed: 0.1,Unnamed: 0,label,CaloMET_pt,nElectron,Electron_dxy,Electron_dz,Electron_eta,Electron_mass,Electron_phi,Electron_pt,...,Photon_charge,nTau,Tau_dxy,Tau_dz,Tau_eta,Tau_mass,Tau_phi,Tau_pt,Tau_charge,eventWeightLumi
count,329799.0,329799.0,329799.0,329799.0,329799.0,329799.0,329799.0,329799.0,329799.0,329799.0,...,329799.0,329799.0,329799.0,329799.0,329799.0,329799.0,329799.0,329799.0,329799.0,329799.0
mean,164899.0,0.402015,0.046412,0.078711,0.48762,0.377069,0.535762,0.454471,0.497045,0.032372,...,0.0,0.295639,0.789415,0.491295,0.503398,0.006718,0.500291,0.036124,0.639881,0.095547
std,95204.915052,0.490306,0.031051,0.121114,0.006301,0.007948,0.212949,0.022885,0.288395,0.035083,...,0.0,0.117903,0.002507,0.005635,0.218434,0.023753,0.289121,0.029126,0.480036,0.011385
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,82449.5,0.0,0.025931,0.0,0.487529,0.37703,0.370979,0.447151,0.246969,0.010197,...,0.0,0.25,0.789221,0.491283,0.339968,0.0,0.249301,0.017457,0.0,0.0876
50%,164899.0,0.0,0.040617,0.0,0.487595,0.377049,0.535676,0.454326,0.495422,0.021517,...,0.0,0.25,0.789425,0.491321,0.503016,0.0,0.501059,0.028323,1.0,0.093668
75%,247348.5,1.0,0.059277,0.166667,0.487662,0.377069,0.700677,0.46171,0.746114,0.040972,...,0.0,0.375,0.78963,0.491358,0.666771,0.0,0.750622,0.045484,1.0,0.098376
max,329798.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [45]:
pd.read_csv("CMStest.csv").to_pickle("CMS_test.pkle")

In [107]:
test = pd.read_csv("CMStest.csv",index_col=0)

In [111]:
features = list(test.columns)
features.remove("label")