-
Notifications
You must be signed in to change notification settings - Fork 0
/
ntuple_processor.py
354 lines (288 loc) · 10.2 KB
/
ntuple_processor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
# Kai Chang - Caltech CMS-CERN 2016
#
# Program takes the root file created from the reco-ntuplizer for the HGC
# and produces appropriate numpy arrays for the trees inside.
# This is the hardcore version of the former root_processor.py
# Has to run on cmsenv.
# Then formats them for autoencoding.
#
#
# Blocks (BLOCKS) of code taken from RootProcessor.py
# https://github.com/bencbartlett/tVertexing/blob/master/RootProcessor.py
# =============================================================================
'''
Usage (in cmsenv): RootProcessor.py <[file or directory]> <[options]>
See parser for more details.
'''
import argparse
import ROOT as rt
import numpy as np
import os, sys, platform
import glob
import sys
sys.path.insert(0, './utils')
import wafer
from FastProgressBar import progressbar
from copy import deepcopy
# Argument parser
parser = argparse.ArgumentParser(description =
'''Converts HGCROIAnalysis-format .root files into RecHit numpy arrays.
If converting all files in a directory using -f, make sure all files are HGCROI-formatted.''',\
epilog = '''>> Example: python RootProcessor.py -f ~/public/testfile.root -o ~/public/data
NOTE: This program must be run while cmsenv is active.
''')
parser.add_argument("-d", "--directory", \
help="Process all files in this directory. Default is current path.", action="store")
parser.add_argument("-o", "--outdir", \
help="Directory to save output to. Defaults to Data/.", action="store")
parser.add_argument("-t", "--thickdir", \
help="location of thickness separation array.", action="store")
args = parser.parse_args()
def process(f, outdir):
'''
Usage: process(fileName, writeDirectory)
Takes a HGCROI-formatted .root file and converts it to a list of structured numpy arrays.
The format is as follows:
List of events
For each event, array for various data structures (RecHits, Clusters, etc.).
Unfortunately, there is no way to do recursive recarrays, so we use the following
organisation method:
eventArray[0]=event
eventArray[1]=rechits
For each data structure array, recarray of properties
For example:
(Index 0 = Event 0)
(Index 2 = RecHits)
'x' -> [1.23, 4.25, ...]
'y' -> [5.24, 6.42, ...]
...
'clusterID' -> [1, 2, 1, 1, ...]
(Index 1 = Clusters)
'centerX' -> [3.21, 2.56, ...]
...
...
(Index 1 = Event 1)
(Index 0 = RecHits)
...
(Index 1 = Clusters)
...
...
...
So, for example, to access the array of x positions in the RecHits of the 7th event,
you would use:
xpos = Data[7][2]['x']
'''
# initialization
print "Processing file: " + str(f) + "..."
outArray = [] # produced array
fIn = rt.TFile.Open(f) # open .root file
tree = fIn.Get('ana/hgc') # find tree
n_entries = tree.GetEntries() # get total # of entries
print 'number of events: ', n_entries
# storing entries
for i in xrange(0, n_entries):
tree.GetEntry(i)
eventArray = []
names = ""
# rechits
rechits = True
r_layer = []
r_wafer = []
r_cell = []
r_x = []
r_y = []
r_z = []
r_eta = []
r_phi = []
r_energy = []
r_time = []
r_thickness = []
r_isHalf = []
r_flags = []
r_cluster2d = []
if rechits:
for hit in tree.rechits:
r_layer.append(hit.layer)
r_wafer.append(hit.wafer)
r_cell.append(hit.cell)
r_x.append(hit.x)
r_y.append(hit.y)
r_z.append(hit.z)
r_eta.append(hit.eta)
r_phi.append(hit.phi)
r_energy.append(hit.energy)
r_time.append(hit.time)
r_thickness.append(hit.thickness)
r_isHalf.append(hit.isHalf)
r_flags.append(hit.flags)
r_cluster2d.append(hit.cluster2d)
rechits_array = np.core.records.fromarrays([r_layer, r_wafer, r_cell, r_x, r_y, r_z, \
r_eta, r_phi, r_energy, r_time, r_thickness, r_isHalf, r_flags, r_cluster2d], \
names='layer, wafer, cell, x, y, z, eta, phi, energy, time, thickness, isHalf, flags, cluster2d')
eventArray.append(rechits_array)
names += 'rechits'
else:
eventArray.append([])
# combine arrays into single event and append to outArray
outArray.append(eventArray)
# save final result to file
filename = str(f[:-5]) + '.npy' # replaces .root with .npy
print filename
filename = filename.split("/")[-1] # removes directory prefixes
filepath = outdir+filename
print 'writing file ' + os.path.abspath(filepath) + '...'
np.save(filepath, outArray) # saves array into .npy file
print 'Processing complete. \n'
def create_feeding_arrays(files, t_list, outdir):
''' Merge data into three numpy datasets sorted by wafer thickness.
Currently only using layer 10 data.
Parameters
----------
t_list : Thickness list
outdir : Location where dae-ready npy file will be located
fname : name of dae npy file
'''
# creating thickness arrays formatted for autoencoding
t100 = []
t200 = []
t300 = []
# creating arrays formatted for displaying which wafers are on which samples
t100_wafer = []
t200_wafer = []
t300_wafer = []
# creating organized array for picture reconstruction of entire layer
outArray = []
# creating wafers with appropriate cell count
# l_array = [[] for i in range(0, 28)] # 28 layers in HGCal
w_array = [[] for i in range(0, 516)] # 516 total wafers in layer 10 of HGCal
c1_array = [0 for i in range(240)] # thinner wafers, nominally 256
c23_array = [0 for i in range(133)] # thicker wafers, nominally 128, position info
for idx, t in np.ndenumerate(t_list):
print t
if idx[0] == 0:
c_array = c1_array
empty_array = [0 for i in xrange(240)]
else:
c_array = c23_array
empty_array = [0 for i in xrange(133)]
for i in t:
w_array[i] = deepcopy(c_array)
# IndexError wafers
weird_wf = []
for fi in files:
f = np.load(fi)
for event in f:
layer = deepcopy(w_array) # create a clean layer 10 for each event
# layer_wafer = []
for hit in event[0]:
if hit['layer'] == 10:
try:
layer[hit['wafer']][hit['cell']] = hit['energy'] # both index at 0
except IndexError:
# print 'Index (w: %i,c: %i), (t: %i,h: %i) out of range' %(hit['wafer'], hit['cell'], hit['thickness'],hit['isHalf'])
weird_wf.append(hit['wafer'])
# outArray.append(layer) #append layer 10 instance to outArray
# outArray.append(layer_wafer)
for indx, w in enumerate(layer):
if w != empty_array:
if indx in t_list[0]:
t100.append(w)
t100_wafer.append(indx)
elif indx in t_list[1]:
t200.append(w)
t200_wafer.append(indx)
elif indx in t_list[2]:
t300.append(w)
t300_wafer.append(indx)
else:
pass
print 'completed processing %s.' %fi
weird_wf = set(weird_wf)
print 'writing autoencoding formatted file'
fname = str(files[0][:-5])
# save 100um dataset to file
filename = fname + 't100.npy' # replaces .root with .npy
filename = filename.split("/")[-1] # removes directory prefixes
filepath = outdir+filename
print 'writing file ' + os.path.abspath(filepath) + '...'
np.save(filepath, t100) # saves array into .npy file
# save 200um dataset to file
filename = fname + 't200.npy' # replaces .root with .npy
filename = filename.split("/")[-1] # removes directory prefixes
filepath = outdir+filename
print 'writing file ' + os.path.abspath(filepath) + '...'
np.save(filepath, t200) # saves array into .npy file
# save 300um dataset to file
filename = fname + 't300.npy' # replaces .root with .npy
filename = filename.split("/")[-1] # removes directory prefixes
filepath = outdir+filename
print 'writing file ' + os.path.abspath(filepath) + '...'
np.save(filepath, t300) # saves array into .npy file
print 'writing additional information files'
# save the weird wafers
filepath = outdir + 'weird_wafer.npy'
print 'writing file ' + os.path.abspath(filepath) + '...'
np.save(filepath, weird_wf)
# save 100um dataset position to file
filename = fname + 't100_src.npy' # replaces .root with .npy
filename = filename.split("/")[-1] # removes directory prefixes
filepath = outdir+filename
print 'writing file ' + os.path.abspath(filepath) + '...'
np.save(filepath, t100_wafer) # saves array into .npy file
# save 200um dataset positions to file
filename = fname + 't200_src.npy' # replaces .root with .npy
filename = filename.split("/")[-1] # removes directory prefixes
filepath = outdir+filename
print 'writing file ' + os.path.abspath(filepath) + '...'
np.save(filepath, t200_wafer) # saves array into .npy file
# save 300um dataset positions to file
filename = fname + 't300_src.npy' # replaces .root with .npy
filename = filename.split("/")[-1] # removes directory prefixes
filepath = outdir+filename
print 'writing file ' + os.path.abspath(filepath) + '...'
np.save(filepath, t300_wafer) # saves array into .npy file
# saving entire layer
# filename = fname + 'layer10.npy' # replaces .root with .npy
# filename = filename.split("/")[-1] # removes directory prefixes
# filepath = outdir+filename
# print 'writing file ' + os.path.abspath(filepath) + '...'
# np.save(filepath, outArray) # saves array into .npy file
# print 'Process complete.'
if __name__ == "__main__":
if args.outdir == None:
if not os.path.exists("data"):
os.makedirs("data")
outdir = os.getcwd() + '/data/'
else:
outdir = args.outdir
if args.thickdir == None:
t_list = wafer.genThickness()
else:
t_list = np.load(thickdir)
if args.directory == None:
directory = os.getcwd()
else:
directory = args.directory
os.chdir(directory)
files = [f for f in os.listdir(directory) if f.endswith('.root')]
for file in files:
try:
process(file, outdir)
except TypeError:
raise
# this will store each simulation (RECO) as a .npy where the np to tensor program will loop through each .npy
# and save them into an array for that .npy, and then merge all these arrays so we have independent data for
# the same layer/wafer but different simulation.
# move back to appropriate folder
os.chdir(outdir)
if not os.path.exists("ae_fmt"):
os.makedirs("ae_fmt")
outdir = os.getcwd() + '/ae_fmt/'
files = [f for f in os.listdir('./pdg/') if f.endswith('.npy')]
files = [f for f in files if 'pdg12' in f]
os.chdir('./pdg/')
try:
create_feeding_arrays(files, t_list, outdir)
except TypeError:
raise
raw_input("Operation completed\nPress enter to return to the terminal.")