/
WorkBalance.C
278 lines (232 loc) · 8 KB
/
WorkBalance.C
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
//* This file is part of the MOOSE framework
//* https://www.mooseframework.org
//*
//* All rights reserved, see COPYRIGHT for full restrictions
//* https://github.com/idaholab/moose/blob/master/COPYRIGHT
//*
//* Licensed under LGPL 2.1, please see LICENSE for details
//* https://www.gnu.org/licenses/lgpl-2.1.html
#include "WorkBalance.h"
// MOOSE includes
#include "MooseVariable.h"
#include "ThreadedElementLoopBase.h"
#include "ThreadedNodeLoop.h"
#include "libmesh/quadrature.h"
#include <numeric>
registerMooseObject("MooseApp", WorkBalance);
template <>
InputParameters
validParams<WorkBalance>()
{
InputParameters params = validParams<GeneralVectorPostprocessor>();
params.addClassDescription("Computes several metrics for workload balance per processor");
// These are numbered this way because NL is always system 0 and Aux is system 1
MooseEnum system_enum("ALL=-1 NL AUX", "ALL");
params.addParam<MooseEnum>(
"system",
system_enum,
"The system(s) to retrieve the number of DOFs from (NL, AUX, ALL). Default == ALL");
params.addParam<bool>("sync_to_all_procs",
false,
"Whether or not to sync the vectors to all processors. By default we only "
"sync them to processor 0 so they can be written out. Setting this to "
"true will use more communication, but is necessary if you expect these "
"vectors to be available on all processors");
return params;
}
WorkBalance::WorkBalance(const InputParameters & parameters)
: GeneralVectorPostprocessor(parameters),
_system(getParam<MooseEnum>("system")),
_sync_to_all_procs(getParam<bool>("sync_to_all_procs")),
_local_num_elems(0),
_local_num_nodes(0),
_local_num_dofs(0),
_local_num_partition_sides(0),
_local_partition_surface_area(0),
_pid(declareVector("pid")),
_num_elems(declareVector("num_elems")),
_num_nodes(declareVector("num_nodes")),
_num_dofs(declareVector("num_dofs")),
_num_partition_sides(declareVector("num_partition_sides")),
_partition_surface_area(declareVector("partition_surface_area"))
{
}
void
WorkBalance::initialize()
{
_local_num_elems = 0;
_local_num_nodes = 0;
_local_num_dofs = 0;
_local_num_partition_sides = 0;
_local_partition_surface_area = 0;
}
namespace
{
// Helper Threaded Loop for Elements
class WBElementLoop : public ThreadedElementLoopBase<ConstElemRange>
{
public:
WBElementLoop(MooseMesh & mesh, int system)
: ThreadedElementLoopBase(mesh),
_system(system),
_local_num_elems(0),
_local_num_dofs(0),
_local_num_partition_sides(0),
_local_partition_surface_area(0),
_this_pid(_mesh.processor_id()) // Get this once because it is expensive
{
}
WBElementLoop(WBElementLoop & x, Threads::split split)
: ThreadedElementLoopBase(x, split),
_system(x._system),
_local_num_elems(0),
_local_num_dofs(0),
_local_num_partition_sides(0),
_local_partition_surface_area(0),
_this_pid(x._this_pid)
{
}
virtual ~WBElementLoop() {}
virtual void pre() override
{
_local_num_elems = 0;
_local_num_dofs = 0;
_local_num_partition_sides = 0;
_local_partition_surface_area = 0;
}
virtual void onElement(const Elem * elem) override
{
_local_num_elems++;
// Find out how many dofs there are on this element
if (_system == WorkBalance::ALL) // All systems
{
auto n_sys = elem->n_systems();
for (decltype(n_sys) sys = 0; sys < n_sys; sys++)
{
auto n_vars = elem->n_vars(sys);
for (decltype(n_vars) var = 0; var < n_vars; var++)
_local_num_dofs += elem->n_dofs(sys, var);
}
}
else // Particular system
{
auto n_vars = elem->n_vars(static_cast<unsigned int>(_system));
for (decltype(n_vars) var = 0; var < n_vars; var++)
_local_num_dofs += elem->n_dofs(static_cast<unsigned int>(_system), var);
}
}
virtual void onInternalSide(const Elem * elem, unsigned int side) override
{
if (elem->neighbor(side)->processor_id() != _this_pid)
{
_local_num_partition_sides++;
// Build the side so we can compute its volume
auto side_elem = elem->build_side(side);
_local_partition_surface_area += side_elem->volume();
}
}
void join(const WBElementLoop & y)
{
_local_num_elems += y._local_num_elems;
_local_num_dofs += y._local_num_dofs;
_local_num_partition_sides += y._local_num_partition_sides;
_local_partition_surface_area += y._local_partition_surface_area;
}
int _system;
dof_id_type _local_num_elems;
dof_id_type _local_num_dofs;
dof_id_type _local_num_partition_sides;
Real _local_partition_surface_area;
processor_id_type _this_pid;
};
class WBNodeLoop : public ThreadedNodeLoop<ConstNodeRange, ConstNodeRange::const_iterator>
{
public:
WBNodeLoop(FEProblemBase & fe_problem, int system)
: ThreadedNodeLoop<ConstNodeRange, ConstNodeRange::const_iterator>(fe_problem),
_system(system),
_local_num_nodes(0),
_local_num_dofs(0)
{
}
WBNodeLoop(WBNodeLoop & x, Threads::split split)
: ThreadedNodeLoop<ConstNodeRange, ConstNodeRange::const_iterator>(x, split),
_system(x._system),
_local_num_nodes(0),
_local_num_dofs(0)
{
}
virtual void onNode(ConstNodeRange::const_iterator & node_it)
{
auto & node = *(*node_it);
_local_num_nodes++;
// Find out how many dofs there are on this node
if (_system == WorkBalance::ALL) // All systems
{
auto n_sys = node.n_systems();
for (decltype(n_sys) sys = 0; sys < n_sys; sys++)
{
auto n_vars = node.n_vars(sys);
for (decltype(n_vars) var = 0; var < n_vars; var++)
_local_num_dofs += node.n_dofs(sys, var);
}
}
else // Particular system
{
auto n_vars = node.n_vars(static_cast<unsigned int>(_system));
for (decltype(n_vars) var = 0; var < n_vars; var++)
_local_num_dofs += node.n_dofs(static_cast<unsigned int>(_system), var);
}
}
void join(WBNodeLoop & y)
{
_local_num_nodes += y._local_num_nodes;
_local_num_dofs += y._local_num_dofs;
}
int _system;
dof_id_type _local_num_nodes;
dof_id_type _local_num_dofs;
};
} // End of blank namespace
void
WorkBalance::execute()
{
auto & mesh = _fe_problem.mesh();
// Get all of the Elem info first
auto wb_el = WBElementLoop(mesh, _system);
Threads::parallel_reduce(*mesh.getActiveLocalElementRange(), wb_el);
_local_num_elems = wb_el._local_num_elems;
_local_num_dofs = wb_el._local_num_dofs;
_local_num_partition_sides = wb_el._local_num_partition_sides;
_local_partition_surface_area = wb_el._local_partition_surface_area;
// Now Node info
auto wb_nl = WBNodeLoop(_fe_problem, _system);
Threads::parallel_reduce(*mesh.getLocalNodeRange(), wb_nl);
_local_num_nodes = wb_nl._local_num_nodes;
_local_num_dofs += wb_nl._local_num_dofs;
}
void
WorkBalance::finalize()
{
if (!_sync_to_all_procs)
{
// Gather the results down to processor 0
_communicator.gather(0, static_cast<Real>(_local_num_elems), _num_elems);
_communicator.gather(0, static_cast<Real>(_local_num_nodes), _num_nodes);
_communicator.gather(0, static_cast<Real>(_local_num_dofs), _num_dofs);
_communicator.gather(0, static_cast<Real>(_local_num_partition_sides), _num_partition_sides);
_communicator.gather(0, _local_partition_surface_area, _partition_surface_area);
}
else
{
// Gather the results down to all procs
_communicator.allgather(static_cast<Real>(_local_num_elems), _num_elems);
_communicator.allgather(static_cast<Real>(_local_num_nodes), _num_nodes);
_communicator.allgather(static_cast<Real>(_local_num_dofs), _num_dofs);
_communicator.allgather(static_cast<Real>(_local_num_partition_sides), _num_partition_sides);
_communicator.allgather(_local_partition_surface_area, _partition_surface_area);
}
// Fill in the PID column - this just makes plotting easier
_pid.resize(_num_elems.size());
std::iota(_pid.begin(), _pid.end(), 0);
}