/
mpi_message.fbs
101 lines (88 loc) · 3.19 KB
/
mpi_message.fbs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
// Modifications copyright (C) 2017 Uber Technologies, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================
namespace horovod.common.wire;
// Supported data types.
enum MPIDataType:byte {
HOROVOD_UINT8 = 0,
HOROVOD_INT8 = 1,
HOROVOD_UINT16 = 2,
HOROVOD_INT16 = 3,
HOROVOD_INT32 = 4,
HOROVOD_INT64 = 5,
HOROVOD_FLOAT16 = 6,
HOROVOD_FLOAT32 = 7,
HOROVOD_FLOAT64 = 8,
HOROVOD_BOOL = 9
}
// An MPIRequest is a message sent from a rank greater than zero to the
// coordinator (rank zero), informing the coordinator of an operation that
// the rank wants to do and the tensor that it wants to apply the operation to.
enum MPIRequestType:byte {
ALLREDUCE = 0,
ALLGATHER = 1,
BROADCAST = 2
}
table MPIRequest {
// The request rank is necessary to create a consistent ordering of results,
// for example in the allgather where the order of outputs should be sorted
// by rank.
request_rank:int;
request_type:MPIRequestType;
tensor_type:MPIDataType;
tensor_name:string;
// Root rank is necessary for broadcast operation.
root_rank:int;
// Device this request is made on.
device:int;
// We use a repeated integer instead of a TensorShapeProto because linking directly
// to TensorFlow protos causes issues. See the comment for MPIDataType.
tensor_shape:[long];
}
table MPIRequestList {
requests:[MPIRequest];
// Flag indicating if worker is requesting a shutdown.
shutdown:bool;
}
// An MPIResponse is a message sent from the coordinator (rank zero) to a rank
// greater than zero, informing the rank of an operation should be performed
// now. If the operation requested would result in an error (for example, due
// to a type or shape mismatch), then the MPIResponse can contain an error and
// an error message instead.
enum MPIResponseType:byte {
ALLREDUCE = 0,
ALLGATHER = 1,
BROADCAST = 2,
ERROR = 3
}
table MPIResponse {
response_type:MPIResponseType;
// If tensor_names has more than one entry, this is a fused operation and the
// rest of the fields are the same.
tensor_names:[string];
// Empty unless response_type is ERROR.
error_message:string;
// List of devices participating in this operation.
devices:[int];
// Empty unless response_type is ALLGATHER.
// These tensor sizes are the dimension zero sizes of all the input matrices,
// indexed by the rank.
tensor_sizes:[long];
}
table MPIResponseList {
responses:[MPIResponse];
// Flag indicating if worker is requested to shutdown.
shutdown:bool;
}