/
ddl_operations.cc
127 lines (106 loc) · 4.61 KB
/
ddl_operations.cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
// Modifications copyright (C) 2019 Uber Technologies, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================
#include "ddl_operations.h"
#include "../logging.h"
namespace horovod {
namespace common {
DDL_Type GetDDLDataType(const std::shared_ptr<Tensor> tensor) {
switch (tensor->dtype()) {
case HOROVOD_FLOAT32:
return DDL_TYPE_FLOAT;
case HOROVOD_FLOAT16:
return DDL_TYPE_HALF;
default:
throw std::logic_error("Type " + DataType_Name(tensor->dtype()) +
" is not supported in DDL mode.");
}
}
DDLAllreduce::DDLAllreduce(DDLContext* ddl_context,
GPUContext* gpu_context,
HorovodGlobalState* global_state)
: GPUAllreduce(gpu_context, global_state),
ddl_context_(ddl_context) {}
Status DDLAllreduce::Execute(std::vector<TensorTableEntry>& entries, const Response& response) {
auto& first_entry = entries[0];
gpu_op_context_.InitGPU(entries);
gpu_op_context_.InitGPUQueue(entries, response);
auto& timeline = global_state_->timeline;
if (ddl_context_->ddl_local_device_id != first_entry.device) {
throw std::logic_error("DDL does not support more than one GPU device per process.");
}
const void* fused_input_data;
void* buffer_data;
size_t buffer_len;
// Copy memory into the fusion buffer.
if (entries.size() > 1) {
MemcpyInFusionBuffer(entries, fused_input_data, buffer_data, buffer_len);
if (timeline.Initialized()) {
gpu_context_->RecordEvent(gpu_op_context_.event_queue, MEMCPY_IN_FUSION_BUFFER, *gpu_op_context_.stream);
}
} else {
fused_input_data = first_entry.tensor->data();
buffer_data = (void*) first_entry.output->data();
buffer_len = (size_t) first_entry.output->size();
}
int64_t num_elements = buffer_len / DataType_Size(first_entry.tensor->dtype());
if (response.prescale_factor() != 1.0) {
// Execute prescaling op
ScaleBuffer(response.prescale_factor(), entries, fused_input_data, buffer_data, num_elements);
fused_input_data = buffer_data; // for unfused, scale is done out of place
}
// Do allreduce.
if (entries.size() == 1) {
// Copy input buffer content to output buffer
// because DDL only supports in-place allreduce
gpu_context_->MemcpyAsyncD2D(buffer_data, fused_input_data, buffer_len, *gpu_op_context_.stream);
gpu_context_->RecordEvent(gpu_op_context_.event_queue, MEMCPY_IN_FUSION_BUFFER, *gpu_op_context_.stream);
}
// Synchronize.
gpu_context_->WaitForEvents(gpu_op_context_.event_queue, entries, timeline);
DDL_Type ddl_data_type = GetDDLDataType(first_entry.tensor);
auto ddl_result = ddl_allreduce(buffer_data, (size_t) num_elements, ddl_data_type,
DDL_OP_SUM);
if (ddl_result != DDL_SUCCESS) {
throw std::logic_error("ddl_allreduce failed.");
}
if (response.postscale_factor() != 1.0) {
// Execute postscaling op
ScaleBuffer(response.postscale_factor(), entries, buffer_data, buffer_data, num_elements);
}
// Copy memory out of the fusion buffer.
if (entries.size() > 1) {
MemcpyOutFusionBuffer(buffer_data, entries);
if (timeline.Initialized()) {
gpu_context_->RecordEvent(gpu_op_context_.event_queue, MEMCPY_OUT_FUSION_BUFFER, *gpu_op_context_.stream);
}
}
return gpu_op_context_.FinalizeGPUQueue(entries);
}
void DDLAllreduce::DDLInit(DDLContext* ddl_context, GPUContext* gpu_context) {
LOG(WARNING) << "DDL backend has been deprecated. Please, start using the NCCL backend by "
"building Horovod with 'HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_GPU_BROADCAST=NCCL'.";
auto ddl_options = std::getenv("DDL_OPTIONS");
if (ddl_options == nullptr) {
throw std::logic_error("DDL_OPTIONS env variable needs to be set to use DDL.");
}
auto ddl_result = ddl_init(ddl_options);
if (ddl_result != DDL_SUCCESS) {
throw std::logic_error("ddl_init failed.");
}
ddl_context->ddl_local_device_id = gpu_context->GetDevice();
}
} // namespace common
} // namespace horovod