Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
9d4c8e2
add test for skip first epoch sampler
Mar 8, 2022
4482acf
Support API set_dump for more type of ops
Feb 17, 2022
d5eb509
fix ut compile in debug mode
Mar 15, 2022
af92029
PyNative RunOp Async
Mar 11, 2022
6f17f3f
Add dist excecution mode class declaration.
Mar 14, 2022
31af126
modify_format
Mar 15, 2022
a14ca9c
fix pclint_plus
Mar 15, 2022
059c939
!31269 Fix ut debug mode compile
it-is-a-robot Mar 15, 2022
5716ad8
!29707 PyNative RunOp Async
it-is-a-robot Mar 15, 2022
32c5e89
!29707 PyNative RunOp Async
it-is-a-robot Mar 15, 2022
b7f90e2
implement Conv2dBackward and log of pool
xiaoyuxing010818 Mar 10, 2022
5331c87
!31311 modify format
it-is-a-robot Mar 15, 2022
4f28588
!31263 Add dist excecution mode class declaration.
it-is-a-robot Mar 15, 2022
2f0824b
!31263 Add dist excecution mode class declaration.
it-is-a-robot Mar 15, 2022
f0e448d
!31308 fix pclint plus
it-is-a-robot Mar 15, 2022
a7b2ef7
!31308 fix pclint plus
it-is-a-robot Mar 15, 2022
3d16a4d
Fix bug in terminating a failed pipeline
h-farahat Mar 15, 2022
12d8906
!30253 Support more ops for dump flag
it-is-a-robot Mar 15, 2022
8714988
!30253 Support more ops for dump flag
it-is-a-robot Mar 15, 2022
9694fce
!31092 implement Conv2dBackward and remove index cache in maxpool for…
it-is-a-robot Mar 15, 2022
916628e
!31092 implement Conv2dBackward and remove index cache in maxpool for…
it-is-a-robot Mar 15, 2022
272d25b
!31137 Test SkipFirstEpochSampler
it-is-a-robot Mar 15, 2022
5f75c70
!31332 Fix bug in terminating a failed pipeline
it-is-a-robot Mar 15, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions docs/api/api_python/mindspore/mindspore.set_dump.rst
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,8 @@ mindspore.set_dump

.. Note::
- 此API只在Ascend后端的图模式有效。
- 当target是一个Cell且enabled设置为True时,Cell实例及其子Cell实例的Primitive将递归启用Dump。如果算子不是Cell实例的成员,则不会为该算子启用Dump(例如,在construct方法中直接使用的 `functional 算子 <https://www.mindspore.cn/docs/api/zh-CN/master/api_python/mindspore.ops.html#functional>`_ )。要使此API生效,请在Cell的__init__方法中使用self.some_op = SomeOp()的写法。
- 使用set_dump(Cell, True)后,Cell正向计算中的算子会被Dump,大多数反向计算(梯度运算产生的计算)不会被Dump。然而,由于图的优化,一些反向计算的数据仍然会被Dump。可以忽略文件名中包含“Gradients”的反向计算数据。
- 此API只支持训练开始前调用。如果在训练过程中调用这个API,可能不会有效果。
- 使用set_dump(Cell, True)后,Cell正向计算和反向计算(梯度运算产生的计算)中的算子会被Dump。
- 对于 `nn.SoftMaxCrossEntropyWithLogits 层 <https://www.mindspore.cn/docs/api/zh-CN/master/api_python/nn/mindspore.nn.SoftmaxCrossEntropyWithLogits.html#mindspore.nn.SoftmaxCrossEntropyWithLogits>`_ ,正向计算和反向计算使用同一组算子。因此,只能看到反向计算中的Dump数据。请注意,当使用sparse=True和reduce=“mean”初始化时,nn.SoftmaxCrossEntropyWithLogits层也将在内部使用这些算子。

**参数:**
Expand Down
2 changes: 1 addition & 1 deletion docs/api/api_python/nn/mindspore.nn.Cell.rst
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@
初始化并替换Cell中所有的parameter的值。

.. note::
在调用`init_parameters_data`后,`trainable_params()` 或其他相似的接口可能返回不同的参数对象,不要保存这些结果。
在调用 `init_parameters_data` 后,`trainable_params()` 或其他相似的接口可能返回不同的参数对象,不要保存这些结果。

**参数:**

Expand Down
6 changes: 3 additions & 3 deletions mindspore/ccsrc/backend/common/session/session_basic.cc
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@
#endif
#include "backend/common/session/session_factory.h"
#include "backend/common/session/pynative_task_manager.h"
#include "runtime/pynative/op_lazy_builder.h"
#include "runtime/pynative/op_executor.h"
#ifdef ENABLE_DEBUGGER
#include "debug/tensor_load.h"
#include "debug/debugger/proto_exporter.h"
Expand Down Expand Up @@ -2770,8 +2770,8 @@ void SessionBasic::AddGradAddrToBucket(const GraphId &graph_id, const std::vecto
auto &free_bucket = bucket_list[free_bucket_index];
free_bucket->AddGradTensor(tensor);
if (free_bucket->full()) {
// Delete this when session is moved to MindRT.
runtime::OpLazyBuilder::GetInstance().ExecuteRemainingTasks();
// AllReduce need to wait for the kernel execution of bprop to complete.
runtime::OpExecutor::GetInstance().Wait();
MS_LOG(INFO) << "bucket is full";
free_bucket->Launch();
free_bucket_index = ++free_bucket_iter->second;
Expand Down
2 changes: 1 addition & 1 deletion mindspore/ccsrc/backend/common/somas/somas.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ class Somas {
~Somas() = default;

bool Allocate(const session::KernelGraph *graph);
size_t GetTotalMemSize() { return mem_offset_; }
size_t GetTotalMemSize() const { return mem_offset_; }
void set_mem_base_addr(uint8_t *mem_base_addr) { mem_base_addr_ = mem_base_addr; }
uint8_t *GetNodeOutputPtr(const AnfNodePtr &node, size_t index) const;
uint8_t *GetNodeWorkSpacePtr(const AnfNodePtr &node, size_t index) const;
Expand Down
213 changes: 116 additions & 97 deletions mindspore/ccsrc/backend/graph_compiler/backend.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
#include "include/common/utils/parallel_context.h"
#include "backend/graph_compiler/transform.h"
#include "backend/common/session/session_factory.h"
#include "runtime/pynative/op_lazy_builder.h"
#include "runtime/pynative/op_executor.h"
#include "backend/common/optimizer/helper.h"
#include "pipeline/pynative/pynative_execute.h"
#include "pipeline/jit/action.h"
Expand Down Expand Up @@ -373,18 +373,6 @@ bool EnablePyNativeSyncRunning() {
MS_EXCEPTION_IF_NULL(ms_context);
return ms_context->get_param<bool>(MS_CTX_ENABLE_PYNATIVE_SYNCHRONIZE);
}

bool NeedDisableLazyBuild(bool need_erase, bool cache_hit, const OpRunInfo &op_run_info) {
// Disable lazy build when:
// 1. Execute Dynamic shape operator. The output shape depends on the calculation result of the operator.
// 2. Cache hit and there are no tasks in Queue. For example Non-first iteration.
// 3. Not in nn.Cell construct.
// 4. Operator to process dataset.
// 5. Graph mode.
// 6. set PYNATIVE_SYNCHRONIZE in context.
return need_erase || cache_hit || !op_run_info.lazy_build || OpInBlackList(op_run_info) ||
GetExecutionMode() == kGraphMode || EnablePyNativeSyncRunning();
}
} // namespace

VectorRef MsBackend::MsRunGraph(const GraphId &g, const VectorRef &args, const std::string &target) {
Expand Down Expand Up @@ -873,10 +861,10 @@ void PushTupleTensor(const VectorRef &args, const std::vector<AnfNodePtr> &param

void MindRTBackend::RunGraphBySingleOp(const std::vector<KernelGraphPtr> &graphs,
const std::vector<std::vector<tensor::TensorPtr>> &inputs, VectorRef *outputs) {
SyncLazyTasks();
WaitTaskFinish();
MS_EXCEPTION_IF_NULL(graph_compiler_);
auto &op_lazy_builder = runtime::OpLazyBuilder::GetInstance();
op_lazy_builder.Register([this]() { LazyExecuteTaskCallback(); });
auto &op_executor = runtime::OpExecutor::GetInstance();
op_executor.Register([this]() { BatchBuildCallback(); });
for (size_t graph_index = 0; graph_index < graphs.size(); ++graph_index) {
const auto &graph = graphs[graph_index];
MS_EXCEPTION_IF_NULL(graph);
Expand Down Expand Up @@ -910,11 +898,11 @@ void MindRTBackend::RunGraphBySingleOp(const std::vector<KernelGraphPtr> &graphs

RunOp(&op_run_info, &op_outputs);
} else {
SyncLazyTasks();
WaitTaskFinish();
RunControlOperator(graph_compiler_, graph, kernel, op_output_map, parameter_index, inputs[graph_index],
&input_tensor_info, &op_outputs);
// Execute remaining lazy tasks before PyNative hook exit.
SyncLazyTasks();
WaitTaskFinish();
}

graph_compiler_->UpdateRefCount(input_tensor_info.input_kernel, &cnode_ref_count, &op_output_map);
Expand All @@ -927,7 +915,7 @@ void MindRTBackend::RunGraphBySingleOp(const std::vector<KernelGraphPtr> &graphs
graph_compiler_->AddGradAddrToBucket(graph->graph_id(), graph_output_info.graph_output_tensors);
}
}
SyncLazyTasks();
WaitTaskFinish();
// Clear bucket resources every step
if (graph->is_bprop()) {
graph_compiler_->ClearAllBucket(graph->graph_id());
Expand Down Expand Up @@ -961,7 +949,8 @@ void MindRTBackend::RunGraph(const ActorInfo &actor_info, const VectorRef &args,
const auto &graph_compiler_info = *(graph_iter->second);
const auto &origin_parameters = graph_compiler_info.origin_parameters_order_;

SyncLazyTasks();
// For pynative and graph mix execution.
WaitTaskFinish();

// Transform args to input tensors.
// Input tensors of the graph.
Expand Down Expand Up @@ -1163,9 +1152,9 @@ void MindRTBackend::SetDebuggerInit() {
}
#endif

void MindRTBackend::SyncLazyTasks() const { runtime::OpLazyBuilder::GetInstance().ExecuteRemainingTasks(); }
void MindRTBackend::WaitTaskFinish() const { runtime::OpExecutor::GetInstance().Wait(); }

void MindRTBackend::ClearOpBuilderResource() const { runtime::OpLazyBuilder::GetInstance().Reset(); }
void MindRTBackend::ClearOpExecutorResource() const { runtime::OpExecutor::GetInstance().Reset(); }

void MindRTBackend::SyncStream() {
const auto &device_context =
Expand Down Expand Up @@ -1326,7 +1315,7 @@ void MindRTBackend::ReleaseForwardOutput(const std::vector<TensorPtr> &input_ten
}
}

void MindRTBackend::CompileSingleOpGraphs(const std::vector<std::shared_ptr<runtime::OpTask>> &build_tasks) {
void MindRTBackend::CompileSingleOpGraphs(const std::vector<std::shared_ptr<runtime::OpBuildTask>> &build_tasks) {
if (build_tasks.empty()) {
return;
}
Expand Down Expand Up @@ -1357,9 +1346,24 @@ void MindRTBackend::CompileSingleOpGraphs(const std::vector<std::shared_ptr<runt
}
}

void MindRTBackend::LazyExecuteTaskCallback() {
auto &op_lazy_builder = runtime::OpLazyBuilder::GetInstance();
if (op_lazy_builder.QueueEmpty()) {
void MindRTBackend::OpRunCallback(const std::shared_ptr<runtime::OpTaskContext> &context) {
MS_LOG(DEBUG) << "OpRunCallback start";
auto ms_context = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(ms_context);
auto infer_flag = ms_context->get_param<bool>(MS_CTX_ENABLE_PYNATIVE_INFER);
ms_context->set_param<bool>(MS_CTX_ENABLE_PYNATIVE_INFER, context->is_pynative_infer());
runtime::RunSingleOpGraph(context->graph(), GetTensorWithoutValueMask(context->op_run_info()),
context->device_context(), context->op_run_info().is_dynamic_shape);
ClearGraphDeviceAddress(context->graph(), context->device_context(), context->op_run_info().is_gradient_out);
ClearInputDeviceAddress(context->graph(), context->device_context());
// Reset PyNative infer flag.
ms_context->set_param<bool>(MS_CTX_ENABLE_PYNATIVE_INFER, infer_flag);
MS_LOG(DEBUG) << "OpRunCallback end";
}

void MindRTBackend::BatchBuildCallback() {
auto &op_executor = runtime::OpExecutor::GetInstance();
if (op_executor.BuildQueueEmpty()) {
return;
}

Expand All @@ -1369,50 +1373,76 @@ void MindRTBackend::LazyExecuteTaskCallback() {
MS_EXCEPTION_IF_NULL(ms_context);
auto infer_flag = ms_context->get_param<bool>(MS_CTX_ENABLE_PYNATIVE_INFER);

CompileSingleOpGraphs(op_lazy_builder.GetOpBuildTasks());
op_lazy_builder.ClearOpBuildTasks();

// Run op one by one
auto &op_run_tasks = op_lazy_builder.GetOpRunTasks();
while (!op_run_tasks.empty()) {
auto &op_run_task = op_run_tasks.front();
const auto &context = op_run_task->context();
ms_context->set_param<bool>(MS_CTX_ENABLE_PYNATIVE_INFER, context->is_pynative_infer());
auto tensor_without_value_mask = GetTensorWithoutValueMask(context->op_run_info());
runtime::RunSingleOpGraph(context->graph(), tensor_without_value_mask, context->device_context(),
context->op_run_info().is_dynamic_shape);
ReleaseForwardOutput(context->op_run_info().input_tensors);
ClearGraphDeviceAddress(context->graph(), context->device_context(), context->op_run_info().is_gradient_out);
ClearInputDeviceAddress(context->graph(), context->device_context());
op_lazy_builder.PopOpRunTask();
}
CompileSingleOpGraphs(op_executor.GetOpBuildTasks());
op_executor.ClearOpBuildTasks();

ms_context->set_param<bool>(MS_CTX_ENABLE_PYNATIVE_INFER, infer_flag);
MS_LOG(DEBUG) << "End";
} catch (const py::type_error &ex) {
op_lazy_builder.Reset();
op_executor.Reset();
throw py::type_error(ex);
} catch (const py::value_error &ex) {
op_lazy_builder.Reset();
op_executor.Reset();
throw py::value_error(ex);
} catch (const py::index_error &ex) {
op_lazy_builder.Reset();
op_executor.Reset();
throw py::index_error(ex);
} catch (const py::name_error &ex) {
op_lazy_builder.Reset();
op_executor.Reset();
throw py::name_error(ex);
} catch (const std::exception &ex) {
op_lazy_builder.Reset();
op_executor.Reset();
throw(std::runtime_error(ex.what()));
} catch (...) {
op_lazy_builder.Reset();
op_executor.Reset();
std::string exName(abi::__cxa_current_exception_type()->name());
MS_LOG(EXCEPTION) << "Error occurred when execute task in queue. Exception name: " << exName;
}
}

void MindRTBackend::RunOpInternal(bool single_op_cache_hit, GraphCompilerInfo *graph_compiler_info,
OpRunInfo *op_run_info, VectorRef *outputs) {
void MindRTBackend::DispatchOpTask(bool single_op_cache_hit, VectorRef *outputs, GraphCompilerInfo *graph_compiler_info,
OpRunInfo *op_run_info) {
MS_EXCEPTION_IF_NULL(graph_compiler_info);
// Fetch outputs.
if (graph_compiler_info->graphs_.empty()) {
MS_LOG(EXCEPTION) << "No graph found, op:" << graph_compiler_info->name_;
}
const auto &graph = graph_compiler_info->graphs_.front();
MS_EXCEPTION_IF_NULL(graph);
const auto &output_nodes = graph_compiler_->GetGraphOutputNodes(graph->graph_id());

runtime::UpdateDeviceAddress(graph, GetTensorWithoutValueMask(*op_run_info),
graph_compiler_info->device_contexts_.front());
UpdateOutput(output_nodes, outputs);

auto ms_context = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(ms_context);
auto infer_flag = ms_context->get_param<bool>(MS_CTX_ENABLE_PYNATIVE_INFER);
auto run_op_context = std::make_shared<runtime::OpTaskContext>(
graph_compiler_info, graph, output_nodes, *op_run_info, graph_compiler_info->device_contexts_.front(), infer_flag);

// Save build task and run task.
std::promise<bool> promise;
auto future = promise.get_future();

auto &op_executor = runtime::OpExecutor::GetInstance();
if (!single_op_cache_hit) {
op_executor.PushOpBuildTask(std::make_shared<runtime::OpBuildTask>(run_op_context, std::move(promise)));
} else {
promise.set_value(true);
}
op_executor.PushOpRunTask(std::make_shared<runtime::OpRunTask>(
run_op_context, [this](const std::shared_ptr<runtime::OpTaskContext> &ctx) { OpRunCallback(ctx); },
std::move(future)));

op_executor.Register([this]() { BatchBuildCallback(); });
if (op_executor.BuildQueueFull()) {
WaitTaskFinish();
}
}

void MindRTBackend::RunOpImpl(bool single_op_cache_hit, GraphCompilerInfo *graph_compiler_info, OpRunInfo *op_run_info,
VectorRef *outputs) {
MS_EXCEPTION_IF_NULL(op_run_info);
MS_EXCEPTION_IF_NULL(graph_compiler_info);
// Fetch outputs.
Expand All @@ -1423,51 +1453,36 @@ void MindRTBackend::RunOpInternal(bool single_op_cache_hit, GraphCompilerInfo *g
MS_EXCEPTION_IF_NULL(outputs);

auto device_context = graph_compiler_info->device_contexts_.front();
auto &op_lazy_builder = runtime::OpLazyBuilder::GetInstance();

bool lazy_build_disabled = NeedDisableLazyBuild(graph_compiler_info->need_erase_,
(single_op_cache_hit && op_lazy_builder.QueueEmpty()), *op_run_info);

auto tensor_without_value_mask = GetTensorWithoutValueMask(*op_run_info);

if (lazy_build_disabled) {
if (!op_lazy_builder.QueueEmpty()) {
op_lazy_builder.ExecuteRemainingTasks();
}
if (!single_op_cache_hit) {
CompileSingleOpGraph(graph, device_context, graph_compiler_info);
}
auto &op_executor = runtime::OpExecutor::GetInstance();

bool async_exec_disabled = graph_compiler_info->need_erase_ || !op_run_info->lazy_build ||
OpInBlackList(*op_run_info) || GetExecutionMode() == kGraphMode ||
EnablePyNativeSyncRunning();
if (!async_exec_disabled) {
MS_LOG(DEBUG) << "Async exec enabled, op:" << op_run_info->op_name;
DispatchOpTask(single_op_cache_hit, outputs, graph_compiler_info, op_run_info);
return;
}

runtime::UpdateDeviceAddress(graph, tensor_without_value_mask, device_context);
runtime::RunSingleOpGraph(graph, tensor_without_value_mask, device_context, op_run_info->is_dynamic_shape);
ReleaseForwardOutput(op_run_info->input_tensors);
UpdateOutput(output_nodes, outputs);
ClearGraphDeviceAddress(graph, device_context, op_run_info->is_gradient_out);
ClearInputDeviceAddress(graph, device_context);
if (op_run_info->is_dynamic_shape) {
UpdateOutputAbstract(graph, op_run_info);
}
if (graph_compiler_info->need_erase_) {
EraseSingleOpCache(graph_compiler_info->name_, graph);
}
} else {
runtime::UpdateDeviceAddress(graph, tensor_without_value_mask, device_context);
UpdateOutput(output_nodes, outputs);
auto ms_context = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(ms_context);
auto infer_flag = ms_context->get_param<bool>(MS_CTX_ENABLE_PYNATIVE_INFER);
auto run_op_context =
std::make_shared<runtime::OpLazyBuilderContext>(graph_compiler_info, graph, output_nodes, *op_run_info,
graph_compiler_info->device_contexts_.front(), infer_flag);
if (!single_op_cache_hit) {
op_lazy_builder.PushOpBuildTask(std::make_shared<runtime::OpBuildTask>(run_op_context));
}
op_lazy_builder.PushOpRunTask(std::make_shared<runtime::OpRunTask>(run_op_context));
// Callbacks need to be re-registered in heterogeneous scenarios.
op_lazy_builder.Register([this]() { LazyExecuteTaskCallback(); });
if (op_lazy_builder.QueueFull()) {
op_lazy_builder.ExecuteRemainingTasks();
}
MS_LOG(DEBUG) << "Async exec disabled, op:" << op_run_info->op_name;
if (!op_executor.BuildQueueEmpty()) {
WaitTaskFinish();
}
if (!single_op_cache_hit) {
CompileSingleOpGraph(graph, device_context, graph_compiler_info);
}
auto tensors_without_value_mask = GetTensorWithoutValueMask(*op_run_info);
runtime::UpdateDeviceAddress(graph, tensors_without_value_mask, device_context);
runtime::RunSingleOpGraph(graph, tensors_without_value_mask, device_context, op_run_info->is_dynamic_shape);
ReleaseForwardOutput(op_run_info->input_tensors);
UpdateOutput(output_nodes, outputs);
ClearGraphDeviceAddress(graph, device_context, op_run_info->is_gradient_out);
ClearInputDeviceAddress(graph, device_context);
if (op_run_info->is_dynamic_shape) {
UpdateOutputAbstract(graph, op_run_info);
}
if (graph_compiler_info->need_erase_) {
EraseSingleOpCache(graph_compiler_info->name_, graph);
}
}

Expand All @@ -1483,6 +1498,10 @@ void MindRTBackend::RunOp(OpRunInfo *op_run_info, VectorRef *outputs) {
bool single_op_cache_hit = true;
auto graph_id = graph_compiler_->CompileGraph(*op_run_info, &single_op_cache_hit, device_context);
std::string actor_info = std::to_string(graph_id) + "_" + op_run_info->op_name;
if (runtime::OpExecutor::GetInstance().ActorInQueue(actor_info)) {
WaitTaskFinish();
}

GraphCompilerInfo *graph_compiler_info_ptr;
if (single_op_cache_hit) {
auto iter = actor_to_graph_compiler_info_.find(actor_info);
Expand All @@ -1506,7 +1525,7 @@ void MindRTBackend::RunOp(OpRunInfo *op_run_info, VectorRef *outputs) {
}
}

RunOpInternal(single_op_cache_hit, graph_compiler_info_ptr, op_run_info, outputs);
RunOpImpl(single_op_cache_hit, graph_compiler_info_ptr, op_run_info, outputs);
}

void MindRTBackend::CompileSingleOpGraph(const KernelGraphPtr &graph, const DeviceContext *device_context,
Expand All @@ -1532,7 +1551,7 @@ void MindRTBackend::UpdateOutput(const std::vector<session::KernelWithIndex> &ou
}
auto output_tensor = CreateOutputTensor(item_with_index.first, item_with_index.second);
MS_EXCEPTION_IF_NULL(output_tensor);
output_tensor->set_lazy_callback([]() { runtime::OpLazyBuilder::GetInstance().ExecuteRemainingTasks(); });
output_tensor->set_lazy_callback([]() { runtime::OpExecutor::GetInstance().Wait(); });
outputs->emplace_back(output_tensor);
}
}
Expand Down
Loading