Skip to content

Commit

Permalink
Move error handling to DALI core. (NVIDIA#867)
Browse files Browse the repository at this point in the history
* Move error_handling and cuda_utils to dali_core.
* Fix hang when something other than `std::exception` is thrown.
* Add CUDA error handling.
* Move dynlink_cuda to core.
* Add dynlink_cuda static library.

Signed-off-by: Michal Zientkiewicz <michalz@nvidia.com>
  • Loading branch information
mzient authored and haoxintong committed Jul 16, 2019
1 parent 48122f6 commit fd67c17
Show file tree
Hide file tree
Showing 122 changed files with 490 additions and 345 deletions.
7 changes: 5 additions & 2 deletions cmake/lint.cmake
Expand Up @@ -11,13 +11,16 @@ file(GLOB_RECURSE LINT_INC "${DALI_INC_DIR}/*.h" "${DALI_INC_DIR}/*.cuh" "${DALI

# nvdecoder
list(REMOVE_ITEM LINT_SRC
${DALI_SRC_DIR}/util/dynlink_cuda.h
${DALI_SRC_DIR}/util/dynlink_cuda.cc
${DALI_SRC_DIR}/core/dynlink_cuda.cc
${DALI_SRC_DIR}/pipeline/operators/reader/nvdecoder/dynlink_nvcuvid.h
${DALI_SRC_DIR}/pipeline/operators/reader/nvdecoder/dynlink_cuviddec.h
${DALI_SRC_DIR}/pipeline/operators/reader/nvdecoder/dynlink_nvcuvid.cc
)

list(REMOVE_ITEM LINT_INC
${DALI_INC_DIR}/core/dynlink_cuda.h
)

# cuTT
list(REMOVE_ITEM LINT_SRC
${DALI_SRC_DIR}/pipeline/operators/transpose/cutt/cutt.h
Expand Down
1 change: 1 addition & 0 deletions dali/CMakeLists.txt
Expand Up @@ -120,6 +120,7 @@ if (BUILD_TEST)
"${dali_kernel_test_lib}"
"${DALI_LIBS}"
"${dali_lib}"
"${dali_core_lib}"
"gtest")

set_target_properties(${test_main_bin} PROPERTIES
Expand Down
4 changes: 4 additions & 0 deletions dali/core/CMakeLists.txt
Expand Up @@ -15,6 +15,10 @@
# Get all the source files
collect_headers(DALI_INST_HDRS PARENT_SCOPE)
collect_sources(DALI_CORE_SRCS)
list(REMOVE_ITEM DALI_CORE_SRCS dynlink_cuda.cc)
collect_test_sources(DALI_TEST_SRCS PARENT_SCOPE)

cuda_add_library(dynlink_cuda STATIC dynlink_cuda.cc)

cuda_add_library(${dali_core_lib} STATIC ${DALI_CORE_SRCS})
target_link_libraries(${dali_core_lib} dynlink_cuda)
6 changes: 3 additions & 3 deletions dali/util/dynlink_cuda.cc → dali/core/dynlink_cuda.cc
Expand Up @@ -21,7 +21,7 @@
//#define CUDA_INIT_OPENGL

#include <stdio.h>
#include "dali/util/dynlink_cuda.h"
#include "dali/core/dynlink_cuda.h"

tcuInit *_cuInit;
tcuDriverGetVersion *cuDriverGetVersion;
Expand Down Expand Up @@ -176,7 +176,7 @@ tcuMipmappedArrayGetLevel *cuMipmappedArrayGetLevel;
tcuMipmappedArrayDestroy *cuMipmappedArrayDestroy;

tcuProfilerStop *cuProfilerStop;
#endif
#endif

#ifdef CUDA_INIT_D3D9
// D3D9/CUDA interop (CUDA 1.x compatible API). These functions
Expand Down Expand Up @@ -416,7 +416,7 @@ CUresult cuInit(unsigned int Flags, int cudaVersion)
}

#endif

// These could be _v2 interfaces
if (cudaVersion >= 4000 && __CUDA_API_VERSION >= 4000)
{
Expand Down
4 changes: 2 additions & 2 deletions dali/error_handling.cc → dali/core/error_handling.cc
Expand Up @@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.

#include "dali/error_handling.h"
#include "dali/core/error_handling.h"

namespace dali {

Expand Down Expand Up @@ -41,7 +41,7 @@ void DALIAppendToLastError(string error_str) {
void DALIReportFatalProblem(const char *file, int lineNumb, const char *pComment) {
dali::string line = std::to_string(lineNumb);
dali::string error_str = "[" + dali::string(file) + ":" + line + "] " + pComment;
throw std::runtime_error(error_str);
throw DALIException(error_str);
}


Expand Down
79 changes: 79 additions & 0 deletions dali/core/error_test.cc
@@ -0,0 +1,79 @@
// Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <gtest/gtest.h>
#include "dali/core/dynlink_cuda.h"
#include "dali/core/cuda_utils.h"
#include "dali/core/error_handling.h"

namespace dali {

TEST(Error, EnforceFailed) {
std::string file_and_line;
std::string message = "Test message";
try {
// the two statements below must be in one line!
file_and_line = FILE_AND_LINE; DALI_ENFORCE(!"Always fail", message);
FAIL() << "Exception was expeceted";
} catch (DALIException &e) {
std::string msg = e.what();
EXPECT_NE(msg.find(file_and_line), std::string::npos)
<< "File/line spec not found in error `what()`, which is:\n" << msg;
EXPECT_NE(msg.find(message), std::string::npos)
<< "Original message not found in error `what()`, which is:\n" << msg;
} catch (...) {
FAIL() << "Expected DALIException, got other exception";
}
}

TEST(Error, CudaError) {
try {
CUDA_CALL(cudaSetDevice(-2));
FAIL() << "Exception was expeceted";
} catch (CUDAError &e) {
EXPECT_TRUE(e.is_rt_api());
EXPECT_FALSE(e.is_drv_api());
EXPECT_EQ(e.rt_error(), cudaErrorInvalidDevice);
EXPECT_NE(strstr(e.what(), "cudaErrorInvalidDevice"), nullptr)
<< "Error name `cudaErrorInvalidDevice` should have appeared in the exception message\n";
} catch (...) {
FAIL() << "Expected CUDAError, got other exception";
}
EXPECT_EQ(cudaGetLastError(), cudaSuccess) << "Last error not cleared!";
}

TEST(Error, CudaAlloc_Drv) {
ASSERT_TRUE(cuInitChecked());
char name[64];
try {
CUDA_CALL(cuDeviceGetName(name, sizeof(name), -2));
} catch (CUDAError &e) {
EXPECT_TRUE(e.is_drv_api());
EXPECT_FALSE(e.is_rt_api());
EXPECT_EQ(e.drv_error(), CUDA_ERROR_INVALID_DEVICE);
EXPECT_NE(strstr(e.what(), "CUDA_ERROR_INVALID_DEVICE"), nullptr)
<< "Error name `cudaErrorInvalidDevice` should have appeared in the exception message\n";
}
CUdevice device;
cuDeviceGet(&device, 0);
EXPECT_NO_THROW(CUDA_CALL(cuDeviceGetName(name, sizeof(name), device)));
}

TEST(Error, CudaAlloc) {
void *mem = nullptr;
size_t sz = 1LL << 62;
EXPECT_THROW(CUDA_CALL(cudaMalloc(&mem, sz)), CUDABadAlloc);
}

} // namespace dali
2 changes: 1 addition & 1 deletion dali/image/image.h
Expand Up @@ -24,7 +24,7 @@
#include <utility>
#include <functional>
#include "dali/core/common.h"
#include "dali/error_handling.h"
#include "dali/core/error_handling.h"
#include "dali/pipeline/operators/operator.h"
#include "dali/util/crop_window.h"

Expand Down
2 changes: 1 addition & 1 deletion dali/image/jpeg_mem.cc
Expand Up @@ -26,7 +26,7 @@ limitations under the License.
#include <string>
#include <utility>
#include "dali/image/jpeg_handle.h"
#include "dali/error_handling.h"
#include "dali/core/error_handling.h"

namespace dali {
namespace jpeg {
Expand Down
2 changes: 1 addition & 1 deletion dali/image/transform.h
Expand Up @@ -18,7 +18,7 @@
#include <string>
#include <utility>
#include "dali/core/common.h"
#include "dali/error_handling.h"
#include "dali/core/error_handling.h"
#include "dali/pipeline/data/tensor.h"

namespace dali {
Expand Down
2 changes: 1 addition & 1 deletion dali/kernels/alloc.cc
Expand Up @@ -16,7 +16,7 @@
#include <cassert>
#include "dali/kernels/alloc.h"
#include "dali/core/static_switch.h"
#include "dali/core/gpu_utils.h"
#include "dali/core/device_guard.h"

namespace dali {
namespace kernels {
Expand Down
2 changes: 1 addition & 1 deletion dali/pipeline/data/allocator.h
Expand Up @@ -15,7 +15,7 @@
#ifndef DALI_PIPELINE_DATA_ALLOCATOR_H_
#define DALI_PIPELINE_DATA_ALLOCATOR_H_

#include "dali/util/cuda_utils.h"
#include "dali/core/cuda_utils.h"
#include "dali/pipeline/operators/operator_factory.h"

namespace dali {
Expand Down
2 changes: 1 addition & 1 deletion dali/pipeline/data/backend.h
Expand Up @@ -18,7 +18,7 @@
#include <cuda_runtime_api.h>
#include <memory>

#include "dali/error_handling.h"
#include "dali/core/error_handling.h"
#include "dali/pipeline/data/allocator.h"

namespace dali {
Expand Down
2 changes: 1 addition & 1 deletion dali/pipeline/data/buffer.h
Expand Up @@ -24,7 +24,7 @@
#include <vector>

#include "dali/core/common.h"
#include "dali/error_handling.h"
#include "dali/core/error_handling.h"
#include "dali/pipeline/data/types.h"
#include "dali/core/util.h"

Expand Down
2 changes: 1 addition & 1 deletion dali/pipeline/data/tensor.h
Expand Up @@ -23,7 +23,7 @@
#include <vector>

#include "dali/core/common.h"
#include "dali/error_handling.h"
#include "dali/core/error_handling.h"
#include "dali/pipeline/data/backend.h"
#include "dali/pipeline/data/buffer.h"
#include "dali/pipeline/data/tensor_list.h"
Expand Down
4 changes: 2 additions & 2 deletions dali/pipeline/data/types.h
Expand Up @@ -28,8 +28,8 @@
#include <unordered_map>

#include "dali/core/common.h"
#include "dali/util/cuda_utils.h"
#include "dali/error_handling.h"
#include "dali/core/cuda_utils.h"
#include "dali/core/error_handling.h"

// Workaround missing "is_trivially_copyable" in libstdc++ for g++ < 5.0.
// We have to first include some standard library headers, so to have __GLIBCXX__ symbol,
Expand Down
2 changes: 1 addition & 1 deletion dali/pipeline/data/view_test.cc
Expand Up @@ -15,7 +15,7 @@
#include <gtest/gtest.h>
#include "dali/pipeline/data/views.h"

#define EXPECT_ENFORCE_FAIL(statement) EXPECT_THROW(statement, std::runtime_error)
#define EXPECT_ENFORCE_FAIL(statement) EXPECT_THROW(statement, DALIException)

namespace dali {

Expand Down
10 changes: 7 additions & 3 deletions dali/pipeline/executor/async_pipelined_executor.h
Expand Up @@ -17,7 +17,7 @@

#include <string>
#include "dali/core/common.h"
#include "dali/error_handling.h"
#include "dali/core/error_handling.h"
#include "dali/pipeline/executor/pipelined_executor.h"
#include "dali/pipeline/util/worker_thread.h"

Expand Down Expand Up @@ -80,13 +80,17 @@ class DLL_PUBLIC AsyncPipelinedExecutor : public PipelinedExecutor {
CheckForErrors();
try {
PipelinedExecutor::Outputs(ws);
} catch (std::runtime_error &e) {
} catch (std::exception &e) {
exec_error_ = true;
mixed_work_cv_.notify_all();
gpu_work_cv_.notify_all();
SignalStop();
throw std::runtime_error(std::string(e.what()));
throw;
} catch (...) {
exec_error_ = true;
mixed_work_cv_.notify_all();
gpu_work_cv_.notify_all();
SignalStop();
throw std::runtime_error("Unknown critical error in pipeline");
}
}
Expand Down
8 changes: 5 additions & 3 deletions dali/pipeline/executor/async_separated_pipelined_executor.h
Expand Up @@ -17,7 +17,7 @@

#include <string>
#include "dali/core/common.h"
#include "dali/error_handling.h"
#include "dali/core/error_handling.h"
#include "dali/pipeline/executor/pipelined_executor.h"
#include "dali/pipeline/util/worker_thread.h"

Expand Down Expand Up @@ -82,11 +82,13 @@ class DLL_PUBLIC AsyncSeparatedPipelinedExecutor : public SeparatedPipelinedExec
CheckForErrors();
try {
SeparatedPipelinedExecutor::Outputs(ws);
} catch (std::runtime_error &e) {
} catch (std::exception &e) {
exec_error_ = true;
SignalStop();
throw std::runtime_error(std::string(e.what()));
throw;
} catch (...) {
exec_error_ = true;
SignalStop();
throw std::runtime_error("Unknown critical error in pipeline");
}
}
Expand Down

0 comments on commit fd67c17

Please sign in to comment.