diff --git a/CHANGELOG b/CHANGELOG index beb655822..c57b12ffb 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -8,6 +8,7 @@ * Added set_stream and get_stream to bifrost.device to help control which CUDA stream is used * Added bifrost.device.ExternalStream as a context manager to help with mixing Bifrost and cupy/pycuda * Fixed a problem calling bifrost.reduce on a slice of an array + * Added support for a new 'mapped' ring space which is backed by a file on disk 0.10.0 * Switched over to an autotools-based build system diff --git a/codecov.yml b/codecov.yml index 6c5021390..699f6bcf5 100644 --- a/codecov.yml +++ b/codecov.yml @@ -1,3 +1,5 @@ # https://docs.codecov.com/docs/fixing-paths fixes: - "/home/docker/actions-runner/_work/_tool/Python/3.*/x64/lib/python3.*/site-packages/::python/" +ignore: + - "**/libbifrost_generated.py" diff --git a/configure b/configure index 0ad981380..57c941994 100755 --- a/configure +++ b/configure @@ -709,6 +709,7 @@ PYBUILDFLAGS PYTHON3 PYTHON HAVE_PYTHON +MAPPED_RING_DIR HAVE_MAP_CACHE HAVE_CUDA_DEBUG enable_native_arch @@ -862,6 +863,7 @@ enable_debug enable_trace enable_native_arch enable_cuda_debug +with_mapped_ring_dir enable_map_cache enable_python with_python @@ -1568,6 +1570,9 @@ Optional Packages: --with-alignment=N default memory alignment in bytes (default=4096) --with-logging-dir=DIR directory for Bifrost proclog logging (default=autodetect) + --with-mapped-ring-dir=... + directory to store mapped ring files in + (default=/tmp/bifrost_mapped) --with-python=[PATH] absolute path to python executable --with-pybuild-flags build flags for python (default='') --with-pyinstall-flags install flags for python (default='') @@ -22075,6 +22080,16 @@ then : NVCCFLAGS="$NVCCFLAGS -G" fi +# Check whether --with-mapped_ring_dir was given. +if test ${with_mapped_ring_dir+y} +then : + withval=$with_mapped_ring_dir; +else $as_nop + mapped_ring_dir=/tmp/bifrost_mapped +fi + +MAPPED_RING_DIR=$mapped_ring_dir + # Check whether --enable-map_cache was given. if test ${enable_map_cache+y} then : @@ -27057,6 +27072,9 @@ fi { printf "%s\n" "$as_me:${as_lineno-$LINENO}: memory alignment: $ALIGNMENT" >&5 printf "%s\n" "$as_me: memory alignment: $ALIGNMENT" >&6;} +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: mapped ring directory: $mapped_ring_dir" >&5 +printf "%s\n" "$as_me: mapped ring directory: $mapped_ring_dir" >&6;} + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: logging directory: $HAVE_TMPFS" >&5 printf "%s\n" "$as_me: logging directory: $HAVE_TMPFS" >&6;} diff --git a/configure.ac b/configure.ac index 9aebfd263..77818d245 100644 --- a/configure.ac +++ b/configure.ac @@ -206,6 +206,13 @@ AS_IF([test x$enable_cuda_debug != xno], [AC_SUBST([HAVE_CUDA_DEBUG], [1]) NVCCFLAGS="$NVCCFLAGS -G"]) +AC_ARG_WITH([mapped_ring_dir], + [AS_HELP_STRING([--with-mapped-ring-dir=...], + [directory to store mapped ring files in (default=/tmp/bifrost_mapped)])], + [], + [mapped_ring_dir=/tmp/bifrost_mapped]) +AC_SUBST([MAPPED_RING_DIR], [$mapped_ring_dir]) + AC_ARG_ENABLE([map_cache], [AS_HELP_STRING([--disable-map-cache], [disable caching bifrost.map kernels (default=no)])], @@ -375,6 +382,8 @@ AS_IF([test x$HAVE_PYTHON = x1], [AC_MSG_NOTICE(python bindings: no)]) AC_MSG_NOTICE(memory alignment: $ALIGNMENT) + +AC_MSG_NOTICE(mapped ring directory: $mapped_ring_dir) AC_MSG_NOTICE(logging directory: $HAVE_TMPFS) diff --git a/python/bifrost/Space.py b/python/bifrost/Space.py index 3945e8599..1edb669a0 100644 --- a/python/bifrost/Space.py +++ b/python/bifrost/Space.py @@ -32,12 +32,14 @@ SPACEMAP_TO_STR = {_bf.BF_SPACE_AUTO: 'auto', _bf.BF_SPACE_SYSTEM: 'system', + _bf.BF_SPACE_MAPPED: 'mapped', _bf.BF_SPACE_CUDA: 'cuda', _bf.BF_SPACE_CUDA_HOST: 'cuda_host', _bf.BF_SPACE_CUDA_MANAGED: 'cuda_managed'} SPACEMAP_FROM_STR = {'auto': _bf.BF_SPACE_AUTO, 'system': _bf.BF_SPACE_SYSTEM, + 'mapped': _bf.BF_SPACE_MAPPED, 'cuda': _bf.BF_SPACE_CUDA, 'cuda_host': _bf.BF_SPACE_CUDA_HOST, 'cuda_managed': _bf.BF_SPACE_CUDA_MANAGED} @@ -45,7 +47,7 @@ class Space(object): def __init__(self, s): if isinstance(s, str): - if s not in set(['auto', 'system', + if s not in set(['auto', 'system', 'mapped', 'cuda', 'cuda_host', 'cuda_managed']): raise ValueError('Invalid space: %s' % s) self._space = s diff --git a/python/bifrost/libbifrost.py b/python/bifrost/libbifrost.py index 08eda69c1..5d0c7dbc0 100644 --- a/python/bifrost/libbifrost.py +++ b/python/bifrost/libbifrost.py @@ -181,6 +181,7 @@ def _get(func, *args): STRING2SPACE = {'auto': _bf.BF_SPACE_AUTO, 'system': _bf.BF_SPACE_SYSTEM, + 'mapped': _bf.BF_SPACE_MAPPED, 'cuda': _bf.BF_SPACE_CUDA, 'cuda_host': _bf.BF_SPACE_CUDA_HOST, 'cuda_managed': _bf.BF_SPACE_CUDA_MANAGED} @@ -192,6 +193,7 @@ def _string2space(s): SPACE2STRING = {_bf.BF_SPACE_AUTO: 'auto', _bf.BF_SPACE_SYSTEM: 'system', + _bf.BF_SPACE_MAPPED: 'mapped', _bf.BF_SPACE_CUDA: 'cuda', _bf.BF_SPACE_CUDA_HOST: 'cuda_host', _bf.BF_SPACE_CUDA_MANAGED: 'cuda_managed'} diff --git a/python/bifrost/memory.py b/python/bifrost/memory.py index dc884e00b..472ddc541 100644 --- a/python/bifrost/memory.py +++ b/python/bifrost/memory.py @@ -41,10 +41,19 @@ def space_accessible(space, from_spaces): from_spaces = set(from_spaces) if space in from_spaces: return True + elif space in ('system', 'mapped'): + return 'system' in from_spaces \ + or 'mapped' in from_spaces \ + or 'cuda_host' in from_spaces \ + or 'cuda_managed' in from_spaces elif space == 'cuda_host': - return 'system' in from_spaces + return 'system' in from_spaces \ + or 'mapped' in from_spaces \ + or 'cuda_managed' in from_spaces elif space == 'cuda_managed': - return 'system' in from_spaces or 'cuda' in from_spaces + return 'system' in from_spaces \ + or 'mapped' in from_spaces \ + or 'cuda' in from_spaces else: return False diff --git a/python/bifrost/pipeline.py b/python/bifrost/pipeline.py index 4f16ab2f6..7f782bd93 100644 --- a/python/bifrost/pipeline.py +++ b/python/bifrost/pipeline.py @@ -190,6 +190,7 @@ def dot_graph(self, parent_graph=None): for oring in block.orings: space_colors = { 'system': 'orange', + 'mapped': 'goldenrod', 'cuda': 'limegreen', 'cuda_host': 'deepskyblue' } diff --git a/python/bifrost/version/__main__.py b/python/bifrost/version/__main__.py index d352987d6..8c6a4501c 100644 --- a/python/bifrost/version/__main__.py +++ b/python/bifrost/version/__main__.py @@ -28,6 +28,7 @@ from __future__ import print_function +import os import argparse from bifrost import __version__, __copyright__, __license__ @@ -49,6 +50,9 @@ def _yes_no(value): print(" NUMA support %s" % _yes_no(BF_NUMA_ENABLED)) print(" Hardware locality support: %s" % _yes_no(BF_HWLOC_ENABLED)) print(" Mellanox messaging accelerator (VMA) support: %s" % _yes_no(BF_VMA_ENABLED)) + print(" Mapped ring directory: %s" % BF_MAPPED_RING_DIR) + if os.getenv('BIFROST_MAPPED_DIR') is not None: + print("Mapped ring directory: override - %s" % os.getenv('BIFROST_MAPPED_DIR')) print(" Logging directory: %s" % BF_PROCLOG_DIR) print(" Debugging: %s" % _yes_no(BF_DEBUG_ENABLED)) print(" CUDA support: %s" % _yes_no(BF_CUDA_ENABLED)) diff --git a/src/bifrost/config.h.in b/src/bifrost/config.h.in index 5e58f1c95..4960c8d1c 100644 --- a/src/bifrost/config.h.in +++ b/src/bifrost/config.h.in @@ -66,6 +66,9 @@ extern "C" { #define BF_TRACE_ENABLED @HAVE_TRACE@ #define BF_CUDA_DEBUG_ENABLED @HAVE_CUDA_DEBUG@ +// Mapped ring directory +#define BF_MAPPED_RING_DIR "@MAPPED_RING_DIR@" + // Logging directory #define BF_PROCLOG_DIR "@HAVE_TMPFS@" diff --git a/src/bifrost/memory.h b/src/bifrost/memory.h index 48624ae3c..d2ff7b5a7 100644 --- a/src/bifrost/memory.h +++ b/src/bifrost/memory.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016, The Bifrost Authors. All rights reserved. + * Copyright (c) 2016-2019, The Bifrost Authors. All rights reserved. * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -43,9 +43,10 @@ extern "C" { typedef enum BFspace_ { BF_SPACE_AUTO = 0, BF_SPACE_SYSTEM = 1, // aligned_alloc - BF_SPACE_CUDA = 2, // cudaMalloc - BF_SPACE_CUDA_HOST = 3, // cudaHostAlloc - BF_SPACE_CUDA_MANAGED = 4 // cudaMallocManaged + BF_SPACE_MAPPED = 2, // mmapped to a file + BF_SPACE_CUDA = 3, // cudaMalloc + BF_SPACE_CUDA_HOST = 4, // cudaHostAlloc + BF_SPACE_CUDA_MANAGED = 5 // cudaMallocManaged } BFspace; BFstatus bfMalloc(void** ptr, BFsize size, BFspace space); diff --git a/src/fileutils.hpp b/src/fileutils.hpp index 7664a1cfb..5c938e709 100644 --- a/src/fileutils.hpp +++ b/src/fileutils.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, The Bifrost Authors. All rights reserved. + * Copyright (c) 2016-2022, The Bifrost Authors. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions diff --git a/src/memory.cpp b/src/memory.cpp index 415847802..1df86b4df 100644 --- a/src/memory.cpp +++ b/src/memory.cpp @@ -1,5 +1,5 @@ -/* - * Copyright (c) 2016, The Bifrost Authors. All rights reserved. +/* + * Copyright (c) 2016-2020, The Bifrost Authors. All rights reserved. * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -32,20 +32,195 @@ #include "utils.hpp" #include "cuda.hpp" #include "trace.hpp" +#include "fileutils.hpp" #include // For posix_memalign #include // For memcpy #include +#include +#include // For flock +#include // For fstat +#include // For getpid +#include // For opendir, readdir, closedir +#include +#include +#include +#include #define BF_IS_POW2(x) (x) && !((x) & ((x) - 1)) static_assert(BF_IS_POW2(BF_ALIGNMENT), "BF_ALIGNMENT must be a power of 2"); #undef BF_IS_POW2 //static_assert(BF_ALIGNMENT >= 8, "BF_ALIGNMENT must be >= 8"); +#if defined(__APPLE__) && __APPLE__ + +// Based on information from: +// https://hg.mozilla.org/mozilla-central/file/3d846420a907/xpcom/glue/FileUtils.cpp#l61 + +int posix_fallocate(int fd, off_t offset, off_t len) { + fstore_t flags; + flags.fst_flags = F_ALLOCATECONTIG | F_ALLOCATEALL; + flags.fst_posmode = F_PEOFPOSMODE; + flags.fst_offset = offset; + flags.fst_length = len; + + if( fcntl(fd, F_PREALLOCATE, &flags) == -1 ) { + // Try again but this time don't request a contiguous file + flags.fst_flags = F_ALLOCATEALL; + if( fcntl(fd, F_PREALLOCATE, &flags) == -1 ) { + return -1; + } + } + + return ftruncate(fd, len); +} + +#endif + +class MappedMgr { + const char* base_mapped_dir = ((std::getenv("BIFROST_MAPPED_DIR") != NULL) \ + ? std::getenv("BIFROST_MAPPED_DIR") \ + : BF_MAPPED_RING_DIR); + std::string _mapped_dir; + std::map _filenames; + std::map _fds; + std::map _lengths; + + void try_base_mapped_dir_cleanup() { + // Do this with a file lock to avoid interference from other processes + LockFile lock(std::string(base_mapped_dir) + ".lock"); + DIR* dp; + // Remove pid dirs for which a corresponding process does not exist + if( (dp = opendir(base_mapped_dir)) ) { + struct dirent* ep; + while( (ep = readdir(dp)) ) { + pid_t pid = atoi(ep->d_name); + if( pid && !process_exists(pid) ) { + remove_files_recursively(std::string(base_mapped_dir) + "/" + + std::to_string(pid)); + } + } + closedir(dp); + } + // Remove the base_logdir if it's empty + try { remove_dir(base_mapped_dir); } + catch( std::exception const& ) {} + } + void cleanup(std::string filename, int fd) { + if( fd >= 0 ) { + ::close(fd); + } + try { remove_file(filename); } + catch( std::exception const& ) {} + } + MappedMgr() + : _mapped_dir(std::string(base_mapped_dir) + "/" + std::to_string(getpid())) { + this->try_base_mapped_dir_cleanup(); + make_dir(base_mapped_dir, 0777); + make_dir(_mapped_dir); + } + ~MappedMgr() { + while(! _filenames.empty() ) { + auto x = _filenames.begin(); + this->free(x->first); + } + try { + remove_files_recursively(_mapped_dir); + this->try_base_mapped_dir_cleanup(); + } catch( std::exception const& ) {} + } +public: + MappedMgr(MappedMgr& ) = delete; + MappedMgr& operator=(MappedMgr& ) = delete; + static MappedMgr& get() { + static MappedMgr mm; + return mm; + } + inline bool is_mapped(void* data) const { + if( _filenames.count(data) == 0 ) { + return false; + } else { + return true; + } + } + int alloc(void** data, BFsize size) { + // Create + char tempname[256]; + strcpy(tempname, _mapped_dir.c_str()); + strcat(tempname, "/mmapXXXXXX"); + int fd = ::mkstemp(tempname); + std::string filename = std::string(tempname); + if( fd < 0 ) { + this->cleanup(filename, fd); + return 1; + } + + // Allocate + int status = ::posix_fallocate(fd, 0, size); + if( status != 0 ) { + this->cleanup(filename, fd); + return 2; + } + + // MMap + *data = ::mmap(nullptr, size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); + if( *data == MAP_FAILED ) { + this->cleanup(filename, fd); + return 3; + } + + // Advise the kernel of how we'll use it + ::madvise(*data, size, MADV_SEQUENTIAL); + + // Save and return + _filenames[*data] = filename; + _fds[*data] = fd; + _lengths[*data] = size; + return 0; + } + int sync(void* data) { + if( !this->is_mapped(data) ) { + return -1; + } + + return ::msync(data, _lengths[data], MS_ASYNC|MS_INVALIDATE); + } + void* memcpy(void* dest, void* src, BFsize count) { + ::memcpy(dest, src, count); + if( this->is_mapped(dest) ) { + this->sync(dest); + } + return dest; + } + void* memset(void* dest, int ch, BFsize count) { + ::memset(dest, ch, count); + if( this->is_mapped(dest) ) { + this->sync(dest); + } + return dest; + } + int free(void* data) { + if( !this->is_mapped(data) ) { + return -1; + } + + ::munmap(data, _lengths[data]); + this->cleanup(_filenames[data], _fds[data]); + _filenames.erase(data); + _fds.erase(data); + _lengths.erase(data); + return 0; + } +}; + BFstatus bfGetSpace(const void* ptr, BFspace* space) { BF_ASSERT(ptr, BF_STATUS_INVALID_POINTER); -#if !defined BF_CUDA_ENABLED || !BF_CUDA_ENABLED - *space = BF_SPACE_SYSTEM; +#if !defined(BF_CUDA_ENABLED) || !BF_CUDA_ENABLED + if( MappedMgr::get().is_mapped((void*) ptr) ) { + *space = BF_SPACE_MAPPED; + } else { + *space = BF_SPACE_SYSTEM; + } #else cudaPointerAttributes ptr_attrs; cudaError_t ret = cudaPointerGetAttributes(&ptr_attrs, ptr); @@ -57,7 +232,11 @@ BFstatus bfGetSpace(const void* ptr, BFspace* space) { // up in cuda-memcheck? // Note: cudaPointerGetAttributes only works for memory allocated with // CUDA API functions, so if it fails we just assume sysmem. - *space = BF_SPACE_SYSTEM; + if( MappedMgr::get().is_mapped((void*) ptr) ) { + *space = BF_SPACE_MAPPED; + } else { + *space = BF_SPACE_SYSTEM; + } // WAR to avoid the ignored failure showing up later cudaGetLastError(); #if defined(CUDA_VERSION) && CUDA_VERSION >= 10000 @@ -95,10 +274,10 @@ const char* bfGetSpaceString(BFspace space) { // coding all of these values twice (one in memory.h for the // enum, once here)? - switch( space ) { case BF_SPACE_AUTO: return "auto"; case BF_SPACE_SYSTEM: return "system"; + case BF_SPACE_MAPPED: return "mapped"; case BF_SPACE_CUDA: return "cuda"; case BF_SPACE_CUDA_HOST: return "cuda_host"; case BF_SPACE_CUDA_MANAGED: return "cuda_managed"; @@ -118,7 +297,12 @@ BFstatus bfMalloc(void** ptr, BFsize size, BFspace space) { //printf("bfMalloc --> %p\n", data); break; } -#if defined BF_CUDA_ENABLED && BF_CUDA_ENABLED + case BF_SPACE_MAPPED: { + int err = MappedMgr::get().alloc((void**)&data, size); + BF_ASSERT(!err, BF_STATUS_MEM_ALLOC_FAILED); + break; + } +#if defined(BF_CUDA_ENABLED) && BF_CUDA_ENABLED case BF_SPACE_CUDA: { BF_CHECK_CUDA(cudaMalloc((void**)&data, size), BF_STATUS_MEM_ALLOC_FAILED); @@ -150,7 +334,8 @@ BFstatus bfFree(void* ptr, BFspace space) { } switch( space ) { case BF_SPACE_SYSTEM: ::free(ptr); break; -#if defined BF_CUDA_ENABLED && BF_CUDA_ENABLED + case BF_SPACE_MAPPED: MappedMgr::get().free(ptr); break; +#if defined(BF_CUDA_ENABLED) && BF_CUDA_ENABLED case BF_SPACE_CUDA: cudaFree(ptr); break; case BF_SPACE_CUDA_HOST: cudaFreeHost(ptr); break; case BF_SPACE_CUDA_MANAGED: cudaFree(ptr); break; @@ -167,30 +352,39 @@ BFstatus bfMemcpy(void* dst, if( count ) { BF_ASSERT(dst, BF_STATUS_INVALID_POINTER); BF_ASSERT(src, BF_STATUS_INVALID_POINTER); -#if !defined BF_CUDA_ENABLED || !BF_CUDA_ENABLED - ::memcpy(dst, src, count); -#else // Note: Explicitly dispatching to ::memcpy was found to be much faster // than using cudaMemcpyDefault. if( src_space == BF_SPACE_AUTO ) bfGetSpace(src, &src_space); if( dst_space == BF_SPACE_AUTO ) bfGetSpace(dst, &dst_space); +#if defined(BF_CUDA_ENABLED) && BF_CUDA_ENABLED cudaMemcpyKind kind = cudaMemcpyDefault; +#endif switch( src_space ) { +#if defined(BF_CUDA_ENABLED) && BF_CUDA_ENABLED case BF_SPACE_CUDA_HOST: // fall-through +#endif + case BF_SPACE_MAPPED: // fall-through case BF_SPACE_SYSTEM: { switch( dst_space ) { +#if defined(BF_CUDA_ENABLED) && BF_CUDA_ENABLED case BF_SPACE_CUDA_HOST: // fall-through +#endif + case BF_SPACE_MAPPED: // fall-through case BF_SPACE_SYSTEM: ::memcpy(dst, src, count); return BF_STATUS_SUCCESS; +#if defined(BF_CUDA_ENABLED) && BF_CUDA_ENABLED case BF_SPACE_CUDA: kind = cudaMemcpyHostToDevice; break; // Is this the right thing to do? case BF_SPACE_CUDA_MANAGED: kind = cudaMemcpyDefault; break; +#endif default: BF_FAIL("Valid bfMemcpy dst space", BF_STATUS_INVALID_ARGUMENT); } break; } +#if defined(BF_CUDA_ENABLED) && BF_CUDA_ENABLED case BF_SPACE_CUDA: { switch( dst_space ) { case BF_SPACE_CUDA_HOST: // fall-through + case BF_SPACE_MAPPED: // fall-through # TODO: Is this a good idea? case BF_SPACE_SYSTEM: kind = cudaMemcpyDeviceToHost; break; case BF_SPACE_CUDA: kind = cudaMemcpyDeviceToDevice; break; case BF_SPACE_CUDA_MANAGED: kind = cudaMemcpyDefault; break; @@ -200,8 +394,10 @@ BFstatus bfMemcpy(void* dst, } // Is this the right thing to do? case BF_SPACE_CUDA_MANAGED: kind = cudaMemcpyDefault; break; +#endif default: BF_FAIL("Valid bfMemcpy src space", BF_STATUS_INVALID_ARGUMENT); } +#if defined(BF_CUDA_ENABLED) && BF_CUDA_ENABLED BF_TRACE_STREAM(g_cuda_stream); BF_CHECK_CUDA(cudaMemcpyAsync(dst, src, count, kind, g_cuda_stream), BF_STATUS_MEM_OP_FAILED); @@ -235,30 +431,39 @@ BFstatus bfMemcpy2D(void* dst, if( width && height ) { BF_ASSERT(dst, BF_STATUS_INVALID_POINTER); BF_ASSERT(src, BF_STATUS_INVALID_POINTER); -#if !defined BF_CUDA_ENABLED || !BF_CUDA_ENABLED - memcpy2D(dst, dst_stride, src, src_stride, width, height); -#else // Note: Explicitly dispatching to ::memcpy was found to be much faster // than using cudaMemcpyDefault. if( src_space == BF_SPACE_AUTO ) bfGetSpace(src, &src_space); if( dst_space == BF_SPACE_AUTO ) bfGetSpace(dst, &dst_space); +#if defined(BF_CUDA_ENABLED) && BF_CUDA_ENABLED cudaMemcpyKind kind = cudaMemcpyDefault; +#endif switch( src_space ) { +#if defined(BF_CUDA_ENABLED) && BF_CUDA_ENABLED case BF_SPACE_CUDA_HOST: // fall-through +#endif + case BF_SPACE_MAPPED: // fall-through case BF_SPACE_SYSTEM: { switch( dst_space ) { +#if defined(BF_CUDA_ENABLED) && BF_CUDA_ENABLED case BF_SPACE_CUDA_HOST: // fall-through +#endif + case BF_SPACE_MAPPED: // fall-through case BF_SPACE_SYSTEM: memcpy2D(dst, dst_stride, src, src_stride, width, height); return BF_STATUS_SUCCESS; +#if defined(BF_CUDA_ENABLED) && BF_CUDA_ENABLED case BF_SPACE_CUDA: kind = cudaMemcpyHostToDevice; break; // TODO: Is this the right thing to do? case BF_SPACE_CUDA_MANAGED: kind = cudaMemcpyDefault; break; +#endif default: BF_FAIL("Valid bfMemcpy2D dst space", BF_STATUS_INVALID_ARGUMENT); } break; } +#if defined(BF_CUDA_ENABLED) && BF_CUDA_ENABLED case BF_SPACE_CUDA: { switch( dst_space ) { case BF_SPACE_CUDA_HOST: // fall-through + case BF_SPACE_MAPPED: // fall-through # TODO: Is this a good idea? case BF_SPACE_SYSTEM: kind = cudaMemcpyDeviceToHost; break; case BF_SPACE_CUDA: kind = cudaMemcpyDeviceToDevice; break; // TODO: Is this the right thing to do? @@ -269,8 +474,10 @@ BFstatus bfMemcpy2D(void* dst, } // Is this the right thing to do? case BF_SPACE_CUDA_MANAGED: kind = cudaMemcpyDefault; break; +#endif default: BF_FAIL("Valid bfMemcpy2D src space", BF_STATUS_INVALID_ARGUMENT); } +#if defined(BF_CUDA_ENABLED) && BF_CUDA_ENABLED BF_TRACE_STREAM(g_cuda_stream); BF_CHECK_CUDA(cudaMemcpy2DAsync(dst, dst_stride, src, src_stride, @@ -292,8 +499,9 @@ BFstatus bfMemset(void* ptr, bfGetSpace(ptr, &space); } switch( space ) { + case BF_SPACE_MAPPED: // fall-through case BF_SPACE_SYSTEM: ::memset(ptr, value, count); break; -#if defined BF_CUDA_ENABLED && BF_CUDA_ENABLED +#if defined(BF_CUDA_ENABLED) && BF_CUDA_ENABLED case BF_SPACE_CUDA_HOST: ::memset(ptr, value, count); break; case BF_SPACE_CUDA: // Fall-through case BF_SPACE_CUDA_MANAGED: { @@ -329,10 +537,11 @@ BFstatus bfMemset2D(void* ptr, bfGetSpace(ptr, &space); } switch( space ) { + case BF_SPACE_MAPPED: // fall-through case BF_SPACE_SYSTEM: memset2D(ptr, stride, value, width, height); break; -#if defined BF_CUDA_ENABLED && BF_CUDA_ENABLED +#if defined(BF_CUDA_ENABLED) && BF_CUDA_ENABLED case BF_SPACE_CUDA_HOST: memset2D(ptr, stride, value, width, height); break; - case BF_SPACE_CUDA: // Fall-through + case BF_SPACE_CUDA: // fall-through case BF_SPACE_CUDA_MANAGED: { BF_TRACE_STREAM(g_cuda_stream); BF_CHECK_CUDA(cudaMemset2DAsync(ptr, stride, value, width, height, g_cuda_stream), diff --git a/src/ring_impl.cpp b/src/ring_impl.cpp index e0471b7ab..8d89c5e90 100644 --- a/src/ring_impl.cpp +++ b/src/ring_impl.cpp @@ -1,5 +1,5 @@ -/* - * Copyright (c) 2016, The Bifrost Authors. All rights reserved. +/* + * Copyright (c) 2016-2019, The Bifrost Authors. All rights reserved. * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -95,14 +95,16 @@ BFring_impl::BFring_impl(const char* name, BFspace space) _nread_open(0), _nwrite_open(0), _nrealloc_pending(0), _core(-1), _size_log(std::string("rings/")+name) { -#if defined BF_CUDA_ENABLED && BF_CUDA_ENABLED +#if defined(BF_CUDA_ENABLED) && BF_CUDA_ENABLED BF_ASSERT_EXCEPTION(space==BF_SPACE_SYSTEM || + space==BF_SPACE_MAPPED || space==BF_SPACE_CUDA || space==BF_SPACE_CUDA_HOST || space==BF_SPACE_CUDA_MANAGED, BF_STATUS_INVALID_ARGUMENT); #else - BF_ASSERT_EXCEPTION(space==BF_SPACE_SYSTEM, + BF_ASSERT_EXCEPTION(space==BF_SPACE_SYSTEM || + space==BF_SPACE_MAPPED, BF_STATUS_INVALID_ARGUMENT); #endif diff --git a/src/utils.hpp b/src/utils.hpp index db9b0d03b..84d56530f 100644 --- a/src/utils.hpp +++ b/src/utils.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016, The Bifrost Authors. All rights reserved. + * Copyright (c) 2016-2019, The Bifrost Authors. All rights reserved. * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -209,11 +209,17 @@ byteswap(T value, T* result) { } inline BFbool space_accessible_from(BFspace space, BFspace from) { -#if !defined BF_CUDA_ENABLED || !BF_CUDA_ENABLED - return space == BF_SPACE_SYSTEM; +#if !defined(BF_CUDA_ENABLED) || !BF_CUDA_ENABLED + return ( (space == BF_SPACE_SYSTEM) + || (space == BF_SPACE_MAPPED) ); #else switch( from ) { +#if !defined(BF_CUDA_ENABLED) || !BF_CUDA_ENABLED + case BF_CUDA_HOST: // fall-through +#endif + case BF_SPACE_MAPPED: // fall-through case BF_SPACE_SYSTEM: return (space == BF_SPACE_SYSTEM || + space == BF_SPACE_MAPPED || space == BF_SPACE_CUDA_HOST || space == BF_SPACE_CUDA_MANAGED); case BF_SPACE_CUDA: return (space == BF_SPACE_CUDA || diff --git a/test/test_pipeline.py b/test/test_pipeline.py index f40d30cac..b67cef804 100644 --- a/test/test_pipeline.py +++ b/test/test_pipeline.py @@ -116,14 +116,14 @@ def setUp(self): # ring buffer at least a few times over in order to properly # test things. self.fil_file = "./data/2chan16bitNoDM.fil" - def test_cuda_copy(self): + def test_cuda_copy(self, space='system'): def check_sequence(seq): pass def check_data(ispan): pass gulp_nframe = 101 with bf.Pipeline() as pipeline: - data = read_sigproc([self.fil_file], gulp_nframe) + data = read_sigproc([self.fil_file], gulp_nframe, space=space) for _ in range(10): data = copy(data, space='cuda') data = copy(data, space='cuda_host') @@ -132,6 +132,8 @@ def check_data(ispan): pipeline.run() self.assertEqual(ref['idata'].dtype, 'uint16') self.assertEqual(ref['idata'].shape, (29, 1, 2)) + def test_cuda_copy_mapped(self): + self.test_cuda_copy(space='mapped') def test_fdmt(self): gulp_nframe = 101 # TODO: Check handling of multiple pols (not currently supported?) diff --git a/test/test_pipeline_cpu.py b/test/test_pipeline_cpu.py index 6e1886014..c8e454ddf 100644 --- a/test/test_pipeline_cpu.py +++ b/test/test_pipeline_cpu.py @@ -70,7 +70,7 @@ def setUp(self): # ring buffer at least a few times over in order to properly # test things. self.fil_file = "./data/2chan16bitNoDM.fil" - def test_read_sigproc(self): + def test_read_sigproc(self, space='system'): gulp_nframe = 101 def check_sequence(seq): tensor = seq.header['_tensor'] @@ -84,11 +84,13 @@ def check_data(ispan, ospan): self.assertEqual(ispan.data.shape, (ispan.nframe,1,2)) self.assertEqual(ospan.data.shape, (ospan.nframe,1,2)) with bf.Pipeline() as pipeline: - data = read_sigproc([self.fil_file], gulp_nframe) - data = CallbackBlock(data, check_sequence, check_data) + data = read_sigproc([self.fil_file], gulp_nframe, space=space) + data = CallbackBlock(data, check_sequence, check_data, space='system') pipeline.run() + def test_read_sigproc_mapped(self): + self.test_read_sigproc(space='mapped') def run_test_simple_copy(self, guarantee, test_views=False, - gulp_nframe_inc=0): + gulp_nframe_inc=0, space='system'): def check_sequence(seq): hdr = seq.header tensor = hdr['_tensor'] @@ -101,7 +103,7 @@ def check_data(ispan, ospan): pass gulp_nframe = 101 with bf.Pipeline() as pipeline: - data = read_sigproc([self.fil_file], gulp_nframe) + data = read_sigproc([self.fil_file], gulp_nframe, space=space) if test_views: data = bf.views.split_axis(data, 'freq', 2, 'fine_freq') data = bf.views.merge_axes(data, 'freq', 'fine_freq') @@ -122,29 +124,44 @@ def check_data(ispan, ospan): for i in range(20): if gulp_nframe_inc != 0: data = copy(data, guarantee=guarantee, - gulp_nframe=gulp_nframe+i*gulp_nframe_inc) + gulp_nframe=gulp_nframe+i*gulp_nframe_inc, + space=space) else: - data = copy(data, guarantee=guarantee) + data = copy(data, guarantee=guarantee, space=space) data = copy(data, guarantee=guarantee, gulp_nframe=gulp_nframe) ref = {} - data = CallbackBlock(data, check_sequence, check_data, data_ref=ref) + data = CallbackBlock(data, check_sequence, check_data, data_ref=ref, space='system') pipeline.run() self.assertEqual(ref['odata'].dtype, 'uint16') self.assertEqual(ref['odata'].shape, (29, 1, 2)) def test_simple_copy(self): self.run_test_simple_copy(guarantee=True) + def test_simple_copy_mapped(self): + self.run_test_simple_copy(guarantee=True, space='mapped') def test_simple_copy_unguaranteed(self): self.run_test_simple_copy(guarantee=False) + def test_ssimple_copy_unguaranteed_mapped(self): + self.run_test_simple_copy(guarantee=False, space='mapped') def test_simple_copy_mixed_gulp_nframe(self): self.run_test_simple_copy(guarantee=True, gulp_nframe_inc=1) self.run_test_simple_copy(guarantee=True, gulp_nframe_inc=3) + def test_simple_copy_mixed_gulp_nframe_mapped(self): + self.run_test_simple_copy(guarantee=True, gulp_nframe_inc=1, space='mapped') + self.run_test_simple_copy(guarantee=True, gulp_nframe_inc=3, space='mapped') def test_simple_copy_mixed_gulp_nframe_unguaranteed(self): self.run_test_simple_copy(guarantee=False, gulp_nframe_inc=1) self.run_test_simple_copy(guarantee=False, gulp_nframe_inc=3) + def test_simple_copy_mixed_gulp_nframe_unguaranteed_mapped(self): + self.run_test_simple_copy(guarantee=False, gulp_nframe_inc=1, space='mapped') + self.run_test_simple_copy(guarantee=False, gulp_nframe_inc=3, space='mapped') def test_simple_views(self): self.run_test_simple_copy(guarantee=True, test_views=True) + def test_simple_views_mapped(self): + self.run_test_simple_copy(guarantee=True, test_views=True, space='mapped') def test_simple_views_unguaranteed(self): self.run_test_simple_copy(guarantee=False, test_views=True) + def test_simple_views_unguaranteed_mapped(self): + self.run_test_simple_copy(guarantee=False, test_views=True, space='mapped') def test_block_chainer(self): with bf.Pipeline() as pipeline: bc = bf.BlockChainer() diff --git a/test/test_serialize.py b/test/test_serialize.py index 4a60b7a95..2b774274c 100644 --- a/test/test_serialize.py +++ b/test/test_serialize.py @@ -77,10 +77,10 @@ def setUp(self): self.data = self.data[hdr_size:] self.basename = os.path.basename(self.fil_file) self.gulp_nframe = 101 - def run_test_serialize_with_name_no_ringlets(self, gulp_nframe_inc=0): + def run_test_serialize_with_name_no_ringlets(self, gulp_nframe_inc=0, space='system'): with TemporaryDirectory() as temp_path: with bf.Pipeline() as pipeline: - data = read_sigproc([self.fil_file], self.gulp_nframe, core=0) + data = read_sigproc([self.fil_file], self.gulp_nframe, core=0, space=space) for i in range(5): if gulp_nframe_inc != 0: data = copy(data, @@ -105,10 +105,14 @@ def test_serialize_with_name_no_ringlets(self): self.run_test_serialize_with_name_no_ringlets() self.run_test_serialize_with_name_no_ringlets(gulp_nframe_inc=1) self.run_test_serialize_with_name_no_ringlets(gulp_nframe_inc=3) - def test_serialize_with_time_tag_no_ringlets(self): + def test_serialize_with_name_no_ringlets_mapped(self): + self.run_test_serialize_with_name_no_ringlets(space='mapped') + self.run_test_serialize_with_name_no_ringlets(gulp_nframe_inc=1, space='mapped') + self.run_test_serialize_with_name_no_ringlets(gulp_nframe_inc=3, space='mapped') + def test_serialize_with_time_tag_no_ringlets(self, space='system'): with TemporaryDirectory() as temp_path: with bf.Pipeline() as pipeline: - data = read_sigproc([self.fil_file], self.gulp_nframe) + data = read_sigproc([self.fil_file], self.gulp_nframe, space=space) # Custom view sets sequence name to '', which causes SerializeBlock # to use the time_tag instead. data = bf.views.custom(data, lambda hdr: rename_sequence(hdr, '')) @@ -124,10 +128,12 @@ def test_serialize_with_time_tag_no_ringlets(self): with open(datpath, 'rb') as f: data = f.read() self.assertEqual(data, self.data) - def test_serialize_with_name_and_ringlets(self): + def test_serialize_with_time_tag_no_ringlets_mapped(self): + self.test_serialize_with_time_tag_no_ringlets(space='mapped') + def test_serialize_with_name_and_ringlets(self, space='system'): with TemporaryDirectory() as temp_path: with bf.Pipeline() as pipeline: - data = read_sigproc([self.fil_file], self.gulp_nframe) + data = read_sigproc([self.fil_file], self.gulp_nframe, space=space) # Transpose so that freq becomes a ringlet dimension # TODO: Test multiple ringlet dimensions (e.g., freq + pol) once # SerializeBlock supports it. @@ -147,10 +153,12 @@ def test_serialize_with_name_and_ringlets(self): self.data_size // 2) self.assertEqual(os.path.getsize(datpath1), self.data_size // 2) - def test_deserialize_no_ringlets(self): + def test_serialize_with_name_and_ringlets_mapped(self): + self.test_serialize_with_name_and_ringlets(space='mapped') + def test_deserialize_no_ringlets(self, space='system'): with TemporaryDirectory() as temp_path: with bf.Pipeline() as pipeline: - data = read_sigproc([self.fil_file], self.gulp_nframe) + data = read_sigproc([self.fil_file], self.gulp_nframe, space=space) serialize(data, temp_path) pipeline.run() @@ -169,10 +177,12 @@ def test_deserialize_no_ringlets(self): data = f.read() self.assertEqual(len(data), len(self.data)) self.assertEqual(data, self.data) - def test_deserialize_with_ringlets(self): + def test_deserialize_no_ringlets_mapped(self): + self.test_deserialize_no_ringlets(space='mapped') + def test_deserialize_with_ringlets(self, space='system'): with TemporaryDirectory() as temp_path: with bf.Pipeline() as pipeline: - data = read_sigproc([self.fil_file], self.gulp_nframe) + data = read_sigproc([self.fil_file], self.gulp_nframe, space=space) data = transpose(data, ['freq', 'time', 'pol']) serialize(data, temp_path) pipeline.run() @@ -198,3 +208,5 @@ def test_deserialize_with_ringlets(self): self.data_size // 2) self.assertEqual(os.path.getsize(datpath1), self.data_size // 2) + def test_deserialize_with_ringlets_mapped(self): + self.test_deserialize_with_ringlets(space='mapped')