diff --git a/CMakeLists.txt b/CMakeLists.txt index a642b385d..b714ee0ce 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -100,6 +100,11 @@ if(CUDA_ENABLE) list(APPEND DEFAULT_CUDA_ARCH "70") endif() endif() + # add Turing support for CUDA >= 10.0 + if(NOT CUDA_VERSION VERSION_LESS 10.0) + list(APPEND DEFAULT_CUDA_ARCH "75") + endif() + set(CUDA_ARCH "${DEFAULT_CUDA_ARCH}" CACHE STRING "Set GPU architecture (semicolon separated list, e.g. '-DCUDA_ARCH=20;35;60')") # generate comma separated list with architectures @@ -186,7 +191,10 @@ if(CUDA_ENABLE) endif() if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC" AND - (CUDA_VERSION VERSION_EQUAL 9.0 OR CUDA_VERSION VERSION_EQUAL 9.1 OR CUDA_VERSION VERSION_EQUAL 9.2) + (CUDA_VERSION VERSION_EQUAL 9.0 OR + CUDA_VERSION VERSION_EQUAL 9.1 OR + CUDA_VERSION VERSION_EQUAL 9.2 OR + CUDA_VERSION VERSION_EQUAL 10.0) ) # workaround find_package(CUDA) is using the wrong path to the CXX host compiler # overwrite the CUDA host compiler variable with the used CXX MSVC @@ -435,6 +443,11 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") endif() endif() +if(${CMAKE_CXX_COMPILER_ID} STREQUAL "GNU") + set(CMAKE_CXX_FLAGS "-Wl,-z,noexecstack ${CMAKE_CXX_FLAGS}") + set(CMAKE_C_FLAGS "-Wl,-z,noexecstack ${CMAKE_C_FLAGS}") +endif() + # activate static libgcc and libstdc++ linking if(CMAKE_LINK_STATIC) set(BUILD_SHARED_LIBRARIES OFF) @@ -445,6 +458,27 @@ if(CMAKE_LINK_STATIC) endif() endif() +if(CMAKE_C_COMPILER_ID MATCHES "MSVC") + # asm optimized monero v8 code + enable_language(ASM_MASM) + set_property(SOURCE "xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.asm" PROPERTY ASM_MASM) + add_library(xmr-stak-asm + STATIC + "xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.asm" + ) +else() + # asm optimized monero v8 code + enable_language(ASM) + set_property(SOURCE "xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S" PROPERTY CPP) + set_source_files_properties("xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S" PROPERTIES COMPILE_FLAGS "-x assembler-with-cpp") + add_library(xmr-stak-asm + STATIC + "xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S" + ) +endif() + +set_property(TARGET xmr-stak-asm PROPERTY LINKER_LANGUAGE C) + # compile C files file(GLOB SRCFILES_C "xmrstak/backend/cpu/crypto/*.c") @@ -456,7 +490,7 @@ set_property(TARGET xmr-stak-c PROPERTY C_STANDARD 99) if(MICROHTTPD_ENABLE) target_link_libraries(xmr-stak-c ${MHTD}) endif() -target_link_libraries(xmr-stak-c ${LIBS}) +target_link_libraries(xmr-stak-c ${LIBS} xmr-stak-asm) # compile generic backend files file(GLOB BACKEND_CPP @@ -472,7 +506,7 @@ add_library(xmr-stak-backend STATIC ${BACKEND_CPP} ) -target_link_libraries(xmr-stak-backend xmr-stak-c ${CMAKE_DL_LIBS}) +target_link_libraries(xmr-stak-backend xmr-stak-c ${CMAKE_DL_LIBS} xmr-stak-asm) # compile CUDA backend if(CUDA_FOUND) @@ -499,7 +533,7 @@ if(CUDA_FOUND) ) endif() target_link_libraries(xmrstak_cuda_backend ${CUDA_LIBRARIES}) - target_link_libraries(xmrstak_cuda_backend xmr-stak-backend) + target_link_libraries(xmrstak_cuda_backend xmr-stak-backend xmr-stak-asm) endif() # compile AMD backend @@ -512,7 +546,7 @@ if(OpenCL_FOUND) ${OPENCLSRCFILES} ) target_link_libraries(xmrstak_opencl_backend ${OpenCL_LIBRARY} ) - target_link_libraries(xmrstak_opencl_backend xmr-stak-backend) + target_link_libraries(xmrstak_opencl_backend xmr-stak-backend xmr-stak-asm) endif() # compile final binary @@ -528,7 +562,7 @@ endif() set(EXECUTABLE_OUTPUT_PATH "bin" CACHE STRING "Path to place executables relative to ${CMAKE_INSTALL_PREFIX}") set(LIBRARY_OUTPUT_PATH "bin" CACHE STRING "Path to place libraries relative to ${CMAKE_INSTALL_PREFIX}") -target_link_libraries(xmr-stak ${LIBS} xmr-stak-c xmr-stak-backend) +target_link_libraries(xmr-stak ${LIBS} xmr-stak-c xmr-stak-backend xmr-stak-asm) ################################################################################ # Install diff --git a/README.md b/README.md index 887bc5cf3..046a930e1 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ ###### fireice-uk's and psychocrypt's # XMR-Stak - Cryptonight All-in-One Mining Software -XMR-Stak is a universal Stratum pool miner. This miner supports CPUs, AMD and NVIDIA gpus and can be used to mine the crypto currencys Monero, Aeon and many more Cryptonight coins. +XMR-Stak is a universal Stratum pool miner. This miner supports CPUs, AMD and NVIDIA GPUs and can be used to mine the crypto currencies Monero, Aeon and many more Cryptonight coins. ## HTML reports @@ -28,7 +28,7 @@ XMR-Stak is a universal Stratum pool miner. This miner supports CPUs, AMD and NV - supports algorithm cryptonight for Monero (XMR) and cryptonight-light (AEON) - easy to use - guided start (no need to edit a config file for the first start) - - auto configuration for each backend + - auto-configuration for each backend - open source software (GPLv3) - TLS support - [HTML statistics](doc/usage.md#html-and-json-api-report-configuraton) @@ -45,10 +45,14 @@ Besides [Monero](https://getmonero.org), following coins can be mined using this - [Haven](https://havenprotocol.com) - [Intense](https://intensecoin.com) - [Masari](https://getmasari.org) -- [Ryo](https://ryo-currency.com) +- [QRL](https://theqrl.org) +- **[Ryo](https://ryo-currency.com) - Upcoming xmr-stak-gui is sponsored by Ryo** - [TurtleCoin](https://turtlecoin.lol) -If your prefered coin is not listed, you can chose one of the following algorithms: +Ryo currency is a way for us to implement the ideas that we were unable to in +Monero. See [here](https://github.com/fireice-uk/cryptonote-speedup-demo/) for details. + +If your prefered coin is not listed, you can choose one of the following algorithms: - 1MiB scratchpad memory - cryptonight_lite @@ -59,11 +63,12 @@ If your prefered coin is not listed, you can chose one of the following algorith - cryptonight_masari - cryptonight_v7 - cryptonight_v7_stellite + - cryptonight_v8 - 4MiB scratchpad memory - cryptonight_haven - cryptonight_heavy -Please note, this list is not complete, and is not an endorsement. +Please note, this list is not complete and is not an endorsement. ## Download @@ -71,7 +76,7 @@ You can find the latest releases and precompiled binaries on GitHub under [Relea ## Default Developer Donation -By default the miner will donate 2% of the hashpower (2 minute in 100 minutes) to my pool. If you want to change that, edit [donate-level.hpp](xmrstak/donate-level.hpp) before you build the binaries. +By default, the miner will donate 2% of the hashpower (2 minutes in 100 minutes) to my pool. If you want to change that, edit [donate-level.hpp](xmrstak/donate-level.hpp) before you build the binaries. If you want to donate directly to support further development, here is my wallet diff --git a/doc/FAQ.md b/doc/FAQ.md index 2fccdd8a4..f744e3d24 100644 --- a/doc/FAQ.md +++ b/doc/FAQ.md @@ -9,8 +9,8 @@ * [Virus Protection Alert](#virus-protection-alert) * [Change Currency to Mine](#change-currency-to-mine) * [How can I mine Monero](#how-can-i-mine-monero) -* [Why is Monero named monero7](#why-is-monero-named-monero7) * [Which currency must be chosen if my fork coin is not listed](#which-currency-must-be-chosen-if-my-fork-coin-is-not-listed) +* [Internal compiler error: Killed (program cc1plus)](#internal-compiler-error) ## "Obtaining SeLockMemoryPrivilege failed." @@ -44,20 +44,35 @@ Download and install this [runtime package](https://go.microsoft.com/fwlink/?Lin ## Error: MEMORY ALLOC FAILED: mmap failed -On Linux you will need to configure large page support and increase your ulimit -l. +On Linux you will need to configure large page support and increase your memlock limit (`ulimit -l`). -To set large page support, add the following lines to `/etc/sysctl.conf` (`/etc/sysctl.d/xmr-stak.conf` for [Arch Linux](https://www.archlinux.org/news/deprecation-of-etcsysctlconf/) and its derivatives): +Never put settings directly into `/etc/sysctl.conf` or `/etc/security/limits.conf` as those are system defaults and can be replaced in upgrades, and custom settings in that file are deprecated in all distros since at least wheezy/trusty (has been illegal in RedHat based distros for longer than that), and will be even more deprecated with systemd (it no longer even reads sysctl.conf, ONLY sysctl.d files, for example - there is a link to the old `/etc/sysctl.conf` for backward compatibility but that can go away at any time). Also adding to `/etc/rc.local` is extra incorrect, systemd does not even use that file anymore (once the sysvinit compatibility layer is gone, rc.local will no longer work). + +To check current settings, run `/sbin/sysctl vm.nr_hugepages ; ulimit -l` as whatever user you will run `xmr-stak` as (example shows bad/low sample defaults): + + $ /sbin/sysctl vm.nr_hugepages ; ulimit -l + vm.nr_hugepages = 0 + 16 + +To set large page support, add the following lines to `/etc/sysctl.d/60-hugepages.conf`: vm.nr_hugepages=128 -To increase the ulimit, add following lines to `/etc/security/limits.conf`: +You WILL need to run `sudo sysctl --system` for these settings to take effect on your system (or reboot). In some cases (many threads, very large CPU, etc) you may need more than 128 (try 256 if there are still complaints from thread inits) - * soft memlock 262144 - * hard memlock 262144 +To increase the memlock (ulimit -l), add following lines to `/etc/security/limits.d/60-memlock.conf`: + + * - memlock 262144 + root - memlock 262144 You WILL need to log out and log back in for these settings to take effect on your user (no need to reboot, just relogin in your session). +Recheck after completing these steps to validate: + + $ /sbin/sysctl vm.nr_hugepages ; ulimit -l + vm.nr_hugepages = 128 + 262144 -You can also do it Windows-style and simply run-as-root, but this is NOT recommended for security reasons. +You can also do it Windows-style and simply run-as-root, but this is NOT recommended for security reasons. Also running as root does not properly get around the `ulimit -l` being large enough (and limits `*` does not apply to `root` either, it must be specified explicitly). ## Illegal Instruction @@ -72,18 +87,18 @@ If your antivirus software flags **xmr-stak**, it will likely move it to its qua If the miner is compiled for Monero and Aeon than you can change - the value `currency` in the config *or* - - start the miner with the [command line option](usage.md) `--currency monero7` or `--currency aeon7` + - start the miner with the [command line option](usage.md) `--currency monero` or `--currency aeon7` - run `xmr-stak --help` to see all supported currencies and algorithms ## How can I mine Monero -Set the value `currency` in `pools.txt` to `monero7`. - -## Why is Monero named monero7 - -To avoid configuration conflicts after the hard fork of Monero to the new POW with our old naming schema where all cryptonight currencies was selected by choosing `monero` as currency we decided to switch to the name `monero7`. +Set the value `currency` in `pools.txt` to `monero`. ## Which currency must be chosen if my fork coin is not listed If your coin you want to mine is not listed please check the documentation of the coin and try to find out if `cryptonight` or `cryptonight-lite` is the used algorithm. Select one of these generic coin algorithms. + +## Internal compiler error + +Seeing `g++: internal compiler error: Killed (program cc1plus)` is probably related to not enough RAM to compile. 1 Gb RAM should be enough (it is on clean Ubuntu 16.04). diff --git a/doc/compile_Linux.md b/doc/compile_Linux.md index 072402ff7..79e036ef7 100644 --- a/doc/compile_Linux.md +++ b/doc/compile_Linux.md @@ -4,7 +4,7 @@ ### AMD APP SDK 3.0 (only needed to use AMD GPUs) -- download and install the latest version from https://www.dropbox.com/sh/mpg882ekirnsfa7/AADWz5X-TgVdsmWt0QwMgTWLa/AMD-APP-SDKInstaller-v3.0.130.136-GA-linux64.tar.bz2?dl=0 +- download and install the latest version from http://debian.nullivex.com/amd/AMD-APP-SDKInstaller-v3.0.130.136-GA-linux64.tar.bz2 (see https://github.com/fireice-uk/xmr-stak/issues/1511#issuecomment-385120692) (do not wonder why it is a link to a dropbox but AMD has removed the SDK downloads, see https://community.amd.com/thread/228059) ### Cuda 8.0+ (only needed to use NVIDIA GPUs) @@ -105,6 +105,8 @@ In that case you can force CUDA to use an older compiler in the following way: cmake -DCUDA_HOST_COMPILER=/usr/bin/gcc-5 .. ``` +- You need 1 Gb RAM to compile (a bit less might be enough, 512 Mb isn't). + ### To do a generic and static build for a system without gcc 5.1+ ``` cmake -DCMAKE_LINK_STATIC=ON -DXMR-STAK_COMPILE=generic . diff --git a/doc/compile_Windows.md b/doc/compile_Windows.md index 802d5c5ab..add5fbfd0 100644 --- a/doc/compile_Windows.md +++ b/doc/compile_Windows.md @@ -32,8 +32,8 @@ ### AMD APP SDK 3.0 (only needed for AMD GPUs) -- Download and install the latest version from https://www.dropbox.com/s/gq8vqhelq0m6gj4/AMD-APP-SDKInstaller-v3.0.130.135-GA-windows-F-x64.exe - (do not wonder why it is a link to a dropbox but AMD has removed the SDK downloads, see https://community.amd.com/thread/222855) +- Download and install the latest version from http://amd-dev.wpengine.netdna-cdn.com/app-sdk/installers/APPSDKInstaller/3.0.130.135-GA/full/AMD-APP-SDKInstaller-v3.0.130.135-GA-windows-F-x64.exe + (do not wonder why it is a link to a netdna-cdn.com but AMD has removed the SDK downloads, see https://community.amd.com/thread/222855) ### Dependencies OpenSSL/Hwloc and Microhttpd - For CUDA 8*: diff --git a/doc/tuning.md b/doc/tuning.md index 6bf036e9f..2673d68d9 100644 --- a/doc/tuning.md +++ b/doc/tuning.md @@ -9,6 +9,7 @@ * [AMD Backend](#amd-backend) * [Choose `intensity` and `worksize`](#choose-intensity-and-worksize) * [Add more GPUs](#add-more-gpus) + * [Two Threads per GPU](two-threads-per-gpu) * [disable comp_mode](#disable-comp_mode) * [change the scratchpad memory pattern](change-the-scratchpad-memory-pattern) * [Increase Memory Pool](#increase-memory-pool) @@ -55,10 +56,10 @@ To add a new GPU you need to add a new config set to `gpu_threads_conf`. "gpu_threads_conf" : [ { "index" : 0, "threads" : 17, "blocks" : 60, "bfactor" : 0, "bsleep" : 0, - "affine_to_cpu" : false, "sync_mode" : 3, + "affine_to_cpu" : false, "sync_mode" : 3, "mem_mode" : 1, }, { "index" : 1, "threads" : 17, "blocks" : 60, "bfactor" : 0, "bsleep" : 0, - "affine_to_cpu" : false, "sync_mode" : 3, + "affine_to_cpu" : false, "sync_mode" : 3, "mem_mode" : 1, }, ], ``` @@ -82,11 +83,37 @@ If you are unsure of either GPU or platform index value, you can use `clinfo` to ``` "gpu_threads_conf" : [ - { "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false, - "strided_index" : true, "mem_chunk" : 2, "comp_mode" : true + { + "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false, + "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true }, - { "index" : 1, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false, - "strided_index" : true, "mem_chunk" : 2, "comp_mode" : true + { + "index" : 1, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false, + "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true + }, +], + +"platform_index" : 0, +``` + +### Two Threads per GPU + +Some GPUs like AMD Vega can mine faster if two threads are using the same GPU. +Use the auto generated config as base and repeat the config entry for a GPU. +If the attribute `index` is used twice than two threads will use one GPU. +Take care that the required memory usage on the GPU will also double. +Therefore adjust your intensity by hand. + +``` +"gpu_threads_conf" : +[ + { + "index" : 0, "intensity" : 768, "worksize" : 8, "affine_to_cpu" : false, + "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true + }, + { + "index" : 0, "intensity" : 768, "worksize" : 8, "affine_to_cpu" : false, + "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true }, ], diff --git a/doc/usage.md b/doc/usage.md index 886c1b319..a371f0e67 100644 --- a/doc/usage.md +++ b/doc/usage.md @@ -5,6 +5,7 @@ * [Usage on Windows](#usage-on-windows) * [Usage on Linux](#usage-on-linux) * [Command Line Options](#command-line-options) +* [Use different backends](#use-different-backends) * [HTML and JSON API report configuraton](#html-and-json-api-report-configuraton) ## Configurations @@ -34,6 +35,33 @@ Note: If the pool is ignoring the option `rig_id` in `pools.txt` to name your wo The miner allow to overwrite some of the settings via command line options. Run `xmr-stak --help` to show all available command line options. +## Use Different Backends + +On linux and OSX please add `./` before the binary name `xmr-stak`. + +### CPU Only: +``` +xmr-stak --noAMD --noNVIDIA +``` + +### NVIDIA/AMD Only: + +The miner will automatically detect if CUDA (for NVIDIA GPUs) or OpenCL (for AMD GPUs) is available. + +``` +xmr-stak --noCPU +``` +**CUDA** is currently not supported. I am currently try to get some performance out it. + +### NVIDIA via OpenCL + +It is possible to use the OpenCl backend which is originally created for AMD GPUs with NVIDIA GPus. +Some NVIDIA GPUs can reach better performance with this backend. + +``` +xmr-stak --openCLVendor NVIDIA --noNVIDIA +``` + ## Docker image usage You can run the Docker image the following way: diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp index 87721ac8f..7c7aff788 100644 --- a/xmrstak/backend/amd/amd_gpu/gpu.cpp +++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp @@ -17,6 +17,7 @@ #include "xmrstak/jconf.hpp" #include "xmrstak/picosha2/picosha2.hpp" #include "xmrstak/params.hpp" +#include "xmrstak/version.hpp" #include #include @@ -375,6 +376,13 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ return ERR_OCL_API; } + std::vector openCLDriverVer(1024); + if((ret = clGetDeviceInfo(ctx->DeviceID, CL_DRIVER_VERSION, openCLDriverVer.size(), openCLDriverVer.data(), NULL)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DRIVER_VERSION for device %u.", err_to_str(ret),ctx->deviceIdx ); + return ERR_OCL_API; + } + xmrstak_algo miner_algo[2] = { ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo(), ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot() @@ -388,11 +396,31 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ int threadMemMask = cn_select_mask(miner_algo[ii]); int hashIterations = cn_select_iter(miner_algo[ii]); - char options[512]; - snprintf(options, sizeof(options), - "-DITERATIONS=%d -DMASK=%d -DWORKSIZE=%llu -DSTRIDED_INDEX=%d -DMEM_CHUNK_EXPONENT=%d -DCOMP_MODE=%d -DMEMORY=%llu -DALGO=%d", - hashIterations, threadMemMask, int_port(ctx->workSize), ctx->stridedIndex, int(1u<memChunk), ctx->compMode ? 1 : 0, - int_port(hashMemSize), int(miner_algo[ii])); + size_t mem_chunk_exp = 1u << ctx->memChunk; + size_t strided_index = ctx->stridedIndex; + /* Adjust the config settings to a valid combination + * this is required if the dev pool is mining monero + * but the user tuned there settings for another currency + */ + if(miner_algo[ii] == cryptonight_monero_v8) + { + if(ctx->memChunk < 2) + mem_chunk_exp = 1u << 2; + if(strided_index == 1) + strided_index = 0; + } + + std::string options; + options += " -DITERATIONS=" + std::to_string(hashIterations); + options += " -DMASK=" + std::to_string(threadMemMask); + options += " -DWORKSIZE=" + std::to_string(ctx->workSize); + options += " -DSTRIDED_INDEX=" + std::to_string(strided_index); + options += " -DMEM_CHUNK_EXPONENT=" + std::to_string(mem_chunk_exp); + options += " -DCOMP_MODE=" + std::to_string(ctx->compMode ? 1u : 0u); + options += " -DMEMORY=" + std::to_string(hashMemSize); + options += " -DALGO=" + std::to_string(miner_algo[ii]); + options += " -DCN_UNROLL=" + std::to_string(ctx->unroll); + /* create a hash for the compile time cache * used data: * - source code @@ -402,6 +430,9 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ std::string src_str(source_code); src_str += options; src_str += devNameVec.data(); + src_str += get_version_str(); + src_str += openCLDriverVer.data(); + std::string hash_hex_str; picosha2::hash256_hex_string(src_str, hash_hex_str); @@ -418,7 +449,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ return ERR_OCL_API; } - ret = clBuildProgram(ctx->Program[ii], 1, &ctx->DeviceID, options, NULL, NULL); + ret = clBuildProgram(ctx->Program[ii], 1, &ctx->DeviceID, options.c_str(), NULL, NULL); if(ret != CL_SUCCESS) { size_t len; @@ -594,27 +625,6 @@ const char* const attributeNames[] = { #define NELEMS(x) (sizeof(x) / sizeof((x)[0])) -void PrintDeviceInfo(cl_device_id device) -{ - char queryBuffer[1024]; - int queryInt; - cl_int clError; - clError = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(queryBuffer), &queryBuffer, NULL); - printf(" CL_DEVICE_NAME: %s\n", queryBuffer); - queryBuffer[0] = '\0'; - clError = clGetDeviceInfo(device, CL_DEVICE_VENDOR, sizeof(queryBuffer), &queryBuffer, NULL); - printf(" CL_DEVICE_VENDOR: %s\n", queryBuffer); - queryBuffer[0] = '\0'; - clError = clGetDeviceInfo(device, CL_DRIVER_VERSION, sizeof(queryBuffer), &queryBuffer, NULL); - printf(" CL_DRIVER_VERSION: %s\n", queryBuffer); - queryBuffer[0] = '\0'; - clError = clGetDeviceInfo(device, CL_DEVICE_VERSION, sizeof(queryBuffer), &queryBuffer, NULL); - printf(" CL_DEVICE_VERSION: %s\n", queryBuffer); - queryBuffer[0] = '\0'; - clError = clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(int), &queryInt, NULL); - printf(" CL_DEVICE_MAX_COMPUTE_UNITS: %d\n", queryInt); -} - uint32_t getNumPlatforms() { cl_uint num_platforms = 0; @@ -885,6 +895,9 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx) //char* source_code = LoadTextFile(sSourcePath); + const char *fastIntMathV2CL = + #include "./opencl/fast_int_math_v2.cl" + ; const char *cryptonightCL = #include "./opencl/cryptonight.cl" ; @@ -905,6 +918,7 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx) ; std::string source_code(cryptonightCL); + source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_FAST_INT_MATH_V2"), fastIntMathV2CL); source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_WOLF_AES"), wolfAesCL); source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_WOLF_SKEIN"), wolfSkeinCL); source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_JH"), jhCL); @@ -914,13 +928,20 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx) // create a directory for the OpenCL compile cache create_directory(get_home() + "/.openclcache"); + // check if cryptonight_monero_v8 is selected for the user or dev pool + bool useCryptonight_v8 = + ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_monero_v8 || + ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot() == cryptonight_monero_v8 || + ::jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgo() == cryptonight_monero_v8 || + ::jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgoRoot() == cryptonight_monero_v8; + for(int i = 0; i < num_gpus; ++i) { + const std::string backendName = xmrstak::params::inst().openCLVendor; if(ctx[i].stridedIndex == 2 && (ctx[i].rawIntensity % ctx[i].workSize) != 0) { size_t reduced_intensity = (ctx[i].rawIntensity / ctx[i].workSize) * ctx[i].workSize; ctx[i].rawIntensity = reduced_intensity; - const std::string backendName = xmrstak::params::inst().openCLVendor; printer::inst()->print_msg(L0, "WARNING %s: gpu %d intensity is not a multiple of 'worksize', auto reduce intensity to %d", backendName.c_str(), ctx[i].deviceIdx, int(reduced_intensity)); } diff --git a/xmrstak/backend/amd/amd_gpu/gpu.hpp b/xmrstak/backend/amd/amd_gpu/gpu.hpp index 5ab80b82a..63c5029d7 100644 --- a/xmrstak/backend/amd/amd_gpu/gpu.hpp +++ b/xmrstak/backend/amd/amd_gpu/gpu.hpp @@ -27,6 +27,7 @@ struct GpuContext size_t workSize; int stridedIndex; int memChunk; + int unroll = 0; bool isNVIDIA = false; int compMode; diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl index 002472d3a..9c9bcd08e 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl @@ -35,8 +35,8 @@ R"===( inline uint2 amd_bitalign( const uint2 src0, const uint2 src1, const uint src2) { uint2 result; - result.s0 = (uint) (((((long)src0.s0) << 32) | (long)src1.s0) >> (src2)); - result.s1 = (uint) (((((long)src0.s1) << 32) | (long)src1.s1) >> (src2)); + result.s0 = (uint) (((((ulong)src0.s0) << 32) | (ulong)src1.s0) >> (src2)); + result.s1 = (uint) (((((ulong)src0.s1) << 32) | (ulong)src1.s1) >> (src2)); return result; } #endif @@ -78,6 +78,8 @@ inline int amd_bfe(const uint src0, const uint offset, const uint width) } #endif +//#include "opencl/fast_int_math_v2.cl" +XMRSTAK_INCLUDE_FAST_INT_MATH_V2 //#include "opencl/wolf-aes.cl" XMRSTAK_INCLUDE_WOLF_AES //#include "opencl/wolf-skein.cl" @@ -416,6 +418,9 @@ void AESExpandKey256(uint *keybuf) } } +)===" +R"===( + #define MEM_CHUNK (1<> 8)); + /* explicit cast to `uint` is required because some OpenCL implementations (e.g. NVIDIA) + * handle get_global_id and get_global_offset as signed long long int and add + * 0xFFFFFFFF... to `get_global_id` if we set on host side a 32bit offset where the first bit is `1` + * (even if it is correct casted to unsigned on the host) + */ + ((uint *)State)[10] |= (((uint)get_global_id(0) >> 8)); for(int i = 11; i < 25; ++i) State[i] = 0x00UL; @@ -551,7 +561,15 @@ __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad, } mem_fence(CLK_GLOBAL_MEM_FENCE); } - + +// cryptonight_monero_v8 && NVIDIA +#if(ALGO==11 && defined(__NV_CL_C_VERSION)) +# define SCRATCHPAD_CHUNK(N) (*(__local uint4*)((__local uchar*)(scratchpad_line) + (idxS ^ (N << 4)))) +# define SCRATCHPAD_CHUNK_GLOBAL (*((__global uint16*)(Scratchpad + (IDX((idx0 & 0x1FFFC0U) >> 4))))) +#else +# define SCRATCHPAD_CHUNK(N) (Scratchpad[IDX(((idx0) >> 4) ^ N)]) +#endif + __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states, ulong Threads // cryptonight_monero || cryptonight_aeon || cryptonight_ipbc || cryptonight_stellite || cryptonight_masari || cryptonight_bittube2 @@ -560,9 +578,29 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states #endif ) { - ulong a[2], b[2]; + ulong a[2]; + +// cryptonight_monero_v8 +#if(ALGO==11) + ulong b[4]; + uint4 b_x[2]; +// NVIDIA +# ifdef __NV_CL_C_VERSION + __local uint16 scratchpad_line_buf[WORKSIZE]; + __local uint16* scratchpad_line = scratchpad_line_buf + get_local_id(0); +# endif +#else + ulong b[2]; + uint4 b_x[1]; +#endif __local uint AES0[256], AES1[256], AES2[256], AES3[256]; +// cryptonight_monero_v8 +#if(ALGO==11) + __local uint RCP[256]; + uint2 division_result; + uint sqrt_result; +#endif const ulong gIdx = getIdx(); for(int i = get_local_id(0); i < 256; i += WORKSIZE) @@ -572,6 +610,10 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states AES1[i] = rotate(tmp, 8U); AES2[i] = rotate(tmp, 16U); AES3[i] = rotate(tmp, 24U); +// cryptonight_monero_v8 +#if(ALGO==11) + RCP[i] = RCP_C[i]; +#endif } barrier(CLK_LOCAL_MEM_FENCE); @@ -579,7 +621,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states #if(ALGO == 3 || ALGO == 5 || ALGO == 6 || ALGO == 7 || ALGO == 8 || ALGO == 10) uint2 tweak1_2; #endif - uint4 b_x; + #if(COMP_MODE==1) // do not use early return here if(gIdx < Threads) @@ -599,13 +641,23 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states a[1] = states[1] ^ states[5]; b[1] = states[3] ^ states[7]; - b_x = ((uint4 *)b)[0]; + b_x[0] = ((uint4 *)b)[0]; + +// cryptonight_monero_v8 +#if(ALGO==11) + a[1] = states[1] ^ states[5]; + b[2] = states[8] ^ states[10]; + b[3] = states[9] ^ states[11]; + b_x[1] = ((uint4 *)b)[1]; + division_result = as_uint2(states[12]); + sqrt_result = as_uint2(states[13]).s0; +#endif // cryptonight_monero || cryptonight_aeon || cryptonight_ipbc || cryptonight_stellite || cryptonight_masari || cryptonight_bittube2 #if(ALGO == 3 || ALGO == 5 || ALGO == 6 || ALGO == 7 || ALGO == 8 || ALGO == 10) tweak1_2 = as_uint2(input[4]); tweak1_2.s0 >>= 24; tweak1_2.s0 |= tweak1_2.s1 << 8; - tweak1_2.s1 = get_global_id(0); + tweak1_2.s1 = (uint)get_global_id(0); tweak1_2 ^= as_uint2(states[24]); #endif } @@ -617,39 +669,102 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states if(gIdx < Threads) #endif { - ulong idx0 = a[0]; + ulong idx0 = a[0] & MASK; - #pragma unroll 8 + #pragma unroll CN_UNROLL for(int i = 0; i < ITERATIONS; ++i) { ulong c[2]; +// cryptonight_monero_v8 && NVIDIA +#if(ALGO==11 && defined(__NV_CL_C_VERSION)) + ulong idxS = idx0 & 0x30; + *scratchpad_line = SCRATCHPAD_CHUNK_GLOBAL; +#endif - ((uint4 *)c)[0] = Scratchpad[IDX((idx0 & MASK) >> 4)]; + ((uint4 *)c)[0] = SCRATCHPAD_CHUNK(0); // cryptonight_bittube2 #if(ALGO == 10) ((uint4 *)c)[0] = AES_Round_bittube2(AES0, AES1, AES2, AES3, ((uint4 *)c)[0], ((uint4 *)a)[0]); #else ((uint4 *)c)[0] = AES_Round(AES0, AES1, AES2, AES3, ((uint4 *)c)[0], ((uint4 *)a)[0]); #endif - b_x ^= ((uint4 *)c)[0]; + +// cryptonight_monero_v8 +#if(ALGO==11) + { + ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(1)); + ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2)); + ulong2 chunk3 = as_ulong2(SCRATCHPAD_CHUNK(3)); + SCRATCHPAD_CHUNK(1) = as_uint4(chunk3 + ((ulong2 *)(b_x + 1))[0]); + SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + ((ulong2 *)b_x)[0]); + SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]); + } +#endif + // cryptonight_monero || cryptonight_aeon || cryptonight_ipbc || cryptonight_stellite || cryptonight_masari || cryptonight_bittube2 #if(ALGO == 3 || ALGO == 5 || ALGO == 6 || ALGO == 7 || ALGO == 8 || ALGO == 10) uint table = 0x75310U; + b_x[0] ^= ((uint4 *)c)[0]; // cryptonight_stellite # if(ALGO == 7) - uint index = ((b_x.s2 >> 27) & 12) | ((b_x.s2 >> 23) & 2); + uint index = ((b_x[0].s2 >> 27) & 12) | ((b_x[0].s2 >> 23) & 2); +# else + uint index = ((b_x[0].s2 >> 26) & 12) | ((b_x[0].s2 >> 23) & 2); +# endif + b_x[0].s2 ^= ((table >> index) & 0x30U) << 24; + SCRATCHPAD_CHUNK(0) = b_x[0]; + idx0 = c[0] & MASK; +// cryptonight_monero_v8 +#elif(ALGO==11) + SCRATCHPAD_CHUNK(0) = b_x[0] ^ ((uint4 *)c)[0]; +# ifdef __NV_CL_C_VERSION + // flush shuffled data + SCRATCHPAD_CHUNK_GLOBAL = *scratchpad_line; + idx0 = c[0] & MASK; + idxS = idx0 & 0x30; + *scratchpad_line = SCRATCHPAD_CHUNK_GLOBAL; # else - uint index = ((b_x.s2 >> 26) & 12) | ((b_x.s2 >> 23) & 2); + idx0 = c[0] & MASK; # endif - b_x.s2 ^= ((table >> index) & 0x30U) << 24; +#else + b_x[0] ^= ((uint4 *)c)[0]; + SCRATCHPAD_CHUNK(0) = b_x[0]; + idx0 = c[0] & MASK; #endif - Scratchpad[IDX((idx0 & MASK) >> 4)] = b_x; - uint4 tmp; - tmp = Scratchpad[IDX((c[0] & MASK) >> 4)]; - - a[1] += c[0] * as_ulong2(tmp).s0; - a[0] += mul_hi(c[0], as_ulong2(tmp).s0); + tmp = SCRATCHPAD_CHUNK(0); +// cryptonight_monero_v8 +#if(ALGO==11) + // Use division and square root results from the _previous_ iteration to hide the latency + tmp.s0 ^= division_result.s0; + tmp.s1 ^= division_result.s1 ^ sqrt_result; + // Most and least significant bits in the divisor are set to 1 + // to make sure we don't divide by a small or even number, + // so there are no shortcuts for such cases + const uint d = (((uint *)c)[0] + (sqrt_result << 1)) | 0x80000001UL; + // Quotient may be as large as (2^64 - 1)/(2^31 + 1) = 8589934588 = 2^33 - 4 + // We drop the highest bit to fit both quotient and remainder in 32 bits + division_result = fast_div_v2(RCP, c[1], d); + // Use division_result as an input for the square root to prevent parallel implementation in hardware + sqrt_result = fast_sqrt_v2(c[0] + as_ulong(division_result)); +#endif + ulong2 result_mul; + result_mul.s0 = mul_hi(c[0], as_ulong2(tmp).s0); + result_mul.s1 = c[0] * as_ulong2(tmp).s0; +// cryptonight_monero_v8 +#if(ALGO==11) + { + ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(1)) ^ result_mul; + ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2)); + result_mul ^= chunk2; + ulong2 chunk3 = as_ulong2(SCRATCHPAD_CHUNK(3)); + SCRATCHPAD_CHUNK(1) = as_uint4(chunk3 + ((ulong2 *)(b_x + 1))[0]); + SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + ((ulong2 *)b_x)[0]); + SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]); + } +#endif + a[1] += result_mul.s1; + a[0] += result_mul.s0; // cryptonight_monero || cryptonight_aeon || cryptonight_ipbc || cryptonight_stellite || cryptonight_masari || cryptonight_bittube2 #if(ALGO == 3 || ALGO == 5 || ALGO == 6 || ALGO == 7 || ALGO == 8 || ALGO == 10) @@ -658,44 +773,55 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states # if(ALGO == 6 || ALGO == 10) uint2 ipbc_tmp = tweak1_2 ^ ((uint2 *)&(a[0]))[0]; ((uint2 *)&(a[1]))[0] ^= ipbc_tmp; - Scratchpad[IDX((c[0] & MASK) >> 4)] = ((uint4 *)a)[0]; + SCRATCHPAD_CHUNK(0) = ((uint4 *)a)[0]; ((uint2 *)&(a[1]))[0] ^= ipbc_tmp; # else ((uint2 *)&(a[1]))[0] ^= tweak1_2; - Scratchpad[IDX((c[0] & MASK) >> 4)] = ((uint4 *)a)[0]; + SCRATCHPAD_CHUNK(0) = ((uint4 *)a)[0]; ((uint2 *)&(a[1]))[0] ^= tweak1_2; # endif #else - Scratchpad[IDX((c[0] & MASK) >> 4)] = ((uint4 *)a)[0]; + SCRATCHPAD_CHUNK(0) = ((uint4 *)a)[0]; #endif ((uint4 *)a)[0] ^= tmp; - idx0 = a[0]; - b_x = ((uint4 *)c)[0]; +// cryptonight_monero_v8 +#if (ALGO == 11) +# if defined(__NV_CL_C_VERSION) + // flush shuffled data + SCRATCHPAD_CHUNK_GLOBAL = *scratchpad_line; +# endif + b_x[1] = b_x[0]; +#endif + b_x[0] = ((uint4 *)c)[0]; + idx0 = a[0] & MASK; // cryptonight_heavy || cryptonight_bittube2 #if (ALGO == 4 || ALGO == 10) - long n = *((__global long*)(Scratchpad + (IDX((idx0 & MASK) >> 4)))); - int d = ((__global int*)(Scratchpad + (IDX((idx0 & MASK) >> 4))))[2]; + long n = *((__global long*)(Scratchpad + (IDX((idx0) >> 4)))); + int d = ((__global int*)(Scratchpad + (IDX((idx0) >> 4))))[2]; long q = n / (d | 0x5); - *((__global long*)(Scratchpad + (IDX((idx0 & MASK) >> 4)))) = n ^ q; - idx0 = d ^ q; -#endif + *((__global long*)(Scratchpad + (IDX((idx0) >> 4)))) = n ^ q; + idx0 = (d ^ q) & MASK; // cryptonight_haven -#if (ALGO == 9) - long n = *((__global long*)(Scratchpad + (IDX((idx0 & MASK) >> 4)))); - int d = ((__global int*)(Scratchpad + (IDX((idx0 & MASK) >> 4))))[2]; +#elif (ALGO == 9) + long n = *((__global long*)(Scratchpad + (IDX((idx0) >> 4)))); + int d = ((__global int*)(Scratchpad + (IDX((idx0) >> 4))))[2]; long q = n / (d | 0x5); - *((__global long*)(Scratchpad + (IDX((idx0 & MASK) >> 4)))) = n ^ q; - idx0 = (~d) ^ q; + *((__global long*)(Scratchpad + (IDX((idx0) >> 4)))) = n ^ q; + idx0 = ((~d) ^ q) & MASK; #endif + } } mem_fence(CLK_GLOBAL_MEM_FENCE); } +)===" +R"===( + __attribute__((reqd_work_group_size(WORKSIZE, 8, 1))) __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states, __global uint *Branch0, __global uint *Branch1, __global uint *Branch2, __global uint *Branch3, ulong Threads) { @@ -918,7 +1044,7 @@ __kernel void Skein(__global ulong *states, __global uint *BranchBuf, __global u { ulong outIdx = atomic_inc(output + 0xFF); if(outIdx < 0xFF) - output[outIdx] = BranchBuf[idx] + get_global_offset(0); + output[outIdx] = BranchBuf[idx] + (uint)get_global_offset(0); } } mem_fence(CLK_GLOBAL_MEM_FENCE); @@ -994,7 +1120,7 @@ __kernel void JH(__global ulong *states, __global uint *BranchBuf, __global uint { ulong outIdx = atomic_inc(output + 0xFF); if(outIdx < 0xFF) - output[outIdx] = BranchBuf[idx] + get_global_offset(0); + output[outIdx] = BranchBuf[idx] + (uint)get_global_offset(0); } } } @@ -1072,7 +1198,7 @@ __kernel void Blake(__global ulong *states, __global uint *BranchBuf, __global u { ulong outIdx = atomic_inc(output + 0xFF); if(outIdx < 0xFF) - output[outIdx] = BranchBuf[idx] + get_global_offset(0); + output[outIdx] = BranchBuf[idx] + (uint)get_global_offset(0); } } } @@ -1095,7 +1221,7 @@ __kernel void Groestl(__global ulong *states, __global uint *BranchBuf, __global #pragma unroll 4 for(uint i = 0; i < 4; ++i) { - ulong H[8], M[8]; + volatile ulong H[8], M[8]; if(i < 3) { @@ -1133,7 +1259,7 @@ __kernel void Groestl(__global ulong *states, __global uint *BranchBuf, __global { ulong outIdx = atomic_inc(output + 0xFF); if(outIdx < 0xFF) - output[outIdx] = BranchBuf[idx] + get_global_offset(0); + output[outIdx] = BranchBuf[idx] + (uint)get_global_offset(0); } } } diff --git a/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl b/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl new file mode 100644 index 000000000..607806b7a --- /dev/null +++ b/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl @@ -0,0 +1,108 @@ +R"===( +/* + * @author SChernykh + */ +static const __constant uint RCP_C[256] = +{ + 0xfe01be73u,0xfd07ff01u,0xfa118c5au,0xf924fb13u,0xf630cddbu,0xf558f73cu,0xf25f2934u,0xf1a3f37bu, + 0xee9c4562u,0xee02efd0u,0xeae7ced5u,0xea76ec3au,0xe7417330u,0xe6ffe8b8u,0xe3a8e217u,0xe39be54au, + 0xe01dcd03u,0xe04ae1f0u,0xdc9fea3bu,0xdd0bdea8u,0xd92eef38u,0xd9dedb73u,0xd5ca9626u,0xd6c3d84fu, + 0xd27299dcu,0xd3b9d53cu,0xcf26b659u,0xd0bfd23au,0xcbe6ab09u,0xcdd5cf48u,0xc8b23886u,0xcafacc65u, + 0xc58920e5u,0xc82ec992u,0xc26b283eu,0xc572c6ceu,0xbf5813d7u,0xc2c3c419u,0xbc4facdbu,0xc023c171u, + 0xb951b9f6u,0xbd8fbed7u,0xb65e05c8u,0xbb09bc4bu,0xb3745d97u,0xb890b9cbu,0xb0948d04u,0xb624b758u, + 0xadbe61e8u,0xb3c3b4f2u,0xaaf1ae2au,0xb16eb297u,0xa82e412eu,0xaf25b048u,0xa573ec98u,0xace7ae05u, + 0xa2c28519u,0xaab4abcdu,0xa019df1cu,0xa88ca99fu,0x9d79cf91u,0xa66ea77cu,0x9ae22df8u,0xa45ba563u, + 0x9852d0ceu,0xa251a354u,0x95cb912eu,0xa050a14fu,0x934c48d6u,0x9e5a9f54u,0x90d4d228u,0x9c6c9d62u, + 0x8e650939u,0x9a879b79u,0x8bfccaf5u,0x98ac9998u,0x899bf212u,0x96d897c1u,0x87425eedu,0x950d95f2u, + 0x84efefd3u,0x934a942bu,0x82a48450u,0x918f926cu,0x805ffcb4u,0x8fdc90b5u,0x7e223ab7u,0x8e308f05u, + 0x7beb1f71u,0x8c8c8d5du,0x79ba8ce2u,0x8aef8bbdu,0x7790683eu,0x89598a23u,0x756c9343u,0x87ca8891u, + 0x734ef468u,0x86428705u,0x71376efbu,0x84c18581u,0x6f25e9ebu,0x83458402u,0x6d1a4b34u,0x81d0828au, + 0x6b147a52u,0x80628118u,0x69145cfbu,0x7ef97fadu,0x6719dd39u,0x7d967e47u,0x6524e2abu,0x7c397ce7u, + 0x6335561bu,0x7ae27b8du,0x614b21eau,0x79907a38u,0x5f662f10u,0x784478e9u,0x5d8667dfu,0x76fd77a0u, + 0x5babb887u,0x75bb765bu,0x59d60b2eu,0x747e751cu,0x58054d25u,0x734673e1u,0x5639688fu,0x721372acu, + 0x54724c2du,0x70e5717bu,0x52afe29cu,0x6fbb7050u,0x50f21c05u,0x6e966f28u,0x4f38e412u,0x6d766e06u, + 0x4d842a91u,0x6c5a6ce7u,0x4bd3dcd0u,0x6b426bcdu,0x4a27e96au,0x6a2e6ab8u,0x4880415eu,0x691f69a6u, + 0x46dcd25du,0x68136899u,0x453d8df4u,0x670c678fu,0x43a262a5u,0x6608668au,0x420b42d6u,0x65096588u, + 0x40781dd3u,0x640d648au,0x3ee8e49au,0x63146390u,0x3d5d8a11u,0x621f6299u,0x3bd5fee0u,0x612e61a6u, + 0x3a523496u,0x604060b7u,0x38d21e75u,0x5f565fcbu,0x3755aec4u,0x5e6f5ee2u,0x35dcd78fu,0x5d8b5dfdu, + 0x34678d72u,0x5cab5d1au,0x32f5c17cu,0x5bcd5c3bu,0x318767f1u,0x5af35b60u,0x301c7511u,0x5a1b5a87u, + 0x2eb4dccau,0x594759b1u,0x2d50935cu,0x587658deu,0x2bef8bfau,0x57a7580eu,0x2a91bc5cu,0x56db5741u, + 0x2937198fu,0x56125676u,0x27df970eu,0x554c55afu,0x268b2b78u,0x548854eau,0x2539cba1u,0x53c75428u, + 0x23eb6d84u,0x53095368u,0x22a00644u,0x524d52abu,0x21578cd3u,0x519451f0u,0x2011f5f9u,0x50dd5138u, + 0x1ecf388eu,0x50285082u,0x1d8f4b53u,0x4f764fcfu,0x1c5224abu,0x4ec64f1eu,0x1b17bb87u,0x4e184e6fu, + 0x19e0073fu,0x4d6d4dc2u,0x18aafe0au,0x4cc44d18u,0x177896f3u,0x4c1c4c70u,0x1648cb16u,0x4b784bcau, + 0x151b9051u,0x4ad54b26u,0x13f0deeau,0x4a344a84u,0x12c8aef3u,0x499549e4u,0x11a2f829u,0x48f84946u, + 0x107fb1ffu,0x485d48abu,0xf5ed5f0u,0x47c44811u,0xe405bc1u,0x472d4779u,0xd243bdau,0x469846e3u, + 0xc0a6fa1u,0x4605464eu,0xaf2edf2u,0x457345bcu,0x9ddb163u,0x44e3452bu,0x8cab264u,0x4455449cu, + 0x7b9e9d5u,0x43c9440fu,0x6ab5173u,0x433e4383u,0x59ee141u,0x42b542fau,0x49494c7u,0x422e4271u, + 0x38c62ffu,0x41a841ebu,0x286478bu,0x41244166u,0x1823b84u,0x40a140e2u,0x803883u,0x401C4060u, +}; + +inline uint get_reciprocal(const __local uchar *RCP, uint a) +{ + const uint index1 = (a & 0x7F000000U) >> 21; + const int index2 = (int)((a >> 8) & 0xFFFFU) - 32768; + + const uint r1 = *(const __local uint*)(RCP + index1); + + uint r2_0 = *(const __local uint*)(RCP + index1 + 4); + if (index2 > 0) r2_0 >>= 16; + const int r2 = r2_0 & 0xFFFFU; + + const uint r = r1 - (uint)(mul24(r2, index2) >> 6); + + const ulong lo0 = (ulong)(r) * a; + ulong lo = lo0 + ((ulong)(a) << 32); + + a >>= 1; + const bool b = (a >= lo) || (lo >= lo0); + lo = a - lo; + + const ulong k = mul_hi(as_uint2(lo).s0, r) + ((ulong)(r) * as_uint2(lo).s1) + lo; + return as_uint2(k).s1 + (b ? r : 0); +} + +inline uint2 fast_div_v2(const __local uint *RCP, ulong a, uint b) +{ + const uint r = get_reciprocal((const __local uchar *)RCP, b); + const ulong k = mul_hi(as_uint2(a).s0, r) + ((ulong)(r) * as_uint2(a).s1) + a; + + ulong q; + ((uint*)&q)[0] = as_uint2(k).s1;; + ((uint*)&q)[1] = (k < a) ? 1 : 0; + + const long tmp = a - q * b; + const bool overshoot = (tmp < 0); + const bool undershoot = (tmp >= b); + + return (uint2)( + as_uint2(q).s0 + (undershoot ? 1U : 0U) - (overshoot ? 1U : 0U), + as_uint2(tmp).s0 + (overshoot ? b : 0U) - (undershoot ? b : 0U) + ); +} + +inline uint fast_sqrt_v2(const ulong n1) +{ + float x = as_float((as_uint2(n1).s1 >> 9) + ((64U + 127U) << 23)); + + float x1 = native_rsqrt(x); + x = native_sqrt(x); + + // The following line does x1 *= 4294967296.0f; + x1 = as_float(as_uint(x1) + (32U << 23)); + + const uint x0 = as_uint(x) - (158U << 23); + const long delta0 = n1 - (((long)(x0) * x0) << 18); + const float delta = convert_float_rte(as_int2(delta0).s1) * x1; + + uint result = (x0 << 10) + convert_int_rte(delta); + const uint s = result >> 1; + const uint b = result & 1; + + const ulong x2 = (ulong)(s) * (s + b) + ((ulong)(result) << 32) - n1; + if ((long)(x2 + b) > 0) --result; + if ((long)(x2 + 0x100000000UL + s) < 0) ++result; + + return result; +} +)===" diff --git a/xmrstak/backend/amd/autoAdjust.hpp b/xmrstak/backend/amd/autoAdjust.hpp index d6acec971..c5b331c87 100644 --- a/xmrstak/backend/amd/autoAdjust.hpp +++ b/xmrstak/backend/amd/autoAdjust.hpp @@ -127,6 +127,24 @@ class autoAdjust minFreeMem = 512u * byteToMiB; } + // check if cryptonight_monero_v8 is selected for the user or dev pool + bool useCryptonight_v8 = + ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_monero_v8 || + ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot() == cryptonight_monero_v8 || + ::jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgo() == cryptonight_monero_v8 || + ::jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgoRoot() == cryptonight_monero_v8; + + // set strided index to default + ctx.stridedIndex = 1; + + // nvidia performance is very bad if the scratchpad is not contiguous + if(ctx.isNVIDIA) + ctx.stridedIndex = 0; + + // use chunked (4x16byte) scratchpad for all backends. Default `mem_chunk` is `2` + if(useCryptonight_v8) + ctx.stridedIndex = 2; + // increase all intensity limits by two for aeon if(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_lite) maxThreads *= 2u; @@ -153,8 +171,8 @@ class autoAdjust // set 8 threads per block (this is a good value for the most gpus) conf += std::string(" { \"index\" : ") + std::to_string(ctx.deviceIdx) + ",\n" + " \"intensity\" : " + std::to_string(intensity) + ", \"worksize\" : " + std::to_string(8) + ",\n" + - " \"affine_to_cpu\" : false, \"strided_index\" : " + (ctx.isNVIDIA ? "0" : "1") + ", \"mem_chunk\" : 2,\n" - " \"comp_mode\" : true\n" + + " \"affine_to_cpu\" : false, \"strided_index\" : " + std::to_string(ctx.stridedIndex) + ", \"mem_chunk\" : 2,\n" + " \"unroll\" : 8, \"comp_mode\" : true\n" + " },\n"; } else diff --git a/xmrstak/backend/amd/config.tpl b/xmrstak/backend/amd/config.tpl index 28855f070..421e0ed4b 100644 --- a/xmrstak/backend/amd/config.tpl +++ b/xmrstak/backend/amd/config.tpl @@ -1,4 +1,5 @@ -R"===( +R"===(// generated by XMRSTAK_VERSION + /* * GPU configuration. You should play around with intensity and worksize as the fastest settings will vary. * index - GPU index number usually starts from 0 @@ -9,17 +10,20 @@ R"===( * 2 = chunked memory, chunk size is controlled by 'mem_chunk' * required: intensity must be a multiple of worksize * 1 or true = use 16byte contiguous memory per thread, the next memory block has offset of intensity blocks + * (for cryptonight_v8 and monero it is equal to strided_index = 0) * 0 or false = use a contiguous block of memory per thread * mem_chunk - range 0 to 18: set the number of elements (16byte) per chunk * this value is only used if 'strided_index' == 2 * element count is computed with the equation: 2 to the power of 'mem_chunk' e.g. 4 means a chunk of 16 elements(256byte) + * unroll - allow to control how often the POW main loop is unrolled; valid range [1;128) - for most OpenCL implementations it must be a power of two. * comp_mode - Compatibility enable/disable the automatic guard around compute kernel which allows * to use a intensity which is not the multiple of the worksize. * If you set false and the intensity is not multiple of the worksize the miner can crash: * in this case set the intensity to a multiple of the worksize or activate comp_mode. * "gpu_threads_conf" : * [ - * { "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false, "strided_index" : true, "mem_chunk" : 2, "comp_mode" : true }, + * { "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false, + * "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true }, * ], * If you do not wish to mine with your AMD GPU(s) then use: * "gpu_threads_conf" : @@ -34,5 +38,4 @@ GPUCONFIG * Platform index. This will be 0 unless you have different OpenCL platform - eg. AMD and Intel. */ "platform_index" : PLATFORMINDEX, - )===" diff --git a/xmrstak/backend/amd/jconf.cpp b/xmrstak/backend/amd/jconf.cpp index 0f39ff2b9..152f8add4 100644 --- a/xmrstak/backend/amd/jconf.cpp +++ b/xmrstak/backend/amd/jconf.cpp @@ -106,17 +106,18 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg) if(!oThdConf.IsObject()) return false; - const Value *idx, *intensity, *w_size, *aff, *stridedIndex, *memChunk, *compMode; + const Value *idx, *intensity, *w_size, *aff, *stridedIndex, *memChunk, *unroll, *compMode; idx = GetObjectMember(oThdConf, "index"); intensity = GetObjectMember(oThdConf, "intensity"); w_size = GetObjectMember(oThdConf, "worksize"); aff = GetObjectMember(oThdConf, "affine_to_cpu"); stridedIndex = GetObjectMember(oThdConf, "strided_index"); memChunk = GetObjectMember(oThdConf, "mem_chunk"); + unroll = GetObjectMember(oThdConf, "unroll"); compMode = GetObjectMember(oThdConf, "comp_mode"); if(idx == nullptr || intensity == nullptr || w_size == nullptr || aff == nullptr || memChunk == nullptr || - stridedIndex == nullptr || compMode == nullptr) + stridedIndex == nullptr || unroll == nullptr || compMode == nullptr) return false; if(!idx->IsUint64() || !intensity->IsUint64() || !w_size->IsUint64()) @@ -142,13 +143,20 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg) return false; } + if(!memChunk->IsUint64() || (int)memChunk->GetInt64() > 18 ) + { + printer::inst()->print_msg(L0, "ERROR: mem_chunk must be smaller than 18"); + return false; + } + cfg.memChunk = (int)memChunk->GetInt64(); - if(!idx->IsUint64() || cfg.memChunk > 18 ) + if(!unroll->IsUint64() || (int)unroll->GetInt64() >= 128 || (int)unroll->GetInt64() == 0) { - printer::inst()->print_msg(L0, "ERROR: mem_chunk must be smaller than 18"); + printer::inst()->print_msg(L0, "ERROR: unroll must be smaller than 128 and not zero"); return false; } + cfg.unroll = (int)unroll->GetInt64(); if(!compMode->IsBool()) return false; diff --git a/xmrstak/backend/amd/jconf.hpp b/xmrstak/backend/amd/jconf.hpp index 580b69fe7..b852c5940 100644 --- a/xmrstak/backend/amd/jconf.hpp +++ b/xmrstak/backend/amd/jconf.hpp @@ -28,6 +28,7 @@ class jconf long long cpu_aff; int stridedIndex; int memChunk; + int unroll; bool compMode; }; diff --git a/xmrstak/backend/amd/minethd.cpp b/xmrstak/backend/amd/minethd.cpp index f7b47249e..5e70f25a6 100644 --- a/xmrstak/backend/amd/minethd.cpp +++ b/xmrstak/backend/amd/minethd.cpp @@ -99,6 +99,7 @@ bool minethd::init_gpus() vGpuData[i].stridedIndex = cfg.stridedIndex; vGpuData[i].memChunk = cfg.memChunk; vGpuData[i].compMode = cfg.compMode; + vGpuData[i].unroll = cfg.unroll; } return InitOpenCL(vGpuData.data(), n, jconf::inst()->GetPlatformIdx()) == ERR_SUCCESS; @@ -173,6 +174,11 @@ void minethd::work_main() cryptonight_ctx* cpu_ctx; cpu_ctx = cpu::minethd::minethd_alloc_ctx(); + if(cpu_ctx == nullptr) + { + printer::inst()->print_msg(L0, "ERROR: miner was not able to allocate memory, miner will be stopped."); + win_exit(1); + } // start with root algorithm and switch later if fork version is reached auto miner_algo = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot(); cn_hash_fun hash_fun = cpu::minethd::func_selector(::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, miner_algo); @@ -235,7 +241,7 @@ void minethd::work_main() if(globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) != iJobNo) break; } - + cl_uint results[0x100]; memset(results,0,sizeof(cl_uint)*(0x100)); @@ -252,7 +258,7 @@ void minethd::work_main() *(uint32_t*)(bWorkBlob + 39) = results[i]; - hash_fun(bWorkBlob, oWork.iWorkSize, bResult, cpu_ctx); + hash_fun(bWorkBlob, oWork.iWorkSize, bResult, &cpu_ctx); if ( (*((uint64_t*)(bResult + 24))) < oWork.iTarget) executor::inst()->push_event(ex_event(job_result(oWork.sJobID, results[i], bResult, iThreadNo, miner_algo), oWork.iPoolId)); else diff --git a/xmrstak/backend/amd/minethd.hpp b/xmrstak/backend/amd/minethd.hpp index 3142117c5..32e66ec87 100644 --- a/xmrstak/backend/amd/minethd.hpp +++ b/xmrstak/backend/amd/minethd.hpp @@ -24,14 +24,14 @@ class minethd : public iBackend static bool init_gpus(); private: - typedef void (*cn_hash_fun)(const void*, size_t, void*, cryptonight_ctx*); + typedef void (*cn_hash_fun)(const void*, size_t, void*, cryptonight_ctx**); minethd(miner_work& pWork, size_t iNo, GpuContext* ctx, const jconf::thd_cfg cfg); void work_main(); uint64_t iJobNo; - + miner_work oWork; std::promise order_fix; diff --git a/xmrstak/backend/backendConnector.cpp b/xmrstak/backend/backendConnector.cpp index 525413fd5..92bb01506 100644 --- a/xmrstak/backend/backendConnector.cpp +++ b/xmrstak/backend/backendConnector.cpp @@ -63,10 +63,35 @@ std::vector* BackendConnector::thread_starter(miner_work& pWork) #ifndef CONF_NO_CUDA if(params::inst().useNVIDIA) { - plugin nvidiaplugin("NVIDIA", "xmrstak_cuda_backend"); - std::vector* nvidiaThreads = nvidiaplugin.startBackend(static_cast(pvThreads->size()), pWork, environment::inst()); - pvThreads->insert(std::end(*pvThreads), std::begin(*nvidiaThreads), std::end(*nvidiaThreads)); - if(nvidiaThreads->size() == 0) + plugin nvidiaplugin; + std::vector* nvidiaThreads; + std::vector libNames = {"xmrstak_cuda_backend_cuda10_0", "xmrstak_cuda_backend_cuda9_2", "xmrstak_cuda_backend"}; + size_t numWorkers = 0u; + + for( const auto & name : libNames) + { + printer::inst()->print_msg(L0, "NVIDIA: try to load library '%s'", name.c_str()); + nvidiaplugin.load("NVIDIA", name); + std::vector* nvidiaThreads = nvidiaplugin.startBackend(static_cast(pvThreads->size()), pWork, environment::inst()); + if(nvidiaThreads != nullptr) + { + pvThreads->insert(std::end(*pvThreads), std::begin(*nvidiaThreads), std::end(*nvidiaThreads)); + numWorkers = nvidiaThreads->size(); + delete nvidiaThreads; + } + else + { + // remove the plugin if we have found no GPUs + nvidiaplugin.unload(); + } + // we found at leat one working GPU + if(numWorkers != 0) + { + printer::inst()->print_msg(L0, "NVIDIA: use library '%s'", name.c_str()); + break; + } + } + if(numWorkers == 0) printer::inst()->print_msg(L0, "WARNING: backend NVIDIA disabled."); } #endif @@ -75,10 +100,17 @@ std::vector* BackendConnector::thread_starter(miner_work& pWork) if(params::inst().useAMD) { const std::string backendName = xmrstak::params::inst().openCLVendor; - plugin amdplugin(backendName, "xmrstak_opencl_backend"); + plugin amdplugin; + amdplugin.load(backendName, "xmrstak_opencl_backend"); std::vector* amdThreads = amdplugin.startBackend(static_cast(pvThreads->size()), pWork, environment::inst()); - pvThreads->insert(std::end(*pvThreads), std::begin(*amdThreads), std::end(*amdThreads)); - if(amdThreads->size() == 0) + size_t numWorkers = 0u; + if(amdThreads != nullptr) + { + pvThreads->insert(std::end(*pvThreads), std::begin(*amdThreads), std::end(*amdThreads)); + numWorkers = amdThreads->size(); + delete amdThreads; + } + if(numWorkers == 0) printer::inst()->print_msg(L0, "WARNING: backend %s (OpenCL) disabled.", backendName.c_str()); } #endif diff --git a/xmrstak/backend/cpu/autoAdjust.hpp b/xmrstak/backend/cpu/autoAdjust.hpp index 57dbef053..e7f3e9148 100644 --- a/xmrstak/backend/cpu/autoAdjust.hpp +++ b/xmrstak/backend/cpu/autoAdjust.hpp @@ -7,6 +7,7 @@ #include "xmrstak/misc/configEditor.hpp" #include "xmrstak/params.hpp" #include "xmrstak/backend/cryptonight.hpp" +#include "xmrstak/backend/cpu/cpuType.hpp" #include #ifdef _WIN32 @@ -20,14 +21,6 @@ namespace xmrstak { namespace cpu { -// Mask bits between h and l and return the value -// This enables us to put in values exactly like in the manual -// For example EBX[31:22] is get_masked(cpu_info[1], 31, 22) -inline int32_t get_masked(int32_t val, int32_t h, int32_t l) -{ - val &= (0x7FFFFFFF >> (31-(h-l))) << l; - return val >> l; -} class autoAdjust { @@ -58,7 +51,7 @@ class autoAdjust if(L3KB_size < halfHashMemSizeKB || L3KB_size > (halfHashMemSizeKB * 2048)) printer::inst()->print_msg(L0, "Autoconf failed: L3 size sanity check failed - %u KB.", L3KB_size); - conf += std::string(" { \"low_power_mode\" : false, \"no_prefetch\" : true, \"affine_to_cpu\" : false },\n"); + conf += std::string(" { \"low_power_mode\" : false, \"no_prefetch\" : true, \"asm\" : \"off\", \"affine_to_cpu\" : false },\n"); printer::inst()->print_msg(L0, "Autoconf FAILED. Create config for a single thread. Please try to add new ones until the hashrate slows down."); } else @@ -82,7 +75,7 @@ class autoAdjust conf += std::string(" { \"low_power_mode\" : "); conf += std::string(double_mode ? "true" : "false"); - conf += std::string(", \"no_prefetch\" : true, \"affine_to_cpu\" : "); + conf += std::string(", \"no_prefetch\" : true, \"asm\" : \"auto\", \"affine_to_cpu\" : "); conf += std::to_string(aff_id); conf += std::string(" },\n"); @@ -143,7 +136,8 @@ class autoAdjust L3KB_size = get_masked(cpu_info[3], 31, 18) * 512; ::jconf::cpuid(1, 0, cpu_info); - if(get_masked(cpu_info[0], 11, 8) < 0x17) //0x17h is Zen + + if(getModel().family < 0x17) //0x17h is Zen old_amd = true; return true; diff --git a/xmrstak/backend/cpu/autoAdjustHwloc.hpp b/xmrstak/backend/cpu/autoAdjustHwloc.hpp index 01d2280d8..b61582588 100644 --- a/xmrstak/backend/cpu/autoAdjustHwloc.hpp +++ b/xmrstak/backend/cpu/autoAdjustHwloc.hpp @@ -70,7 +70,7 @@ class autoAdjust { conf += std::string(" { \"low_power_mode\" : "); conf += std::string((id & 0x8000000) != 0 ? "true" : "false"); - conf += std::string(", \"no_prefetch\" : true, \"affine_to_cpu\" : "); + conf += std::string(", \"no_prefetch\" : true, \"asm\" : \"auto\", \"affine_to_cpu\" : "); conf += std::to_string(id & 0x7FFFFFF); conf += std::string(" },\n"); } @@ -78,7 +78,8 @@ class autoAdjust catch(const std::runtime_error& err) { // \todo add fallback to default auto adjust - conf += std::string(" { \"low_power_mode\" : false, \"no_prefetch\" : true, \"affine_to_cpu\" : false },\n"); + conf += std::string(" { \"low_power_mode\" : false"); + conf += std::string(", \"no_prefetch\" : true, \"asm\" : \"off\", \"affine_to_cpu\" : false },\n"); printer::inst()->print_msg(L0, "Autoconf FAILED: %s. Create config for a single thread.", err.what()); } diff --git a/xmrstak/backend/cpu/config.tpl b/xmrstak/backend/cpu/config.tpl index 2fc9a47ec..1a64860e4 100644 --- a/xmrstak/backend/cpu/config.tpl +++ b/xmrstak/backend/cpu/config.tpl @@ -1,4 +1,5 @@ -R"===( +R"===(// generated by XMRSTAK_VERSION + /* * Thread configuration for each thread. Make sure it matches the number above. * low_power_mode - This can either be a boolean (true or false), or a number between 1 to 5. When set to true, @@ -7,10 +8,16 @@ R"===( * the maximum performance. When set to a number N greater than 1, this mode will increase the * cache usage and single thread performance by N times. * - * no_prefetch - Some systems can gain up to extra 5% here, but sometimes it will have no difference or make + * no_prefetch - Some systems can gain up to extra 5% here, but sometimes it will have no difference or make * things slower. * - * affine_to_cpu - This can be either false (no affinity), or the CPU core number. Note that on hyperthreading + * asm - Allow to switch to a assembler version of cryptonight_v8; allowed value [auto, off, intel_avx, amd_avx] + * - auto: xmr-stak will automatically detect the asm type (default) + * - off: disable the usage of optimized assembler + * - intel_avx: supports Intel cpus with avx instructions e.g. Xeon v2, Core i7/i5/i3 3xxx, Pentium G2xxx, Celeron G1xxx + * - amd_avx: supports AMD cpus with avx instructions e.g. AMD Ryzen 1xxx and 2xxx series + * + * affine_to_cpu - This can be either false (no affinity), or the CPU core number. Note that on hyperthreading * systems it is better to assign threads to physical cores. On Windows this usually means selecting * even or odd numbered cpu numbers. For Linux it will be usually the lower CPU numbers, so for a 4 * physical core CPU you should select cpu numbers 0-3. @@ -21,8 +28,8 @@ R"===( * A filled out configuration should look like this: * "cpu_threads_conf" : * [ - * { "low_power_mode" : false, "no_prefetch" : true, "affine_to_cpu" : 0 }, - * { "low_power_mode" : false, "no_prefetch" : true, "affine_to_cpu" : 1 }, + * { "low_power_mode" : false, "no_prefetch" : true, "asm" : "auto", "affine_to_cpu" : 0 }, + * { "low_power_mode" : false, "no_prefetch" : true, "asm" : "auto", "affine_to_cpu" : 1 }, * ], * If you do not wish to mine with your CPU(s) then use: * "cpu_threads_conf" : @@ -33,5 +40,4 @@ R"===( [ CPUCONFIG ], - )===" diff --git a/xmrstak/backend/cpu/cpuType.cpp b/xmrstak/backend/cpu/cpuType.cpp new file mode 100644 index 000000000..5959b75cc --- /dev/null +++ b/xmrstak/backend/cpu/cpuType.cpp @@ -0,0 +1,79 @@ + +#include "xmrstak/backend/cpu/cpuType.hpp" + +#include +#include +#include + +#ifdef _WIN32 +#define strcasecmp _stricmp +#include +#else +#include +#endif + +namespace xmrstak +{ +namespace cpu +{ + void cpuid(uint32_t eax, int32_t ecx, int32_t val[4]) + { + std::memset(val, 0, sizeof(int32_t)*4); + + #ifdef _WIN32 + __cpuidex(val, eax, ecx); + #else + __cpuid_count(eax, ecx, val[0], val[1], val[2], val[3]); + #endif + } + + int32_t get_masked(int32_t val, int32_t h, int32_t l) + { + val &= (0x7FFFFFFF >> (31-(h-l))) << l; + return val >> l; + } + + bool has_feature(int32_t val, int32_t bit) + { + int32_t mask = 1 << bit; + return (val & mask) != 0u; + + } + + Model getModel() + { + int32_t cpu_info[4]; + char cpustr[13] = {0}; + + cpuid(0, 0, cpu_info); + std::memcpy(cpustr, &cpu_info[1], 4); + std::memcpy(cpustr+4, &cpu_info[3], 4); + std::memcpy(cpustr+8, &cpu_info[2], 4); + + Model result; + + cpuid(1, 0, cpu_info); + + result.family = get_masked(cpu_info[0], 12, 8); + result.model = get_masked(cpu_info[0], 8, 4) | get_masked(cpu_info[0], 20, 16) << 4; + result.type_name = cpustr; + + // feature bits https://en.wikipedia.org/wiki/CPUID + // sse2 + result.sse2 = has_feature(cpu_info[3], 26); + // aes-ni + result.aes = has_feature(cpu_info[2], 25); + // avx + result.avx = has_feature(cpu_info[2], 28); + + if(strcmp(cpustr, "AuthenticAMD") == 0) + { + if(result.family == 0xF) + result.family += get_masked(cpu_info[0], 28, 20); + } + + return result; + } + +} // namespace cpu +} // namespace xmrstak diff --git a/xmrstak/backend/cpu/cpuType.hpp b/xmrstak/backend/cpu/cpuType.hpp new file mode 100644 index 000000000..7f6bfaf51 --- /dev/null +++ b/xmrstak/backend/cpu/cpuType.hpp @@ -0,0 +1,32 @@ +#pragma once + +#include +#include + + +namespace xmrstak +{ +namespace cpu +{ + struct Model + { + uint32_t family = 0u; + uint32_t model = 0u; + bool aes = false; + bool sse2 = false; + bool avx = false; + std::string type_name = "unknown"; + }; + + Model getModel(); + + /** Mask bits between h and l and return the value + * + * This enables us to put in values exactly like in the manual + * For example EBX[30:22] is get_masked(cpu_info[1], 31, 22) + */ + int32_t get_masked(int32_t val, int32_t h, int32_t l); + + +} // namespace cpu +} // namespace xmrstak diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_double_main_loop_sandybridge_linux.inc b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_double_main_loop_sandybridge_linux.inc new file mode 100644 index 000000000..79adab671 --- /dev/null +++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_double_main_loop_sandybridge_linux.inc @@ -0,0 +1,410 @@ + mov rax, rsp + push rbx + push rbp + push rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 184 + + stmxcsr DWORD PTR [rsp+272] + mov DWORD PTR [rsp+276], 24448 + ldmxcsr DWORD PTR [rsp+276] + + mov r13, QWORD PTR [rcx+224] + mov r9, rdx + mov r10, QWORD PTR [rcx+32] + mov r8, rcx + xor r10, QWORD PTR [rcx] + mov r14d, 524288 + mov r11, QWORD PTR [rcx+40] + xor r11, QWORD PTR [rcx+8] + mov rsi, QWORD PTR [rdx+224] + mov rdx, QWORD PTR [rcx+56] + xor rdx, QWORD PTR [rcx+24] + mov rdi, QWORD PTR [r9+32] + xor rdi, QWORD PTR [r9] + mov rbp, QWORD PTR [r9+40] + xor rbp, QWORD PTR [r9+8] + movq xmm0, rdx + movaps XMMWORD PTR [rax-88], xmm6 + movaps XMMWORD PTR [rax-104], xmm7 + movaps XMMWORD PTR [rax-120], xmm8 + movaps XMMWORD PTR [rsp+112], xmm9 + movaps XMMWORD PTR [rsp+96], xmm10 + movaps XMMWORD PTR [rsp+80], xmm11 + movaps XMMWORD PTR [rsp+64], xmm12 + movaps XMMWORD PTR [rsp+48], xmm13 + movaps XMMWORD PTR [rsp+32], xmm14 + movaps XMMWORD PTR [rsp+16], xmm15 + mov rdx, r10 + movq xmm4, QWORD PTR [r8+96] + and edx, 2097136 + mov rax, QWORD PTR [rcx+48] + xorps xmm13, xmm13 + xor rax, QWORD PTR [rcx+16] + mov rcx, QWORD PTR [rcx+88] + xor rcx, QWORD PTR [r8+72] + movq xmm5, QWORD PTR [r8+104] + movq xmm7, rax + + mov eax, 1 + shl rax, 52 + movq xmm14, rax + punpcklqdq xmm14, xmm14 + + mov eax, 1023 + shl rax, 52 + movq xmm12, rax + punpcklqdq xmm12, xmm12 + + mov rax, QWORD PTR [r8+80] + xor rax, QWORD PTR [r8+64] + punpcklqdq xmm7, xmm0 + movq xmm0, rcx + mov rcx, QWORD PTR [r9+56] + xor rcx, QWORD PTR [r9+24] + movq xmm3, rax + mov rax, QWORD PTR [r9+48] + xor rax, QWORD PTR [r9+16] + punpcklqdq xmm3, xmm0 + movq xmm0, rcx + mov QWORD PTR [rsp], r13 + mov rcx, QWORD PTR [r9+88] + xor rcx, QWORD PTR [r9+72] + movq xmm6, rax + mov rax, QWORD PTR [r9+80] + xor rax, QWORD PTR [r9+64] + punpcklqdq xmm6, xmm0 + movq xmm0, rcx + mov QWORD PTR [rsp+256], r10 + mov rcx, rdi + mov QWORD PTR [rsp+264], r11 + movq xmm8, rax + and ecx, 2097136 + punpcklqdq xmm8, xmm0 + movq xmm0, QWORD PTR [r9+96] + punpcklqdq xmm4, xmm0 + movq xmm0, QWORD PTR [r9+104] + lea r8, QWORD PTR [rcx+rsi] + movdqu xmm11, XMMWORD PTR [r8] + punpcklqdq xmm5, xmm0 + lea r9, QWORD PTR [rdx+r13] + movdqu xmm15, XMMWORD PTR [r9] + +ALIGN 16 +main_loop_double_sandybridge: + movdqu xmm9, xmm15 + mov eax, edx + mov ebx, edx + xor eax, 16 + xor ebx, 32 + xor edx, 48 + + movq xmm0, r11 + movq xmm2, r10 + punpcklqdq xmm2, xmm0 + aesenc xmm9, xmm2 + + movdqu xmm0, XMMWORD PTR [rax+r13] + movdqu xmm1, XMMWORD PTR [rbx+r13] + paddq xmm0, xmm7 + paddq xmm1, xmm2 + movdqu XMMWORD PTR [rbx+r13], xmm0 + movdqu xmm0, XMMWORD PTR [rdx+r13] + movdqu XMMWORD PTR [rdx+r13], xmm1 + paddq xmm0, xmm3 + movdqu XMMWORD PTR [rax+r13], xmm0 + + movq r11, xmm9 + mov edx, r11d + and edx, 2097136 + movdqa xmm0, xmm9 + pxor xmm0, xmm7 + movdqu XMMWORD PTR [r9], xmm0 + + lea rbx, QWORD PTR [rdx+r13] + mov r10, QWORD PTR [rdx+r13] + + movdqu xmm10, xmm11 + movq xmm0, rbp + movq xmm11, rdi + punpcklqdq xmm11, xmm0 + aesenc xmm10, xmm11 + + mov eax, ecx + mov r12d, ecx + xor eax, 16 + xor r12d, 32 + xor ecx, 48 + + movdqu xmm0, XMMWORD PTR [rax+rsi] + paddq xmm0, xmm6 + movdqu xmm1, XMMWORD PTR [r12+rsi] + movdqu XMMWORD PTR [r12+rsi], xmm0 + paddq xmm1, xmm11 + movdqu xmm0, XMMWORD PTR [rcx+rsi] + movdqu XMMWORD PTR [rcx+rsi], xmm1 + paddq xmm0, xmm8 + movdqu XMMWORD PTR [rax+rsi], xmm0 + + movq rcx, xmm10 + and ecx, 2097136 + + movdqa xmm0, xmm10 + pxor xmm0, xmm6 + movdqu XMMWORD PTR [r8], xmm0 + mov r12, QWORD PTR [rcx+rsi] + + mov r9, QWORD PTR [rbx+8] + + xor edx, 16 + mov r8d, edx + mov r15d, edx + + movq rdx, xmm5 + shl rdx, 32 + movq rax, xmm4 + xor rdx, rax + xor r10, rdx + mov rax, r10 + mul r11 + mov r11d, r8d + xor r11d, 48 + movq xmm0, rdx + xor rdx, [r11+r13] + movq xmm1, rax + xor rax, [r11+r13+8] + punpcklqdq xmm0, xmm1 + + pxor xmm0, XMMWORD PTR [r8+r13] + xor r8d, 32 + movdqu xmm1, XMMWORD PTR [r11+r13] + paddq xmm0, xmm7 + paddq xmm1, xmm2 + movdqu XMMWORD PTR [r11+r13], xmm0 + movdqu xmm0, XMMWORD PTR [r8+r13] + movdqu XMMWORD PTR [r8+r13], xmm1 + paddq xmm0, xmm3 + movdqu XMMWORD PTR [r15+r13], xmm0 + + mov r11, QWORD PTR [rsp+256] + add r11, rdx + mov rdx, QWORD PTR [rsp+264] + add rdx, rax + mov QWORD PTR [rbx], r11 + xor r11, r10 + mov QWORD PTR [rbx+8], rdx + xor rdx, r9 + mov QWORD PTR [rsp+256], r11 + and r11d, 2097136 + mov QWORD PTR [rsp+264], rdx + mov QWORD PTR [rsp+8], r11 + lea r15, QWORD PTR [r11+r13] + movdqu xmm15, XMMWORD PTR [r11+r13] + lea r13, QWORD PTR [rsi+rcx] + movdqa xmm0, xmm5 + psrldq xmm0, 8 + movaps xmm2, xmm13 + movq r10, xmm0 + psllq xmm5, 1 + shl r10, 32 + movdqa xmm0, xmm9 + psrldq xmm0, 8 + movdqa xmm1, xmm10 + movq r11, xmm0 + psrldq xmm1, 8 + movq r8, xmm1 + psrldq xmm4, 8 + movaps xmm0, xmm13 + movq rax, xmm4 + xor r10, rax + movaps xmm1, xmm13 + xor r10, r12 + lea rax, QWORD PTR [r11+1] + shr rax, 1 + movdqa xmm3, xmm9 + punpcklqdq xmm3, xmm10 + paddq xmm5, xmm3 + movq rdx, xmm5 + psrldq xmm5, 8 + cvtsi2sd xmm2, rax + or edx, -2147483647 + lea rax, QWORD PTR [r8+1] + shr rax, 1 + movq r9, xmm5 + cvtsi2sd xmm0, rax + or r9d, -2147483647 + cvtsi2sd xmm1, rdx + unpcklpd xmm2, xmm0 + movaps xmm0, xmm13 + cvtsi2sd xmm0, r9 + unpcklpd xmm1, xmm0 + divpd xmm2, xmm1 + paddq xmm2, xmm14 + cvttsd2si rax, xmm2 + psrldq xmm2, 8 + mov rbx, rax + imul rax, rdx + sub r11, rax + js div_fix_1_sandybridge +div_fix_1_ret_sandybridge: + + cvttsd2si rdx, xmm2 + mov rax, rdx + imul rax, r9 + movd xmm2, r11d + movd xmm4, ebx + sub r8, rax + js div_fix_2_sandybridge +div_fix_2_ret_sandybridge: + + movd xmm1, r8d + movd xmm0, edx + punpckldq xmm2, xmm1 + punpckldq xmm4, xmm0 + punpckldq xmm4, xmm2 + paddq xmm3, xmm4 + movdqa xmm0, xmm3 + psrlq xmm0, 12 + paddq xmm0, xmm12 + sqrtpd xmm1, xmm0 + movq r9, xmm1 + movdqa xmm5, xmm1 + psrlq xmm5, 19 + test r9, 524287 + je sqrt_fix_1_sandybridge +sqrt_fix_1_ret_sandybridge: + + movq r9, xmm10 + psrldq xmm1, 8 + movq r8, xmm1 + test r8, 524287 + je sqrt_fix_2_sandybridge +sqrt_fix_2_ret_sandybridge: + + mov r12d, ecx + mov r8d, ecx + xor r12d, 16 + xor r8d, 32 + xor ecx, 48 + mov rax, r10 + mul r9 + movq xmm0, rax + movq xmm3, rdx + punpcklqdq xmm3, xmm0 + + movdqu xmm0, XMMWORD PTR [r12+rsi] + pxor xmm0, xmm3 + movdqu xmm1, XMMWORD PTR [r8+rsi] + xor rdx, [r8+rsi] + xor rax, [r8+rsi+8] + movdqu xmm3, XMMWORD PTR [rcx+rsi] + paddq xmm0, xmm6 + paddq xmm1, xmm11 + paddq xmm3, xmm8 + movdqu XMMWORD PTR [r8+rsi], xmm0 + movdqu XMMWORD PTR [rcx+rsi], xmm1 + movdqu XMMWORD PTR [r12+rsi], xmm3 + + add rdi, rdx + mov QWORD PTR [r13], rdi + xor rdi, r10 + mov ecx, edi + and ecx, 2097136 + lea r8, QWORD PTR [rcx+rsi] + + mov rdx, QWORD PTR [r13+8] + add rbp, rax + mov QWORD PTR [r13+8], rbp + movdqu xmm11, XMMWORD PTR [rcx+rsi] + xor rbp, rdx + mov r13, QWORD PTR [rsp] + movdqa xmm3, xmm7 + mov rdx, QWORD PTR [rsp+8] + movdqa xmm8, xmm6 + mov r10, QWORD PTR [rsp+256] + movdqa xmm7, xmm9 + mov r11, QWORD PTR [rsp+264] + movdqa xmm6, xmm10 + mov r9, r15 + dec r14d + jne main_loop_double_sandybridge + + ldmxcsr DWORD PTR [rsp+272] + movaps xmm13, XMMWORD PTR [rsp+48] + lea r11, QWORD PTR [rsp+184] + movaps xmm6, XMMWORD PTR [r11-24] + movaps xmm7, XMMWORD PTR [r11-40] + movaps xmm8, XMMWORD PTR [r11-56] + movaps xmm9, XMMWORD PTR [r11-72] + movaps xmm10, XMMWORD PTR [r11-88] + movaps xmm11, XMMWORD PTR [r11-104] + movaps xmm12, XMMWORD PTR [r11-120] + movaps xmm14, XMMWORD PTR [rsp+32] + movaps xmm15, XMMWORD PTR [rsp+16] + mov rsp, r11 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + pop rsi + pop rbp + pop rbx + jmp cnv2_double_mainloop_asm_sandybridge_endp + +div_fix_1_sandybridge: + dec rbx + add r11, rdx + jmp div_fix_1_ret_sandybridge + +div_fix_2_sandybridge: + dec rdx + add r8, r9 + jmp div_fix_2_ret_sandybridge + +sqrt_fix_1_sandybridge: + movq r8, xmm3 + movdqa xmm0, xmm5 + psrldq xmm0, 8 + dec r9 + mov r11d, -1022 + shl r11, 32 + mov rax, r9 + shr r9, 19 + shr rax, 20 + mov rdx, r9 + sub rdx, rax + lea rdx, [rdx+r11+1] + add rax, r11 + imul rdx, rax + sub rdx, r8 + adc r9, 0 + movq xmm5, r9 + punpcklqdq xmm5, xmm0 + jmp sqrt_fix_1_ret_sandybridge + +sqrt_fix_2_sandybridge: + psrldq xmm3, 8 + movq r11, xmm3 + dec r8 + mov ebx, -1022 + shl rbx, 32 + mov rax, r8 + shr r8, 19 + shr rax, 20 + mov rdx, r8 + sub rdx, rax + lea rdx, [rdx+rbx+1] + add rax, rbx + imul rdx, rax + sub rdx, r11 + adc r8, 0 + movq xmm0, r8 + punpcklqdq xmm5, xmm0 + jmp sqrt_fix_2_ret_sandybridge + +cnv2_double_mainloop_asm_sandybridge_endp: diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_double_main_loop_sandybridge_win64.inc b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_double_main_loop_sandybridge_win64.inc new file mode 100644 index 000000000..ad8f18233 --- /dev/null +++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_double_main_loop_sandybridge_win64.inc @@ -0,0 +1,410 @@ + mov rax, rsp + push rbx + push rbp + push rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 184 + + stmxcsr DWORD PTR [rsp+272] + mov DWORD PTR [rsp+276], 24448 + ldmxcsr DWORD PTR [rsp+276] + + mov r13, QWORD PTR [rcx+224] + mov r9, rdx + mov r10, QWORD PTR [rcx+32] + mov r8, rcx + xor r10, QWORD PTR [rcx] + mov r14d, 524288 + mov r11, QWORD PTR [rcx+40] + xor r11, QWORD PTR [rcx+8] + mov rsi, QWORD PTR [rdx+224] + mov rdx, QWORD PTR [rcx+56] + xor rdx, QWORD PTR [rcx+24] + mov rdi, QWORD PTR [r9+32] + xor rdi, QWORD PTR [r9] + mov rbp, QWORD PTR [r9+40] + xor rbp, QWORD PTR [r9+8] + movd xmm0, rdx + movaps XMMWORD PTR [rax-88], xmm6 + movaps XMMWORD PTR [rax-104], xmm7 + movaps XMMWORD PTR [rax-120], xmm8 + movaps XMMWORD PTR [rsp+112], xmm9 + movaps XMMWORD PTR [rsp+96], xmm10 + movaps XMMWORD PTR [rsp+80], xmm11 + movaps XMMWORD PTR [rsp+64], xmm12 + movaps XMMWORD PTR [rsp+48], xmm13 + movaps XMMWORD PTR [rsp+32], xmm14 + movaps XMMWORD PTR [rsp+16], xmm15 + mov rdx, r10 + movq xmm4, QWORD PTR [r8+96] + and edx, 2097136 + mov rax, QWORD PTR [rcx+48] + xorps xmm13, xmm13 + xor rax, QWORD PTR [rcx+16] + mov rcx, QWORD PTR [rcx+88] + xor rcx, QWORD PTR [r8+72] + movq xmm5, QWORD PTR [r8+104] + movd xmm7, rax + + mov eax, 1 + shl rax, 52 + movd xmm14, rax + punpcklqdq xmm14, xmm14 + + mov eax, 1023 + shl rax, 52 + movd xmm12, rax + punpcklqdq xmm12, xmm12 + + mov rax, QWORD PTR [r8+80] + xor rax, QWORD PTR [r8+64] + punpcklqdq xmm7, xmm0 + movd xmm0, rcx + mov rcx, QWORD PTR [r9+56] + xor rcx, QWORD PTR [r9+24] + movd xmm3, rax + mov rax, QWORD PTR [r9+48] + xor rax, QWORD PTR [r9+16] + punpcklqdq xmm3, xmm0 + movd xmm0, rcx + mov QWORD PTR [rsp], r13 + mov rcx, QWORD PTR [r9+88] + xor rcx, QWORD PTR [r9+72] + movd xmm6, rax + mov rax, QWORD PTR [r9+80] + xor rax, QWORD PTR [r9+64] + punpcklqdq xmm6, xmm0 + movd xmm0, rcx + mov QWORD PTR [rsp+256], r10 + mov rcx, rdi + mov QWORD PTR [rsp+264], r11 + movd xmm8, rax + and ecx, 2097136 + punpcklqdq xmm8, xmm0 + movd xmm0, QWORD PTR [r9+96] + punpcklqdq xmm4, xmm0 + movd xmm0, QWORD PTR [r9+104] + lea r8, QWORD PTR [rcx+rsi] + movdqu xmm11, XMMWORD PTR [r8] + punpcklqdq xmm5, xmm0 + lea r9, QWORD PTR [rdx+r13] + movdqu xmm15, XMMWORD PTR [r9] + + ALIGN 64 +main_loop_double_sandybridge: + movdqu xmm9, xmm15 + mov eax, edx + mov ebx, edx + xor eax, 16 + xor ebx, 32 + xor edx, 48 + + movd xmm0, r11 + movd xmm2, r10 + punpcklqdq xmm2, xmm0 + aesenc xmm9, xmm2 + + movdqu xmm0, XMMWORD PTR [rax+r13] + movdqu xmm1, XMMWORD PTR [rbx+r13] + paddq xmm0, xmm7 + paddq xmm1, xmm2 + movdqu XMMWORD PTR [rbx+r13], xmm0 + movdqu xmm0, XMMWORD PTR [rdx+r13] + movdqu XMMWORD PTR [rdx+r13], xmm1 + paddq xmm0, xmm3 + movdqu XMMWORD PTR [rax+r13], xmm0 + + movd r11, xmm9 + mov edx, r11d + and edx, 2097136 + movdqa xmm0, xmm9 + pxor xmm0, xmm7 + movdqu XMMWORD PTR [r9], xmm0 + + lea rbx, QWORD PTR [rdx+r13] + mov r10, QWORD PTR [rdx+r13] + + movdqu xmm10, xmm11 + movd xmm0, rbp + movd xmm11, rdi + punpcklqdq xmm11, xmm0 + aesenc xmm10, xmm11 + + mov eax, ecx + mov r12d, ecx + xor eax, 16 + xor r12d, 32 + xor ecx, 48 + + movdqu xmm0, XMMWORD PTR [rax+rsi] + paddq xmm0, xmm6 + movdqu xmm1, XMMWORD PTR [r12+rsi] + movdqu XMMWORD PTR [r12+rsi], xmm0 + paddq xmm1, xmm11 + movdqu xmm0, XMMWORD PTR [rcx+rsi] + movdqu XMMWORD PTR [rcx+rsi], xmm1 + paddq xmm0, xmm8 + movdqu XMMWORD PTR [rax+rsi], xmm0 + + movd rcx, xmm10 + and ecx, 2097136 + + movdqa xmm0, xmm10 + pxor xmm0, xmm6 + movdqu XMMWORD PTR [r8], xmm0 + mov r12, QWORD PTR [rcx+rsi] + + mov r9, QWORD PTR [rbx+8] + + xor edx, 16 + mov r8d, edx + mov r15d, edx + + movd rdx, xmm5 + shl rdx, 32 + movd rax, xmm4 + xor rdx, rax + xor r10, rdx + mov rax, r10 + mul r11 + mov r11d, r8d + xor r11d, 48 + movd xmm0, rdx + xor rdx, [r11+r13] + movd xmm1, rax + xor rax, [r11+r13+8] + punpcklqdq xmm0, xmm1 + + pxor xmm0, XMMWORD PTR [r8+r13] + xor r8d, 32 + movdqu xmm1, XMMWORD PTR [r11+r13] + paddq xmm0, xmm7 + paddq xmm1, xmm2 + movdqu XMMWORD PTR [r11+r13], xmm0 + movdqu xmm0, XMMWORD PTR [r8+r13] + movdqu XMMWORD PTR [r8+r13], xmm1 + paddq xmm0, xmm3 + movdqu XMMWORD PTR [r15+r13], xmm0 + + mov r11, QWORD PTR [rsp+256] + add r11, rdx + mov rdx, QWORD PTR [rsp+264] + add rdx, rax + mov QWORD PTR [rbx], r11 + xor r11, r10 + mov QWORD PTR [rbx+8], rdx + xor rdx, r9 + mov QWORD PTR [rsp+256], r11 + and r11d, 2097136 + mov QWORD PTR [rsp+264], rdx + mov QWORD PTR [rsp+8], r11 + lea r15, QWORD PTR [r11+r13] + movdqu xmm15, XMMWORD PTR [r11+r13] + lea r13, QWORD PTR [rsi+rcx] + movdqa xmm0, xmm5 + psrldq xmm0, 8 + movaps xmm2, xmm13 + movd r10, xmm0 + psllq xmm5, 1 + shl r10, 32 + movdqa xmm0, xmm9 + psrldq xmm0, 8 + movdqa xmm1, xmm10 + movd r11, xmm0 + psrldq xmm1, 8 + movd r8, xmm1 + psrldq xmm4, 8 + movaps xmm0, xmm13 + movd rax, xmm4 + xor r10, rax + movaps xmm1, xmm13 + xor r10, r12 + lea rax, QWORD PTR [r11+1] + shr rax, 1 + movdqa xmm3, xmm9 + punpcklqdq xmm3, xmm10 + paddq xmm5, xmm3 + movd rdx, xmm5 + psrldq xmm5, 8 + cvtsi2sd xmm2, rax + or edx, -2147483647 + lea rax, QWORD PTR [r8+1] + shr rax, 1 + movd r9, xmm5 + cvtsi2sd xmm0, rax + or r9d, -2147483647 + cvtsi2sd xmm1, rdx + unpcklpd xmm2, xmm0 + movaps xmm0, xmm13 + cvtsi2sd xmm0, r9 + unpcklpd xmm1, xmm0 + divpd xmm2, xmm1 + paddq xmm2, xmm14 + cvttsd2si rax, xmm2 + psrldq xmm2, 8 + mov rbx, rax + imul rax, rdx + sub r11, rax + js div_fix_1_sandybridge +div_fix_1_ret_sandybridge: + + cvttsd2si rdx, xmm2 + mov rax, rdx + imul rax, r9 + movd xmm2, r11d + movd xmm4, ebx + sub r8, rax + js div_fix_2_sandybridge +div_fix_2_ret_sandybridge: + + movd xmm1, r8d + movd xmm0, edx + punpckldq xmm2, xmm1 + punpckldq xmm4, xmm0 + punpckldq xmm4, xmm2 + paddq xmm3, xmm4 + movdqa xmm0, xmm3 + psrlq xmm0, 12 + paddq xmm0, xmm12 + sqrtpd xmm1, xmm0 + movd r9, xmm1 + movdqa xmm5, xmm1 + psrlq xmm5, 19 + test r9, 524287 + je sqrt_fix_1_sandybridge +sqrt_fix_1_ret_sandybridge: + + movd r9, xmm10 + psrldq xmm1, 8 + movd r8, xmm1 + test r8, 524287 + je sqrt_fix_2_sandybridge +sqrt_fix_2_ret_sandybridge: + + mov r12d, ecx + mov r8d, ecx + xor r12d, 16 + xor r8d, 32 + xor ecx, 48 + mov rax, r10 + mul r9 + movd xmm0, rax + movd xmm3, rdx + punpcklqdq xmm3, xmm0 + + movdqu xmm0, XMMWORD PTR [r12+rsi] + pxor xmm0, xmm3 + movdqu xmm1, XMMWORD PTR [r8+rsi] + xor rdx, [r8+rsi] + xor rax, [r8+rsi+8] + movdqu xmm3, XMMWORD PTR [rcx+rsi] + paddq xmm0, xmm6 + paddq xmm1, xmm11 + paddq xmm3, xmm8 + movdqu XMMWORD PTR [r8+rsi], xmm0 + movdqu XMMWORD PTR [rcx+rsi], xmm1 + movdqu XMMWORD PTR [r12+rsi], xmm3 + + add rdi, rdx + mov QWORD PTR [r13], rdi + xor rdi, r10 + mov ecx, edi + and ecx, 2097136 + lea r8, QWORD PTR [rcx+rsi] + + mov rdx, QWORD PTR [r13+8] + add rbp, rax + mov QWORD PTR [r13+8], rbp + movdqu xmm11, XMMWORD PTR [rcx+rsi] + xor rbp, rdx + mov r13, QWORD PTR [rsp] + movdqa xmm3, xmm7 + mov rdx, QWORD PTR [rsp+8] + movdqa xmm8, xmm6 + mov r10, QWORD PTR [rsp+256] + movdqa xmm7, xmm9 + mov r11, QWORD PTR [rsp+264] + movdqa xmm6, xmm10 + mov r9, r15 + dec r14d + jne main_loop_double_sandybridge + + ldmxcsr DWORD PTR [rsp+272] + movaps xmm13, XMMWORD PTR [rsp+48] + lea r11, QWORD PTR [rsp+184] + movaps xmm6, XMMWORD PTR [r11-24] + movaps xmm7, XMMWORD PTR [r11-40] + movaps xmm8, XMMWORD PTR [r11-56] + movaps xmm9, XMMWORD PTR [r11-72] + movaps xmm10, XMMWORD PTR [r11-88] + movaps xmm11, XMMWORD PTR [r11-104] + movaps xmm12, XMMWORD PTR [r11-120] + movaps xmm14, XMMWORD PTR [rsp+32] + movaps xmm15, XMMWORD PTR [rsp+16] + mov rsp, r11 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + pop rsi + pop rbp + pop rbx + jmp cnv2_double_mainloop_asm_sandybridge_endp + +div_fix_1_sandybridge: + dec rbx + add r11, rdx + jmp div_fix_1_ret_sandybridge + +div_fix_2_sandybridge: + dec rdx + add r8, r9 + jmp div_fix_2_ret_sandybridge + +sqrt_fix_1_sandybridge: + movd r8, xmm3 + movdqa xmm0, xmm5 + psrldq xmm0, 8 + dec r9 + mov r11d, -1022 + shl r11, 32 + mov rax, r9 + shr r9, 19 + shr rax, 20 + mov rdx, r9 + sub rdx, rax + lea rdx, [rdx+r11+1] + add rax, r11 + imul rdx, rax + sub rdx, r8 + adc r9, 0 + movd xmm5, r9 + punpcklqdq xmm5, xmm0 + jmp sqrt_fix_1_ret_sandybridge + +sqrt_fix_2_sandybridge: + psrldq xmm3, 8 + movd r11, xmm3 + dec r8 + mov ebx, -1022 + shl rbx, 32 + mov rax, r8 + shr r8, 19 + shr rax, 20 + mov rdx, r8 + sub rdx, rax + lea rdx, [rdx+rbx+1] + add rax, rbx + imul rdx, rax + sub rdx, r11 + adc r8, 0 + movd xmm0, r8 + punpcklqdq xmm5, xmm0 + jmp sqrt_fix_2_ret_sandybridge + +cnv2_double_mainloop_asm_sandybridge_endp: diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S new file mode 100644 index 000000000..c0a3d0b41 --- /dev/null +++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S @@ -0,0 +1,37 @@ +#define ALIGN .align +.intel_syntax noprefix +#ifdef __APPLE__ +# define FN_PREFIX(fn) _ ## fn +.text +#else +# define FN_PREFIX(fn) fn +.section .text +#endif +.global FN_PREFIX(cryptonight_v8_mainloop_ivybridge_asm) +.global FN_PREFIX(cryptonight_v8_mainloop_ryzen_asm) +.global FN_PREFIX(cryptonight_v8_double_mainloop_sandybridge_asm) + +ALIGN 8 +FN_PREFIX(cryptonight_v8_mainloop_ivybridge_asm): + sub rsp, 48 + mov rcx, rdi + #include "cryptonight_v8_main_loop_ivybridge_linux.inc" + add rsp, 48 + ret 0 + +ALIGN 8 +FN_PREFIX(cryptonight_v8_mainloop_ryzen_asm): + sub rsp, 48 + mov rcx, rdi + #include "cryptonight_v8_main_loop_ryzen_linux.inc" + add rsp, 48 + ret 0 + +ALIGN 16 +FN_PREFIX(cryptonight_v8_double_mainloop_sandybridge_asm): + sub rsp, 48 + mov rcx, rdi + mov rdx, rsi + #include "cryptonight_v8_double_main_loop_sandybridge_linux.inc" + add rsp, 48 + ret 0 diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.asm b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.asm new file mode 100644 index 000000000..1f3d2e15c --- /dev/null +++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.asm @@ -0,0 +1,25 @@ +_TEXT_CNV8_MAINLOOP SEGMENT PAGE READ EXECUTE +PUBLIC cryptonight_v8_mainloop_ivybridge_asm +PUBLIC cryptonight_v8_mainloop_ryzen_asm +PUBLIC cryptonight_v8_double_mainloop_sandybridge_asm + +ALIGN 8 +cryptonight_v8_mainloop_ivybridge_asm PROC + INCLUDE cryptonight_v8_main_loop_ivybridge_win64.inc + ret 0 +cryptonight_v8_mainloop_ivybridge_asm ENDP + +ALIGN 8 +cryptonight_v8_mainloop_ryzen_asm PROC + INCLUDE cryptonight_v8_main_loop_ryzen_win64.inc + ret 0 +cryptonight_v8_mainloop_ryzen_asm ENDP + +ALIGN 8 +cryptonight_v8_double_mainloop_sandybridge_asm PROC + INCLUDE cryptonight_v8_double_main_loop_sandybridge_win64.inc + ret 0 +cryptonight_v8_double_mainloop_sandybridge_asm ENDP + +_TEXT_CNV8_MAINLOOP ENDS +END diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_linux.inc b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_linux.inc new file mode 100644 index 000000000..cbe43b0d3 --- /dev/null +++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_linux.inc @@ -0,0 +1,187 @@ + mov QWORD PTR [rsp+24], rbx + push rbp + push rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 80 + + stmxcsr DWORD PTR [rsp] + mov DWORD PTR [rsp+4], 24448 + ldmxcsr DWORD PTR [rsp+4] + + mov rax, QWORD PTR [rcx+48] + mov r9, rcx + xor rax, QWORD PTR [rcx+16] + mov esi, 524288 + mov r8, QWORD PTR [rcx+32] + mov r13d, -2147483647 + xor r8, QWORD PTR [rcx] + mov r11, QWORD PTR [rcx+40] + mov r10, r8 + mov rdx, QWORD PTR [rcx+56] + movq xmm4, rax + xor rdx, QWORD PTR [rcx+24] + xor r11, QWORD PTR [rcx+8] + mov rbx, QWORD PTR [rcx+224] + mov rax, QWORD PTR [r9+80] + xor rax, QWORD PTR [r9+64] + movq xmm0, rdx + mov rcx, QWORD PTR [rcx+88] + xor rcx, QWORD PTR [r9+72] + movq xmm3, QWORD PTR [r9+104] + movaps XMMWORD PTR [rsp+64], xmm6 + movaps XMMWORD PTR [rsp+48], xmm7 + movaps XMMWORD PTR [rsp+32], xmm8 + and r10d, 2097136 + movq xmm5, rax + + xor eax, eax + mov QWORD PTR [rsp+16], rax + + mov ax, 1023 + shl rax, 52 + movq xmm8, rax + mov r15, QWORD PTR [r9+96] + punpcklqdq xmm4, xmm0 + movq xmm0, rcx + punpcklqdq xmm5, xmm0 + movdqu xmm6, XMMWORD PTR [r10+rbx] + + ALIGN 8 +main_loop_ivybridge: + lea rdx, QWORD PTR [r10+rbx] + mov ecx, r10d + mov eax, r10d + mov rdi, r15 + xor ecx, 16 + xor eax, 32 + xor r10d, 48 + movq xmm0, r11 + movq xmm7, r8 + punpcklqdq xmm7, xmm0 + aesenc xmm6, xmm7 + movq rbp, xmm6 + mov r9, rbp + and r9d, 2097136 + movdqu xmm2, XMMWORD PTR [rcx+rbx] + movdqu xmm1, XMMWORD PTR [rax+rbx] + movdqu xmm0, XMMWORD PTR [r10+rbx] + paddq xmm1, xmm7 + paddq xmm0, xmm5 + paddq xmm2, xmm4 + movdqu XMMWORD PTR [rcx+rbx], xmm0 + movdqu XMMWORD PTR [rax+rbx], xmm2 + movdqu XMMWORD PTR [r10+rbx], xmm1 + mov r10, r9 + xor r10d, 32 + movq rcx, xmm3 + mov rax, rcx + shl rax, 32 + xor rdi, rax + movdqa xmm0, xmm6 + pxor xmm0, xmm4 + movdqu XMMWORD PTR [rdx], xmm0 + xor rdi, QWORD PTR [r9+rbx] + lea r14, QWORD PTR [r9+rbx] + mov r12, QWORD PTR [r14+8] + xor edx, edx + lea r9d, DWORD PTR [ecx+ecx] + add r9d, ebp + movdqa xmm0, xmm6 + psrldq xmm0, 8 + or r9d, r13d + movq rax, xmm0 + div r9 + xorps xmm3, xmm3 + mov eax, eax + shl rdx, 32 + add rdx, rax + lea r9, QWORD PTR [rdx+rbp] + mov r15, rdx + mov rax, r9 + shr rax, 12 + movq xmm0, rax + paddq xmm0, xmm8 + sqrtsd xmm3, xmm0 + psubq xmm3, XMMWORD PTR [rsp+16] + movq rdx, xmm3 + test edx, 524287 + je sqrt_fixup_ivybridge + psrlq xmm3, 19 + psubq xmm3, XMMWORD PTR [rsp+16] +sqrt_fixup_ivybridge_ret: + + mov ecx, r10d + mov rax, rdi + mul rbp + movq xmm2, rdx + xor rdx, [rcx+rbx] + add r8, rdx + mov QWORD PTR [r14], r8 + xor r8, rdi + mov edi, r8d + and edi, 2097136 + movq xmm0, rax + xor rax, [rcx+rbx+8] + add r11, rax + mov QWORD PTR [r14+8], r11 + punpcklqdq xmm2, xmm0 + + mov r9d, r10d + xor r9d, 48 + xor r10d, 16 + pxor xmm2, XMMWORD PTR [r9+rbx] + movdqu xmm0, XMMWORD PTR [r10+rbx] + paddq xmm0, xmm5 + movdqu xmm1, XMMWORD PTR [rcx+rbx] + paddq xmm2, xmm4 + paddq xmm1, xmm7 + movdqa xmm5, xmm4 + movdqu XMMWORD PTR [r9+rbx], xmm0 + movdqa xmm4, xmm6 + movdqu XMMWORD PTR [rcx+rbx], xmm2 + movdqu XMMWORD PTR [r10+rbx], xmm1 + movdqu xmm6, [rdi+rbx] + mov r10d, edi + xor r11, r12 + dec rsi + jne main_loop_ivybridge + + ldmxcsr DWORD PTR [rsp] + mov rbx, QWORD PTR [rsp+160] + movaps xmm6, XMMWORD PTR [rsp+64] + movaps xmm7, XMMWORD PTR [rsp+48] + movaps xmm8, XMMWORD PTR [rsp+32] + add rsp, 80 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + pop rsi + pop rbp + jmp cnv2_main_loop_ivybridge_endp + +sqrt_fixup_ivybridge: + dec rdx + mov r13d, -1022 + shl r13, 32 + mov rax, rdx + shr rdx, 19 + shr rax, 20 + mov rcx, rdx + sub rcx, rax + add rax, r13 + not r13 + sub rcx, r13 + mov r13d, -2147483647 + imul rcx, rax + sub rcx, r9 + adc rdx, 0 + movq xmm3, rdx + jmp sqrt_fixup_ivybridge_ret + +cnv2_main_loop_ivybridge_endp: diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_win64.inc b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_win64.inc new file mode 100755 index 000000000..8d49c5db7 --- /dev/null +++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_win64.inc @@ -0,0 +1,187 @@ + mov QWORD PTR [rsp+24], rbx + push rbp + push rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 80 + + stmxcsr DWORD PTR [rsp] + mov DWORD PTR [rsp+4], 24448 + ldmxcsr DWORD PTR [rsp+4] + + mov rax, QWORD PTR [rcx+48] + mov r9, rcx + xor rax, QWORD PTR [rcx+16] + mov esi, 524288 + mov r8, QWORD PTR [rcx+32] + mov r13d, -2147483647 + xor r8, QWORD PTR [rcx] + mov r11, QWORD PTR [rcx+40] + mov r10, r8 + mov rdx, QWORD PTR [rcx+56] + movd xmm4, rax + xor rdx, QWORD PTR [rcx+24] + xor r11, QWORD PTR [rcx+8] + mov rbx, QWORD PTR [rcx+224] + mov rax, QWORD PTR [r9+80] + xor rax, QWORD PTR [r9+64] + movd xmm0, rdx + mov rcx, QWORD PTR [rcx+88] + xor rcx, QWORD PTR [r9+72] + movq xmm3, QWORD PTR [r9+104] + movaps XMMWORD PTR [rsp+64], xmm6 + movaps XMMWORD PTR [rsp+48], xmm7 + movaps XMMWORD PTR [rsp+32], xmm8 + and r10d, 2097136 + movd xmm5, rax + + xor eax, eax + mov QWORD PTR [rsp+16], rax + + mov ax, 1023 + shl rax, 52 + movd xmm8, rax + mov r15, QWORD PTR [r9+96] + punpcklqdq xmm4, xmm0 + movd xmm0, rcx + punpcklqdq xmm5, xmm0 + movdqu xmm6, XMMWORD PTR [r10+rbx] + + ALIGN 8 +main_loop_ivybridge: + lea rdx, QWORD PTR [r10+rbx] + mov ecx, r10d + mov eax, r10d + mov rdi, r15 + xor ecx, 16 + xor eax, 32 + xor r10d, 48 + movd xmm0, r11 + movd xmm7, r8 + punpcklqdq xmm7, xmm0 + aesenc xmm6, xmm7 + movd rbp, xmm6 + mov r9, rbp + and r9d, 2097136 + movdqu xmm2, XMMWORD PTR [rcx+rbx] + movdqu xmm1, XMMWORD PTR [rax+rbx] + movdqu xmm0, XMMWORD PTR [r10+rbx] + paddq xmm1, xmm7 + paddq xmm0, xmm5 + paddq xmm2, xmm4 + movdqu XMMWORD PTR [rcx+rbx], xmm0 + movdqu XMMWORD PTR [rax+rbx], xmm2 + movdqu XMMWORD PTR [r10+rbx], xmm1 + mov r10, r9 + xor r10d, 32 + movd rcx, xmm3 + mov rax, rcx + shl rax, 32 + xor rdi, rax + movdqa xmm0, xmm6 + pxor xmm0, xmm4 + movdqu XMMWORD PTR [rdx], xmm0 + xor rdi, QWORD PTR [r9+rbx] + lea r14, QWORD PTR [r9+rbx] + mov r12, QWORD PTR [r14+8] + xor edx, edx + lea r9d, DWORD PTR [ecx+ecx] + add r9d, ebp + movdqa xmm0, xmm6 + psrldq xmm0, 8 + or r9d, r13d + movd rax, xmm0 + div r9 + xorps xmm3, xmm3 + mov eax, eax + shl rdx, 32 + add rdx, rax + lea r9, QWORD PTR [rdx+rbp] + mov r15, rdx + mov rax, r9 + shr rax, 12 + movd xmm0, rax + paddq xmm0, xmm8 + sqrtsd xmm3, xmm0 + psubq xmm3, XMMWORD PTR [rsp+16] + movd rdx, xmm3 + test edx, 524287 + je sqrt_fixup_ivybridge + psrlq xmm3, 19 + psubq xmm3, XMMWORD PTR [rsp+16] +sqrt_fixup_ivybridge_ret: + + mov ecx, r10d + mov rax, rdi + mul rbp + movd xmm2, rdx + xor rdx, [rcx+rbx] + add r8, rdx + mov QWORD PTR [r14], r8 + xor r8, rdi + mov edi, r8d + and edi, 2097136 + movd xmm0, rax + xor rax, [rcx+rbx+8] + add r11, rax + mov QWORD PTR [r14+8], r11 + punpcklqdq xmm2, xmm0 + + mov r9d, r10d + xor r9d, 48 + xor r10d, 16 + pxor xmm2, XMMWORD PTR [r9+rbx] + movdqu xmm0, XMMWORD PTR [r10+rbx] + paddq xmm0, xmm5 + movdqu xmm1, XMMWORD PTR [rcx+rbx] + paddq xmm2, xmm4 + paddq xmm1, xmm7 + movdqa xmm5, xmm4 + movdqu XMMWORD PTR [r9+rbx], xmm0 + movdqa xmm4, xmm6 + movdqu XMMWORD PTR [rcx+rbx], xmm2 + movdqu XMMWORD PTR [r10+rbx], xmm1 + movdqu xmm6, [rdi+rbx] + mov r10d, edi + xor r11, r12 + dec rsi + jne main_loop_ivybridge + + ldmxcsr DWORD PTR [rsp] + mov rbx, QWORD PTR [rsp+160] + movaps xmm6, XMMWORD PTR [rsp+64] + movaps xmm7, XMMWORD PTR [rsp+48] + movaps xmm8, XMMWORD PTR [rsp+32] + add rsp, 80 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + pop rsi + pop rbp + jmp cnv2_main_loop_ivybridge_endp + +sqrt_fixup_ivybridge: + dec rdx + mov r13d, -1022 + shl r13, 32 + mov rax, rdx + shr rdx, 19 + shr rax, 20 + mov rcx, rdx + sub rcx, rax + add rax, r13 + not r13 + sub rcx, r13 + mov r13d, -2147483647 + imul rcx, rax + sub rcx, r9 + adc rdx, 0 + movd xmm3, rdx + jmp sqrt_fixup_ivybridge_ret + +cnv2_main_loop_ivybridge_endp: diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_linux.inc b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_linux.inc new file mode 100644 index 000000000..cd8b43477 --- /dev/null +++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_linux.inc @@ -0,0 +1,179 @@ + mov QWORD PTR [rsp+16], rbx + mov QWORD PTR [rsp+24], rbp + mov QWORD PTR [rsp+32], rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 64 + + stmxcsr DWORD PTR [rsp] + mov DWORD PTR [rsp+4], 24448 + ldmxcsr DWORD PTR [rsp+4] + + mov rax, QWORD PTR [rcx+48] + mov r9, rcx + xor rax, QWORD PTR [rcx+16] + mov ebp, 524288 + mov r8, QWORD PTR [rcx+32] + xor r8, QWORD PTR [rcx] + mov r11, QWORD PTR [rcx+40] + mov r10, r8 + mov rdx, QWORD PTR [rcx+56] + movq xmm3, rax + xor rdx, QWORD PTR [rcx+24] + xor r11, QWORD PTR [rcx+8] + mov rbx, QWORD PTR [rcx+224] + mov rax, QWORD PTR [r9+80] + xor rax, QWORD PTR [r9+64] + movq xmm0, rdx + mov rcx, QWORD PTR [rcx+88] + xor rcx, QWORD PTR [r9+72] + mov rdi, QWORD PTR [r9+104] + and r10d, 2097136 + movaps XMMWORD PTR [rsp+48], xmm6 + movq xmm4, rax + movaps XMMWORD PTR [rsp+32], xmm7 + movaps XMMWORD PTR [rsp+16], xmm8 + xorps xmm8, xmm8 + mov ax, 1023 + shl rax, 52 + movq xmm7, rax + mov r15, QWORD PTR [r9+96] + punpcklqdq xmm3, xmm0 + movq xmm0, rcx + punpcklqdq xmm4, xmm0 + + ALIGN 8 +main_loop_ryzen: + movdqa xmm5, XMMWORD PTR [r10+rbx] + movq xmm0, r11 + movq xmm6, r8 + punpcklqdq xmm6, xmm0 + lea rdx, QWORD PTR [r10+rbx] + lea r9, QWORD PTR [rdi+rdi] + shl rdi, 32 + + mov ecx, r10d + mov eax, r10d + xor ecx, 16 + xor eax, 32 + xor r10d, 48 + aesenc xmm5, xmm6 + movdqa xmm2, XMMWORD PTR [rcx+rbx] + movdqa xmm1, XMMWORD PTR [rax+rbx] + movdqa xmm0, XMMWORD PTR [r10+rbx] + paddq xmm2, xmm3 + paddq xmm1, xmm6 + paddq xmm0, xmm4 + movdqa XMMWORD PTR [rcx+rbx], xmm0 + movdqa XMMWORD PTR [rax+rbx], xmm2 + movdqa XMMWORD PTR [r10+rbx], xmm1 + + movaps xmm1, xmm8 + mov rsi, r15 + xor rsi, rdi + movq r14, xmm5 + movdqa xmm0, xmm5 + pxor xmm0, xmm3 + mov r10, r14 + and r10d, 2097136 + movdqa XMMWORD PTR [rdx], xmm0 + xor rsi, QWORD PTR [r10+rbx] + lea r12, QWORD PTR [r10+rbx] + mov r13, QWORD PTR [r10+rbx+8] + + add r9d, r14d + or r9d, -2147483647 + xor edx, edx + movdqa xmm0, xmm5 + psrldq xmm0, 8 + movq rax, xmm0 + + div r9 + movq xmm0, rax + movq xmm1, rdx + punpckldq xmm0, xmm1 + movq r15, xmm0 + paddq xmm0, xmm5 + movdqa xmm2, xmm0 + psrlq xmm0, 12 + paddq xmm0, xmm7 + sqrtsd xmm1, xmm0 + movq rdi, xmm1 + test rdi, 524287 + je sqrt_fixup_ryzen + shr rdi, 19 + +sqrt_fixup_ryzen_ret: + mov rax, rsi + mul r14 + movq xmm1, rax + movq xmm0, rdx + punpcklqdq xmm0, xmm1 + + mov r9d, r10d + mov ecx, r10d + xor r9d, 16 + xor ecx, 32 + xor r10d, 48 + movdqa xmm1, XMMWORD PTR [rcx+rbx] + xor rdx, [rcx+rbx] + xor rax, [rcx+rbx+8] + movdqa xmm2, XMMWORD PTR [r9+rbx] + pxor xmm2, xmm0 + paddq xmm4, XMMWORD PTR [r10+rbx] + paddq xmm2, xmm3 + paddq xmm1, xmm6 + movdqa XMMWORD PTR [r9+rbx], xmm4 + movdqa XMMWORD PTR [rcx+rbx], xmm2 + movdqa XMMWORD PTR [r10+rbx], xmm1 + + movdqa xmm4, xmm3 + add r8, rdx + add r11, rax + mov QWORD PTR [r12], r8 + xor r8, rsi + mov QWORD PTR [r12+8], r11 + mov r10, r8 + xor r11, r13 + and r10d, 2097136 + movdqa xmm3, xmm5 + dec ebp + jne main_loop_ryzen + + ldmxcsr DWORD PTR [rsp] + movaps xmm6, XMMWORD PTR [rsp+48] + lea r11, QWORD PTR [rsp+64] + mov rbx, QWORD PTR [r11+56] + mov rbp, QWORD PTR [r11+64] + mov rsi, QWORD PTR [r11+72] + movaps xmm8, XMMWORD PTR [r11-48] + movaps xmm7, XMMWORD PTR [rsp+32] + mov rsp, r11 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + jmp cnv2_main_loop_ryzen_endp + +sqrt_fixup_ryzen: + movq r9, xmm2 + dec rdi + mov edx, -1022 + shl rdx, 32 + mov rax, rdi + shr rdi, 19 + shr rax, 20 + mov rcx, rdi + sub rcx, rax + lea rcx, [rcx+rdx+1] + add rax, rdx + imul rcx, rax + sub rcx, r9 + adc rdi, 0 + jmp sqrt_fixup_ryzen_ret + +cnv2_main_loop_ryzen_endp: diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_win64.inc b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_win64.inc new file mode 100755 index 000000000..d103cc2ee --- /dev/null +++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_win64.inc @@ -0,0 +1,179 @@ + mov QWORD PTR [rsp+16], rbx + mov QWORD PTR [rsp+24], rbp + mov QWORD PTR [rsp+32], rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 64 + + stmxcsr DWORD PTR [rsp] + mov DWORD PTR [rsp+4], 24448 + ldmxcsr DWORD PTR [rsp+4] + + mov rax, QWORD PTR [rcx+48] + mov r9, rcx + xor rax, QWORD PTR [rcx+16] + mov ebp, 524288 + mov r8, QWORD PTR [rcx+32] + xor r8, QWORD PTR [rcx] + mov r11, QWORD PTR [rcx+40] + mov r10, r8 + mov rdx, QWORD PTR [rcx+56] + movd xmm3, rax + xor rdx, QWORD PTR [rcx+24] + xor r11, QWORD PTR [rcx+8] + mov rbx, QWORD PTR [rcx+224] + mov rax, QWORD PTR [r9+80] + xor rax, QWORD PTR [r9+64] + movd xmm0, rdx + mov rcx, QWORD PTR [rcx+88] + xor rcx, QWORD PTR [r9+72] + mov rdi, QWORD PTR [r9+104] + and r10d, 2097136 + movaps XMMWORD PTR [rsp+48], xmm6 + movd xmm4, rax + movaps XMMWORD PTR [rsp+32], xmm7 + movaps XMMWORD PTR [rsp+16], xmm8 + xorps xmm8, xmm8 + mov ax, 1023 + shl rax, 52 + movd xmm7, rax + mov r15, QWORD PTR [r9+96] + punpcklqdq xmm3, xmm0 + movd xmm0, rcx + punpcklqdq xmm4, xmm0 + + ALIGN 8 +main_loop_ryzen: + movdqa xmm5, XMMWORD PTR [r10+rbx] + movd xmm0, r11 + movd xmm6, r8 + punpcklqdq xmm6, xmm0 + lea rdx, QWORD PTR [r10+rbx] + lea r9, QWORD PTR [rdi+rdi] + shl rdi, 32 + + mov ecx, r10d + mov eax, r10d + xor ecx, 16 + xor eax, 32 + xor r10d, 48 + aesenc xmm5, xmm6 + movdqa xmm2, XMMWORD PTR [rcx+rbx] + movdqa xmm1, XMMWORD PTR [rax+rbx] + movdqa xmm0, XMMWORD PTR [r10+rbx] + paddq xmm2, xmm3 + paddq xmm1, xmm6 + paddq xmm0, xmm4 + movdqa XMMWORD PTR [rcx+rbx], xmm0 + movdqa XMMWORD PTR [rax+rbx], xmm2 + movdqa XMMWORD PTR [r10+rbx], xmm1 + + movaps xmm1, xmm8 + mov rsi, r15 + xor rsi, rdi + movd r14, xmm5 + movdqa xmm0, xmm5 + pxor xmm0, xmm3 + mov r10, r14 + and r10d, 2097136 + movdqa XMMWORD PTR [rdx], xmm0 + xor rsi, QWORD PTR [r10+rbx] + lea r12, QWORD PTR [r10+rbx] + mov r13, QWORD PTR [r10+rbx+8] + + add r9d, r14d + or r9d, -2147483647 + xor edx, edx + movdqa xmm0, xmm5 + psrldq xmm0, 8 + movd rax, xmm0 + + div r9 + movd xmm0, rax + movd xmm1, rdx + punpckldq xmm0, xmm1 + movd r15, xmm0 + paddq xmm0, xmm5 + movdqa xmm2, xmm0 + psrlq xmm0, 12 + paddq xmm0, xmm7 + sqrtsd xmm1, xmm0 + movd rdi, xmm1 + test rdi, 524287 + je sqrt_fixup_ryzen + shr rdi, 19 + +sqrt_fixup_ryzen_ret: + mov rax, rsi + mul r14 + movd xmm1, rax + movd xmm0, rdx + punpcklqdq xmm0, xmm1 + + mov r9d, r10d + mov ecx, r10d + xor r9d, 16 + xor ecx, 32 + xor r10d, 48 + movdqa xmm1, XMMWORD PTR [rcx+rbx] + xor rdx, [rcx+rbx] + xor rax, [rcx+rbx+8] + movdqa xmm2, XMMWORD PTR [r9+rbx] + pxor xmm2, xmm0 + paddq xmm4, XMMWORD PTR [r10+rbx] + paddq xmm2, xmm3 + paddq xmm1, xmm6 + movdqa XMMWORD PTR [r9+rbx], xmm4 + movdqa XMMWORD PTR [rcx+rbx], xmm2 + movdqa XMMWORD PTR [r10+rbx], xmm1 + + movdqa xmm4, xmm3 + add r8, rdx + add r11, rax + mov QWORD PTR [r12], r8 + xor r8, rsi + mov QWORD PTR [r12+8], r11 + mov r10, r8 + xor r11, r13 + and r10d, 2097136 + movdqa xmm3, xmm5 + dec ebp + jne main_loop_ryzen + + ldmxcsr DWORD PTR [rsp] + movaps xmm6, XMMWORD PTR [rsp+48] + lea r11, QWORD PTR [rsp+64] + mov rbx, QWORD PTR [r11+56] + mov rbp, QWORD PTR [r11+64] + mov rsi, QWORD PTR [r11+72] + movaps xmm8, XMMWORD PTR [r11-48] + movaps xmm7, XMMWORD PTR [rsp+32] + mov rsp, r11 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + jmp cnv2_main_loop_ryzen_endp + +sqrt_fixup_ryzen: + movd r9, xmm2 + dec rdi + mov edx, -1022 + shl rdx, 32 + mov rax, rdi + shr rdi, 19 + shr rax, 20 + mov rcx, rdi + sub rcx, rax + lea rcx, [rcx+rdx+1] + add rax, rdx + imul rcx, rax + sub rcx, r9 + adc rdi, 0 + jmp sqrt_fixup_ryzen_ret + +cnv2_main_loop_ryzen_endp: diff --git a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h index 9f70bcfa7..2b1741764 100644 --- a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h +++ b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h @@ -19,6 +19,8 @@ #include "xmrstak/backend/cryptonight.hpp" #include #include +#include +#include #ifdef __GNUC__ #include @@ -151,15 +153,15 @@ static inline void soft_aes_round(__m128i key, __m128i* x0, __m128i* x1, __m128i inline void mix_and_propagate(__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3, __m128i& x4, __m128i& x5, __m128i& x6, __m128i& x7) { - __m128i tmp0 = x0; - x0 = _mm_xor_si128(x0, x1); - x1 = _mm_xor_si128(x1, x2); - x2 = _mm_xor_si128(x2, x3); - x3 = _mm_xor_si128(x3, x4); - x4 = _mm_xor_si128(x4, x5); - x5 = _mm_xor_si128(x5, x6); - x6 = _mm_xor_si128(x6, x7); - x7 = _mm_xor_si128(x7, tmp0); + __m128i tmp0 = x0; + x0 = _mm_xor_si128(x0, x1); + x1 = _mm_xor_si128(x1, x2); + x2 = _mm_xor_si128(x2, x3); + x3 = _mm_xor_si128(x3, x4); + x4 = _mm_xor_si128(x4, x5); + x5 = _mm_xor_si128(x5, x6); + x6 = _mm_xor_si128(x6, x7); + x7 = _mm_xor_si128(x7, tmp0); } template @@ -422,6 +424,29 @@ void cn_implode_scratchpad(const __m128i* input, __m128i* output) _mm_store_si128(output + 11, xout7); } +inline uint64_t int_sqrt33_1_double_precision(const uint64_t n0) +{ + __m128d x = _mm_castsi128_pd(_mm_add_epi64(_mm_cvtsi64_si128(n0 >> 12), _mm_set_epi64x(0, 1023ULL << 52))); + x = _mm_sqrt_sd(_mm_setzero_pd(), x); + uint64_t r = static_cast(_mm_cvtsi128_si64(_mm_castpd_si128(x))); + + const uint64_t s = r >> 20; + r >>= 19; + + uint64_t x2 = (s - (1022ULL << 32)) * (r - s - (1022ULL << 32) + 1); + +#ifdef __INTEL_COMPILER + _addcarry_u64(_subborrow_u64(0, x2, n0, (unsigned __int64*)&x2), r, 0, (unsigned __int64*)&r); +#elif defined(_MSC_VER) || (__GNUC__ >= 7) + _addcarry_u64(_subborrow_u64(0, x2, n0, (unsigned long long int*)&x2), r, 0, (unsigned long long int*)&r); +#else + // GCC versions prior to 7 don't generate correct assembly for _subborrow_u64 -> _addcarry_u64 sequence + // Fallback to simpler code + if (x2 < n0) ++r; +#endif + return r; +} + inline __m128i aes_round_bittube2(const __m128i& val, const __m128i& key) { alignas(16) uint32_t k[4]; @@ -467,712 +492,511 @@ inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp) } -template -void cryptonight_hash(const void* input, size_t len, void* output, cryptonight_ctx* ctx0) +/** optimal type for sqrt + * + * Depending on the number of hashes calculated the optimal type for the sqrt value will be selected. + * + * @tparam N number of hashes per thread + */ +template +struct GetOptimalSqrtType { - constexpr size_t MASK = cn_select_mask(); - constexpr size_t ITERATIONS = cn_select_iter(); - constexpr size_t MEM = cn_select_memory(); - - if((ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) && len < 43) - { - memset(output, 0, 32); - return; - } - - keccak((const uint8_t *)input, len, ctx0->hash_state, 200); + using type = __m128i; +}; - uint64_t monero_const; - if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) - { - monero_const = *reinterpret_cast(reinterpret_cast(input) + 35); - monero_const ^= *(reinterpret_cast(ctx0->hash_state) + 24); - } - - // Optim - 99% time boundary - cn_explode_scratchpad((__m128i*)ctx0->hash_state, (__m128i*)ctx0->long_state); - - uint8_t* l0 = ctx0->long_state; - uint64_t* h0 = (uint64_t*)ctx0->hash_state; - - uint64_t al0 = h0[0] ^ h0[4]; - uint64_t ah0 = h0[1] ^ h0[5]; - __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); - - uint64_t idx0 = h0[0] ^ h0[4]; - - // Optim - 90% time boundary - for(size_t i = 0; i < ITERATIONS; i++) - { - __m128i cx; - cx = _mm_load_si128((__m128i *)&l0[idx0 & MASK]); +template<> +struct GetOptimalSqrtType<1u> +{ + using type = uint64_t; +}; +template +using GetOptimalSqrtType_t = typename GetOptimalSqrtType::type; + +/** assign a value and convert if necessary + * + * @param output output type + * @param input value which is assigned to output + * @{ + */ +inline void assign(__m128i& output, const uint64_t input) +{ + output = _mm_cvtsi64_si128(input); +} - if (ALGO == cryptonight_bittube2) - { - cx = aes_round_bittube2(cx, _mm_set_epi64x(ah0, al0)); - } - else - { - if(SOFT_AES) - cx = soft_aesenc(cx, _mm_set_epi64x(ah0, al0)); - else - cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah0, al0)); - } +inline void assign(uint64_t& output, const uint64_t input) +{ + output = input; +} - if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) - cryptonight_monero_tweak((uint64_t*)&l0[idx0 & MASK], _mm_xor_si128(bx0, cx)); - else - _mm_store_si128((__m128i *)&l0[idx0 & MASK], _mm_xor_si128(bx0, cx)); +inline void assign(uint64_t& output, const __m128i& input) +{ + output = _mm_cvtsi128_si64(input); +} +/** @} */ - idx0 = _mm_cvtsi128_si64(cx); +inline void set_float_rounding_mode() +{ +#ifdef _MSC_VER + _control87(RC_DOWN, MCW_RC); +#else + std::fesetround(FE_DOWNWARD); +#endif +} - if(PREFETCH) - _mm_prefetch((const char*)&l0[idx0 & MASK], _MM_HINT_T0); - bx0 = cx; +#define CN_MONERO_V8_SHUFFLE_0(n, l0, idx0, ax0, bx0, bx1) \ + /* Shuffle the other 3x16 byte chunks in the current 64-byte cache line */ \ + if(ALGO == cryptonight_monero_v8) \ + { \ + const uint64_t idx1 = idx0 & MASK; \ + const __m128i chunk1 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x10]); \ + const __m128i chunk2 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x20]); \ + const __m128i chunk3 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x30]); \ + _mm_store_si128((__m128i *)&l0[idx1 ^ 0x10], _mm_add_epi64(chunk3, bx1)); \ + _mm_store_si128((__m128i *)&l0[idx1 ^ 0x20], _mm_add_epi64(chunk1, bx0)); \ + _mm_store_si128((__m128i *)&l0[idx1 ^ 0x30], _mm_add_epi64(chunk2, ax0)); \ + } - uint64_t hi, lo, cl, ch; - cl = ((uint64_t*)&l0[idx0 & MASK])[0]; - ch = ((uint64_t*)&l0[idx0 & MASK])[1]; +#define CN_MONERO_V8_SHUFFLE_1(n, l0, idx0, ax0, bx0, bx1, lo, hi) \ + /* Shuffle the other 3x16 byte chunks in the current 64-byte cache line */ \ + if(ALGO == cryptonight_monero_v8) \ + { \ + const uint64_t idx1 = idx0 & MASK; \ + const __m128i chunk1 = _mm_xor_si128(_mm_load_si128((__m128i *)&l0[idx1 ^ 0x10]), _mm_set_epi64x(lo, hi)); \ + const __m128i chunk2 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x20]); \ + hi ^= ((uint64_t*)&chunk2)[0]; \ + lo ^= ((uint64_t*)&chunk2)[1]; \ + const __m128i chunk3 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x30]); \ + _mm_store_si128((__m128i *)&l0[idx1 ^ 0x10], _mm_add_epi64(chunk3, bx1)); \ + _mm_store_si128((__m128i *)&l0[idx1 ^ 0x20], _mm_add_epi64(chunk1, bx0)); \ + _mm_store_si128((__m128i *)&l0[idx1 ^ 0x30], _mm_add_epi64(chunk2, ax0)); \ + } - lo = _umul128(idx0, cl, &hi); +#define CN_MONERO_V8_DIV(n, cx, sqrt_result, division_result_xmm, cl) \ + if(ALGO == cryptonight_monero_v8) \ + { \ + uint64_t sqrt_result_tmp; \ + assign(sqrt_result_tmp, sqrt_result); \ + /* Use division and square root results from the _previous_ iteration to hide the latency */ \ + const uint64_t cx_64 = _mm_cvtsi128_si64(cx); \ + cl ^= static_cast(_mm_cvtsi128_si64(division_result_xmm)) ^ (sqrt_result_tmp << 32); \ + const uint32_t d = (cx_64 + (sqrt_result_tmp << 1)) | 0x80000001UL; \ + /* Most and least significant bits in the divisor are set to 1 \ + * to make sure we don't divide by a small or even number, \ + * so there are no shortcuts for such cases \ + * \ + * Quotient may be as large as (2^64 - 1)/(2^31 + 1) = 8589934588 = 2^33 - 4 \ + * We drop the highest bit to fit both quotient and remainder in 32 bits \ + */ \ + /* Compiler will optimize it to a single div instruction */ \ + const uint64_t cx_s = _mm_cvtsi128_si64(_mm_srli_si128(cx, 8)); \ + const uint64_t division_result = static_cast(cx_s / d) + ((cx_s % d) << 32); \ + division_result_xmm = _mm_cvtsi64_si128(static_cast(division_result)); \ + /* Use division_result as an input for the square root to prevent parallel implementation in hardware */ \ + assign(sqrt_result, int_sqrt33_1_double_precision(cx_64 + division_result)); \ + } - al0 += hi; - ((uint64_t*)&l0[idx0 & MASK])[0] = al0; - al0 ^= cl; - if(PREFETCH) - _mm_prefetch((const char*)&l0[al0 & MASK], _MM_HINT_T0); - ah0 += lo; +#define CN_INIT_SINGLE \ + if((ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) && len < 43) \ + { \ + memset(output, 0, 32 * N); \ + return; \ + } - if (ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) { - if (ALGO == cryptonight_ipbc || ALGO == cryptonight_bittube2) - ((uint64_t*)&l0[idx0 & MASK])[1] = ah0 ^ monero_const ^ ((uint64_t*)&l0[idx0 & MASK])[0]; - else - ((uint64_t*)&l0[idx0 & MASK])[1] = ah0 ^ monero_const; - } - else - ((uint64_t*)&l0[idx0 & MASK])[1] = ah0; - ah0 ^= ch; +#define CN_INIT(n, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm) \ + keccak((const uint8_t *)input + len * n, len, ctx[n]->hash_state, 200); \ + uint64_t monero_const; \ + if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) \ + { \ + monero_const = *reinterpret_cast(reinterpret_cast(input) + len * n + 35); \ + monero_const ^= *(reinterpret_cast(ctx[n]->hash_state) + 24); \ + } \ + /* Optim - 99% time boundary */ \ + cn_explode_scratchpad((__m128i*)ctx[n]->hash_state, (__m128i*)ctx[n]->long_state); \ + \ + __m128i ax0; \ + uint64_t idx0; \ + __m128i bx0; \ + uint8_t* l0 = ctx[n]->long_state; \ + /* BEGIN cryptonight_monero_v8 variables */ \ + __m128i bx1; \ + __m128i division_result_xmm; \ + GetOptimalSqrtType_t sqrt_result; \ + /* END cryptonight_monero_v8 variables */ \ + { \ + uint64_t* h0 = (uint64_t*)ctx[n]->hash_state; \ + idx0 = h0[0] ^ h0[4]; \ + ax0 = _mm_set_epi64x(h0[1] ^ h0[5], idx0); \ + bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); \ + if(ALGO == cryptonight_monero_v8) \ + { \ + bx1 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]); \ + division_result_xmm = _mm_cvtsi64_si128(h0[12]); \ + assign(sqrt_result, h0[13]); \ + set_float_rounding_mode(); \ + } \ + } \ + __m128i *ptr0 - idx0 = al0; +#define CN_STEP1(n, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1) \ + __m128i cx; \ + ptr0 = (__m128i *)&l0[idx0 & MASK]; \ + cx = _mm_load_si128(ptr0); \ + if (ALGO == cryptonight_bittube2) \ + { \ + cx = aes_round_bittube2(cx, ax0); \ + } \ + else \ + { \ + if(SOFT_AES) \ + cx = soft_aesenc(cx, ax0); \ + else \ + cx = _mm_aesenc_si128(cx, ax0); \ + } \ + CN_MONERO_V8_SHUFFLE_0(n, l0, idx0, ax0, bx0, bx1) - if(ALGO == cryptonight_heavy || ALGO == cryptonight_bittube2) - { - int64_t n = ((int64_t*)&l0[idx0 & MASK])[0]; - int32_t d = ((int32_t*)&l0[idx0 & MASK])[2]; - int64_t q = n / (d | 0x5); +#define CN_STEP2(n, monero_const, l0, ax0, bx0, idx0, ptr0, cx) \ + if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) \ + cryptonight_monero_tweak((uint64_t*)ptr0, _mm_xor_si128(bx0, cx)); \ + else \ + _mm_store_si128((__m128i *)ptr0, _mm_xor_si128(bx0, cx)); \ + idx0 = _mm_cvtsi128_si64(cx); \ + \ + ptr0 = (__m128i *)&l0[idx0 & MASK]; \ + if(PREFETCH) \ + _mm_prefetch((const char*)ptr0, _MM_HINT_T0); \ + if(ALGO != cryptonight_monero_v8) \ + bx0 = cx + +#define CN_STEP3(n, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm) \ + uint64_t lo, cl, ch; \ + uint64_t al0 = _mm_cvtsi128_si64(ax0); \ + uint64_t ah0 = ((uint64_t*)&ax0)[1]; \ + cl = ((uint64_t*)ptr0)[0]; \ + ch = ((uint64_t*)ptr0)[1]; \ + CN_MONERO_V8_DIV(n, cx, sqrt_result, division_result_xmm, cl); \ + { \ + uint64_t hi; \ + lo = _umul128(idx0, cl, &hi); \ + CN_MONERO_V8_SHUFFLE_1(n, l0, idx0, ax0, bx0, bx1, lo, hi); \ + ah0 += lo; \ + al0 += hi; \ + } \ + if(ALGO == cryptonight_monero_v8) \ + { \ + bx1 = bx0; \ + bx0 = cx; \ + } \ + ((uint64_t*)ptr0)[0] = al0; \ + if(PREFETCH) \ + _mm_prefetch((const char*)ptr0, _MM_HINT_T0) - ((int64_t*)&l0[idx0 & MASK])[0] = n ^ q; - idx0 = d ^ q; - } - else if(ALGO == cryptonight_haven) - { - int64_t n = ((int64_t*)&l0[idx0 & MASK])[0]; - int32_t d = ((int32_t*)&l0[idx0 & MASK])[2]; - int64_t q = n / (d | 0x5); +#define CN_STEP4(n, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0) \ + if (ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) \ + { \ + if (ALGO == cryptonight_ipbc || ALGO == cryptonight_bittube2) \ + ((uint64_t*)ptr0)[1] = ah0 ^ monero_const ^ ((uint64_t*)ptr0)[0]; \ + else \ + ((uint64_t*)ptr0)[1] = ah0 ^ monero_const; \ + } \ + else \ + ((uint64_t*)ptr0)[1] = ah0; \ + al0 ^= cl; \ + ah0 ^= ch; \ + ax0 = _mm_set_epi64x(ah0, al0); \ + idx0 = al0; - ((int64_t*)&l0[idx0 & MASK])[0] = n ^ q; - idx0 = (~d) ^ q; - } +#define CN_STEP5(n, monero_const, l0, ax0, bx0, idx0, ptr0) \ + if(ALGO == cryptonight_heavy || ALGO == cryptonight_bittube2) \ + { \ + ptr0 = (__m128i *)&l0[idx0 & MASK]; \ + int64_t u = ((int64_t*)ptr0)[0]; \ + int32_t d = ((int32_t*)ptr0)[2]; \ + int64_t q = u / (d | 0x5); \ + \ + ((int64_t*)ptr0)[0] = u ^ q; \ + idx0 = d ^ q; \ + } \ + else if(ALGO == cryptonight_haven) \ + { \ + ptr0 = (__m128i *)&l0[idx0 & MASK]; \ + int64_t u = ((int64_t*)ptr0)[0]; \ + int32_t d = ((int32_t*)ptr0)[2]; \ + int64_t q = u / (d | 0x5); \ + \ + ((int64_t*)ptr0)[0] = u ^ q; \ + idx0 = (~d) ^ q; \ } - // Optim - 90% time boundary - cn_implode_scratchpad((__m128i*)ctx0->long_state, (__m128i*)ctx0->hash_state); - - // Optim - 99% time boundary +#define CN_FINALIZE(n) \ + /* Optim - 90% time boundary */ \ + cn_implode_scratchpad((__m128i*)ctx[n]->long_state, (__m128i*)ctx[n]->hash_state); \ + /* Optim - 99% time boundary */ \ + keccakf((uint64_t*)ctx[n]->hash_state, 24); \ + extra_hashes[ctx[n]->hash_state[0] & 3](ctx[n]->hash_state, 200, (char*)output + 32 * n) - keccakf((uint64_t*)ctx0->hash_state, 24); - extra_hashes[ctx0->hash_state[0] & 3](ctx0->hash_state, 200, (char*)output); -} +//! defer the evaluation of an macro +#ifndef _MSC_VER +# define CN_DEFER(...) __VA_ARGS__ +#else +# define CN_EMPTY(...) +# define CN_DEFER(...) __VA_ARGS__ CN_EMPTY() +#endif -// This lovely creation will do 2 cn hashes at a time. We have plenty of space on silicon -// to fit temporary vars for two contexts. Function will read len*2 from input and write 64 bytes to output -// We are still limited by L3 cache, so doubling will only work with CPUs where we have more than 2MB to core (Xeons) -template -void cryptonight_double_hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx) +//! execute the macro f with the passed arguments +#define CN_EXEC(f,...) CN_DEFER(f)(__VA_ARGS__) + +/** add append n to all arguments and keeps n as first argument + * + * @param n number which is appended to the arguments (expect the first argument n) + * + * @code{.cpp} + * CN_ENUM_2(1, foo, bar) + * // is transformed to + * 1, foo1, bar1 + * @endcode + */ +#define CN_ENUM_0(n, ...) n +#define CN_ENUM_1(n, x1) n, x1 ## n +#define CN_ENUM_2(n, x1, x2) n, x1 ## n, x2 ## n +#define CN_ENUM_3(n, x1, x2, x3) n, x1 ## n, x2 ## n, x3 ## n +#define CN_ENUM_4(n, x1, x2, x3, x4) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n +#define CN_ENUM_5(n, x1, x2, x3, x4, x5) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n +#define CN_ENUM_6(n, x1, x2, x3, x4, x5, x6) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n +#define CN_ENUM_7(n, x1, x2, x3, x4, x5, x6, x7) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n +#define CN_ENUM_8(n, x1, x2, x3, x4, x5, x6, x7, x8) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n +#define CN_ENUM_9(n, x1, x2, x3, x4, x5, x6, x7, x8, x9) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n +#define CN_ENUM_10(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n +#define CN_ENUM_11(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n +#define CN_ENUM_12(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n, x12 ## n +#define CN_ENUM_13(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n, x12 ## n, x13 ## n +#define CN_ENUM_14(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n, x12 ## n, x13 ## n, x14 ## n +#define CN_ENUM_15(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n, x12 ## n, x13 ## n, x14 ## n, x15 ## n + +/** repeat a macro call multiple times + * + * @param n number of arguments followed after f + * @param f name of the macro which should be executed + * @param ... n parameter which name will get appended by a unique number + * + * @code{.cpp} + * REPEAT_2(2, f, foo, bar) + * // is transformed to + * f(0, foo0, bar); f(1, foo1, bar1) + * @endcode + */ +#define REPEAT_1(n, f, ...) CN_EXEC(f, CN_ENUM_ ## n(0, __VA_ARGS__)) +#define REPEAT_2(n, f, ...) CN_EXEC(f, CN_ENUM_ ## n(0, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(1, __VA_ARGS__)) +#define REPEAT_3(n, f, ...) CN_EXEC(f, CN_ENUM_ ## n(0, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(1, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(2, __VA_ARGS__)) +#define REPEAT_4(n, f, ...) CN_EXEC(f, CN_ENUM_ ## n(0, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(1, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(2, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(3, __VA_ARGS__)) +#define REPEAT_5(n, f, ...) CN_EXEC(f, CN_ENUM_ ## n(0, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(1, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(2, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(3, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(4, __VA_ARGS__)) + +template< size_t N> +struct Cryptonight_hash; + +template< > +struct Cryptonight_hash<1> { - constexpr size_t MASK = cn_select_mask(); - constexpr size_t ITERATIONS = cn_select_iter(); - constexpr size_t MEM = cn_select_memory(); + static constexpr size_t N = 1; - if((ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) && len < 43) + template + static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx) { - memset(output, 0, 64); - return; - } - - keccak((const uint8_t *)input, len, ctx[0]->hash_state, 200); - keccak((const uint8_t *)input+len, len, ctx[1]->hash_state, 200); - - uint64_t monero_const_0, monero_const_1; - if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) - { - monero_const_0 = *reinterpret_cast(reinterpret_cast(input) + 35); - monero_const_0 ^= *(reinterpret_cast(ctx[0]->hash_state) + 24); - monero_const_1 = *reinterpret_cast(reinterpret_cast(input) + len + 35); - monero_const_1 ^= *(reinterpret_cast(ctx[1]->hash_state) + 24); - } - - // Optim - 99% time boundary - cn_explode_scratchpad((__m128i*)ctx[0]->hash_state, (__m128i*)ctx[0]->long_state); - cn_explode_scratchpad((__m128i*)ctx[1]->hash_state, (__m128i*)ctx[1]->long_state); + constexpr size_t MASK = cn_select_mask(); + constexpr size_t ITERATIONS = cn_select_iter(); + constexpr size_t MEM = cn_select_memory(); - uint8_t* l0 = ctx[0]->long_state; - uint64_t* h0 = (uint64_t*)ctx[0]->hash_state; - uint8_t* l1 = ctx[1]->long_state; - uint64_t* h1 = (uint64_t*)ctx[1]->hash_state; + CN_INIT_SINGLE; + REPEAT_1(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm); - uint64_t axl0 = h0[0] ^ h0[4]; - uint64_t axh0 = h0[1] ^ h0[5]; - __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); - uint64_t axl1 = h1[0] ^ h1[4]; - uint64_t axh1 = h1[1] ^ h1[5]; - __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); - - uint64_t idx0 = h0[0] ^ h0[4]; - uint64_t idx1 = h1[0] ^ h1[4]; - - // Optim - 90% time boundary - for (size_t i = 0; i < ITERATIONS; i++) - { - __m128i cx; - cx = _mm_load_si128((__m128i *)&l0[idx0 & MASK]); - - if (ALGO == cryptonight_bittube2) - { - cx = aes_round_bittube2(cx, _mm_set_epi64x(axh0, axl0)); - } - else + // Optim - 90% time boundary + for(size_t i = 0; i < ITERATIONS; i++) { - if(SOFT_AES) - cx = soft_aesenc(cx, _mm_set_epi64x(axh0, axl0)); - else - cx = _mm_aesenc_si128(cx, _mm_set_epi64x(axh0, axl0)); + REPEAT_1(8, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1); + REPEAT_1(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx); + REPEAT_1(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm); + REPEAT_1(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0); + REPEAT_1(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0); } - if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) - cryptonight_monero_tweak((uint64_t*)&l0[idx0 & MASK], _mm_xor_si128(bx0, cx)); - else - _mm_store_si128((__m128i *)&l0[idx0 & MASK], _mm_xor_si128(bx0, cx)); + REPEAT_1(0, CN_FINALIZE); + } +}; - idx0 = _mm_cvtsi128_si64(cx); - bx0 = cx; +template< > +struct Cryptonight_hash<2> +{ + static constexpr size_t N = 2; - if(PREFETCH) - _mm_prefetch((const char*)&l0[idx0 & MASK], _MM_HINT_T0); + template + static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx) + { + constexpr size_t MASK = cn_select_mask(); + constexpr size_t ITERATIONS = cn_select_iter(); + constexpr size_t MEM = cn_select_memory(); - cx = _mm_load_si128((__m128i *)&l1[idx1 & MASK]); + CN_INIT_SINGLE; + REPEAT_2(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm); - if (ALGO == cryptonight_bittube2) - { - cx = aes_round_bittube2(cx, _mm_set_epi64x(axh1, axl1)); - } - else + // Optim - 90% time boundary + for(size_t i = 0; i < ITERATIONS; i++) { - if(SOFT_AES) - cx = soft_aesenc(cx, _mm_set_epi64x(axh1, axl1)); - else - cx = _mm_aesenc_si128(cx, _mm_set_epi64x(axh1, axl1)); + REPEAT_2(8, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1); + REPEAT_2(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx); + REPEAT_2(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm); + REPEAT_2(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0); + REPEAT_2(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0); } - if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) - cryptonight_monero_tweak((uint64_t*)&l1[idx1 & MASK], _mm_xor_si128(bx1, cx)); - else - _mm_store_si128((__m128i *)&l1[idx1 & MASK], _mm_xor_si128(bx1, cx)); - - idx1 = _mm_cvtsi128_si64(cx); - bx1 = cx; - - if(PREFETCH) - _mm_prefetch((const char*)&l1[idx1 & MASK], _MM_HINT_T0); - - uint64_t hi, lo, cl, ch; - cl = ((uint64_t*)&l0[idx0 & MASK])[0]; - ch = ((uint64_t*)&l0[idx0 & MASK])[1]; - - lo = _umul128(idx0, cl, &hi); + REPEAT_2(0, CN_FINALIZE); + } +}; - axl0 += hi; - axh0 += lo; - ((uint64_t*)&l0[idx0 & MASK])[0] = axl0; +template< > +struct Cryptonight_hash<3> +{ + static constexpr size_t N = 3; - if (ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) { - if (ALGO == cryptonight_ipbc || ALGO == cryptonight_bittube2) - ((uint64_t*)&l0[idx0 & MASK])[1] = axh0 ^ monero_const_0 ^ ((uint64_t*)&l0[idx0 & MASK])[0]; - else - ((uint64_t*)&l0[idx0 & MASK])[1] = axh0 ^ monero_const_0; - } else - ((uint64_t*)&l0[idx0 & MASK])[1] = axh0; + template + static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx) + { + constexpr size_t MASK = cn_select_mask(); + constexpr size_t ITERATIONS = cn_select_iter(); + constexpr size_t MEM = cn_select_memory(); - axh0 ^= ch; - axl0 ^= cl; - idx0 = axl0; + CN_INIT_SINGLE; + REPEAT_3(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm); - if(ALGO == cryptonight_heavy || ALGO == cryptonight_bittube2) + // Optim - 90% time boundary + for(size_t i = 0; i < ITERATIONS; i++) { - int64_t n = ((int64_t*)&l0[idx0 & MASK])[0]; - int32_t d = ((int32_t*)&l0[idx0 & MASK])[2]; - int64_t q = n / (d | 0x5); - - ((int64_t*)&l0[idx0 & MASK])[0] = n ^ q; - idx0 = d ^ q; + REPEAT_3(8, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1); + REPEAT_3(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx); + REPEAT_3(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm); + REPEAT_3(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0); + REPEAT_3(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0); } - else if(ALGO == cryptonight_haven) - { - int64_t n = ((int64_t*)&l0[idx0 & MASK])[0]; - int32_t d = ((int32_t*)&l0[idx0 & MASK])[2]; - int64_t q = n / (d | 0x5); - - ((int64_t*)&l0[idx0 & MASK])[0] = n ^ q; - idx0 = (~d) ^ q; - } - - if(PREFETCH) - _mm_prefetch((const char*)&l0[idx0 & MASK], _MM_HINT_T0); - - cl = ((uint64_t*)&l1[idx1 & MASK])[0]; - ch = ((uint64_t*)&l1[idx1 & MASK])[1]; - lo = _umul128(idx1, cl, &hi); + REPEAT_3(0, CN_FINALIZE); + } +}; - axl1 += hi; - axh1 += lo; - ((uint64_t*)&l1[idx1 & MASK])[0] = axl1; +template< > +struct Cryptonight_hash<4> +{ + static constexpr size_t N = 4; - if (ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_bittube2) { - if (ALGO == cryptonight_ipbc || ALGO == cryptonight_bittube2) - ((uint64_t*)&l1[idx1 & MASK])[1] = axh1 ^ monero_const_1 ^ ((uint64_t*)&l1[idx1 & MASK])[0]; - else - ((uint64_t*)&l1[idx1 & MASK])[1] = axh1 ^ monero_const_1; - } else - ((uint64_t*)&l1[idx1 & MASK])[1] = axh1; + template + static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx) + { + constexpr size_t MASK = cn_select_mask(); + constexpr size_t ITERATIONS = cn_select_iter(); + constexpr size_t MEM = cn_select_memory(); - axh1 ^= ch; - axl1 ^= cl; - idx1 = axl1; + CN_INIT_SINGLE; + REPEAT_4(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm); - if(ALGO == cryptonight_heavy || ALGO == cryptonight_bittube2) + // Optim - 90% time boundary + for(size_t i = 0; i < ITERATIONS; i++) { - int64_t n = ((int64_t*)&l1[idx1 & MASK])[0]; - int32_t d = ((int32_t*)&l1[idx1 & MASK])[2]; - int64_t q = n / (d | 0x5); - - ((int64_t*)&l1[idx1 & MASK])[0] = n ^ q; - idx1 = d ^ q; + REPEAT_4(8, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1); + REPEAT_4(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx); + REPEAT_4(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm); + REPEAT_4(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0); + REPEAT_4(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0); } - else if(ALGO == cryptonight_haven) - { - int64_t n = ((int64_t*)&l1[idx1 & MASK])[0]; - int32_t d = ((int32_t*)&l1[idx1 & MASK])[2]; - int64_t q = n / (d | 0x5); - ((int64_t*)&l1[idx1 & MASK])[0] = n ^ q; - idx1 = (~d) ^ q; - } - - if(PREFETCH) - _mm_prefetch((const char*)&l1[idx1 & MASK], _MM_HINT_T0); + REPEAT_4(0, CN_FINALIZE); } +}; - // Optim - 90% time boundary - cn_implode_scratchpad((__m128i*)ctx[0]->long_state, (__m128i*)ctx[0]->hash_state); - cn_implode_scratchpad((__m128i*)ctx[1]->long_state, (__m128i*)ctx[1]->hash_state); - - // Optim - 99% time boundary - - keccakf((uint64_t*)ctx[0]->hash_state, 24); - extra_hashes[ctx[0]->hash_state[0] & 3](ctx[0]->hash_state, 200, (char*)output); - keccakf((uint64_t*)ctx[1]->hash_state, 24); - extra_hashes[ctx[1]->hash_state[0] & 3](ctx[1]->hash_state, 200, (char*)output + 32); -} +template< > +struct Cryptonight_hash<5> +{ + static constexpr size_t N = 5; -#define CN_STEP1(a, b, c, l, ptr, idx) \ - ptr = (__m128i *)&l[idx & MASK]; \ - if(PREFETCH) \ - _mm_prefetch((const char*)ptr, _MM_HINT_T0); \ - c = _mm_load_si128(ptr); - -#define CN_STEP2(a, b, c, l, ptr, idx) \ - if (ALGO == cryptonight_bittube2) \ - { \ - c = aes_round_bittube2(c, a); \ - } \ - else \ - { \ - if(SOFT_AES) \ - c = soft_aesenc(c, a); \ - else \ - c = _mm_aesenc_si128(c, a); \ - } \ - b = _mm_xor_si128(b, c); \ - if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) \ - cryptonight_monero_tweak((uint64_t*)ptr, b); \ - else \ - _mm_store_si128(ptr, b);\ - -#define CN_STEP3(a, b, c, l, ptr, idx) \ - idx = _mm_cvtsi128_si64(c); \ - ptr = (__m128i *)&l[idx & MASK]; \ - if(PREFETCH) \ - _mm_prefetch((const char*)ptr, _MM_HINT_T0); \ - b = _mm_load_si128(ptr); - -#define CN_STEP4(a, b, c, l, mc, ptr, idx) \ - lo = _umul128(idx, _mm_cvtsi128_si64(b), &hi); \ - a = _mm_add_epi64(a, _mm_set_epi64x(lo, hi)); \ - if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) \ - { \ - _mm_store_si128(ptr, _mm_xor_si128(a, mc)); \ - if (ALGO == cryptonight_ipbc || ALGO == cryptonight_bittube2) \ - ((uint64_t*)ptr)[1] ^= ((uint64_t*)ptr)[0];\ - } \ - else \ - _mm_store_si128(ptr, a);\ - a = _mm_xor_si128(a, b); \ - idx = _mm_cvtsi128_si64(a); \ - if(ALGO == cryptonight_heavy || ALGO == cryptonight_bittube2) \ - { \ - int64_t n = ((int64_t*)&l[idx & MASK])[0]; \ - int32_t d = ((int32_t*)&l[idx & MASK])[2]; \ - int64_t q = n / (d | 0x5); \ - ((int64_t*)&l[idx & MASK])[0] = n ^ q; \ - idx = d ^ q; \ - } \ - else if(ALGO == cryptonight_haven) \ - { \ - int64_t n = ((int64_t*)&l[idx & MASK])[0]; \ - int32_t d = ((int32_t*)&l[idx & MASK])[2]; \ - int64_t q = n / (d | 0x5); \ - ((int64_t*)&l[idx & MASK])[0] = n ^ q; \ - idx = (~d) ^ q; \ - } + template + static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx) + { + constexpr size_t MASK = cn_select_mask(); + constexpr size_t ITERATIONS = cn_select_iter(); + constexpr size_t MEM = cn_select_memory(); -#define CONST_INIT(ctx, n) \ - __m128i mc##n = _mm_set_epi64x(*reinterpret_cast(reinterpret_cast(input) + n * len + 35) ^ \ - *(reinterpret_cast((ctx)->hash_state) + 24), 0); + CN_INIT_SINGLE; + REPEAT_5(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm); -// This lovelier creation will do 3 cn hashes at a time. -template -void cryptonight_triple_hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx) -{ - constexpr size_t MASK = cn_select_mask(); - constexpr size_t ITERATIONS = cn_select_iter(); - constexpr size_t MEM = cn_select_memory(); + // Optim - 90% time boundary + for(size_t i = 0; i < ITERATIONS; i++) + { + REPEAT_5(8, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1); + REPEAT_5(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx); + REPEAT_5(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm); + REPEAT_5(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0); + REPEAT_5(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0); + } - if((ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) && len < 43) - { - memset(output, 0, 32 * 3); - return; + REPEAT_5(0, CN_FINALIZE); } +}; - for (size_t i = 0; i < 3; i++) - { - keccak((const uint8_t *)input + len * i, len, ctx[i]->hash_state, 200); - cn_explode_scratchpad((__m128i*)ctx[i]->hash_state, (__m128i*)ctx[i]->long_state); - } +extern "C" void cryptonight_v8_mainloop_ivybridge_asm(cryptonight_ctx* ctx0); +extern "C" void cryptonight_v8_mainloop_ryzen_asm(cryptonight_ctx* ctx0); +extern "C" void cryptonight_v8_double_mainloop_sandybridge_asm(cryptonight_ctx* ctx0, cryptonight_ctx* ctx1); - CONST_INIT(ctx[0], 0); - CONST_INIT(ctx[1], 1); - CONST_INIT(ctx[2], 2); - - uint8_t* l0 = ctx[0]->long_state; - uint64_t* h0 = (uint64_t*)ctx[0]->hash_state; - uint8_t* l1 = ctx[1]->long_state; - uint64_t* h1 = (uint64_t*)ctx[1]->hash_state; - uint8_t* l2 = ctx[2]->long_state; - uint64_t* h2 = (uint64_t*)ctx[2]->hash_state; - - __m128i ax0 = _mm_set_epi64x(h0[1] ^ h0[5], h0[0] ^ h0[4]); - __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); - __m128i ax1 = _mm_set_epi64x(h1[1] ^ h1[5], h1[0] ^ h1[4]); - __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); - __m128i ax2 = _mm_set_epi64x(h2[1] ^ h2[5], h2[0] ^ h2[4]); - __m128i bx2 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]); - __m128i cx0 = _mm_set_epi64x(0, 0); - __m128i cx1 = _mm_set_epi64x(0, 0); - __m128i cx2 = _mm_set_epi64x(0, 0); - - uint64_t idx0, idx1, idx2; - idx0 = _mm_cvtsi128_si64(ax0); - idx1 = _mm_cvtsi128_si64(ax1); - idx2 = _mm_cvtsi128_si64(ax2); - - for (size_t i = 0; i < ITERATIONS/2; i++) - { - uint64_t hi, lo; - __m128i *ptr0, *ptr1, *ptr2; - - // EVEN ROUND - CN_STEP1(ax0, bx0, cx0, l0, ptr0, idx0); - CN_STEP1(ax1, bx1, cx1, l1, ptr1, idx1); - CN_STEP1(ax2, bx2, cx2, l2, ptr2, idx2); - - CN_STEP2(ax0, bx0, cx0, l0, ptr0, idx0); - CN_STEP2(ax1, bx1, cx1, l1, ptr1, idx1); - CN_STEP2(ax2, bx2, cx2, l2, ptr2, idx2); - - CN_STEP3(ax0, bx0, cx0, l0, ptr0, idx0); - CN_STEP3(ax1, bx1, cx1, l1, ptr1, idx1); - CN_STEP3(ax2, bx2, cx2, l2, ptr2, idx2); - - CN_STEP4(ax0, bx0, cx0, l0, mc0, ptr0, idx0); - CN_STEP4(ax1, bx1, cx1, l1, mc1, ptr1, idx1); - CN_STEP4(ax2, bx2, cx2, l2, mc2, ptr2, idx2); - - // ODD ROUND - CN_STEP1(ax0, cx0, bx0, l0, ptr0, idx0); - CN_STEP1(ax1, cx1, bx1, l1, ptr1, idx1); - CN_STEP1(ax2, cx2, bx2, l2, ptr2, idx2); - - CN_STEP2(ax0, cx0, bx0, l0, ptr0, idx0); - CN_STEP2(ax1, cx1, bx1, l1, ptr1, idx1); - CN_STEP2(ax2, cx2, bx2, l2, ptr2, idx2); - - CN_STEP3(ax0, cx0, bx0, l0, ptr0, idx0); - CN_STEP3(ax1, cx1, bx1, l1, ptr1, idx1); - CN_STEP3(ax2, cx2, bx2, l2, ptr2, idx2); - - CN_STEP4(ax0, cx0, bx0, l0, mc0, ptr0, idx0); - CN_STEP4(ax1, cx1, bx1, l1, mc1, ptr1, idx1); - CN_STEP4(ax2, cx2, bx2, l2, mc2, ptr2, idx2); - } - for (size_t i = 0; i < 3; i++) - { - cn_implode_scratchpad((__m128i*)ctx[i]->long_state, (__m128i*)ctx[i]->hash_state); - keccakf((uint64_t*)ctx[i]->hash_state, 24); - extra_hashes[ctx[i]->hash_state[0] & 3](ctx[i]->hash_state, 200, (char*)output + 32 * i); - } -} +template< size_t N, size_t asm_version> +struct Cryptonight_hash_asm; -// This even lovelier creation will do 4 cn hashes at a time. -template -void cryptonight_quad_hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx) +template +struct Cryptonight_hash_asm<1, asm_version> { - constexpr size_t MASK = cn_select_mask(); - constexpr size_t ITERATIONS = cn_select_iter(); - constexpr size_t MEM = cn_select_memory(); + static constexpr size_t N = 1; - if((ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) && len < 43) + template + static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx) { - memset(output, 0, 32 * 4); - return; - } + constexpr size_t MEM = cn_select_memory(); - for (size_t i = 0; i < 4; i++) - { - keccak((const uint8_t *)input + len * i, len, ctx[i]->hash_state, 200); - cn_explode_scratchpad((__m128i*)ctx[i]->hash_state, (__m128i*)ctx[i]->long_state); - } + keccak((const uint8_t *)input, len, ctx[0]->hash_state, 200); + cn_explode_scratchpad((__m128i*)ctx[0]->hash_state, (__m128i*)ctx[0]->long_state); - CONST_INIT(ctx[0], 0); - CONST_INIT(ctx[1], 1); - CONST_INIT(ctx[2], 2); - CONST_INIT(ctx[3], 3); - - uint8_t* l0 = ctx[0]->long_state; - uint64_t* h0 = (uint64_t*)ctx[0]->hash_state; - uint8_t* l1 = ctx[1]->long_state; - uint64_t* h1 = (uint64_t*)ctx[1]->hash_state; - uint8_t* l2 = ctx[2]->long_state; - uint64_t* h2 = (uint64_t*)ctx[2]->hash_state; - uint8_t* l3 = ctx[3]->long_state; - uint64_t* h3 = (uint64_t*)ctx[3]->hash_state; - - __m128i ax0 = _mm_set_epi64x(h0[1] ^ h0[5], h0[0] ^ h0[4]); - __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); - __m128i ax1 = _mm_set_epi64x(h1[1] ^ h1[5], h1[0] ^ h1[4]); - __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); - __m128i ax2 = _mm_set_epi64x(h2[1] ^ h2[5], h2[0] ^ h2[4]); - __m128i bx2 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]); - __m128i ax3 = _mm_set_epi64x(h3[1] ^ h3[5], h3[0] ^ h3[4]); - __m128i bx3 = _mm_set_epi64x(h3[3] ^ h3[7], h3[2] ^ h3[6]); - __m128i cx0 = _mm_set_epi64x(0, 0); - __m128i cx1 = _mm_set_epi64x(0, 0); - __m128i cx2 = _mm_set_epi64x(0, 0); - __m128i cx3 = _mm_set_epi64x(0, 0); - - uint64_t idx0, idx1, idx2, idx3; - idx0 = _mm_cvtsi128_si64(ax0); - idx1 = _mm_cvtsi128_si64(ax1); - idx2 = _mm_cvtsi128_si64(ax2); - idx3 = _mm_cvtsi128_si64(ax3); - - for (size_t i = 0; i < ITERATIONS/2; i++) - { - uint64_t hi, lo; - __m128i *ptr0, *ptr1, *ptr2, *ptr3; - - // EVEN ROUND - CN_STEP1(ax0, bx0, cx0, l0, ptr0, idx0); - CN_STEP1(ax1, bx1, cx1, l1, ptr1, idx1); - CN_STEP1(ax2, bx2, cx2, l2, ptr2, idx2); - CN_STEP1(ax3, bx3, cx3, l3, ptr3, idx3); - - CN_STEP2(ax0, bx0, cx0, l0, ptr0, idx0); - CN_STEP2(ax1, bx1, cx1, l1, ptr1, idx1); - CN_STEP2(ax2, bx2, cx2, l2, ptr2, idx2); - CN_STEP2(ax3, bx3, cx3, l3, ptr3, idx3); - - CN_STEP3(ax0, bx0, cx0, l0, ptr0, idx0); - CN_STEP3(ax1, bx1, cx1, l1, ptr1, idx1); - CN_STEP3(ax2, bx2, cx2, l2, ptr2, idx2); - CN_STEP3(ax3, bx3, cx3, l3, ptr3, idx3); - - CN_STEP4(ax0, bx0, cx0, l0, mc0, ptr0, idx0); - CN_STEP4(ax1, bx1, cx1, l1, mc1, ptr1, idx1); - CN_STEP4(ax2, bx2, cx2, l2, mc2, ptr2, idx2); - CN_STEP4(ax3, bx3, cx3, l3, mc3, ptr3, idx3); - - // ODD ROUND - CN_STEP1(ax0, cx0, bx0, l0, ptr0, idx0); - CN_STEP1(ax1, cx1, bx1, l1, ptr1, idx1); - CN_STEP1(ax2, cx2, bx2, l2, ptr2, idx2); - CN_STEP1(ax3, cx3, bx3, l3, ptr3, idx3); - - CN_STEP2(ax0, cx0, bx0, l0, ptr0, idx0); - CN_STEP2(ax1, cx1, bx1, l1, ptr1, idx1); - CN_STEP2(ax2, cx2, bx2, l2, ptr2, idx2); - CN_STEP2(ax3, cx3, bx3, l3, ptr3, idx3); - - CN_STEP3(ax0, cx0, bx0, l0, ptr0, idx0); - CN_STEP3(ax1, cx1, bx1, l1, ptr1, idx1); - CN_STEP3(ax2, cx2, bx2, l2, ptr2, idx2); - CN_STEP3(ax3, cx3, bx3, l3, ptr3, idx3); - - CN_STEP4(ax0, cx0, bx0, l0, mc0, ptr0, idx0); - CN_STEP4(ax1, cx1, bx1, l1, mc1, ptr1, idx1); - CN_STEP4(ax2, cx2, bx2, l2, mc2, ptr2, idx2); - CN_STEP4(ax3, cx3, bx3, l3, mc3, ptr3, idx3); - } + if(asm_version == 0) + cryptonight_v8_mainloop_ivybridge_asm(ctx[0]); + else if(asm_version == 1) + cryptonight_v8_mainloop_ryzen_asm(ctx[0]); - for (size_t i = 0; i < 4; i++) - { - cn_implode_scratchpad((__m128i*)ctx[i]->long_state, (__m128i*)ctx[i]->hash_state); - keccakf((uint64_t*)ctx[i]->hash_state, 24); - extra_hashes[ctx[i]->hash_state[0] & 3](ctx[i]->hash_state, 200, (char*)output + 32 * i); + cn_implode_scratchpad((__m128i*)ctx[0]->long_state, (__m128i*)ctx[0]->hash_state); + keccakf((uint64_t*)ctx[0]->hash_state, 24); + extra_hashes[ctx[0]->hash_state[0] & 3](ctx[0]->hash_state, 200, (char*)output); } -} +}; -// This most lovely creation will do 5 cn hashes at a time. -template -void cryptonight_penta_hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx) +// double hash only for intel +template< > +struct Cryptonight_hash_asm<2, 0> { - constexpr size_t MASK = cn_select_mask(); - constexpr size_t ITERATIONS = cn_select_iter(); - constexpr size_t MEM = cn_select_memory(); + static constexpr size_t N = 2; - if((ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) && len < 43) + template + static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx) { - memset(output, 0, 32 * 5); - return; - } + constexpr size_t MEM = cn_select_memory(); - for (size_t i = 0; i < 5; i++) - { - keccak((const uint8_t *)input + len * i, len, ctx[i]->hash_state, 200); - cn_explode_scratchpad((__m128i*)ctx[i]->hash_state, (__m128i*)ctx[i]->long_state); - } + for(size_t i = 0; i < N; ++i) + { + keccak((const uint8_t *)input + len * i, len, ctx[i]->hash_state, 200); + /* Optim - 99% time boundary */ + cn_explode_scratchpad((__m128i*)ctx[i]->hash_state, (__m128i*)ctx[i]->long_state); + } - CONST_INIT(ctx[0], 0); - CONST_INIT(ctx[1], 1); - CONST_INIT(ctx[2], 2); - CONST_INIT(ctx[3], 3); - CONST_INIT(ctx[4], 4); - - uint8_t* l0 = ctx[0]->long_state; - uint64_t* h0 = (uint64_t*)ctx[0]->hash_state; - uint8_t* l1 = ctx[1]->long_state; - uint64_t* h1 = (uint64_t*)ctx[1]->hash_state; - uint8_t* l2 = ctx[2]->long_state; - uint64_t* h2 = (uint64_t*)ctx[2]->hash_state; - uint8_t* l3 = ctx[3]->long_state; - uint64_t* h3 = (uint64_t*)ctx[3]->hash_state; - uint8_t* l4 = ctx[4]->long_state; - uint64_t* h4 = (uint64_t*)ctx[4]->hash_state; - - __m128i ax0 = _mm_set_epi64x(h0[1] ^ h0[5], h0[0] ^ h0[4]); - __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); - __m128i ax1 = _mm_set_epi64x(h1[1] ^ h1[5], h1[0] ^ h1[4]); - __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); - __m128i ax2 = _mm_set_epi64x(h2[1] ^ h2[5], h2[0] ^ h2[4]); - __m128i bx2 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]); - __m128i ax3 = _mm_set_epi64x(h3[1] ^ h3[5], h3[0] ^ h3[4]); - __m128i bx3 = _mm_set_epi64x(h3[3] ^ h3[7], h3[2] ^ h3[6]); - __m128i ax4 = _mm_set_epi64x(h4[1] ^ h4[5], h4[0] ^ h4[4]); - __m128i bx4 = _mm_set_epi64x(h4[3] ^ h4[7], h4[2] ^ h4[6]); - __m128i cx0 = _mm_set_epi64x(0, 0); - __m128i cx1 = _mm_set_epi64x(0, 0); - __m128i cx2 = _mm_set_epi64x(0, 0); - __m128i cx3 = _mm_set_epi64x(0, 0); - __m128i cx4 = _mm_set_epi64x(0, 0); - - uint64_t idx0, idx1, idx2, idx3, idx4; - idx0 = _mm_cvtsi128_si64(ax0); - idx1 = _mm_cvtsi128_si64(ax1); - idx2 = _mm_cvtsi128_si64(ax2); - idx3 = _mm_cvtsi128_si64(ax3); - idx4 = _mm_cvtsi128_si64(ax4); - - for (size_t i = 0; i < ITERATIONS/2; i++) - { - uint64_t hi, lo; - __m128i *ptr0, *ptr1, *ptr2, *ptr3, *ptr4; - - // EVEN ROUND - CN_STEP1(ax0, bx0, cx0, l0, ptr0, idx0); - CN_STEP1(ax1, bx1, cx1, l1, ptr1, idx1); - CN_STEP1(ax2, bx2, cx2, l2, ptr2, idx2); - CN_STEP1(ax3, bx3, cx3, l3, ptr3, idx3); - CN_STEP1(ax4, bx4, cx4, l4, ptr4, idx4); - - CN_STEP2(ax0, bx0, cx0, l0, ptr0, idx0); - CN_STEP2(ax1, bx1, cx1, l1, ptr1, idx1); - CN_STEP2(ax2, bx2, cx2, l2, ptr2, idx2); - CN_STEP2(ax3, bx3, cx3, l3, ptr3, idx3); - CN_STEP2(ax4, bx4, cx4, l4, ptr4, idx4); - - CN_STEP3(ax0, bx0, cx0, l0, ptr0, idx0); - CN_STEP3(ax1, bx1, cx1, l1, ptr1, idx1); - CN_STEP3(ax2, bx2, cx2, l2, ptr2, idx2); - CN_STEP3(ax3, bx3, cx3, l3, ptr3, idx3); - CN_STEP3(ax4, bx4, cx4, l4, ptr4, idx4); - - CN_STEP4(ax0, bx0, cx0, l0, mc0, ptr0, idx0); - CN_STEP4(ax1, bx1, cx1, l1, mc1, ptr1, idx1); - CN_STEP4(ax2, bx2, cx2, l2, mc2, ptr2, idx2); - CN_STEP4(ax3, bx3, cx3, l3, mc3, ptr3, idx3); - CN_STEP4(ax4, bx4, cx4, l4, mc4, ptr4, idx4); - - // ODD ROUND - CN_STEP1(ax0, cx0, bx0, l0, ptr0, idx0); - CN_STEP1(ax1, cx1, bx1, l1, ptr1, idx1); - CN_STEP1(ax2, cx2, bx2, l2, ptr2, idx2); - CN_STEP1(ax3, cx3, bx3, l3, ptr3, idx3); - CN_STEP1(ax4, cx4, bx4, l4, ptr4, idx4); - - CN_STEP2(ax0, cx0, bx0, l0, ptr0, idx0); - CN_STEP2(ax1, cx1, bx1, l1, ptr1, idx1); - CN_STEP2(ax2, cx2, bx2, l2, ptr2, idx2); - CN_STEP2(ax3, cx3, bx3, l3, ptr3, idx3); - CN_STEP2(ax4, cx4, bx4, l4, ptr4, idx4); - - CN_STEP3(ax0, cx0, bx0, l0, ptr0, idx0); - CN_STEP3(ax1, cx1, bx1, l1, ptr1, idx1); - CN_STEP3(ax2, cx2, bx2, l2, ptr2, idx2); - CN_STEP3(ax3, cx3, bx3, l3, ptr3, idx3); - CN_STEP3(ax4, cx4, bx4, l4, ptr4, idx4); - - CN_STEP4(ax0, cx0, bx0, l0, mc0, ptr0, idx0); - CN_STEP4(ax1, cx1, bx1, l1, mc1, ptr1, idx1); - CN_STEP4(ax2, cx2, bx2, l2, mc2, ptr2, idx2); - CN_STEP4(ax3, cx3, bx3, l3, mc3, ptr3, idx3); - CN_STEP4(ax4, cx4, bx4, l4, mc4, ptr4, idx4); - } + cryptonight_v8_double_mainloop_sandybridge_asm(ctx[0], ctx[1]); - for (size_t i = 0; i < 5; i++) - { - cn_implode_scratchpad((__m128i*)ctx[i]->long_state, (__m128i*)ctx[i]->hash_state); - keccakf((uint64_t*)ctx[i]->hash_state, 24); - extra_hashes[ctx[i]->hash_state[0] & 3](ctx[i]->hash_state, 200, (char*)output + 32 * i); + for(size_t i = 0; i < N; ++i) + { + /* Optim - 90% time boundary */ + cn_implode_scratchpad((__m128i*)ctx[i]->long_state, (__m128i*)ctx[i]->hash_state); + /* Optim - 99% time boundary */ + keccakf((uint64_t*)ctx[i]->hash_state, 24); + extra_hashes[ctx[i]->hash_state[0] & 3](ctx[i]->hash_state, 200, (char*)output + 32 * i); + } } -} +}; diff --git a/xmrstak/backend/cpu/crypto/cryptonight_common.cpp b/xmrstak/backend/cpu/crypto/cryptonight_common.cpp index a478c9b2a..a7e4696a8 100644 --- a/xmrstak/backend/cpu/crypto/cryptonight_common.cpp +++ b/xmrstak/backend/cpu/crypto/cryptonight_common.cpp @@ -216,6 +216,8 @@ cryptonight_ctx* cryptonight_alloc_ctx(size_t use_fast_mem, size_t use_mlock, al ptr->long_state = (uint8_t*)_mm_malloc(hashMemSize, hashMemSize); ptr->ctx_info[0] = 0; ptr->ctx_info[1] = 0; + if(ptr->long_state == NULL) + printer::inst()->print_msg(L0, "MEMORY ALLOC FAILED: _mm_malloc was not able to allocate %s byte",std::to_string(hashMemSize).c_str()); return ptr; } @@ -243,25 +245,25 @@ cryptonight_ctx* cryptonight_alloc_ctx(size_t use_fast_mem, size_t use_mlock, al return ptr; } #else - +//http://man7.org/linux/man-pages/man2/mmap.2.html #if defined(__APPLE__) - ptr->long_state = (uint8_t*)mmap(0, hashMemSize, PROT_READ | PROT_WRITE, + ptr->long_state = (uint8_t*)mmap(NULL, hashMemSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, VM_FLAGS_SUPERPAGE_SIZE_2MB, 0); #elif defined(__FreeBSD__) - ptr->long_state = (uint8_t*)mmap(0, hashMemSize, PROT_READ | PROT_WRITE, + ptr->long_state = (uint8_t*)mmap(NULL, hashMemSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_ALIGNED_SUPER | MAP_PREFAULT_READ, -1, 0); #elif defined(__OpenBSD__) - ptr->long_state = (uint8_t*)mmap(0, hashMemSize, PROT_READ | PROT_WRITE, + ptr->long_state = (uint8_t*)mmap(NULL, hashMemSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0); #else - ptr->long_state = (uint8_t*)mmap(0, hashMemSize, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_POPULATE, 0, 0); + ptr->long_state = (uint8_t*)mmap(NULL, hashMemSize, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_POPULATE, -1, 0); #endif if (ptr->long_state == MAP_FAILED) { _mm_free(ptr); - msg->warning = "mmap failed"; + msg->warning = "mmap failed, check attribute 'use_slow_memory' in 'config.txt'"; return NULL; } diff --git a/xmrstak/backend/cpu/jconf.cpp b/xmrstak/backend/cpu/jconf.cpp index 49da7ae2d..a14be1732 100644 --- a/xmrstak/backend/cpu/jconf.cpp +++ b/xmrstak/backend/cpu/jconf.cpp @@ -108,12 +108,13 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg) if(!oThdConf.IsObject()) return false; - const Value *mode, *no_prefetch, *aff; + const Value *mode, *no_prefetch, *aff, *asm_version; mode = GetObjectMember(oThdConf, "low_power_mode"); no_prefetch = GetObjectMember(oThdConf, "no_prefetch"); aff = GetObjectMember(oThdConf, "affine_to_cpu"); + asm_version = GetObjectMember(oThdConf, "asm"); - if(mode == nullptr || no_prefetch == nullptr || aff == nullptr) + if(mode == nullptr || no_prefetch == nullptr || aff == nullptr || asm_version == nullptr) return false; if(!mode->IsBool() && !mode->IsNumber()) @@ -140,6 +141,10 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg) else cfg.iCpuAff = -1; + if(!asm_version->IsString()) + return false; + cfg.asm_version_str = asm_version->GetString(); + return true; } diff --git a/xmrstak/backend/cpu/jconf.hpp b/xmrstak/backend/cpu/jconf.hpp index be855036e..4ec9165d5 100644 --- a/xmrstak/backend/cpu/jconf.hpp +++ b/xmrstak/backend/cpu/jconf.hpp @@ -24,6 +24,7 @@ class jconf struct thd_cfg { int iMultiway; bool bNoPrefetch; + std::string asm_version_str; long long iCpuAff; }; diff --git a/xmrstak/backend/cpu/minethd.cpp b/xmrstak/backend/cpu/minethd.cpp index 2e7169ef7..a9f18d1b0 100644 --- a/xmrstak/backend/cpu/minethd.cpp +++ b/xmrstak/backend/cpu/minethd.cpp @@ -27,6 +27,7 @@ #include "xmrstak/backend/iBackend.hpp" #include "xmrstak/backend/globalStates.hpp" #include "xmrstak/misc/configEditor.hpp" +#include "xmrstak/backend/cpu/cpuType.hpp" #include "xmrstak/params.hpp" #include "jconf.hpp" @@ -104,7 +105,7 @@ bool minethd::thd_setaffinity(std::thread::native_handle_type h, uint64_t cpu_id #endif } -minethd::minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch, int64_t affinity) +minethd::minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch, int64_t affinity, const std::string& asm_version) { this->backendType = iBackend::CPU; oWork = pWork; @@ -113,6 +114,7 @@ minethd::minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch, iJobNo = 0; bNoPrefetch = no_prefetch; this->affinity = affinity; + asm_version_str = asm_version; std::unique_lock lck(thd_aff_set); std::future order_guard = order_fix.get_future(); @@ -224,6 +226,7 @@ bool minethd::self_test() { if ((ctx[i] = minethd_alloc_ctx()) == nullptr) { + printer::inst()->print_msg(L0, "ERROR: miner was not able to allocate memory."); for (int j = 0; j < i; j++) cryptonight_free_ctx(ctx[j]); return false; @@ -232,92 +235,174 @@ bool minethd::self_test() bool bResult = true; - if(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight) - { - unsigned char out[32 * MAX_N]; - cn_hash_fun hashf; - cn_hash_fun_multi hashf_multi; - - hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight); - hashf("This is a test", 14, out, ctx[0]); - bResult = memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0; - - hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight); - hashf("This is a test", 14, out, ctx[0]); - bResult &= memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0; - - hashf_multi = func_multi_selector(2, ::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight); - hashf_multi("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx); - bResult &= memcmp(out, "\x3e\xbb\x7f\x9f\x7d\x27\x3d\x7c\x31\x8d\x86\x94\x77\x55\x0c\xc8\x00\xcf\xb1\x1b\x0c\xad\xb7\xff\xbd\xf6\xf8\x9f\x3a\x47\x1c\x59" - "\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22", 64) == 0; - - hashf_multi = func_multi_selector(2, ::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight); - hashf_multi("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx); - bResult &= memcmp(out, "\x3e\xbb\x7f\x9f\x7d\x27\x3d\x7c\x31\x8d\x86\x94\x77\x55\x0c\xc8\x00\xcf\xb1\x1b\x0c\xad\xb7\xff\xbd\xf6\xf8\x9f\x3a\x47\x1c\x59" - "\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22", 64) == 0; - - hashf_multi = func_multi_selector(3, ::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight); - hashf_multi("This is a testThis is a testThis is a test", 14, out, ctx); - bResult &= memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" - "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" - "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 96) == 0; - - hashf_multi = func_multi_selector(4, ::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight); - hashf_multi("This is a testThis is a testThis is a testThis is a test", 14, out, ctx); - bResult &= memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" - "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" - "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" - "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 128) == 0; - - hashf_multi = func_multi_selector(5, ::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight); - hashf_multi("This is a testThis is a testThis is a testThis is a testThis is a test", 14, out, ctx); - bResult &= memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" - "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" - "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" - "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" - "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 160) == 0; - } - else if(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_lite) - { - } - else if(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_monero) - { - } - else if(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_aeon) - { - } - else if(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_ipbc) - { - } - else if(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_stellite) - { - } - else if(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_masari) - { - } - else if(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_bittube2) + unsigned char out[32 * MAX_N]; + cn_hash_fun hashf; + cn_hash_fun hashf_multi; + + xmrstak_algo algo = xmrstak_algo::invalid_algo; + + for(int algo_idx = 0; algo_idx < 2; ++algo_idx) { - unsigned char out[32 * MAX_N]; - cn_hash_fun hashf; + if(algo_idx == 0) + algo = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo(); + else + algo = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot(); + + if(algo == cryptonight) + { + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight); + hashf("This is a test", 14, out, ctx); + bResult = bResult && memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0; + + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight); + hashf("This is a test", 14, out, ctx); + bResult = bResult && memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0; + + hashf_multi = func_multi_selector<2>(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight); + hashf_multi("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx); + bResult = bResult && memcmp(out, "\x3e\xbb\x7f\x9f\x7d\x27\x3d\x7c\x31\x8d\x86\x94\x77\x55\x0c\xc8\x00\xcf\xb1\x1b\x0c\xad\xb7\xff\xbd\xf6\xf8\x9f\x3a\x47\x1c\x59" + "\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22", 64) == 0; + + hashf_multi = func_multi_selector<2>(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight); + hashf_multi("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx); + bResult = bResult && memcmp(out, "\x3e\xbb\x7f\x9f\x7d\x27\x3d\x7c\x31\x8d\x86\x94\x77\x55\x0c\xc8\x00\xcf\xb1\x1b\x0c\xad\xb7\xff\xbd\xf6\xf8\x9f\x3a\x47\x1c\x59" + "\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22", 64) == 0; + + hashf_multi = func_multi_selector<3>(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight); + hashf_multi("This is a testThis is a testThis is a test", 14, out, ctx); + bResult = bResult && memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" + "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" + "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 96) == 0; + + hashf_multi = func_multi_selector<4>(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight); + hashf_multi("This is a testThis is a testThis is a testThis is a test", 14, out, ctx); + bResult = bResult && memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" + "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" + "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" + "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 128) == 0; + + hashf_multi = func_multi_selector<5>(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight); + hashf_multi("This is a testThis is a testThis is a testThis is a testThis is a test", 14, out, ctx); + bResult = bResult && memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" + "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" + "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" + "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" + "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 160) == 0; + } + else if(algo == cryptonight_lite) + { + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_lite); + hashf("This is a test This is a test This is a test", 44, out, ctx); + bResult = bResult && memcmp(out, "\x5a\x24\xa0\x29\xde\x1c\x39\x3f\x3d\x52\x7a\x2f\x9b\x39\xdc\x3d\xb3\xbc\x87\x11\x8b\x84\x52\x9b\x9f\x0\x88\x49\x25\x4b\x5\xce", 32) == 0; + + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_lite); + bResult = bResult && memcmp(out, "\x5a\x24\xa0\x29\xde\x1c\x39\x3f\x3d\x52\x7a\x2f\x9b\x39\xdc\x3d\xb3\xbc\x87\x11\x8b\x84\x52\x9b\x9f\x0\x88\x49\x25\x4b\x5\xce", 32) == 0; + } + else if(algo == cryptonight_monero) + { + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_monero); + hashf("This is a test This is a test This is a test", 44, out, ctx); + bResult = bResult && memcmp(out, "\x1\x57\xc5\xee\x18\x8b\xbe\xc8\x97\x52\x85\xa3\x6\x4e\xe9\x20\x65\x21\x76\x72\xfd\x69\xa1\xae\xbd\x7\x66\xc7\xb5\x6e\xe0\xbd", 32) == 0; + + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_monero); + hashf("This is a test This is a test This is a test", 44, out, ctx); + bResult = bResult && memcmp(out, "\x1\x57\xc5\xee\x18\x8b\xbe\xc8\x97\x52\x85\xa3\x6\x4e\xe9\x20\x65\x21\x76\x72\xfd\x69\xa1\xae\xbd\x7\x66\xc7\xb5\x6e\xe0\xbd", 32) == 0; + } + else if(algo == cryptonight_monero_v8) + { + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_monero_v8); + hashf("This is a test This is a test This is a test", 44, out, ctx); + bResult = memcmp(out, "\x35\x3f\xdc\x06\x8f\xd4\x7b\x03\xc0\x4b\x94\x31\xe0\x05\xe0\x0b\x68\xc2\x16\x8a\x3c\xc7\x33\x5c\x8b\x9b\x30\x81\x56\x59\x1a\x4f", 32) == 0; + + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_monero_v8); + hashf("This is a test This is a test This is a test", 44, out, ctx); + bResult &= memcmp(out, "\x35\x3f\xdc\x06\x8f\xd4\x7b\x03\xc0\x4b\x94\x31\xe0\x05\xe0\x0b\x68\xc2\x16\x8a\x3c\xc7\x33\x5c\x8b\x9b\x30\x81\x56\x59\x1a\x4f", 32) == 0; + } + else if(algo == cryptonight_aeon) + { + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_aeon); + hashf("This is a test This is a test This is a test", 44, out, ctx); + bResult = bResult && memcmp(out, "\xfc\xa1\x7d\x44\x37\x70\x9b\x4a\x3b\xd7\x1e\xf3\xed\x21\xb4\x17\xca\x93\xdc\x86\x79\xce\x81\xdf\xd3\xcb\xdd\xa\x22\xd7\x58\xba", 32) == 0; + + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_aeon); + hashf("This is a test This is a test This is a test", 44, out, ctx); + bResult = bResult && memcmp(out, "\xfc\xa1\x7d\x44\x37\x70\x9b\x4a\x3b\xd7\x1e\xf3\xed\x21\xb4\x17\xca\x93\xdc\x86\x79\xce\x81\xdf\xd3\xcb\xdd\xa\x22\xd7\x58\xba", 32) == 0; + } + else if(algo == cryptonight_ipbc) + { + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_ipbc); + hashf("This is a test This is a test This is a test", 44, out, ctx); + bResult = bResult && memcmp(out, "\xbc\xe7\x48\xaf\xc5\x31\xff\xc9\x33\x7f\xcf\x51\x1b\xe3\x20\xa3\xaa\x8d\x4\x55\xf9\x14\x2a\x61\xe8\x38\xdf\xdc\x3b\x28\x3e\x0xb0", 32) == 0; + + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_ipbc); + hashf("This is a test This is a test This is a test", 44, out, ctx); + bResult = bResult && memcmp(out, "\xbc\xe7\x48\xaf\xc5\x31\xff\xc9\x33\x7f\xcf\x51\x1b\xe3\x20\xa3\xaa\x8d\x4\x55\xf9\x14\x2a\x61\xe8\x38\xdf\xdc\x3b\x28\x3e\x0", 32) == 0; + } + else if(algo == cryptonight_stellite) + { + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_stellite); + hashf("This is a test This is a test This is a test", 44, out, ctx); + bResult = bResult && memcmp(out, "\xb9\x9d\x6c\xee\x50\x3c\x6f\xa6\x3f\x30\x69\x24\x4a\x0\x9f\xe4\xd4\x69\x3f\x68\x92\xa4\x5c\xc2\x51\xae\x46\x87\x7c\x6b\x98\xae", 32) == 0; + + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_stellite); + hashf("This is a test This is a test This is a test", 44, out, ctx); + bResult = bResult && memcmp(out, "\xb9\x9d\x6c\xee\x50\x3c\x6f\xa6\x3f\x30\x69\x24\x4a\x0\x9f\xe4\xd4\x69\x3f\x68\x92\xa4\x5c\xc2\x51\xae\x46\x87\x7c\x6b\x98\xae", 32) == 0; + } + else if(algo == cryptonight_masari) + { + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_masari); + hashf("This is a test This is a test This is a test", 44, out, ctx); + bResult = bResult && memcmp(out, "\xbf\x5f\xd\xf3\x5a\x65\x7c\x89\xb0\x41\xcf\xf0\xd\x46\x6a\xb6\x30\xf9\x77\x7f\xd9\xc6\x3\xd7\x3b\xd8\xf1\xb5\x4b\x49\xed\x28", 32) == 0; + + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_masari); + hashf("This is a test This is a test This is a test", 44, out, ctx); + bResult = bResult && memcmp(out, "\xbf\x5f\xd\xf3\x5a\x65\x7c\x89\xb0\x41\xcf\xf0\xd\x46\x6a\xb6\x30\xf9\x77\x7f\xd9\xc6\x3\xd7\x3b\xd8\xf1\xb5\x4b\x49\xed\x28", 32) == 0; + } + else if(algo == cryptonight_heavy) + { + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_heavy); + hashf("This is a test This is a test This is a test", 44, out, ctx); + bResult = bResult && memcmp(out, "\xf9\x44\x97\xce\xb4\xf0\xd9\x84\xb\x9b\xfc\x45\x94\x74\x55\x25\xcf\x26\x83\x16\x4f\xc\xf8\x2d\xf5\xf\x25\xff\x45\x28\x2e\x85", 32) == 0; + + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_heavy); + hashf("This is a test This is a test This is a test", 44, out, ctx); + bResult = bResult && memcmp(out, "\xf9\x44\x97\xce\xb4\xf0\xd9\x84\xb\x9b\xfc\x45\x94\x74\x55\x25\xcf\x26\x83\x16\x4f\xc\xf8\x2d\xf5\xf\x25\xff\x45\x28\x2e\x85", 32) == 0; + } + else if(algo == cryptonight_haven) + { + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_haven); + hashf("This is a test This is a test This is a test", 44, out, ctx); + bResult = bResult && memcmp(out, "\xc7\xd4\x52\x9\x2b\x48\xa5\xaf\xae\x11\xaf\x40\x9a\x87\xe5\x88\xf0\x29\x35\xa3\x68\xd\xe3\x6b\xce\x43\xf6\xc8\xdf\xd3\xe3\x9", 32) == 0; + + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_haven); + hashf("This is a test This is a test This is a test", 44, out, ctx); + bResult = bResult && memcmp(out, "\xc7\xd4\x52\x9\x2b\x48\xa5\xaf\xae\x11\xaf\x40\x9a\x87\xe5\x88\xf0\x29\x35\xa3\x68\xd\xe3\x6b\xce\x43\xf6\xc8\xdf\xd3\xe3\x9", 32) == 0; + } + else if(algo == cryptonight_bittube2) + { + unsigned char out[32 * MAX_N]; + cn_hash_fun hashf; + + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_bittube2); - hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_bittube2); + hashf("\x38\x27\x4c\x97\xc4\x5a\x17\x2c\xfc\x97\x67\x98\x70\x42\x2e\x3a\x1a\xb0\x78\x49\x60\xc6\x05\x14\xd8\x16\x27\x14\x15\xc3\x06\xee\x3a\x3e\xd1\xa7\x7e\x31\xf6\xa8\x85\xc3\xcb\xff\x01\x02\x03\x04", 48, out, ctx); + bResult = bResult && memcmp(out, "\x18\x2c\x30\x41\x93\x1a\x14\x73\xc6\xbf\x7e\x77\xfe\xb5\x17\x9b\xa8\xbe\xa9\x68\xba\x9e\xe1\xe8\x24\x1a\x12\x7a\xac\x81\xb4\x24", 32) == 0; - hashf("\x38\x27\x4c\x97\xc4\x5a\x17\x2c\xfc\x97\x67\x98\x70\x42\x2e\x3a\x1a\xb0\x78\x49\x60\xc6\x05\x14\xd8\x16\x27\x14\x15\xc3\x06\xee\x3a\x3e\xd1\xa7\x7e\x31\xf6\xa8\x85\xc3\xcb\xff\x01\x02\x03\x04", 48, out, ctx[0]); - bResult = memcmp(out, "\x18\x2c\x30\x41\x93\x1a\x14\x73\xc6\xbf\x7e\x77\xfe\xb5\x17\x9b\xa8\xbe\xa9\x68\xba\x9e\xe1\xe8\x24\x1a\x12\x7a\xac\x81\xb4\x24", 32) == 0; + hashf("\x04\x04\xb4\x94\xce\xd9\x05\x18\xe7\x25\x5d\x01\x28\x63\xde\x8a\x4d\x27\x72\xb1\xff\x78\x8c\xd0\x56\x20\x38\x98\x3e\xd6\x8c\x94\xea\x00\xfe\x43\x66\x68\x83\x00\x00\x00\x00\x18\x7c\x2e\x0f\x66\xf5\x6b\xb9\xef\x67\xed\x35\x14\x5c\x69\xd4\x69\x0d\x1f\x98\x22\x44\x01\x2b\xea\x69\x6e\xe8\xb3\x3c\x42\x12\x01", 76, out, ctx); + bResult = bResult && memcmp(out, "\x7f\xbe\xb9\x92\x76\x87\x5a\x3c\x43\xc2\xbe\x5a\x73\x36\x06\xb5\xdc\x79\xcc\x9c\xf3\x7c\x43\x3e\xb4\x18\x56\x17\xfb\x9b\xc9\x36", 32) == 0; - hashf("\x04\x04\xb4\x94\xce\xd9\x05\x18\xe7\x25\x5d\x01\x28\x63\xde\x8a\x4d\x27\x72\xb1\xff\x78\x8c\xd0\x56\x20\x38\x98\x3e\xd6\x8c\x94\xea\x00\xfe\x43\x66\x68\x83\x00\x00\x00\x00\x18\x7c\x2e\x0f\x66\xf5\x6b\xb9\xef\x67\xed\x35\x14\x5c\x69\xd4\x69\x0d\x1f\x98\x22\x44\x01\x2b\xea\x69\x6e\xe8\xb3\x3c\x42\x12\x01", 76, out, ctx[0]); - bResult = bResult && memcmp(out, "\x7f\xbe\xb9\x92\x76\x87\x5a\x3c\x43\xc2\xbe\x5a\x73\x36\x06\xb5\xdc\x79\xcc\x9c\xf3\x7c\x43\x3e\xb4\x18\x56\x17\xfb\x9b\xc9\x36", 32) == 0; + hashf("\x85\x19\xe0\x39\x17\x2b\x0d\x70\xe5\xca\x7b\x33\x83\xd6\xb3\x16\x73\x15\xa4\x22\x74\x7b\x73\xf0\x19\xcf\x95\x28\xf0\xfd\xe3\x41\xfd\x0f\x2a\x63\x03\x0b\xa6\x45\x05\x25\xcf\x6d\xe3\x18\x37\x66\x9a\xf6\xf1\xdf\x81\x31\xfa\xf5\x0a\xaa\xb8\xd3\xa7\x40\x55\x89", 64, out, ctx); + bResult = bResult && memcmp(out, "\x90\xdc\x65\x53\x8d\xb0\x00\xea\xa2\x52\xcd\xd4\x1c\x17\x7a\x64\xfe\xff\x95\x36\xe7\x71\x68\x35\xd4\xcf\x5c\x73\x56\xb1\x2f\xcd", 32) == 0; + } - hashf("\x85\x19\xe0\x39\x17\x2b\x0d\x70\xe5\xca\x7b\x33\x83\xd6\xb3\x16\x73\x15\xa4\x22\x74\x7b\x73\xf0\x19\xcf\x95\x28\xf0\xfd\xe3\x41\xfd\x0f\x2a\x63\x03\x0b\xa6\x45\x05\x25\xcf\x6d\xe3\x18\x37\x66\x9a\xf6\xf1\xdf\x81\x31\xfa\xf5\x0a\xaa\xb8\xd3\xa7\x40\x55\x89", 64, out, ctx[0]); - bResult = bResult && memcmp(out, "\x90\xdc\x65\x53\x8d\xb0\x00\xea\xa2\x52\xcd\xd4\x1c\x17\x7a\x64\xfe\xff\x95\x36\xe7\x71\x68\x35\xd4\xcf\x5c\x73\x56\xb1\x2f\xcd", 32) == 0; + if(!bResult) + printer::inst()->print_msg(L0, + "Cryptonight hash self-test failed. This might be caused by bad compiler optimizations."); } + for (int i = 0; i < MAX_N; i++) cryptonight_free_ctx(ctx[i]); - if(!bResult) - printer::inst()->print_msg(L0, - "Cryptonight hash self-test failed. This might be caused by bad compiler optimizations."); - return bResult; } @@ -359,15 +444,41 @@ std::vector minethd::thread_starter(uint32_t threadOffset, miner_work else printer::inst()->print_msg(L1, "Starting %dx thread, no affinity.", cfg.iMultiway); - minethd* thd = new minethd(pWork, i + threadOffset, cfg.iMultiway, cfg.bNoPrefetch, cfg.iCpuAff); + minethd* thd = new minethd(pWork, i + threadOffset, cfg.iMultiway, cfg.bNoPrefetch, cfg.iCpuAff, cfg.asm_version_str); pvThreads.push_back(thd); } return pvThreads; } -minethd::cn_hash_fun minethd::func_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo) +/** get the supported asm name + * + * @return asm type based on the number of hashes per thread the internal + * evaluated cpu type + */ +static std::string getAsmName(const uint32_t num_hashes) { + std::string asm_type = "off"; + if(num_hashes != 0) + { + auto cpu_model = getModel(); + + if(cpu_model.avx && cpu_model.aes) + { + if(cpu_model.type_name.find("Intel") != std::string::npos) + asm_type = "intel_avx"; + else if(cpu_model.type_name.find("AMD") != std::string::npos && num_hashes == 1) + asm_type = "amd_avx"; + } + } + return asm_type; +} + +template +minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo, const std::string& asm_version_str) +{ + static_assert(N >= 1, "number of threads must be >= 1" ); + // We have two independent flag bits in the functions // therefore we will build a binary digit and select the // function as a two digit binary @@ -405,388 +516,119 @@ minethd::cn_hash_fun minethd::func_selector(bool bHaveAes, bool bNoPrefetch, xmr case cryptonight_bittube2: algv = 9; break; + case cryptonight_monero_v8: + algv = 10; + break; default: algv = 2; break; } static const cn_hash_fun func_table[] = { - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + Cryptonight_hash::template hash }; std::bitset<2> digit; digit.set(0, !bHaveAes); digit.set(1, !bNoPrefetch); - return func_table[ algv << 2 | digit.to_ulong() ]; -} - -void minethd::work_main() -{ - if(affinity >= 0) //-1 means no affinity - bindMemoryToNUMANode(affinity); - - order_fix.set_value(); - std::unique_lock lck(thd_aff_set); - lck.release(); - std::this_thread::yield(); - - cryptonight_ctx* ctx; - uint64_t iCount = 0; - uint64_t* piHashVal; - uint32_t* piNonce; - job_result result; - - // start with root algorithm and switch later if fork version is reached - auto miner_algo = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot(); - cn_hash_fun hash_fun = func_selector(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo); - ctx = minethd_alloc_ctx(); + auto selected_function = func_table[ algv << 2 | digit.to_ulong() ]; - piHashVal = (uint64_t*)(result.bResult + 24); - piNonce = (uint32_t*)(oWork.bWorkBlob + 39); - result.iThreadId = iThreadNo; - uint8_t version = 0; - size_t lastPoolId = 0; - - while (bQuit == 0) + // check for asm optimized version for cryptonight_v8 + if(N <= 2 && algo == cryptonight_monero_v8 && bHaveAes) { - if (oWork.bStall) - { - /* We are stalled here because the executor didn't find a job for us yet, - * either because of network latency, or a socket problem. Since we are - * raison d'etre of this software it us sensible to just wait until we have something - */ - - while (globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo) - std::this_thread::sleep_for(std::chrono::milliseconds(100)); - - globalStates::inst().consume_work(oWork, iJobNo); - continue; - } - - size_t nonce_ctr = 0; - constexpr size_t nonce_chunk = 4096; // Needs to be a power of 2 - - assert(sizeof(job_result::sJobID) == sizeof(pool_job::sJobID)); - memcpy(result.sJobID, oWork.sJobID, sizeof(job_result::sJobID)); - - if(oWork.bNiceHash) - result.iNonce = *piNonce; - - uint8_t new_version = oWork.getVersion(); - if(new_version != version || oWork.iPoolId != lastPoolId) - { - coinDescription coinDesc = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(oWork.iPoolId); - if(new_version >= coinDesc.GetMiningForkVersion()) - { - miner_algo = coinDesc.GetMiningAlgo(); - hash_fun = func_selector(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo); - } - else - { - miner_algo = coinDesc.GetMiningAlgoRoot(); - hash_fun = func_selector(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo); - } - result.algorithm = miner_algo; - lastPoolId = oWork.iPoolId; - version = new_version; - } + std::string selected_asm = asm_version_str; + if(selected_asm == "auto") + selected_asm = cpu::getAsmName(N); - while(globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo) + if(selected_asm != "off") { - if ((iCount++ & 0xF) == 0) //Store stats every 16 hashes + if(selected_asm == "intel_avx") { - uint64_t iStamp = get_timestamp_ms(); - iHashCount.store(iCount, std::memory_order_relaxed); - iTimestamp.store(iStamp, std::memory_order_relaxed); + // Intel Ivy Bridge (Xeon v2, Core i7/i5/i3 3xxx, Pentium G2xxx, Celeron G1xxx) + if(N == 1) + selected_function = Cryptonight_hash_asm<1u, 0u>::template hash; + else if(N == 2) + selected_function = Cryptonight_hash_asm<2u, 0u>::template hash; } - - if((nonce_ctr++ & (nonce_chunk-1)) == 0) + // supports only 1 thread per hash + if(N == 1 && selected_asm == "amd_avx") { - globalStates::inst().calc_start_nonce(result.iNonce, oWork.bNiceHash, nonce_chunk); - // check if the job is still valid, there is a small posibility that the job is switched - if(globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) != iJobNo) - break; + // AMD Ryzen (1xxx and 2xxx series) + selected_function = Cryptonight_hash_asm<1u, 1u>::template hash; } - - *piNonce = result.iNonce; - - hash_fun(oWork.bWorkBlob, oWork.iWorkSize, result.bResult, ctx); - - if (*piHashVal < oWork.iTarget) - executor::inst()->push_event(ex_event(result, oWork.iPoolId)); - result.iNonce++; - - std::this_thread::yield(); + if(asm_version_str == "auto" && (selected_asm != "intel_avx" || selected_asm != "amd_avx")) + printer::inst()->print_msg(L3, "Switch to assembler version for '%s' cpu's", selected_asm.c_str()); + else if(selected_asm != "intel_avx" && selected_asm != "amd_avx") // unknown asm type + printer::inst()->print_msg(L1, "Assembler '%s' unknown, fallback to non asm version of cryptonight_v8", selected_asm.c_str()); } - - globalStates::inst().consume_work(oWork, iJobNo); } - - cryptonight_free_ctx(ctx); + + return selected_function; } -minethd::cn_hash_fun_multi minethd::func_multi_selector(size_t N, bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo) +minethd::cn_hash_fun minethd::func_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo) { - // We have two independent flag bits in the functions - // therefore we will build a binary digit and select the - // function as a two digit binary - - uint8_t algv; - switch(algo) - { - case cryptonight: - algv = 2; - break; - case cryptonight_lite: - algv = 1; - break; - case cryptonight_monero: - algv = 0; - break; - case cryptonight_heavy: - algv = 3; - break; - case cryptonight_aeon: - algv = 4; - break; - case cryptonight_ipbc: - algv = 5; - break; - case cryptonight_stellite: - algv = 6; - break; - case cryptonight_masari: - algv = 7; - break; - case cryptonight_haven: - algv = 8; - break; - case cryptonight_bittube2: - algv = 9; - break; - default: - algv = 2; - break; - } - - static const cn_hash_fun_multi func_table[] = { - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - cryptonight_penta_hash - }; - - std::bitset<2> digit; - digit.set(0, !bHaveAes); - digit.set(1, !bNoPrefetch); + return func_multi_selector<1>(bHaveAes, bNoPrefetch, algo); +} - return func_table[algv << 4 | (N-2) << 2 | digit.to_ulong()]; +void minethd::work_main() +{ + multiway_work_main<1u>(); } void minethd::double_work_main() @@ -843,6 +685,13 @@ void minethd::multiway_work_main() for (size_t i = 0; i < N; i++) { ctx[i] = minethd_alloc_ctx(); + if(ctx[i] == nullptr) + { + printer::inst()->print_msg(L0, "ERROR: miner was not able to allocate memory."); + for (int j = 0; j < i; j++) + cryptonight_free_ctx(ctx[j]); + win_exit(1); + } piHashVal[i] = (uint64_t*)(bHashOut + 32 * i + 24); piNonce[i] = (i == 0) ? (uint32_t*)(bWorkBlob + 39) : nullptr; } @@ -854,7 +703,7 @@ void minethd::multiway_work_main() // start with root algorithm and switch later if fork version is reached auto miner_algo = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot(); - cn_hash_fun_multi hash_fun_multi = func_multi_selector(N, ::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo); + cn_hash_fun hash_fun_multi = func_multi_selector(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo, asm_version_str); uint8_t version = 0; size_t lastPoolId = 0; @@ -889,12 +738,12 @@ void minethd::multiway_work_main() if(new_version >= coinDesc.GetMiningForkVersion()) { miner_algo = coinDesc.GetMiningAlgo(); - hash_fun_multi = func_multi_selector(N, ::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo); + hash_fun_multi = func_multi_selector(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo, asm_version_str); } else { miner_algo = coinDesc.GetMiningAlgoRoot(); - hash_fun_multi = func_multi_selector(N, ::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo); + hash_fun_multi = func_multi_selector(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo, asm_version_str); } lastPoolId = oWork.iPoolId; version = new_version; diff --git a/xmrstak/backend/cpu/minethd.hpp b/xmrstak/backend/cpu/minethd.hpp index 2d40ce314..eb77749f6 100644 --- a/xmrstak/backend/cpu/minethd.hpp +++ b/xmrstak/backend/cpu/minethd.hpp @@ -22,7 +22,7 @@ class minethd : public iBackend static std::vector thread_starter(uint32_t threadOffset, miner_work& pWork); static bool self_test(); - typedef void (*cn_hash_fun)(const void*, size_t, void*, cryptonight_ctx*); + typedef void (*cn_hash_fun)(const void*, size_t, void*, cryptonight_ctx**); static cn_hash_fun func_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo); static bool thd_setaffinity(std::thread::native_handle_type h, uint64_t cpu_id); @@ -30,10 +30,11 @@ class minethd : public iBackend static cryptonight_ctx* minethd_alloc_ctx(); private: - typedef void (*cn_hash_fun_multi)(const void*, size_t, void*, cryptonight_ctx**); - static cn_hash_fun_multi func_multi_selector(size_t N, bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo); - minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch, int64_t affinity); + template + static cn_hash_fun func_multi_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo, const std::string& asm_version_str = "off"); + + minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch, int64_t affinity, const std::string& asm_version); template void multiway_work_main(); @@ -59,6 +60,7 @@ class minethd : public iBackend bool bQuit; bool bNoPrefetch; + std::string asm_version_str = "off"; }; } // namespace cpu diff --git a/xmrstak/backend/cryptonight.hpp b/xmrstak/backend/cryptonight.hpp index b6f656138..6b1afa928 100644 --- a/xmrstak/backend/cryptonight.hpp +++ b/xmrstak/backend/cryptonight.hpp @@ -16,6 +16,7 @@ enum xmrstak_algo cryptonight_masari = 8, //equal to cryptonight_monero but with less iterations, used by masari cryptonight_haven = 9, // equal to cryptonight_heavy with a small tweak cryptonight_bittube2 = 10, // derived from cryptonight_heavy with own aes-round implementation and minor other tweaks + cryptonight_monero_v8 = 11 }; // define aeon settings @@ -45,6 +46,9 @@ inline constexpr size_t cn_select_memory() { return CRYPTONIGH template<> inline constexpr size_t cn_select_memory() { return CRYPTONIGHT_MEMORY; } +template<> +inline constexpr size_t cn_select_memory() { return CRYPTONIGHT_MEMORY; } + template<> inline constexpr size_t cn_select_memory() { return CRYPTONIGHT_HEAVY_MEMORY; } @@ -72,6 +76,7 @@ inline size_t cn_select_memory(xmrstak_algo algo) { case cryptonight_stellite: case cryptonight_monero: + case cryptonight_monero_v8: case cryptonight_masari: case cryptonight: return CRYPTONIGHT_MEMORY; @@ -100,6 +105,9 @@ inline constexpr uint32_t cn_select_mask() { return CRYPTONIGH template<> inline constexpr uint32_t cn_select_mask() { return CRYPTONIGHT_MASK; } +template<> +inline constexpr uint32_t cn_select_mask() { return CRYPTONIGHT_MASK; } + template<> inline constexpr uint32_t cn_select_mask() { return CRYPTONIGHT_HEAVY_MASK; } @@ -127,6 +135,7 @@ inline size_t cn_select_mask(xmrstak_algo algo) { case cryptonight_stellite: case cryptonight_monero: + case cryptonight_monero_v8: case cryptonight_masari: case cryptonight: return CRYPTONIGHT_MASK; @@ -155,6 +164,9 @@ inline constexpr uint32_t cn_select_iter() { return CRYPTONIGH template<> inline constexpr uint32_t cn_select_iter() { return CRYPTONIGHT_ITER; } +template<> +inline constexpr uint32_t cn_select_iter() { return CRYPTONIGHT_ITER; } + template<> inline constexpr uint32_t cn_select_iter() { return CRYPTONIGHT_HEAVY_ITER; } @@ -182,6 +194,7 @@ inline size_t cn_select_iter(xmrstak_algo algo) { case cryptonight_stellite: case cryptonight_monero: + case cryptonight_monero_v8: case cryptonight: return CRYPTONIGHT_ITER; case cryptonight_ipbc: diff --git a/xmrstak/backend/globalStates.cpp b/xmrstak/backend/globalStates.cpp index 3bd7d0eea..4eeed3c4b 100644 --- a/xmrstak/backend/globalStates.cpp +++ b/xmrstak/backend/globalStates.cpp @@ -39,7 +39,7 @@ void globalStates::consume_work( miner_work& threadWork, uint64_t& currentJobId) threadWork = oGlobalWork; currentJobId = iGlobalJobNo.load(std::memory_order_relaxed); - + jobLock.UnLock(); } @@ -51,7 +51,7 @@ void globalStates::switch_work(miner_work& pWork, pool_data& dat) * To avoid duplicated shared this must be done before the nonce is exchanged. */ iGlobalJobNo++; - + size_t xid = dat.pool_id; dat.pool_id = pool_id; pool_id = xid; @@ -62,7 +62,7 @@ void globalStates::switch_work(miner_work& pWork, pool_data& dat) */ dat.iSavedNonce = iGlobalNonce.exchange(dat.iSavedNonce, std::memory_order_relaxed); oGlobalWork = pWork; - + jobLock.UnLock(); } diff --git a/xmrstak/backend/nvidia/autoAdjust.hpp b/xmrstak/backend/nvidia/autoAdjust.hpp index 12468093c..2755e03d2 100644 --- a/xmrstak/backend/nvidia/autoAdjust.hpp +++ b/xmrstak/backend/nvidia/autoAdjust.hpp @@ -96,6 +96,7 @@ class autoAdjust " \"threads\" : " + std::to_string(ctx.device_threads) + ", \"blocks\" : " + std::to_string(ctx.device_blocks) + ",\n" + " \"bfactor\" : " + std::to_string(ctx.device_bfactor) + ", \"bsleep\" : " + std::to_string(ctx.device_bsleep) + ",\n" + " \"affine_to_cpu\" : false, \"sync_mode\" : 3,\n" + + " \"mem_mode\" : 1,\n" + " },\n"; } } diff --git a/xmrstak/backend/nvidia/config.tpl b/xmrstak/backend/nvidia/config.tpl index 2aa68dc46..8a5982b57 100644 --- a/xmrstak/backend/nvidia/config.tpl +++ b/xmrstak/backend/nvidia/config.tpl @@ -1,4 +1,5 @@ -R"===( +R"===(// generated by XMRSTAK_VERSION + /* * GPU configuration. You should play around with threads and blocks as the fastest settings will vary. * index - GPU index number usually starts from 0. @@ -15,6 +16,9 @@ R"===( * 1 = cudaDeviceScheduleSpin - create a high load on one cpu thread per gpu * 2 = cudaDeviceScheduleYield * 3 = cudaDeviceScheduleBlockingSync (default) + * mem_mode - select the memory access pattern (this option has only a meaning for cryptonight_v8 and monero) + * 0 = 64bit memory loads + * 1 = 256bit memory loads * * On the first run the miner will look at your system and suggest a basic configuration that will work, * you can try to tweak it from there to get the best performance. @@ -23,7 +27,7 @@ R"===( * "gpu_threads_conf" : * [ * { "index" : 0, "threads" : 17, "blocks" : 60, "bfactor" : 0, "bsleep" : 0, - * "affine_to_cpu" : false, "sync_mode" : 3, + * "affine_to_cpu" : false, "sync_mode" : 3, "mem_mode" : 1 * }, * ], * If you do not wish to mine with your nVidia GPU(s) then use: @@ -35,5 +39,4 @@ R"===( [ GPUCONFIG ], - )===" diff --git a/xmrstak/backend/nvidia/jconf.cpp b/xmrstak/backend/nvidia/jconf.cpp index c9d4f194c..6c443343b 100644 --- a/xmrstak/backend/nvidia/jconf.cpp +++ b/xmrstak/backend/nvidia/jconf.cpp @@ -123,7 +123,7 @@ bool jconf::GetGPUThreadConfig(size_t id, thd_cfg &cfg) if(!oThdConf.IsObject()) return false; - const Value *gid, *blocks, *threads, *bfactor, *bsleep, *aff, *syncMode; + const Value *gid, *blocks, *threads, *bfactor, *bsleep, *aff, *syncMode, *memMode; gid = GetObjectMember(oThdConf, "index"); blocks = GetObjectMember(oThdConf, "blocks"); threads = GetObjectMember(oThdConf, "threads"); @@ -131,9 +131,11 @@ bool jconf::GetGPUThreadConfig(size_t id, thd_cfg &cfg) bsleep = GetObjectMember(oThdConf, "bsleep"); aff = GetObjectMember(oThdConf, "affine_to_cpu"); syncMode = GetObjectMember(oThdConf, "sync_mode"); + memMode = GetObjectMember(oThdConf, "mem_mode"); if(gid == nullptr || blocks == nullptr || threads == nullptr || - bfactor == nullptr || bsleep == nullptr || aff == nullptr || syncMode == nullptr) + bfactor == nullptr || bsleep == nullptr || aff == nullptr || syncMode == nullptr || + memMode == nullptr) { return false; } @@ -158,16 +160,25 @@ bool jconf::GetGPUThreadConfig(size_t id, thd_cfg &cfg) if(!syncMode->IsNumber() || syncMode->GetInt() < 0 || syncMode->GetInt() > 3) { - printer::inst()->print_msg(L0, "Error NVIDIA: sync_mode out of range or no number. ( range: 0 <= sync_mode < 4.)"); + printer::inst()->print_msg(L0, "Error NVIDIA: sync_mode out of range or not a number. ( range: 0 <= sync_mode < 4.)"); return false; } + + if(!memMode->IsNumber() || memMode->GetInt() < 0 || memMode->GetInt() > 1) + { + printer::inst()->print_msg(L0, "Error NVIDIA: mem_mode out of range or not a number. (range: 0 or 1)"); + return false; + } + + cfg.id = gid->GetInt(); cfg.blocks = blocks->GetInt(); cfg.threads = threads->GetInt(); cfg.bfactor = bfactor->GetInt(); cfg.bsleep = bsleep->GetInt(); cfg.syncMode = syncMode->GetInt(); - + cfg.memMode = memMode->GetInt(); + if(aff->IsNumber()) cfg.cpu_aff = aff->GetInt(); else diff --git a/xmrstak/backend/nvidia/jconf.hpp b/xmrstak/backend/nvidia/jconf.hpp index b4ebaa035..40b72f880 100644 --- a/xmrstak/backend/nvidia/jconf.hpp +++ b/xmrstak/backend/nvidia/jconf.hpp @@ -29,6 +29,7 @@ class jconf bool bNoPrefetch; int32_t cpu_aff; int syncMode; + int memMode; long long iCpuAff; }; diff --git a/xmrstak/backend/nvidia/minethd.cpp b/xmrstak/backend/nvidia/minethd.cpp index 88a1acc32..6460628de 100644 --- a/xmrstak/backend/nvidia/minethd.cpp +++ b/xmrstak/backend/nvidia/minethd.cpp @@ -78,6 +78,7 @@ minethd::minethd(miner_work& pWork, size_t iNo, const jconf::thd_cfg& cfg) ctx.device_bfactor = (int)cfg.bfactor; ctx.device_bsleep = (int)cfg.bsleep; ctx.syncMode = cfg.syncMode; + ctx.memMode = cfg.memMode; this->affinity = cfg.cpu_aff; std::future numa_guard = numa_promise.get_future(); @@ -144,6 +145,8 @@ std::vector* minethd::thread_starter(uint32_t threadOffset, miner_wor { std::vector* pvThreads = new std::vector(); + auto miner_algo = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot(); + if(!configEditor::file_exist(params::inst().configFileNVIDIA)) { autoAdjust adjust; @@ -162,6 +165,10 @@ std::vector* minethd::thread_starter(uint32_t threadOffset, miner_wor std::cout<<"WARNING: NVIDIA no device found"<GetGPUThreadCount(); pvThreads->reserve(n); @@ -275,7 +282,7 @@ void minethd::work_main() if((round_ctr++ & 0xF) == 0) { globalStates::inst().calc_start_nonce(iNonce, oWork.bNiceHash, h_per_round * 16); - // check if the job is still valid, there is a small posibility that the job is switched + // check if the job is still valid, there is a small possibility that the job is switched if(globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) != iJobNo) break; } @@ -300,7 +307,7 @@ void minethd::work_main() *(uint32_t*)(bWorkBlob + 39) = foundNonce[i]; - hash_fun(bWorkBlob, oWork.iWorkSize, bResult, cpu_ctx); + hash_fun(bWorkBlob, oWork.iWorkSize, bResult, &cpu_ctx); if ( (*((uint64_t*)(bResult + 24))) < oWork.iTarget) executor::inst()->push_event(ex_event(job_result(oWork.sJobID, foundNonce[i], bResult, iThreadNo, miner_algo), oWork.iPoolId)); else diff --git a/xmrstak/backend/nvidia/minethd.hpp b/xmrstak/backend/nvidia/minethd.hpp index d4ae03864..389356842 100644 --- a/xmrstak/backend/nvidia/minethd.hpp +++ b/xmrstak/backend/nvidia/minethd.hpp @@ -28,7 +28,7 @@ class minethd : public iBackend static bool self_test(); private: - typedef void (*cn_hash_fun)(const void*, size_t, void*, cryptonight_ctx*); + typedef void (*cn_hash_fun)(const void*, size_t, void*, cryptonight_ctx**); minethd(miner_work& pWork, size_t iNo, const jconf::thd_cfg& cfg); void start_mining(); diff --git a/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp b/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp index d588641b4..8fda8d401 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp +++ b/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp @@ -16,6 +16,7 @@ typedef struct { int device_bfactor; int device_bsleep; int syncMode; + bool memMode; uint32_t *d_input; uint32_t inputlen; diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu index 6c6475150..7742e740e 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu @@ -5,8 +5,11 @@ #include #include #include +#include #include "xmrstak/jconf.hpp" +#include "xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp" + #ifdef _WIN32 #include @@ -67,9 +70,9 @@ typedef uint64_t IndexType; typedef int IndexType; #endif -__device__ __forceinline__ uint64_t cuda_mul128( uint64_t multiplier, uint64_t multiplicand, uint64_t* product_hi ) +__device__ __forceinline__ uint64_t cuda_mul128( uint64_t multiplier, uint64_t multiplicand, uint64_t& product_hi ) { - *product_hi = __umul64hi( multiplier, multiplicand ); + product_hi = __umul64hi( multiplier, multiplicand ); return (multiplier * multiplicand ); } @@ -194,11 +197,289 @@ __forceinline__ __device__ uint32_t shuffle(volatile uint32_t* ptr,const uint32_ #endif } +template +__forceinline__ __device__ uint64_t shuffle64(volatile uint32_t* ptr,const uint32_t sub,const int val,const uint32_t src, const uint32_t src2) +{ + uint64_t tmp; + ((uint32_t*)&tmp)[0] = shuffle(ptr, sub, val, src); + ((uint32_t*)&tmp)[1] = shuffle(ptr, sub, val, src2); + return tmp; +} + +struct u64 : public uint2 +{ + + __forceinline__ __device__ u64(){} + + __forceinline__ __device__ u64( const uint32_t x0, const uint32_t x1) + { + uint2::x = x0; + uint2::y = x1; + } + + __forceinline__ __device__ operator uint64_t() const + { + return *((uint64_t*)this); + } + + __forceinline__ __device__ u64( const uint64_t x0) + { + ((uint64_t*)&this->x)[0] = x0; + } + + __forceinline__ __device__ u64 operator^=(const u64& other) + { + uint2::x ^= other.x; + uint2::y ^= other.y; + + return *this; + } + + __forceinline__ __device__ u64 operator+(const u64& other) const + { + u64 tmp; + ((uint64_t*)&tmp.x)[0] = ((uint64_t*)&(this->x))[0] + ((uint64_t*)&(other.x))[0]; + + return tmp; + } + + __forceinline__ __device__ u64 operator+=(const uint64_t& other) + { + return ((uint64_t*)&this->x)[0] += other; + } + + __forceinline__ __device__ void print(int i) const + { + if(i<2) + printf("gpu: %lu\n", ((uint64_t*)&this->x)[0]); + } +}; + +/** cryptonight with two threads per hash + * + * @tparam MEM_MODE if `0` than 64bit memory transfers per thread will be used to store/load data within shared memory + * else if `1` 256bit operations will be used + */ +template +#ifdef XMR_STAK_THREADS +__launch_bounds__( XMR_STAK_THREADS * 2 ) +#endif +__global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b, uint32_t * d_ctx_state, + uint32_t startNonce, uint32_t * __restrict__ d_input ) +{ + __shared__ uint32_t sharedMemory[1024]; + + cn_aes_gpu_init( sharedMemory ); + +#if( __CUDA_ARCH__ < 300 ) + extern __shared__ uint64_t externShared[]; + // 8 x 64bit values + volatile uint64_t* myChunks = (volatile uint64_t*)(externShared + (threadIdx.x >> 1) * 8); + volatile uint32_t* sPtr = (volatile uint32_t*)(externShared + (blockDim.x >> 1) * 8) + (threadIdx.x & 0xFFFFFFFE); +#else + extern __shared__ uint64_t chunkMem[]; + volatile uint32_t* sPtr = NULL; + // 8 x 64bit values + volatile uint64_t* myChunks = (volatile uint64_t*)(chunkMem + (threadIdx.x >> 1) * 8); + +#endif + + __syncthreads( ); + + const uint64_t tid = (blockDim.x * blockIdx.x + threadIdx.x); + const uint32_t thread = tid >> 1; + const uint32_t sub = tid & 1; + + if ( thread >= threads ) + return; + + uint8_t *l0 = (uint8_t*)&d_long_state[(IndexType) thread * MEMORY]; + + uint64_t ax0 = ((uint64_t*)(d_ctx_a + thread * 4))[sub]; + uint64_t bx0; + uint32_t idx0 = shuffle<2>(sPtr, sub, static_cast(ax0), 0); + + uint64_t* ptr0; + + uint64_t bx1; + uint32_t sqrt_result; + uint64_t division_result; + if(ALGO == cryptonight_monero_v8) + { + bx0 = ((uint64_t*)(d_ctx_b + thread * 12))[sub]; + bx1 = ((uint64_t*)(d_ctx_b + thread * 12 + 4))[sub]; + + division_result = ((uint64_t*)(d_ctx_b + thread * 12 + 4 * 2))[0]; + sqrt_result = (d_ctx_b + thread * 12 + 4 * 2 + 2)[0]; + } + else + bx0 = ((uint64_t*)(d_ctx_b + thread * 4))[sub]; + + const int batchsize = (ITERATIONS * 2) >> ( 1 + bfactor ); + const int start = partidx * batchsize; + const int end = start + batchsize; + + for(int i = start; i < end; ++i) + { + ptr0 = (uint64_t *)&l0[idx0 & MASK & 0x1FFFC0]; + + if(MEM_MODE == 0) + { + #pragma unroll 4 + for(int x = 0; x < 8; x += 2) + { + myChunks[x + sub] = ptr0[ x + sub ]; + } + } + else + ((ulonglong4*)myChunks)[sub] = ((ulonglong4*)ptr0)[sub]; + + uint32_t idx1 = (idx0 & 0x30) >> 3; + + const u64 cx = myChunks[ idx1 + sub ]; + const u64 cx2 = myChunks[ idx1 + ((sub + 1) & 1) ]; + + u64 cx_aes = ax0 ^ u64( + t_fn0( cx.x & 0xff ) ^ t_fn1( (cx.y >> 8) & 0xff ) ^ t_fn2( (cx2.x >> 16) & 0xff ) ^ t_fn3( (cx2.y >> 24 ) ), + t_fn0( cx.y & 0xff ) ^ t_fn1( (cx2.x >> 8) & 0xff ) ^ t_fn2( (cx2.y >> 16) & 0xff ) ^ t_fn3( (cx.x >> 24 ) ) + ); + + if(ALGO == cryptonight_monero_v8) + { + + const uint64_t chunk1 = myChunks[ idx1 ^ 2 + sub ]; + const uint64_t chunk2 = myChunks[ idx1 ^ 4 + sub ]; + const uint64_t chunk3 = myChunks[ idx1 ^ 6 + sub ]; +#if (__CUDACC_VER_MAJOR__ >= 9) + __syncwarp(); +#else + __syncthreads( ); +#endif + myChunks[ idx1 ^ 2 + sub ] = chunk3 + bx1; + myChunks[ idx1 ^ 4 + sub ] = chunk1 + bx0; + myChunks[ idx1 ^ 6 + sub ] = chunk2 + ax0; + } + + myChunks[ idx1 + sub ] = cx_aes ^ bx0; + if(MEM_MODE == 0) + { + #pragma unroll 4 + for(int x = 0; x < 8; x += 2) + { + ptr0[ x + sub ] = myChunks[x + sub]; + } + } + else + ((ulonglong4*)ptr0)[sub] = ((ulonglong4*)myChunks)[sub]; + + idx0 = shuffle<2>(sPtr, sub, cx_aes.x, 0); + idx1 = (idx0 & 0x30) >> 3; + ptr0 = (uint64_t *)&l0[idx0 & MASK & 0x1FFFC0]; + + if(MEM_MODE == 0) + { + #pragma unroll 4 + for(int x = 0; x < 8; x += 2) + { + myChunks[x + sub] = ptr0[ x + sub ]; + } + } + else + ((ulonglong4*)myChunks)[sub] = ((ulonglong4*)ptr0)[sub]; + + if(ALGO != cryptonight_monero_v8) + bx0 = cx_aes; + + uint64_t cx_mul; + ((uint32_t*)&cx_mul)[0] = shuffle<2>(sPtr, sub, cx_aes.x , 0); + ((uint32_t*)&cx_mul)[1] = shuffle<2>(sPtr, sub, cx_aes.y , 0); + + if(ALGO == cryptonight_monero_v8 && sub == 1) + { + // Use division and square root results from the _previous_ iteration to hide the latency + ((uint32_t*)&division_result)[1] ^= sqrt_result; + + ((uint64_t*)myChunks)[ idx1 ] ^= division_result; + + const uint32_t dd = (static_cast(cx_mul) + (sqrt_result << 1)) | 0x80000001UL; + division_result = fast_div_v2(cx_aes, dd); + + // Use division_result as an input for the square root to prevent parallel implementation in hardware + sqrt_result = fast_sqrt_v2(cx_mul + division_result); + } +#if (__CUDACC_VER_MAJOR__ >= 9) + __syncwarp(); +#else + __syncthreads( ); +#endif + uint64_t c = ((uint64_t*)myChunks)[ idx1 + sub ]; + + { + uint64_t cl = ((uint64_t*)myChunks)[ idx1 ]; + // sub 0 -> hi, sub 1 -> lo + uint64_t res = sub == 0 ? __umul64hi( cx_mul, cl ) : cx_mul * cl; + if(ALGO == cryptonight_monero_v8) + { + const uint64_t chunk1 = myChunks[ idx1 ^ 2 + sub ] ^ res; + uint64_t chunk2 = myChunks[ idx1 ^ 4 + sub ]; + res ^= ((uint64_t*)&chunk2)[0]; + const uint64_t chunk3 = myChunks[ idx1 ^ 6 + sub ]; +#if (__CUDACC_VER_MAJOR__ >= 9) + __syncwarp(); +#else + __syncthreads( ); +#endif + myChunks[ idx1 ^ 2 + sub ] = chunk3 + bx1; + myChunks[ idx1 ^ 4 + sub ] = chunk1 + bx0; + myChunks[ idx1 ^ 6 + sub ] = chunk2 + ax0; + } + ax0 += res; + } + if(ALGO == cryptonight_monero_v8) + { + bx1 = bx0; + bx0 = cx_aes; + } + myChunks[ idx1 + sub ] = ax0; + if(MEM_MODE == 0) + { + #pragma unroll 4 + for(int x = 0; x < 8; x += 2) + { + ptr0[ x + sub ] = myChunks[x + sub]; + } + } + else + ((ulonglong4*)ptr0)[sub] = ((ulonglong4*)myChunks)[sub]; + ax0 ^= c; + idx0 = shuffle<2>(sPtr, sub, static_cast(ax0), 0); + } + + if ( bfactor > 0 ) + { + ((uint64_t*)(d_ctx_a + thread * 4))[sub] = ax0; + if(ALGO == cryptonight_monero_v8) + { + ((uint64_t*)(d_ctx_b + thread * 12))[sub] = bx0; + ((uint64_t*)(d_ctx_b + thread * 12 + 4))[sub] = bx1; + + if(sub == 1) + { + // must be valid only for `sub == 1` + ((uint64_t*)(d_ctx_b + thread * 12 + 4 * 2))[0] = division_result; + (d_ctx_b + thread * 12 + 4 * 2 + 2)[0] = sqrt_result; + } + } + else + ((uint64_t*)(d_ctx_b + thread * 12))[sub] = bx0; + } +} + template #ifdef XMR_STAK_THREADS __launch_bounds__( XMR_STAK_THREADS * 4 ) #endif -__global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b, uint32_t * d_ctx_state, +__global__ void cryptonight_core_gpu_phase2_quad( int threads, int bfactor, int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b, uint32_t * d_ctx_state, uint32_t startNonce, uint32_t * __restrict__ d_input ) { __shared__ uint32_t sharedMemory[1024]; @@ -448,11 +729,12 @@ __global__ void cryptonight_core_gpu_phase3( int threads, int bfactor, int parti MEMCPY8( d_ctx_state + thread * 50 + sub + 16, text, 2 ); } -template +template void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce) { dim3 grid( ctx->device_blocks ); dim3 block( ctx->device_threads ); + dim3 block2( ctx->device_threads << 2 ); dim3 block4( ctx->device_threads << 2 ); dim3 block8( ctx->device_threads << 3 ); @@ -483,25 +765,53 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce) for ( int i = 0; i < partcount; i++ ) { - CUDA_CHECK_MSG_KERNEL( - ctx->device_id, - "\n**suggestion: Try to increase the value of the attribute 'bfactor' or \nreduce 'threads' in the NVIDIA config file.**", - cryptonight_core_gpu_phase2<<< - grid, - block4, - block4.x * sizeof(uint32_t) * static_cast< int >( ctx->device_arch[0] < 3 ) - >>>( - ctx->device_blocks*ctx->device_threads, - ctx->device_bfactor, - i, - ctx->d_long_state, - ctx->d_ctx_a, - ctx->d_ctx_b, - ctx->d_ctx_state, - nonce, - ctx->d_input - ) - ); + if(ALGO == cryptonight_monero_v8) + { + // two threads per block + CUDA_CHECK_MSG_KERNEL( + ctx->device_id, + "\n**suggestion: Try to increase the value of the attribute 'bfactor' or \nreduce 'threads' in the NVIDIA config file.**", + cryptonight_core_gpu_phase2_double<<< + grid, + block2, + sizeof(uint64_t) * block2.x * 8 + + // shuffle memory for fermi gpus + block2.x * sizeof(uint32_t) * static_cast< int >( ctx->device_arch[0] < 3 ) + >>>( + ctx->device_blocks*ctx->device_threads, + ctx->device_bfactor, + i, + ctx->d_long_state, + ctx->d_ctx_a, + ctx->d_ctx_b, + ctx->d_ctx_state, + nonce, + ctx->d_input + ) + ); + } + else + { + CUDA_CHECK_MSG_KERNEL( + ctx->device_id, + "\n**suggestion: Try to increase the value of the attribute 'bfactor' or \nreduce 'threads' in the NVIDIA config file.**", + cryptonight_core_gpu_phase2_quad<<< + grid, + block4, + block4.x * sizeof(uint32_t) * static_cast< int >( ctx->device_arch[0] < 3 ) + >>>( + ctx->device_blocks*ctx->device_threads, + ctx->device_bfactor, + i, + ctx->d_long_state, + ctx->d_ctx_a, + ctx->d_ctx_b, + ctx->d_ctx_state, + nonce, + ctx->d_input + ) + ); + } if ( partcount > 1 && ctx->device_bsleep > 0) compat_usleep( ctx->device_bsleep ); } @@ -529,46 +839,48 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce) void cryptonight_core_cpu_hash(nvid_ctx* ctx, xmrstak_algo miner_algo, uint32_t startNonce) { + typedef void (*cuda_hash_fn)(nvid_ctx* ctx, uint32_t nonce); + + if(miner_algo == invalid_algo) return; + + static const cuda_hash_fn func_table[] = { + cryptonight_core_gpu_hash, + cryptonight_core_gpu_hash, - if(miner_algo == cryptonight_monero) - { - cryptonight_core_gpu_hash(ctx, startNonce); - } - else if(miner_algo == cryptonight_heavy) - { - cryptonight_core_gpu_hash(ctx, startNonce); - } - else if(miner_algo == cryptonight) - { - cryptonight_core_gpu_hash(ctx, startNonce); - } - else if(miner_algo == cryptonight_lite) - { - cryptonight_core_gpu_hash(ctx, startNonce); - } - else if(miner_algo == cryptonight_aeon) - { - cryptonight_core_gpu_hash(ctx, startNonce); - } - else if(miner_algo == cryptonight_ipbc) - { - cryptonight_core_gpu_hash(ctx, startNonce); - } - else if(miner_algo == cryptonight_stellite) - { - cryptonight_core_gpu_hash(ctx, startNonce); - } - else if(miner_algo == cryptonight_masari) - { - cryptonight_core_gpu_hash(ctx, startNonce); - } - else if(miner_algo == cryptonight_haven) - { - cryptonight_core_gpu_hash(ctx, startNonce); - } - else if(miner_algo == cryptonight_bittube2) - { - cryptonight_core_gpu_hash(ctx, startNonce); - } + cryptonight_core_gpu_hash, + cryptonight_core_gpu_hash, + + cryptonight_core_gpu_hash, + cryptonight_core_gpu_hash, + + cryptonight_core_gpu_hash, + cryptonight_core_gpu_hash, + + cryptonight_core_gpu_hash, + cryptonight_core_gpu_hash, + + cryptonight_core_gpu_hash, + cryptonight_core_gpu_hash, + + cryptonight_core_gpu_hash, + cryptonight_core_gpu_hash, + + cryptonight_core_gpu_hash, + cryptonight_core_gpu_hash, + + cryptonight_core_gpu_hash, + cryptonight_core_gpu_hash, + + cryptonight_core_gpu_hash, + cryptonight_core_gpu_hash, + + cryptonight_core_gpu_hash, + cryptonight_core_gpu_hash + }; + + std::bitset<1> digit; + digit.set(0, ctx->memMode == 1); + cuda_hash_fn selected_function = func_table[ ((miner_algo - 1u) << 1) | digit.to_ulong() ]; + selected_function(ctx, startNonce); } diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu index 3b049ace8..433e175dd 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu @@ -142,7 +142,19 @@ __global__ void cryptonight_extra_gpu_prepare( int threads, uint32_t * __restric XOR_BLOCKS_DST( ctx_state, ctx_state + 8, ctx_a ); XOR_BLOCKS_DST( ctx_state + 4, ctx_state + 12, ctx_b ); memcpy( d_ctx_a + thread * 4, ctx_a, 4 * 4 ); - memcpy( d_ctx_b + thread * 4, ctx_b, 4 * 4 ); + if(ALGO == cryptonight_monero_v8) + { + memcpy( d_ctx_b + thread * 12, ctx_b, 4 * 4 ); + // bx1 + XOR_BLOCKS_DST( ctx_state + 16, ctx_state + 20, ctx_b ); + memcpy( d_ctx_b + thread * 12 + 4, ctx_b, 4 * 4 ); + // division_result + memcpy( d_ctx_b + thread * 12 + 2 * 4, ctx_state + 24, 4 * 2 ); + // sqrt_result + memcpy( d_ctx_b + thread * 12 + 2 * 4 + 2, ctx_state + 26, 4 * 2 ); + } + else + memcpy( d_ctx_b + thread * 4, ctx_b, 4 * 4 ); memcpy( d_ctx_key1 + thread * 40, ctx_key1, 40 * 4 ); memcpy( d_ctx_key2 + thread * 40, ctx_key2, 40 * 4 ); @@ -271,13 +283,9 @@ extern "C" int cryptonight_extra_cpu_init(nvid_ctx* ctx) break; }; - const int gpuArch = ctx->device_arch[0] * 10 + ctx->device_arch[1]; - /* Disable L1 cache for GPUs before Volta. - * L1 speed is increased and latency reduced with Volta. - */ - if(gpuArch < 70) - CUDA_CHECK(ctx->device_id, cudaDeviceSetCacheConfig(cudaFuncCachePreferL1)); + // prefer shared memory over L1 cache + CUDA_CHECK(ctx->device_id, cudaDeviceSetCacheConfig(cudaFuncCachePreferShared)); size_t hashMemSize = std::max( cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo()), @@ -298,6 +306,12 @@ extern "C" int cryptonight_extra_cpu_init(nvid_ctx* ctx) // create a double buffer for the state to exchange the mixed state to phase1 CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_state2, 50 * sizeof(uint32_t) * wsize)); } + else if(cryptonight_monero_v8 == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() || + cryptonight_monero_v8 == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot()) + { + // bx1 (16byte), division_result (8byte) and sqrt_result (8byte) + ctx_b_size = 3 * 4 * sizeof(uint32_t) * wsize; + } else ctx->d_ctx_state2 = ctx->d_ctx_state; @@ -340,6 +354,11 @@ extern "C" void cryptonight_extra_cpu_prepare(nvid_ctx* ctx, uint32_t startNonce CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<<>>( wsize, ctx->d_input, ctx->inputlen, startNonce, ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 )); } + if(miner_algo == cryptonight_monero_v8) + { + CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<<>>( wsize, ctx->d_input, ctx->inputlen, startNonce, + ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 )); + } else { /* pass two times d_ctx_state because the second state is used later in phase1, @@ -396,7 +415,11 @@ extern "C" void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce, } CUDA_CHECK(ctx->device_id, cudaMemcpy( rescount, ctx->d_result_count, sizeof (uint32_t ), cudaMemcpyDeviceToHost )); - CUDA_CHECK(ctx->device_id, cudaMemcpy( resnonce, ctx->d_result_nonce, 10 * sizeof (uint32_t ), cudaMemcpyDeviceToHost )); + CUDA_CHECK_MSG( + ctx->device_id, + "\n**suggestion: Try to increase the attribute 'bfactor' in the NVIDIA config file.**", + cudaMemcpy( resnonce, ctx->d_result_nonce, 10 * sizeof (uint32_t ), cudaMemcpyDeviceToHost ) + ); /* There is only a 32bit limit for the counter on the device side * therefore this value can be greater than 10, in that case limit rescount @@ -450,19 +473,22 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx) if(version < CUDART_VERSION) { - printf("Driver does not support CUDA %d.%d API! Update your nVidia driver!\n", CUDART_VERSION / 1000, (CUDART_VERSION % 1000) / 10); + printf("WARNING: Driver supports CUDA %d.%d but this was compiled for CUDA %d.%d API! Update your nVidia driver or compile with older CUDA!\n", + version / 1000, (version % 1000 / 10), + CUDART_VERSION / 1000, (CUDART_VERSION % 1000) / 10); return 1; } int GPU_N; if(cuda_get_devicecount(&GPU_N) == 0) { + printf("WARNING: CUDA claims zero devices?\n"); return 1; } if(ctx->device_id >= GPU_N) { - printf("Invalid device ID!\n"); + printf("WARNING: Invalid device ID '%i'!\n", ctx->device_id); return 1; } @@ -483,6 +509,11 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx) ctx->name = std::string(props.name); + printf("CUDA [%d.%d/%d.%d] GPU#%d, device architecture %d: \"%s\"... ", + version / 1000, (version % 1000 / 10), + CUDART_VERSION / 1000, (CUDART_VERSION % 1000) / 10, + ctx->device_id, gpuArch, ctx->device_name); + std::vector arch; #define XMRSTAK_PP_TOSTRING1(str) #str #define XMRSTAK_PP_TOSTRING(str) XMRSTAK_PP_TOSTRING1(str) @@ -496,13 +527,14 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx) while ( ss >> tmpArch ) arch.push_back( tmpArch ); + #define MSG_CUDA_NO_ARCH "WARNING: skip device - binary does not contain required device architecture\n" if(gpuArch >= 20 && gpuArch < 30) { // compiled binary must support sm_20 for fermi std::vector::iterator it = std::find(arch.begin(), arch.end(), 20); if(it == arch.end()) { - printf("WARNING: NVIDIA GPU %d: miner not compiled for CUDA architecture %d.\n", ctx->device_id, gpuArch); + printf(MSG_CUDA_NO_ARCH); return 5; } } @@ -520,7 +552,7 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx) minSupportedArch = arch[i]; if(minSupportedArch < 30 || gpuArch < minSupportedArch) { - printf("WARNING: NVIDIA GPU %d: miner not compiled for CUDA architecture %d.\n", ctx->device_id, gpuArch); + printf(MSG_CUDA_NO_ARCH); return 5; } } @@ -529,8 +561,8 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx) if(ctx->device_blocks == -1) { /* good values based of my experience - * - 3 * SMX count >=sm_30 - * - 2 * SMX count for =sm_30 + * - 2 * SMX count for device_blocks = props.multiProcessorCount * ( props.major < 3 ? 2 : 3 ); @@ -582,18 +614,19 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx) int* tmp; cudaError_t err; + #define MSG_CUDA_FUNC_FAIL "WARNING: skip device - %s failed\n" // a device must be selected to get the right memory usage later on err = cudaSetDevice(ctx->device_id); if(err != cudaSuccess) { - printf("WARNING: NVIDIA GPU %d: cannot be selected.\n", ctx->device_id); + printf(MSG_CUDA_FUNC_FAIL, "cudaSetDevice"); return 2; } // trigger that a context on the gpu will be allocated err = cudaMalloc(&tmp, 256); if(err != cudaSuccess) { - printf("WARNING: NVIDIA GPU %d: context cannot be created.\n", ctx->device_id); + printf(MSG_CUDA_FUNC_FAIL, "cudaMalloc"); return 3; } @@ -626,9 +659,7 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx) size_t usedMem = totalMemory - freeMemory; if(usedMem >= maxMemUsage) { - printf("WARNING: NVIDIA GPU %d: already %s MiB memory in use, skip GPU.\n", - ctx->device_id, - std::to_string(usedMem/byteToMiB).c_str()); + printf("WARNING: skip device - already %s MiB memory in use\n", std::to_string(usedMem/byteToMiB).c_str()); return 4; } else @@ -660,7 +691,27 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx) ctx->device_threads = 64; } + // check if cryptonight_monero_v8 is selected for the user pool + bool useCryptonight_v8 = + ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_monero_v8 || + ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot() == cryptonight_monero_v8; + + // overwrite default config if cryptonight_monero_v8 is mined and GPU has at least compute capability 5.0 + if(useCryptonight_v8 && gpuArch >= 50) + { + // 4 based on my test maybe it must be adjusted later + size_t threads = 4; + // 8 is chosen by checking the occupancy calculator + size_t blockOptimal = 8 * ctx->device_mpcount; + + if(blockOptimal * threads * hashMemSize < limitedMemory) + { + ctx->device_threads = threads; + ctx->device_blocks = blockOptimal; + } + } } + printf("device init succeeded\n"); return 0; } diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp new file mode 100644 index 000000000..796b7adda --- /dev/null +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp @@ -0,0 +1,61 @@ +#pragma once + +#include + +__device__ __forceinline__ uint32_t get_reciprocal(uint32_t a) +{ + const float a_hi = __uint_as_float((a >> 8) + ((126U + 31U) << 23)); + const float a_lo = __uint2float_rn(a & 0xFF); + + float r; + asm("rcp.approx.f32 %0, %1;" : "=f"(r) : "f"(a_hi)); + const float r_scaled = __uint_as_float(__float_as_uint(r) + (64U << 23)); + + const float h = __fmaf_rn(a_lo, r, __fmaf_rn(a_hi, r, -1.0f)); + return (__float_as_uint(r) << 9) - __float2int_rn(h * r_scaled); +} + +__device__ __forceinline__ uint64_t fast_div_v2(uint64_t a, uint32_t b) +{ + const uint32_t r = get_reciprocal(b); + const uint64_t k = __umulhi(((uint32_t*)&a)[0], r) + ((uint64_t)(r) * ((uint32_t*)&a)[1]) + a; + + uint32_t q[2]; + q[0] = ((uint32_t*)&k)[1]; + + int64_t tmp = a - (uint64_t)(q[0]) * b; + ((int32_t*)(&tmp))[1] -= (k < a) ? b : 0; + + const bool overshoot = ((int32_t*)(&tmp))[1] < 0; + const bool undershoot = tmp >= b; + + q[0] += (undershoot ? 1U : 0U) - (overshoot ? 1U : 0U); + q[1] = ((uint32_t*)(&tmp))[0] + (overshoot ? b : 0U) - (undershoot ? b : 0U); + + return *((uint64_t*)(q)); +} + +__device__ __forceinline__ uint32_t fast_sqrt_v2(const uint64_t n1) +{ + float x = __uint_as_float((((uint32_t*)&n1)[1] >> 9) + ((64U + 127U) << 23)); + float x1; + asm("rsqrt.approx.f32 %0, %1;" : "=f"(x1) : "f"(x)); + asm("sqrt.approx.f32 %0, %1;" : "=f"(x) : "f"(x)); + + // The following line does x1 *= 4294967296.0f; + x1 = __uint_as_float(__float_as_uint(x1) + (32U << 23)); + + const uint32_t x0 = __float_as_uint(x) - (158U << 23); + const int64_t delta0 = n1 - (((int64_t)(x0) * x0) << 18); + const float delta = __int2float_rn(((int32_t*)&delta0)[1]) * x1; + + uint32_t result = (x0 << 10) + __float2int_rn(delta); + const uint32_t s = result >> 1; + const uint32_t b = result & 1; + + const uint64_t x2 = (uint64_t)(s) * (s + b) + ((uint64_t)(result) << 32) - n1; + const int32_t overshoot = ((int64_t)(x2 + b) > 0) ? -1 : 0; + const int32_t undershoot = ((int64_t)(x2 + 0x100000000UL + s) < 0) ? 1 : 0; + result += (overshoot+undershoot); + return result; +} diff --git a/xmrstak/backend/plugin.hpp b/xmrstak/backend/plugin.hpp index 1811af224..5c7dfe16a 100644 --- a/xmrstak/backend/plugin.hpp +++ b/xmrstak/backend/plugin.hpp @@ -27,8 +27,11 @@ namespace xmrstak struct plugin { - plugin(const std::string backendName, const std::string libName) : fn_startBackend(nullptr), m_backendName(backendName) + plugin() = default; + + void load(const std::string backendName, const std::string libName) { + m_backendName = backendName; #ifdef WIN32 libBackend = LoadLibrary(TEXT((libName + ".dll").c_str())); if(!libBackend) @@ -81,32 +84,36 @@ struct plugin if(fn_startBackend == nullptr) { std::vector* pvThreads = new std::vector(); - std::cerr << "WARNING: " << m_backendName << " Backend disabled"<< std::endl; return pvThreads; } return fn_startBackend(threadOffset, pWork, env); } + void unload() + { + if(libBackend) + { +#ifdef WIN32 + FreeLibrary(libBackend); +#else + dlclose(libBackend); +#endif + } + fn_startBackend = nullptr; + } + std::string m_backendName; typedef std::vector* (*startBackend_t)(uint32_t threadOffset, miner_work& pWork, environment& env); - startBackend_t fn_startBackend; + startBackend_t fn_startBackend = nullptr; #ifdef WIN32 HINSTANCE libBackend; #else - void *libBackend; -#endif - -/* \todo add unload to destructor and change usage of plugin that libs kept open until the miner ends -#ifdef WIN32 - FreeLibrary(libBackend); -#else - dlclose(libBackend); + void *libBackend = nullptr; #endif - * */ }; } // namespace xmrstak diff --git a/xmrstak/cli/cli-miner.cpp b/xmrstak/cli/cli-miner.cpp index ae39d2505..40fb9d948 100644 --- a/xmrstak/cli/cli-miner.cpp +++ b/xmrstak/cli/cli-miner.cpp @@ -749,6 +749,7 @@ int main(int argc, char *argv[]) if (!BackendConnector::self_test()) { + printer::inst()->print_msg(L0, "Self test not passed!"); win_exit(); return 1; } @@ -782,11 +783,24 @@ int main(int argc, char *argv[]) char buffer[64]; snprintf(buffer, sizeof(buffer), "\nConfigurable dev donation level is set to %.1f%%\n\n", fDevDonationLevel * 100.0); printer::inst()->print_str(buffer); + printer::inst()->print_str("-------------------------------------------------------------------\n"); printer::inst()->print_str("You can use following keys to display reports:\n"); printer::inst()->print_str("'h' - hashrate\n"); printer::inst()->print_str("'r' - results\n"); printer::inst()->print_str("'c' - connection\n"); printer::inst()->print_str("-------------------------------------------------------------------\n"); + printer::inst()->print_str("Upcoming xmr-stak-gui is sponsored by:\n"); + printer::inst()->print_str(" ##### ______ ____\n"); + printer::inst()->print_str(" ## ## | ___ \\ / _ \\\n"); + printer::inst()->print_str("# _ #| |_/ /_ _ ___ | / \\/ _ _ _ _ _ _ ___ _ __ ___ _ _\n"); + printer::inst()->print_str("# |_| #| /| | | | / _ \\ | | | | | || '_|| '_|/ _ \\| '_ \\ / __|| | | |\n"); + printer::inst()->print_str("# #| |\\ \\| |_| || (_) || \\_/\\| |_| || | | | | __/| | | || (__ | |_| |\n"); + printer::inst()->print_str(" ## ## \\_| \\_|\\__, | \\___/ \\____/ \\__,_||_| |_| \\___||_| |_| \\___| \\__, |\n"); + printer::inst()->print_str(" ##### __/ | __/ |\n"); + printer::inst()->print_str(" |___/ https://ryo-currency.com |___/\n\n"); + printer::inst()->print_str("This currency is a way for us to implement the ideas that we were unable to in\n"); + printer::inst()->print_str("Monero. See https://github.com/fireice-uk/cryptonote-speedup-demo for details.\n"); + printer::inst()->print_str("-------------------------------------------------------------------\n"); printer::inst()->print_msg(L0, "Mining coin: %s", jconf::inst()->GetMiningCoin().c_str()); if(params::inst().benchmark_block_version >= 0) diff --git a/xmrstak/config.tpl b/xmrstak/config.tpl index 14330a829..73ae054c2 100644 --- a/xmrstak/config.tpl +++ b/xmrstak/config.tpl @@ -1,4 +1,5 @@ -R"===( +R"===(// generated by XMRSTAK_VERSION + /* * Network timeouts. * Because of the way this client is written it doesn't need to constantly talk (keep-alive) to the server to make @@ -58,40 +59,48 @@ R"===( * Large pages need a properly set up OS. It can be difficult if you are not used to systems administration, * but the performance results are worth the trouble - you will get around 20% boost. Slow memory mode is * meant as a backup, you won't get stellar results there. If you are running into trouble, especially - * on Windows, please read the common issues in the README. - * - * By default we will try to allocate large pages. This means you need to "Run As Administrator" on Windows. - * You need to edit your system's group policies to enable locking large pages. Here are the steps from MSDN - * - * 1. On the Start menu, click Run. In the Open box, type gpedit.msc. - * 2. On the Local Group Policy Editor console, expand Computer Configuration, and then expand Windows Settings. - * 3. Expand Security Settings, and then expand Local Policies. - * 4. Select the User Rights Assignment folder. - * 5. The policies will be displayed in the details pane. - * 6. In the pane, double-click Lock pages in memory. - * 7. In the Local Security Setting – Lock pages in memory dialog box, click Add User or Group. - * 8. In the Select Users, Service Accounts, or Groups dialog box, add an account that you will run the miner on - * 9. Reboot for change to take effect. - * - * Windows also tends to fragment memory a lot. If you are running on a system with 4-8GB of RAM you might need - * to switch off all the auto-start applications and reboot to have a large enough chunk of contiguous memory. - * - * On Linux you will need to configure large page support "sudo sysctl -w vm.nr_hugepages=128" and increase your - * ulimit -l. To do do this you need to add following lines to /etc/security/limits.conf - "* soft memlock 262144" - * and "* hard memlock 262144". You can also do it Windows-style and simply run-as-root, but this is NOT - * recommended for security reasons. - * - * Memory locking means that the kernel can't swap out the page to disk - something that is unlikely to happen on a - * command line system that isn't starved of memory. I haven't observed any difference on a CLI Linux system between - * locked and unlocked memory. If that is your setup see option "no_mlck". - */ - -/* + * on Windows, please read the common issues in the README and FAQ. + * + * By default we will try to allocate large pages. This means you need to "Run As Administrator" on Windows.---WINDOWS + * You need to edit your system's group policies to enable locking large pages. Here are the steps from MSDN---WINDOWS + *---WINDOWS + * 1. On the Start menu, click Run. In the Open box, type gpedit.msc.---WINDOWS + * 2. On the Local Group Policy Editor console, expand Computer Configuration, and then expand Windows Settings.---WINDOWS + * 3. Expand Security Settings, and then expand Local Policies.---WINDOWS + * 4. Select the User Rights Assignment folder.---WINDOWS + * 5. The policies will be displayed in the details pane.---WINDOWS + * 6. In the pane, double-click Lock pages in memory.---WINDOWS + * 7. In the Local Security Setting – Lock pages in memory dialog box, click Add User or Group.---WINDOWS + * 8. In the Select Users, Service Accounts, or Groups dialog box, add an account that you will run the miner on---WINDOWS + * 9. Reboot for change to take effect.---WINDOWS + *---WINDOWS + * Windows also tends to fragment memory a lot. If you are running on a system with 4-8GB of RAM you might need---WINDOWS + * to switch off all the auto-start applications and reboot to have a large enough chunk of contiguous memory.---WINDOWS + * On Linux you will need to configure large page support and increase your memlock limit (ulimit -l).---LINUX + *---LINUX + * To set large page support, add the following to "/etc/sysctl.d/60-hugepages.conf":---LINUX + * vm.nr_hugepages=128---LINUX + * You WILL need to run "sudo sysctl --system" for these settings to take effect on your system (or reboot).---LINUX + * In some cases (many threads, very large CPU, etc) you may need more than 128---LINUX + * (try 256 if there are still complaints from thread inits)---LINUX + *---LINUX + * To increase the memlock (ulimit -l), add following lines to /etc/security/limits.d/60-memlock.conf:---LINUX + * * - memlock 262144---LINUX + * root - memlock 262144---LINUX + * You WILL need to log out and log back in for these settings to take effect on your user (no need to reboot, just relogin in your session).---LINUX + *---LINUX + * Check with "/sbin/sysctl vm.nr_hugepages ; ulimit -l" to validate---LINUX + *---LINUX + * Memory locking means that the kernel can't swap out the page to disk - something that is unlikely to happen on a---LINUX + * command line system that isn't starved of memory. I haven't observed any difference on a CLI Linux system between---LINUX + * locked and unlocked memory. If that is your setup see option "no_mlck".---LINUX + * + * * use_slow_memory defines our behaviour with regards to large pages. There are three possible options here: * always - Don't even try to use large pages. Always use slow memory. * warn - We will try to use large pages, but fall back to slow memory if that fails. - * no_mlck - This option is only relevant on Linux, where we can use large pages without locking memory. - * It will never use slow memory, but it won't attempt to mlock + * no_mlck - This option is only relevant on Linux, where we can use large pages without locking memory.---LINUX + * It will never use slow memory, but it won't attempt to mlock---LINUX * never - If we fail to allocate large pages we will print an error and exit. */ "use_slow_memory" : "warn", @@ -149,6 +158,4 @@ R"===( * This setting will only be needed in 2020's. No need to worry about it now. */ "prefer_ipv4" : true, - )===" - diff --git a/xmrstak/cpputil/read_write_lock.h b/xmrstak/cpputil/read_write_lock.h index 9139dfd22..51f42a2e6 100644 --- a/xmrstak/cpputil/read_write_lock.h +++ b/xmrstak/cpputil/read_write_lock.h @@ -61,7 +61,7 @@ class RWLock std::unique_lock lck(mtx_); if (status_ == -1) { status_ = 0; - } + } else { status_ -= 1; @@ -72,7 +72,7 @@ class RWLock { write_cv_.notify_one(); } - } + } else { read_cv_.notify_all(); diff --git a/xmrstak/donate-level.hpp b/xmrstak/donate-level.hpp index 71b79628a..0f851f8f8 100644 --- a/xmrstak/donate-level.hpp +++ b/xmrstak/donate-level.hpp @@ -1,18 +1,19 @@ #pragma once /* - * Dev donation. - * Percentage of your hashing power that you want to donate to the developer, can be 0.0 if you don't want to do that. - * Example of how it works for the default setting of 2.0: - * You miner will mine into your usual pool for 98 minutes, then switch to the developer's pool for 2.0 minute. - * Switching is instant, and only happens after a successful connection, so you never loose any hashes. + * DEV DONATION SETTING + * This setting is a percentage of your hashing power that the miner donates to the developers of this app. + * It can be 0.0 if you don't want to help the developers. The default setting of 2.0 means that + * the miner will mine into your usual pool for 98 minutes, then switch to the developer's pool for 2.0 minutes. + * Switching pools is instant and it only happens after a successful connection, so you don't lose any hash time. * - * If you plan on changing this setting to 0.0 please consider making a one off donation to our wallets: + * If you plan on changing this setting to 0.0, please consider making a one time donation to our wallets: * fireice-uk: * 4581HhZkQHgZrZjKeCfCJxZff9E3xCgHGF25zABZz7oR71TnbbgiS7sK9jveE6Dx6uMs2LwszDuvQJgRZQotdpHt1fTdDhk * psychocrypt: * 43NoJVEXo21hGZ6tDG6Z3g4qimiGdJPE6GRxAmiWwm26gwr62Lqo7zRiCJFSBmbkwTGNuuES9ES5TgaVHceuYc4Y75txCTU * + * Thank you for your support. */ constexpr double fDevDonationLevel = 2.0 / 100.0; diff --git a/xmrstak/http/httpd.cpp b/xmrstak/http/httpd.cpp index dba7d7cdb..ed9abc2bc 100644 --- a/xmrstak/http/httpd.cpp +++ b/xmrstak/http/httpd.cpp @@ -71,17 +71,17 @@ int httpd::req_handler(void * cls, if (username == NULL) { rsp = MHD_create_response_from_buffer(sHtmlAccessDeniedSize, (void*)sHtmlAccessDenied, MHD_RESPMEM_PERSISTENT); - ret = MHD_queue_auth_fail_response(connection, sHttpAuthRelam, sHttpAuthOpaque, rsp, MHD_NO); + ret = MHD_queue_auth_fail_response(connection, sHttpAuthRealm, sHttpAuthOpaque, rsp, MHD_NO); MHD_destroy_response(rsp); return ret; } free(username); - ret = MHD_digest_auth_check(connection, sHttpAuthRelam, jconf::inst()->GetHttpUsername(), jconf::inst()->GetHttpPassword(), 300); + ret = MHD_digest_auth_check(connection, sHttpAuthRealm, jconf::inst()->GetHttpUsername(), jconf::inst()->GetHttpPassword(), 300); if (ret == MHD_INVALID_NONCE || ret == MHD_NO) { rsp = MHD_create_response_from_buffer(sHtmlAccessDeniedSize, (void*)sHtmlAccessDenied, MHD_RESPMEM_PERSISTENT); - ret = MHD_queue_auth_fail_response(connection, sHttpAuthRelam, sHttpAuthOpaque, rsp, (ret == MHD_INVALID_NONCE) ? MHD_YES : MHD_NO); + ret = MHD_queue_auth_fail_response(connection, sHttpAuthRealm, sHttpAuthOpaque, rsp, (ret == MHD_INVALID_NONCE) ? MHD_YES : MHD_NO); MHD_destroy_response(rsp); return ret; } diff --git a/xmrstak/http/webdesign.cpp b/xmrstak/http/webdesign.cpp index d6ee66e8d..93e217519 100644 --- a/xmrstak/http/webdesign.cpp +++ b/xmrstak/http/webdesign.cpp @@ -113,7 +113,7 @@ extern const char sHtmlCssFile [] = size_t sHtmlCssSize = sizeof(sHtmlCssFile) - 1; -extern const char sHttpAuthRelam[] = "XMR-Stak-Miner"; +extern const char sHttpAuthRealm[] = "XMR-Stak-Miner"; extern const char sHttpAuthOpaque[] = "6c071f0df539e234cadbcd79164af7a594e23ab42bccb834df796aead6ce96e4"; extern const char sHtmlAccessDenied[] = diff --git a/xmrstak/http/webdesign.hpp b/xmrstak/http/webdesign.hpp index 48adfea98..bcbe5ae1d 100644 --- a/xmrstak/http/webdesign.hpp +++ b/xmrstak/http/webdesign.hpp @@ -7,7 +7,7 @@ extern size_t sHtmlCssSize; extern const char sHtmlAccessDenied[]; extern size_t sHtmlAccessDeniedSize; -extern const char sHttpAuthRelam[]; +extern const char sHttpAuthRealm[]; extern const char sHttpAuthOpaque[]; extern const char sHtmlCommonHeader[]; diff --git a/xmrstak/jconf.cpp b/xmrstak/jconf.cpp index 354388849..b608c0028 100644 --- a/xmrstak/jconf.cpp +++ b/xmrstak/jconf.cpp @@ -87,27 +87,29 @@ constexpr size_t iConfigCnt = (sizeof(oConfigValues)/sizeof(oConfigValues[0])); xmrstak::coin_selection coins[] = { // name, userpool, devpool, default_pool_suggestion - { "aeon7", {cryptonight_aeon, cryptonight_lite, 7u}, {cryptonight_aeon, cryptonight_lite, 7u}, "mine.aeon-pool.com:5555" }, - { "bbscoin", {cryptonight_monero, cryptonight, 3u}, {cryptonight_monero, cryptonight_monero, 0u}, nullptr }, - { "bittube", {cryptonight_bittube2, cryptonight_bittube2, 0}, {cryptonight_heavy, cryptonight_heavy, 0u},"mining.bit.tube:13333"}, - { "cryptonight", {cryptonight_monero, cryptonight, 255u}, {cryptonight_monero, cryptonight_monero, 0u}, nullptr }, - { "cryptonight_bittube2",{cryptonight_bittube2, cryptonight_bittube2, 0}, {cryptonight_heavy, cryptonight_heavy, 0u},nullptr}, - { "cryptonight_masari", {cryptonight_monero, cryptonight_masari, 255u}, {cryptonight_monero, cryptonight_monero, 0u},nullptr }, - { "cryptonight_haven", {cryptonight_heavy, cryptonight_haven, 255u}, {cryptonight_heavy, cryptonight_heavy, 0u}, nullptr }, - { "cryptonight_heavy", {cryptonight_heavy, cryptonight_heavy, 0u}, {cryptonight_heavy, cryptonight_heavy, 0u}, nullptr }, - { "cryptonight_lite", {cryptonight_aeon, cryptonight_lite, 255u}, {cryptonight_aeon, cryptonight_lite, 7u}, nullptr }, - { "cryptonight_lite_v7", {cryptonight_lite, cryptonight_aeon, 255u}, {cryptonight_aeon, cryptonight_lite, 7u}, nullptr }, - { "cryptonight_lite_v7_xor", {cryptonight_aeon, cryptonight_ipbc, 255u}, {cryptonight_aeon, cryptonight_aeon, 255u}, nullptr }, - { "cryptonight_v7", {cryptonight_monero, cryptonight_monero, 0u}, {cryptonight_monero, cryptonight_monero, 0u}, nullptr }, - { "cryptonight_v7_stellite", {cryptonight_monero, cryptonight_stellite, 255u}, {cryptonight_monero, cryptonight_monero, 255u}, nullptr }, - { "graft", {cryptonight_monero, cryptonight, 8u}, {cryptonight_monero, cryptonight_monero, 0u}, nullptr }, - { "haven", {cryptonight_haven, cryptonight_heavy, 3u}, {cryptonight_heavy, cryptonight_heavy, 0u}, nullptr }, - { "intense", {cryptonight_monero, cryptonight, 4u}, {cryptonight_monero, cryptonight_monero, 0u}, nullptr }, - { "masari", {cryptonight_masari, cryptonight_monero, 7u}, {cryptonight_monero, cryptonight_monero, 0u},nullptr }, - { "monero7", {cryptonight_monero, cryptonight_monero, 0u}, {cryptonight_monero, cryptonight_monero, 0u}, "pool.usxmrpool.com:3333" }, - { "ryo", {cryptonight_heavy, cryptonight_heavy, 0u}, {cryptonight_heavy, cryptonight_heavy, 0u}, nullptr }, - { "stellite", {cryptonight_stellite, cryptonight_monero, 4u}, {cryptonight_monero, cryptonight_monero, 0u}, nullptr }, - { "turtlecoin", {cryptonight_lite, cryptonight_aeon, 255u}, {cryptonight_aeon, cryptonight_lite, 7u}, nullptr } + { "aeon7", {cryptonight_aeon, cryptonight_aeon, 0u}, {cryptonight_aeon, cryptonight_aeon, 0u}, "mine.aeon-pool.com:5555" }, + { "bbscoin", {cryptonight_aeon, cryptonight_aeon, 0u}, {cryptonight_aeon, cryptonight_aeon, 0u}, nullptr }, + { "bittube", {cryptonight_heavy, cryptonight_bittube2, 255u}, {cryptonight_heavy, cryptonight_heavy, 0u},"mining.bit.tube:13333"}, + { "cryptonight", {cryptonight_monero_v8, cryptonight, 255u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr }, + { "cryptonight_bittube2",{cryptonight_heavy, cryptonight_bittube2, 255u}, {cryptonight_heavy, cryptonight_heavy, 0u},nullptr}, + { "cryptonight_masari", {cryptonight_monero_v8, cryptonight_masari, 255u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u},nullptr }, + { "cryptonight_haven", {cryptonight_heavy, cryptonight_haven, 255u}, {cryptonight_heavy, cryptonight_heavy, 0u}, nullptr }, + { "cryptonight_heavy", {cryptonight_heavy, cryptonight_heavy, 0u}, {cryptonight_heavy, cryptonight_heavy, 0u}, nullptr }, + { "cryptonight_lite", {cryptonight_aeon, cryptonight_lite, 255u}, {cryptonight_aeon, cryptonight_aeon, 0u}, nullptr }, + { "cryptonight_lite_v7", {cryptonight_aeon, cryptonight_aeon, 0u}, {cryptonight_aeon, cryptonight_aeon, 0u}, nullptr }, + { "cryptonight_lite_v7_xor", {cryptonight_aeon, cryptonight_ipbc, 255u}, {cryptonight_aeon, cryptonight_aeon, 0u}, nullptr }, + { "cryptonight_v7", {cryptonight_monero_v8, cryptonight_monero, 255u}, {cryptonight_monero_v8, cryptonight_monero, 8u}, nullptr }, + { "cryptonight_v8", {cryptonight_monero, cryptonight_monero_v8, 255u}, {cryptonight_monero_v8, cryptonight_monero, 8u}, nullptr }, + { "cryptonight_v7_stellite", {cryptonight_monero_v8, cryptonight_stellite, 255u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr }, + { "graft", {cryptonight_monero_v8, cryptonight_monero, 255u}, {cryptonight_monero_v8, cryptonight_monero, 8u}, nullptr }, + { "haven", {cryptonight_heavy, cryptonight_haven, 255u}, {cryptonight_heavy, cryptonight_heavy, 0u}, nullptr }, + { "intense", {cryptonight_monero_v8, cryptonight_monero, 255u}, {cryptonight_monero_v8, cryptonight_monero, 8u}, nullptr }, + { "masari", {cryptonight_monero_v8, cryptonight_masari, 255u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u},nullptr }, + { "monero", {cryptonight_monero_v8, cryptonight_monero, 8u}, {cryptonight_monero_v8, cryptonight_monero, 8u}, "pool.usxmrpool.com:3333" }, + { "qrl", {cryptonight_monero_v8, cryptonight_monero, 255u}, {cryptonight_monero_v8, cryptonight_monero, 8u}, nullptr }, + { "ryo", {cryptonight_heavy, cryptonight_heavy, 0u}, {cryptonight_heavy, cryptonight_heavy, 0u}, nullptr }, + { "stellite", {cryptonight_monero_v8, cryptonight_stellite, 255u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr }, + { "turtlecoin", {cryptonight_aeon, cryptonight_aeon, 0u}, {cryptonight_aeon, cryptonight_aeon, 0u}, nullptr } }; constexpr size_t coin_algo_size = (sizeof(coins)/sizeof(coins[0])); @@ -323,13 +325,6 @@ bool jconf::IsOnAlgoList(std::string& needle) { std::transform(needle.begin(), needle.end(), needle.begin(), ::tolower); - if(needle == "monero") - { - printer::inst()->print_msg(L0, "You entered Monero as coin name. Monero will hard-fork the PoW.\nThis means it will stop being compatible with other cryptonight coins.\n" - "Please use 'monero7' (support auto switch to new POW) if you want to mine Monero, \nor name the coin that you want to mine."); - return false; - } - for(size_t i=0; i < coin_algo_size; i++) { if(needle == coins[i].coin_name) @@ -614,13 +609,6 @@ bool jconf::parse_config(const char* sFilename, const char* sFilenamePools) for(size_t i=0; i < coin_algo_size; i++) { - if(ctmp == "monero") - { - printer::inst()->print_msg(L0, "You entered Monero as coin name. Monero will hard-fork the PoW.\nThis means it will stop being compatible with other cryptonight coins.\n" - "Please use monero7 (support auto switch to new POW) if you want to mine Monero, or name the coin that you want to mine."); - return false; - } - if(ctmp == coins[i].coin_name) { currentCoin = coins[i]; diff --git a/xmrstak/misc/configEditor.hpp b/xmrstak/misc/configEditor.hpp index d95ea6b72..3f79df44c 100644 --- a/xmrstak/misc/configEditor.hpp +++ b/xmrstak/misc/configEditor.hpp @@ -6,6 +6,7 @@ #include #include +#include "../version.hpp" namespace xmrstak { @@ -42,6 +43,24 @@ struct configEditor void write(const std::string filename) { + // endmarks: for filtering full lines inside the template string + // Platform marks are done globally here + // "---WINDOWS" endmark keeps lines when compiled for Windows + // "---LINUX" endmark keeps lines when compiled for Linux (and anything not-windows) +#if defined(_WIN32) || defined(__WIN32__) || defined(WIN32) || defined(__WINDOWS__) + // windows: + // completely drop lines with endmark-linux + replace(".*---LINUX\n", ""); + // strip off windows endmarks, keep the lines + replace("---WINDOWS\n", "\n"); +#else + // not-windows: + // completely drop lines with endmark-windows + replace(".*---WINDOWS\n", ""); + // strip off linux endmarks, keep the lines + replace("---LINUX\n", "\n"); +#endif + replace("XMRSTAK_VERSION", get_version_str()); std::ofstream out(filename); out << m_fileContent; out.close(); diff --git a/xmrstak/misc/executor.cpp b/xmrstak/misc/executor.cpp index 11d0f6df0..4fcce1f97 100644 --- a/xmrstak/misc/executor.cpp +++ b/xmrstak/misc/executor.cpp @@ -560,7 +560,7 @@ void executor::ex_main() else pools.emplace_front(0, "donate.xmr-stak.net:5555", "", "", "", 0.0, true, false, "", true); break; - + case cryptonight_monero_v8: case cryptonight_monero: if(dev_tls) pools.emplace_front(0, "donate.xmr-stak.net:8800", "", "", "", 0.0, true, true, "", false); @@ -1255,7 +1255,7 @@ void executor::http_json_report(std::string& out) if(i != 0) cn_error.append(1, ','); snprintf(buffer, sizeof(buffer), sJsonApiConnectionError, - int_port(duration_cast(vMineResults[i].time.time_since_epoch()).count()), + int_port(duration_cast(vSocketLog[i].time.time_since_epoch()).count()), vSocketLog[i].msg.c_str()); cn_error.append(buffer); } diff --git a/xmrstak/misc/telemetry.cpp b/xmrstak/misc/telemetry.cpp index 5642f6b66..197da8eca 100644 --- a/xmrstak/misc/telemetry.cpp +++ b/xmrstak/misc/telemetry.cpp @@ -49,6 +49,7 @@ telemetry::telemetry(size_t iThd) double telemetry::calc_telemetry_data(size_t iLastMillisec, size_t iThread) { + std::unique_lock lk(mtx); uint64_t iTimeNow = get_timestamp_ms(); uint64_t iEarliestHashCnt = 0; @@ -98,6 +99,7 @@ double telemetry::calc_telemetry_data(size_t iLastMillisec, size_t iThread) void telemetry::push_perf_value(size_t iThd, uint64_t iHashCount, uint64_t iTimestamp) { + std::unique_lock lk(mtx); size_t iTop = iBucketTop[iThd]; ppHashCounts[iThd][iTop] = iHashCount; ppTimestamps[iThd][iTop] = iTimestamp; diff --git a/xmrstak/misc/telemetry.hpp b/xmrstak/misc/telemetry.hpp index 309fd6d06..1813c00e6 100644 --- a/xmrstak/misc/telemetry.hpp +++ b/xmrstak/misc/telemetry.hpp @@ -2,6 +2,7 @@ #include #include +#include namespace xmrstak { @@ -14,6 +15,7 @@ class telemetry double calc_telemetry_data(size_t iLastMillisec, size_t iThread); private: + mutable std::mutex mtx; constexpr static size_t iBucketSize = 2 << 11; //Power of 2 to simplify calculations constexpr static size_t iBucketMask = iBucketSize - 1; uint32_t* iBucketTop; diff --git a/xmrstak/net/jpsock.cpp b/xmrstak/net/jpsock.cpp index 9fce9b7e5..d20ba082f 100644 --- a/xmrstak/net/jpsock.cpp +++ b/xmrstak/net/jpsock.cpp @@ -685,6 +685,9 @@ bool jpsock::cmd_submit(const char* sJobId, uint32_t iNonce, const uint8_t* bRes case cryptonight_monero: algo_name = "cryptonight_v7"; break; + case cryptonight_monero_v8: + algo_name = "cryptonight_v8"; + break; case cryptonight_aeon: algo_name = "cryptonight_lite_v7"; break; diff --git a/xmrstak/pools.tpl b/xmrstak/pools.tpl index 6960d63bb..e86e2a537 100644 --- a/xmrstak/pools.tpl +++ b/xmrstak/pools.tpl @@ -1,4 +1,5 @@ -R"===( +R"===(// generated by XMRSTAK_VERSION + /* * pool_address - Pool address should be in the form "pool.supportxmr.com:3333". Only stratum pools are supported. * wallet_address - Your wallet, or pool login. @@ -22,12 +23,13 @@ POOLCONF], * * aeon7 (use this for Aeon's new PoW) * bbscoin (automatic switch with block version 3 to cryptonight_v7) - * bittube (uses cyrptonight_bittube2 algorithm) + * bittube (uses cryptonight_bittube2 algorithm) * graft * haven (automatic switch with block version 3 to cryptonight_haven) * intense * masari - * monero7 (use this for Monero's new PoW) + * monero (use this to support Monero's Oct 2018 fork) + * qrl - Quantum Resistant Ledger * ryo * turtlecoin * @@ -40,13 +42,12 @@ POOLCONF], * # 2MiB scratchpad memory * cryptonight * cryptonight_v7 + * cryptonight_v8 * # 4MiB scratchpad memory - * cyrptonight_bittube2 + * cryptonight_bittube2 * cryptonight_haven * cryptonight_heavy */ "currency" : "CURRENCY", - )===" - diff --git a/xmrstak/version.cpp b/xmrstak/version.cpp index cd8bb52f5..80c25c6b8 100644 --- a/xmrstak/version.cpp +++ b/xmrstak/version.cpp @@ -18,7 +18,7 @@ #endif #define XMR_STAK_NAME "xmr-stak" -#define XMR_STAK_VERSION "2.4.7" +#define XMR_STAK_VERSION "2.5.0" #if defined(_WIN32) #define OS_TYPE "win"