diff --git a/CMakeLists.txt b/CMakeLists.txt
index a642b385d..b714ee0ce 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -100,6 +100,11 @@ if(CUDA_ENABLE)
list(APPEND DEFAULT_CUDA_ARCH "70")
endif()
endif()
+ # add Turing support for CUDA >= 10.0
+ if(NOT CUDA_VERSION VERSION_LESS 10.0)
+ list(APPEND DEFAULT_CUDA_ARCH "75")
+ endif()
+
set(CUDA_ARCH "${DEFAULT_CUDA_ARCH}" CACHE STRING "Set GPU architecture (semicolon separated list, e.g. '-DCUDA_ARCH=20;35;60')")
# generate comma separated list with architectures
@@ -186,7 +191,10 @@ if(CUDA_ENABLE)
endif()
if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC" AND
- (CUDA_VERSION VERSION_EQUAL 9.0 OR CUDA_VERSION VERSION_EQUAL 9.1 OR CUDA_VERSION VERSION_EQUAL 9.2)
+ (CUDA_VERSION VERSION_EQUAL 9.0 OR
+ CUDA_VERSION VERSION_EQUAL 9.1 OR
+ CUDA_VERSION VERSION_EQUAL 9.2 OR
+ CUDA_VERSION VERSION_EQUAL 10.0)
)
# workaround find_package(CUDA) is using the wrong path to the CXX host compiler
# overwrite the CUDA host compiler variable with the used CXX MSVC
@@ -435,6 +443,11 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
endif()
endif()
+if(${CMAKE_CXX_COMPILER_ID} STREQUAL "GNU")
+ set(CMAKE_CXX_FLAGS "-Wl,-z,noexecstack ${CMAKE_CXX_FLAGS}")
+ set(CMAKE_C_FLAGS "-Wl,-z,noexecstack ${CMAKE_C_FLAGS}")
+endif()
+
# activate static libgcc and libstdc++ linking
if(CMAKE_LINK_STATIC)
set(BUILD_SHARED_LIBRARIES OFF)
@@ -445,6 +458,27 @@ if(CMAKE_LINK_STATIC)
endif()
endif()
+if(CMAKE_C_COMPILER_ID MATCHES "MSVC")
+ # asm optimized monero v8 code
+ enable_language(ASM_MASM)
+ set_property(SOURCE "xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.asm" PROPERTY ASM_MASM)
+ add_library(xmr-stak-asm
+ STATIC
+ "xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.asm"
+ )
+else()
+ # asm optimized monero v8 code
+ enable_language(ASM)
+ set_property(SOURCE "xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S" PROPERTY CPP)
+ set_source_files_properties("xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S" PROPERTIES COMPILE_FLAGS "-x assembler-with-cpp")
+ add_library(xmr-stak-asm
+ STATIC
+ "xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S"
+ )
+endif()
+
+set_property(TARGET xmr-stak-asm PROPERTY LINKER_LANGUAGE C)
+
# compile C files
file(GLOB SRCFILES_C "xmrstak/backend/cpu/crypto/*.c")
@@ -456,7 +490,7 @@ set_property(TARGET xmr-stak-c PROPERTY C_STANDARD 99)
if(MICROHTTPD_ENABLE)
target_link_libraries(xmr-stak-c ${MHTD})
endif()
-target_link_libraries(xmr-stak-c ${LIBS})
+target_link_libraries(xmr-stak-c ${LIBS} xmr-stak-asm)
# compile generic backend files
file(GLOB BACKEND_CPP
@@ -472,7 +506,7 @@ add_library(xmr-stak-backend
STATIC
${BACKEND_CPP}
)
-target_link_libraries(xmr-stak-backend xmr-stak-c ${CMAKE_DL_LIBS})
+target_link_libraries(xmr-stak-backend xmr-stak-c ${CMAKE_DL_LIBS} xmr-stak-asm)
# compile CUDA backend
if(CUDA_FOUND)
@@ -499,7 +533,7 @@ if(CUDA_FOUND)
)
endif()
target_link_libraries(xmrstak_cuda_backend ${CUDA_LIBRARIES})
- target_link_libraries(xmrstak_cuda_backend xmr-stak-backend)
+ target_link_libraries(xmrstak_cuda_backend xmr-stak-backend xmr-stak-asm)
endif()
# compile AMD backend
@@ -512,7 +546,7 @@ if(OpenCL_FOUND)
${OPENCLSRCFILES}
)
target_link_libraries(xmrstak_opencl_backend ${OpenCL_LIBRARY} )
- target_link_libraries(xmrstak_opencl_backend xmr-stak-backend)
+ target_link_libraries(xmrstak_opencl_backend xmr-stak-backend xmr-stak-asm)
endif()
# compile final binary
@@ -528,7 +562,7 @@ endif()
set(EXECUTABLE_OUTPUT_PATH "bin" CACHE STRING "Path to place executables relative to ${CMAKE_INSTALL_PREFIX}")
set(LIBRARY_OUTPUT_PATH "bin" CACHE STRING "Path to place libraries relative to ${CMAKE_INSTALL_PREFIX}")
-target_link_libraries(xmr-stak ${LIBS} xmr-stak-c xmr-stak-backend)
+target_link_libraries(xmr-stak ${LIBS} xmr-stak-c xmr-stak-backend xmr-stak-asm)
################################################################################
# Install
diff --git a/README.md b/README.md
index 887bc5cf3..046a930e1 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
###### fireice-uk's and psychocrypt's
# XMR-Stak - Cryptonight All-in-One Mining Software
-XMR-Stak is a universal Stratum pool miner. This miner supports CPUs, AMD and NVIDIA gpus and can be used to mine the crypto currencys Monero, Aeon and many more Cryptonight coins.
+XMR-Stak is a universal Stratum pool miner. This miner supports CPUs, AMD and NVIDIA GPUs and can be used to mine the crypto currencies Monero, Aeon and many more Cryptonight coins.
## HTML reports
@@ -28,7 +28,7 @@ XMR-Stak is a universal Stratum pool miner. This miner supports CPUs, AMD and NV
- supports algorithm cryptonight for Monero (XMR) and cryptonight-light (AEON)
- easy to use
- guided start (no need to edit a config file for the first start)
- - auto configuration for each backend
+ - auto-configuration for each backend
- open source software (GPLv3)
- TLS support
- [HTML statistics](doc/usage.md#html-and-json-api-report-configuraton)
@@ -45,10 +45,14 @@ Besides [Monero](https://getmonero.org), following coins can be mined using this
- [Haven](https://havenprotocol.com)
- [Intense](https://intensecoin.com)
- [Masari](https://getmasari.org)
-- [Ryo](https://ryo-currency.com)
+- [QRL](https://theqrl.org)
+- **[Ryo](https://ryo-currency.com) - Upcoming xmr-stak-gui is sponsored by Ryo**
- [TurtleCoin](https://turtlecoin.lol)
-If your prefered coin is not listed, you can chose one of the following algorithms:
+Ryo currency is a way for us to implement the ideas that we were unable to in
+Monero. See [here](https://github.com/fireice-uk/cryptonote-speedup-demo/) for details.
+
+If your prefered coin is not listed, you can choose one of the following algorithms:
- 1MiB scratchpad memory
- cryptonight_lite
@@ -59,11 +63,12 @@ If your prefered coin is not listed, you can chose one of the following algorith
- cryptonight_masari
- cryptonight_v7
- cryptonight_v7_stellite
+ - cryptonight_v8
- 4MiB scratchpad memory
- cryptonight_haven
- cryptonight_heavy
-Please note, this list is not complete, and is not an endorsement.
+Please note, this list is not complete and is not an endorsement.
## Download
@@ -71,7 +76,7 @@ You can find the latest releases and precompiled binaries on GitHub under [Relea
## Default Developer Donation
-By default the miner will donate 2% of the hashpower (2 minute in 100 minutes) to my pool. If you want to change that, edit [donate-level.hpp](xmrstak/donate-level.hpp) before you build the binaries.
+By default, the miner will donate 2% of the hashpower (2 minutes in 100 minutes) to my pool. If you want to change that, edit [donate-level.hpp](xmrstak/donate-level.hpp) before you build the binaries.
If you want to donate directly to support further development, here is my wallet
diff --git a/doc/FAQ.md b/doc/FAQ.md
index 2fccdd8a4..f744e3d24 100644
--- a/doc/FAQ.md
+++ b/doc/FAQ.md
@@ -9,8 +9,8 @@
* [Virus Protection Alert](#virus-protection-alert)
* [Change Currency to Mine](#change-currency-to-mine)
* [How can I mine Monero](#how-can-i-mine-monero)
-* [Why is Monero named monero7](#why-is-monero-named-monero7)
* [Which currency must be chosen if my fork coin is not listed](#which-currency-must-be-chosen-if-my-fork-coin-is-not-listed)
+* [Internal compiler error: Killed (program cc1plus)](#internal-compiler-error)
## "Obtaining SeLockMemoryPrivilege failed."
@@ -44,20 +44,35 @@ Download and install this [runtime package](https://go.microsoft.com/fwlink/?Lin
## Error: MEMORY ALLOC FAILED: mmap failed
-On Linux you will need to configure large page support and increase your ulimit -l.
+On Linux you will need to configure large page support and increase your memlock limit (`ulimit -l`).
-To set large page support, add the following lines to `/etc/sysctl.conf` (`/etc/sysctl.d/xmr-stak.conf` for [Arch Linux](https://www.archlinux.org/news/deprecation-of-etcsysctlconf/) and its derivatives):
+Never put settings directly into `/etc/sysctl.conf` or `/etc/security/limits.conf` as those are system defaults and can be replaced in upgrades, and custom settings in that file are deprecated in all distros since at least wheezy/trusty (has been illegal in RedHat based distros for longer than that), and will be even more deprecated with systemd (it no longer even reads sysctl.conf, ONLY sysctl.d files, for example - there is a link to the old `/etc/sysctl.conf` for backward compatibility but that can go away at any time). Also adding to `/etc/rc.local` is extra incorrect, systemd does not even use that file anymore (once the sysvinit compatibility layer is gone, rc.local will no longer work).
+
+To check current settings, run `/sbin/sysctl vm.nr_hugepages ; ulimit -l` as whatever user you will run `xmr-stak` as (example shows bad/low sample defaults):
+
+ $ /sbin/sysctl vm.nr_hugepages ; ulimit -l
+ vm.nr_hugepages = 0
+ 16
+
+To set large page support, add the following lines to `/etc/sysctl.d/60-hugepages.conf`:
vm.nr_hugepages=128
-To increase the ulimit, add following lines to `/etc/security/limits.conf`:
+You WILL need to run `sudo sysctl --system` for these settings to take effect on your system (or reboot). In some cases (many threads, very large CPU, etc) you may need more than 128 (try 256 if there are still complaints from thread inits)
- * soft memlock 262144
- * hard memlock 262144
+To increase the memlock (ulimit -l), add following lines to `/etc/security/limits.d/60-memlock.conf`:
+
+ * - memlock 262144
+ root - memlock 262144
You WILL need to log out and log back in for these settings to take effect on your user (no need to reboot, just relogin in your session).
+Recheck after completing these steps to validate:
+
+ $ /sbin/sysctl vm.nr_hugepages ; ulimit -l
+ vm.nr_hugepages = 128
+ 262144
-You can also do it Windows-style and simply run-as-root, but this is NOT recommended for security reasons.
+You can also do it Windows-style and simply run-as-root, but this is NOT recommended for security reasons. Also running as root does not properly get around the `ulimit -l` being large enough (and limits `*` does not apply to `root` either, it must be specified explicitly).
## Illegal Instruction
@@ -72,18 +87,18 @@ If your antivirus software flags **xmr-stak**, it will likely move it to its qua
If the miner is compiled for Monero and Aeon than you can change
- the value `currency` in the config *or*
- - start the miner with the [command line option](usage.md) `--currency monero7` or `--currency aeon7`
+ - start the miner with the [command line option](usage.md) `--currency monero` or `--currency aeon7`
- run `xmr-stak --help` to see all supported currencies and algorithms
## How can I mine Monero
-Set the value `currency` in `pools.txt` to `monero7`.
-
-## Why is Monero named monero7
-
-To avoid configuration conflicts after the hard fork of Monero to the new POW with our old naming schema where all cryptonight currencies was selected by choosing `monero` as currency we decided to switch to the name `monero7`.
+Set the value `currency` in `pools.txt` to `monero`.
## Which currency must be chosen if my fork coin is not listed
If your coin you want to mine is not listed please check the documentation of the coin and try to find out if `cryptonight` or `cryptonight-lite` is the used algorithm.
Select one of these generic coin algorithms.
+
+## Internal compiler error
+
+Seeing `g++: internal compiler error: Killed (program cc1plus)` is probably related to not enough RAM to compile. 1 Gb RAM should be enough (it is on clean Ubuntu 16.04).
diff --git a/doc/compile_Linux.md b/doc/compile_Linux.md
index 072402ff7..79e036ef7 100644
--- a/doc/compile_Linux.md
+++ b/doc/compile_Linux.md
@@ -4,7 +4,7 @@
### AMD APP SDK 3.0 (only needed to use AMD GPUs)
-- download and install the latest version from https://www.dropbox.com/sh/mpg882ekirnsfa7/AADWz5X-TgVdsmWt0QwMgTWLa/AMD-APP-SDKInstaller-v3.0.130.136-GA-linux64.tar.bz2?dl=0
+- download and install the latest version from http://debian.nullivex.com/amd/AMD-APP-SDKInstaller-v3.0.130.136-GA-linux64.tar.bz2 (see https://github.com/fireice-uk/xmr-stak/issues/1511#issuecomment-385120692)
(do not wonder why it is a link to a dropbox but AMD has removed the SDK downloads, see https://community.amd.com/thread/228059)
### Cuda 8.0+ (only needed to use NVIDIA GPUs)
@@ -105,6 +105,8 @@ In that case you can force CUDA to use an older compiler in the following way:
cmake -DCUDA_HOST_COMPILER=/usr/bin/gcc-5 ..
```
+- You need 1 Gb RAM to compile (a bit less might be enough, 512 Mb isn't).
+
### To do a generic and static build for a system without gcc 5.1+
```
cmake -DCMAKE_LINK_STATIC=ON -DXMR-STAK_COMPILE=generic .
diff --git a/doc/compile_Windows.md b/doc/compile_Windows.md
index 802d5c5ab..add5fbfd0 100644
--- a/doc/compile_Windows.md
+++ b/doc/compile_Windows.md
@@ -32,8 +32,8 @@
### AMD APP SDK 3.0 (only needed for AMD GPUs)
-- Download and install the latest version from https://www.dropbox.com/s/gq8vqhelq0m6gj4/AMD-APP-SDKInstaller-v3.0.130.135-GA-windows-F-x64.exe
- (do not wonder why it is a link to a dropbox but AMD has removed the SDK downloads, see https://community.amd.com/thread/222855)
+- Download and install the latest version from http://amd-dev.wpengine.netdna-cdn.com/app-sdk/installers/APPSDKInstaller/3.0.130.135-GA/full/AMD-APP-SDKInstaller-v3.0.130.135-GA-windows-F-x64.exe
+ (do not wonder why it is a link to a netdna-cdn.com but AMD has removed the SDK downloads, see https://community.amd.com/thread/222855)
### Dependencies OpenSSL/Hwloc and Microhttpd
- For CUDA 8*:
diff --git a/doc/tuning.md b/doc/tuning.md
index 6bf036e9f..2673d68d9 100644
--- a/doc/tuning.md
+++ b/doc/tuning.md
@@ -9,6 +9,7 @@
* [AMD Backend](#amd-backend)
* [Choose `intensity` and `worksize`](#choose-intensity-and-worksize)
* [Add more GPUs](#add-more-gpus)
+ * [Two Threads per GPU](two-threads-per-gpu)
* [disable comp_mode](#disable-comp_mode)
* [change the scratchpad memory pattern](change-the-scratchpad-memory-pattern)
* [Increase Memory Pool](#increase-memory-pool)
@@ -55,10 +56,10 @@ To add a new GPU you need to add a new config set to `gpu_threads_conf`.
"gpu_threads_conf" :
[
{ "index" : 0, "threads" : 17, "blocks" : 60, "bfactor" : 0, "bsleep" : 0,
- "affine_to_cpu" : false, "sync_mode" : 3,
+ "affine_to_cpu" : false, "sync_mode" : 3, "mem_mode" : 1,
},
{ "index" : 1, "threads" : 17, "blocks" : 60, "bfactor" : 0, "bsleep" : 0,
- "affine_to_cpu" : false, "sync_mode" : 3,
+ "affine_to_cpu" : false, "sync_mode" : 3, "mem_mode" : 1,
},
],
```
@@ -82,11 +83,37 @@ If you are unsure of either GPU or platform index value, you can use `clinfo` to
```
"gpu_threads_conf" :
[
- { "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false,
- "strided_index" : true, "mem_chunk" : 2, "comp_mode" : true
+ {
+ "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false,
+ "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true
},
- { "index" : 1, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false,
- "strided_index" : true, "mem_chunk" : 2, "comp_mode" : true
+ {
+ "index" : 1, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false,
+ "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true
+ },
+],
+
+"platform_index" : 0,
+```
+
+### Two Threads per GPU
+
+Some GPUs like AMD Vega can mine faster if two threads are using the same GPU.
+Use the auto generated config as base and repeat the config entry for a GPU.
+If the attribute `index` is used twice than two threads will use one GPU.
+Take care that the required memory usage on the GPU will also double.
+Therefore adjust your intensity by hand.
+
+```
+"gpu_threads_conf" :
+[
+ {
+ "index" : 0, "intensity" : 768, "worksize" : 8, "affine_to_cpu" : false,
+ "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true
+ },
+ {
+ "index" : 0, "intensity" : 768, "worksize" : 8, "affine_to_cpu" : false,
+ "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true
},
],
diff --git a/doc/usage.md b/doc/usage.md
index 886c1b319..a371f0e67 100644
--- a/doc/usage.md
+++ b/doc/usage.md
@@ -5,6 +5,7 @@
* [Usage on Windows](#usage-on-windows)
* [Usage on Linux](#usage-on-linux)
* [Command Line Options](#command-line-options)
+* [Use different backends](#use-different-backends)
* [HTML and JSON API report configuraton](#html-and-json-api-report-configuraton)
## Configurations
@@ -34,6 +35,33 @@ Note: If the pool is ignoring the option `rig_id` in `pools.txt` to name your wo
The miner allow to overwrite some of the settings via command line options.
Run `xmr-stak --help` to show all available command line options.
+## Use Different Backends
+
+On linux and OSX please add `./` before the binary name `xmr-stak`.
+
+### CPU Only:
+```
+xmr-stak --noAMD --noNVIDIA
+```
+
+### NVIDIA/AMD Only:
+
+The miner will automatically detect if CUDA (for NVIDIA GPUs) or OpenCL (for AMD GPUs) is available.
+
+```
+xmr-stak --noCPU
+```
+**CUDA** is currently not supported. I am currently try to get some performance out it.
+
+### NVIDIA via OpenCL
+
+It is possible to use the OpenCl backend which is originally created for AMD GPUs with NVIDIA GPus.
+Some NVIDIA GPUs can reach better performance with this backend.
+
+```
+xmr-stak --openCLVendor NVIDIA --noNVIDIA
+```
+
## Docker image usage
You can run the Docker image the following way:
diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp
index 87721ac8f..7c7aff788 100644
--- a/xmrstak/backend/amd/amd_gpu/gpu.cpp
+++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp
@@ -17,6 +17,7 @@
#include "xmrstak/jconf.hpp"
#include "xmrstak/picosha2/picosha2.hpp"
#include "xmrstak/params.hpp"
+#include "xmrstak/version.hpp"
#include
#include
@@ -375,6 +376,13 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
return ERR_OCL_API;
}
+ std::vector openCLDriverVer(1024);
+ if((ret = clGetDeviceInfo(ctx->DeviceID, CL_DRIVER_VERSION, openCLDriverVer.size(), openCLDriverVer.data(), NULL)) != CL_SUCCESS)
+ {
+ printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DRIVER_VERSION for device %u.", err_to_str(ret),ctx->deviceIdx );
+ return ERR_OCL_API;
+ }
+
xmrstak_algo miner_algo[2] = {
::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo(),
::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot()
@@ -388,11 +396,31 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
int threadMemMask = cn_select_mask(miner_algo[ii]);
int hashIterations = cn_select_iter(miner_algo[ii]);
- char options[512];
- snprintf(options, sizeof(options),
- "-DITERATIONS=%d -DMASK=%d -DWORKSIZE=%llu -DSTRIDED_INDEX=%d -DMEM_CHUNK_EXPONENT=%d -DCOMP_MODE=%d -DMEMORY=%llu -DALGO=%d",
- hashIterations, threadMemMask, int_port(ctx->workSize), ctx->stridedIndex, int(1u<memChunk), ctx->compMode ? 1 : 0,
- int_port(hashMemSize), int(miner_algo[ii]));
+ size_t mem_chunk_exp = 1u << ctx->memChunk;
+ size_t strided_index = ctx->stridedIndex;
+ /* Adjust the config settings to a valid combination
+ * this is required if the dev pool is mining monero
+ * but the user tuned there settings for another currency
+ */
+ if(miner_algo[ii] == cryptonight_monero_v8)
+ {
+ if(ctx->memChunk < 2)
+ mem_chunk_exp = 1u << 2;
+ if(strided_index == 1)
+ strided_index = 0;
+ }
+
+ std::string options;
+ options += " -DITERATIONS=" + std::to_string(hashIterations);
+ options += " -DMASK=" + std::to_string(threadMemMask);
+ options += " -DWORKSIZE=" + std::to_string(ctx->workSize);
+ options += " -DSTRIDED_INDEX=" + std::to_string(strided_index);
+ options += " -DMEM_CHUNK_EXPONENT=" + std::to_string(mem_chunk_exp);
+ options += " -DCOMP_MODE=" + std::to_string(ctx->compMode ? 1u : 0u);
+ options += " -DMEMORY=" + std::to_string(hashMemSize);
+ options += " -DALGO=" + std::to_string(miner_algo[ii]);
+ options += " -DCN_UNROLL=" + std::to_string(ctx->unroll);
+
/* create a hash for the compile time cache
* used data:
* - source code
@@ -402,6 +430,9 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
std::string src_str(source_code);
src_str += options;
src_str += devNameVec.data();
+ src_str += get_version_str();
+ src_str += openCLDriverVer.data();
+
std::string hash_hex_str;
picosha2::hash256_hex_string(src_str, hash_hex_str);
@@ -418,7 +449,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
return ERR_OCL_API;
}
- ret = clBuildProgram(ctx->Program[ii], 1, &ctx->DeviceID, options, NULL, NULL);
+ ret = clBuildProgram(ctx->Program[ii], 1, &ctx->DeviceID, options.c_str(), NULL, NULL);
if(ret != CL_SUCCESS)
{
size_t len;
@@ -594,27 +625,6 @@ const char* const attributeNames[] = {
#define NELEMS(x) (sizeof(x) / sizeof((x)[0]))
-void PrintDeviceInfo(cl_device_id device)
-{
- char queryBuffer[1024];
- int queryInt;
- cl_int clError;
- clError = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(queryBuffer), &queryBuffer, NULL);
- printf(" CL_DEVICE_NAME: %s\n", queryBuffer);
- queryBuffer[0] = '\0';
- clError = clGetDeviceInfo(device, CL_DEVICE_VENDOR, sizeof(queryBuffer), &queryBuffer, NULL);
- printf(" CL_DEVICE_VENDOR: %s\n", queryBuffer);
- queryBuffer[0] = '\0';
- clError = clGetDeviceInfo(device, CL_DRIVER_VERSION, sizeof(queryBuffer), &queryBuffer, NULL);
- printf(" CL_DRIVER_VERSION: %s\n", queryBuffer);
- queryBuffer[0] = '\0';
- clError = clGetDeviceInfo(device, CL_DEVICE_VERSION, sizeof(queryBuffer), &queryBuffer, NULL);
- printf(" CL_DEVICE_VERSION: %s\n", queryBuffer);
- queryBuffer[0] = '\0';
- clError = clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(int), &queryInt, NULL);
- printf(" CL_DEVICE_MAX_COMPUTE_UNITS: %d\n", queryInt);
-}
-
uint32_t getNumPlatforms()
{
cl_uint num_platforms = 0;
@@ -885,6 +895,9 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)
//char* source_code = LoadTextFile(sSourcePath);
+ const char *fastIntMathV2CL =
+ #include "./opencl/fast_int_math_v2.cl"
+ ;
const char *cryptonightCL =
#include "./opencl/cryptonight.cl"
;
@@ -905,6 +918,7 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)
;
std::string source_code(cryptonightCL);
+ source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_FAST_INT_MATH_V2"), fastIntMathV2CL);
source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_WOLF_AES"), wolfAesCL);
source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_WOLF_SKEIN"), wolfSkeinCL);
source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_JH"), jhCL);
@@ -914,13 +928,20 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)
// create a directory for the OpenCL compile cache
create_directory(get_home() + "/.openclcache");
+ // check if cryptonight_monero_v8 is selected for the user or dev pool
+ bool useCryptonight_v8 =
+ ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_monero_v8 ||
+ ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot() == cryptonight_monero_v8 ||
+ ::jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgo() == cryptonight_monero_v8 ||
+ ::jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgoRoot() == cryptonight_monero_v8;
+
for(int i = 0; i < num_gpus; ++i)
{
+ const std::string backendName = xmrstak::params::inst().openCLVendor;
if(ctx[i].stridedIndex == 2 && (ctx[i].rawIntensity % ctx[i].workSize) != 0)
{
size_t reduced_intensity = (ctx[i].rawIntensity / ctx[i].workSize) * ctx[i].workSize;
ctx[i].rawIntensity = reduced_intensity;
- const std::string backendName = xmrstak::params::inst().openCLVendor;
printer::inst()->print_msg(L0, "WARNING %s: gpu %d intensity is not a multiple of 'worksize', auto reduce intensity to %d", backendName.c_str(), ctx[i].deviceIdx, int(reduced_intensity));
}
diff --git a/xmrstak/backend/amd/amd_gpu/gpu.hpp b/xmrstak/backend/amd/amd_gpu/gpu.hpp
index 5ab80b82a..63c5029d7 100644
--- a/xmrstak/backend/amd/amd_gpu/gpu.hpp
+++ b/xmrstak/backend/amd/amd_gpu/gpu.hpp
@@ -27,6 +27,7 @@ struct GpuContext
size_t workSize;
int stridedIndex;
int memChunk;
+ int unroll = 0;
bool isNVIDIA = false;
int compMode;
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
index 002472d3a..9c9bcd08e 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
@@ -35,8 +35,8 @@ R"===(
inline uint2 amd_bitalign( const uint2 src0, const uint2 src1, const uint src2)
{
uint2 result;
- result.s0 = (uint) (((((long)src0.s0) << 32) | (long)src1.s0) >> (src2));
- result.s1 = (uint) (((((long)src0.s1) << 32) | (long)src1.s1) >> (src2));
+ result.s0 = (uint) (((((ulong)src0.s0) << 32) | (ulong)src1.s0) >> (src2));
+ result.s1 = (uint) (((((ulong)src0.s1) << 32) | (ulong)src1.s1) >> (src2));
return result;
}
#endif
@@ -78,6 +78,8 @@ inline int amd_bfe(const uint src0, const uint offset, const uint width)
}
#endif
+//#include "opencl/fast_int_math_v2.cl"
+XMRSTAK_INCLUDE_FAST_INT_MATH_V2
//#include "opencl/wolf-aes.cl"
XMRSTAK_INCLUDE_WOLF_AES
//#include "opencl/wolf-skein.cl"
@@ -416,6 +418,9 @@ void AESExpandKey256(uint *keybuf)
}
}
+)==="
+R"===(
+
#define MEM_CHUNK (1<> 8));
+ /* explicit cast to `uint` is required because some OpenCL implementations (e.g. NVIDIA)
+ * handle get_global_id and get_global_offset as signed long long int and add
+ * 0xFFFFFFFF... to `get_global_id` if we set on host side a 32bit offset where the first bit is `1`
+ * (even if it is correct casted to unsigned on the host)
+ */
+ ((uint *)State)[10] |= (((uint)get_global_id(0) >> 8));
for(int i = 11; i < 25; ++i) State[i] = 0x00UL;
@@ -551,7 +561,15 @@ __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad,
}
mem_fence(CLK_GLOBAL_MEM_FENCE);
}
-
+
+// cryptonight_monero_v8 && NVIDIA
+#if(ALGO==11 && defined(__NV_CL_C_VERSION))
+# define SCRATCHPAD_CHUNK(N) (*(__local uint4*)((__local uchar*)(scratchpad_line) + (idxS ^ (N << 4))))
+# define SCRATCHPAD_CHUNK_GLOBAL (*((__global uint16*)(Scratchpad + (IDX((idx0 & 0x1FFFC0U) >> 4)))))
+#else
+# define SCRATCHPAD_CHUNK(N) (Scratchpad[IDX(((idx0) >> 4) ^ N)])
+#endif
+
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
__kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states, ulong Threads
// cryptonight_monero || cryptonight_aeon || cryptonight_ipbc || cryptonight_stellite || cryptonight_masari || cryptonight_bittube2
@@ -560,9 +578,29 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
#endif
)
{
- ulong a[2], b[2];
+ ulong a[2];
+
+// cryptonight_monero_v8
+#if(ALGO==11)
+ ulong b[4];
+ uint4 b_x[2];
+// NVIDIA
+# ifdef __NV_CL_C_VERSION
+ __local uint16 scratchpad_line_buf[WORKSIZE];
+ __local uint16* scratchpad_line = scratchpad_line_buf + get_local_id(0);
+# endif
+#else
+ ulong b[2];
+ uint4 b_x[1];
+#endif
__local uint AES0[256], AES1[256], AES2[256], AES3[256];
+// cryptonight_monero_v8
+#if(ALGO==11)
+ __local uint RCP[256];
+ uint2 division_result;
+ uint sqrt_result;
+#endif
const ulong gIdx = getIdx();
for(int i = get_local_id(0); i < 256; i += WORKSIZE)
@@ -572,6 +610,10 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
AES1[i] = rotate(tmp, 8U);
AES2[i] = rotate(tmp, 16U);
AES3[i] = rotate(tmp, 24U);
+// cryptonight_monero_v8
+#if(ALGO==11)
+ RCP[i] = RCP_C[i];
+#endif
}
barrier(CLK_LOCAL_MEM_FENCE);
@@ -579,7 +621,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
#if(ALGO == 3 || ALGO == 5 || ALGO == 6 || ALGO == 7 || ALGO == 8 || ALGO == 10)
uint2 tweak1_2;
#endif
- uint4 b_x;
+
#if(COMP_MODE==1)
// do not use early return here
if(gIdx < Threads)
@@ -599,13 +641,23 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
a[1] = states[1] ^ states[5];
b[1] = states[3] ^ states[7];
- b_x = ((uint4 *)b)[0];
+ b_x[0] = ((uint4 *)b)[0];
+
+// cryptonight_monero_v8
+#if(ALGO==11)
+ a[1] = states[1] ^ states[5];
+ b[2] = states[8] ^ states[10];
+ b[3] = states[9] ^ states[11];
+ b_x[1] = ((uint4 *)b)[1];
+ division_result = as_uint2(states[12]);
+ sqrt_result = as_uint2(states[13]).s0;
+#endif
// cryptonight_monero || cryptonight_aeon || cryptonight_ipbc || cryptonight_stellite || cryptonight_masari || cryptonight_bittube2
#if(ALGO == 3 || ALGO == 5 || ALGO == 6 || ALGO == 7 || ALGO == 8 || ALGO == 10)
tweak1_2 = as_uint2(input[4]);
tweak1_2.s0 >>= 24;
tweak1_2.s0 |= tweak1_2.s1 << 8;
- tweak1_2.s1 = get_global_id(0);
+ tweak1_2.s1 = (uint)get_global_id(0);
tweak1_2 ^= as_uint2(states[24]);
#endif
}
@@ -617,39 +669,102 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
if(gIdx < Threads)
#endif
{
- ulong idx0 = a[0];
+ ulong idx0 = a[0] & MASK;
- #pragma unroll 8
+ #pragma unroll CN_UNROLL
for(int i = 0; i < ITERATIONS; ++i)
{
ulong c[2];
+// cryptonight_monero_v8 && NVIDIA
+#if(ALGO==11 && defined(__NV_CL_C_VERSION))
+ ulong idxS = idx0 & 0x30;
+ *scratchpad_line = SCRATCHPAD_CHUNK_GLOBAL;
+#endif
- ((uint4 *)c)[0] = Scratchpad[IDX((idx0 & MASK) >> 4)];
+ ((uint4 *)c)[0] = SCRATCHPAD_CHUNK(0);
// cryptonight_bittube2
#if(ALGO == 10)
((uint4 *)c)[0] = AES_Round_bittube2(AES0, AES1, AES2, AES3, ((uint4 *)c)[0], ((uint4 *)a)[0]);
#else
((uint4 *)c)[0] = AES_Round(AES0, AES1, AES2, AES3, ((uint4 *)c)[0], ((uint4 *)a)[0]);
#endif
- b_x ^= ((uint4 *)c)[0];
+
+// cryptonight_monero_v8
+#if(ALGO==11)
+ {
+ ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(1));
+ ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2));
+ ulong2 chunk3 = as_ulong2(SCRATCHPAD_CHUNK(3));
+ SCRATCHPAD_CHUNK(1) = as_uint4(chunk3 + ((ulong2 *)(b_x + 1))[0]);
+ SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + ((ulong2 *)b_x)[0]);
+ SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]);
+ }
+#endif
+
// cryptonight_monero || cryptonight_aeon || cryptonight_ipbc || cryptonight_stellite || cryptonight_masari || cryptonight_bittube2
#if(ALGO == 3 || ALGO == 5 || ALGO == 6 || ALGO == 7 || ALGO == 8 || ALGO == 10)
uint table = 0x75310U;
+ b_x[0] ^= ((uint4 *)c)[0];
// cryptonight_stellite
# if(ALGO == 7)
- uint index = ((b_x.s2 >> 27) & 12) | ((b_x.s2 >> 23) & 2);
+ uint index = ((b_x[0].s2 >> 27) & 12) | ((b_x[0].s2 >> 23) & 2);
+# else
+ uint index = ((b_x[0].s2 >> 26) & 12) | ((b_x[0].s2 >> 23) & 2);
+# endif
+ b_x[0].s2 ^= ((table >> index) & 0x30U) << 24;
+ SCRATCHPAD_CHUNK(0) = b_x[0];
+ idx0 = c[0] & MASK;
+// cryptonight_monero_v8
+#elif(ALGO==11)
+ SCRATCHPAD_CHUNK(0) = b_x[0] ^ ((uint4 *)c)[0];
+# ifdef __NV_CL_C_VERSION
+ // flush shuffled data
+ SCRATCHPAD_CHUNK_GLOBAL = *scratchpad_line;
+ idx0 = c[0] & MASK;
+ idxS = idx0 & 0x30;
+ *scratchpad_line = SCRATCHPAD_CHUNK_GLOBAL;
# else
- uint index = ((b_x.s2 >> 26) & 12) | ((b_x.s2 >> 23) & 2);
+ idx0 = c[0] & MASK;
# endif
- b_x.s2 ^= ((table >> index) & 0x30U) << 24;
+#else
+ b_x[0] ^= ((uint4 *)c)[0];
+ SCRATCHPAD_CHUNK(0) = b_x[0];
+ idx0 = c[0] & MASK;
#endif
- Scratchpad[IDX((idx0 & MASK) >> 4)] = b_x;
-
uint4 tmp;
- tmp = Scratchpad[IDX((c[0] & MASK) >> 4)];
-
- a[1] += c[0] * as_ulong2(tmp).s0;
- a[0] += mul_hi(c[0], as_ulong2(tmp).s0);
+ tmp = SCRATCHPAD_CHUNK(0);
+// cryptonight_monero_v8
+#if(ALGO==11)
+ // Use division and square root results from the _previous_ iteration to hide the latency
+ tmp.s0 ^= division_result.s0;
+ tmp.s1 ^= division_result.s1 ^ sqrt_result;
+ // Most and least significant bits in the divisor are set to 1
+ // to make sure we don't divide by a small or even number,
+ // so there are no shortcuts for such cases
+ const uint d = (((uint *)c)[0] + (sqrt_result << 1)) | 0x80000001UL;
+ // Quotient may be as large as (2^64 - 1)/(2^31 + 1) = 8589934588 = 2^33 - 4
+ // We drop the highest bit to fit both quotient and remainder in 32 bits
+ division_result = fast_div_v2(RCP, c[1], d);
+ // Use division_result as an input for the square root to prevent parallel implementation in hardware
+ sqrt_result = fast_sqrt_v2(c[0] + as_ulong(division_result));
+#endif
+ ulong2 result_mul;
+ result_mul.s0 = mul_hi(c[0], as_ulong2(tmp).s0);
+ result_mul.s1 = c[0] * as_ulong2(tmp).s0;
+// cryptonight_monero_v8
+#if(ALGO==11)
+ {
+ ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(1)) ^ result_mul;
+ ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2));
+ result_mul ^= chunk2;
+ ulong2 chunk3 = as_ulong2(SCRATCHPAD_CHUNK(3));
+ SCRATCHPAD_CHUNK(1) = as_uint4(chunk3 + ((ulong2 *)(b_x + 1))[0]);
+ SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + ((ulong2 *)b_x)[0]);
+ SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]);
+ }
+#endif
+ a[1] += result_mul.s1;
+ a[0] += result_mul.s0;
// cryptonight_monero || cryptonight_aeon || cryptonight_ipbc || cryptonight_stellite || cryptonight_masari || cryptonight_bittube2
#if(ALGO == 3 || ALGO == 5 || ALGO == 6 || ALGO == 7 || ALGO == 8 || ALGO == 10)
@@ -658,44 +773,55 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
# if(ALGO == 6 || ALGO == 10)
uint2 ipbc_tmp = tweak1_2 ^ ((uint2 *)&(a[0]))[0];
((uint2 *)&(a[1]))[0] ^= ipbc_tmp;
- Scratchpad[IDX((c[0] & MASK) >> 4)] = ((uint4 *)a)[0];
+ SCRATCHPAD_CHUNK(0) = ((uint4 *)a)[0];
((uint2 *)&(a[1]))[0] ^= ipbc_tmp;
# else
((uint2 *)&(a[1]))[0] ^= tweak1_2;
- Scratchpad[IDX((c[0] & MASK) >> 4)] = ((uint4 *)a)[0];
+ SCRATCHPAD_CHUNK(0) = ((uint4 *)a)[0];
((uint2 *)&(a[1]))[0] ^= tweak1_2;
# endif
#else
- Scratchpad[IDX((c[0] & MASK) >> 4)] = ((uint4 *)a)[0];
+ SCRATCHPAD_CHUNK(0) = ((uint4 *)a)[0];
#endif
((uint4 *)a)[0] ^= tmp;
- idx0 = a[0];
- b_x = ((uint4 *)c)[0];
+// cryptonight_monero_v8
+#if (ALGO == 11)
+# if defined(__NV_CL_C_VERSION)
+ // flush shuffled data
+ SCRATCHPAD_CHUNK_GLOBAL = *scratchpad_line;
+# endif
+ b_x[1] = b_x[0];
+#endif
+ b_x[0] = ((uint4 *)c)[0];
+ idx0 = a[0] & MASK;
// cryptonight_heavy || cryptonight_bittube2
#if (ALGO == 4 || ALGO == 10)
- long n = *((__global long*)(Scratchpad + (IDX((idx0 & MASK) >> 4))));
- int d = ((__global int*)(Scratchpad + (IDX((idx0 & MASK) >> 4))))[2];
+ long n = *((__global long*)(Scratchpad + (IDX((idx0) >> 4))));
+ int d = ((__global int*)(Scratchpad + (IDX((idx0) >> 4))))[2];
long q = n / (d | 0x5);
- *((__global long*)(Scratchpad + (IDX((idx0 & MASK) >> 4)))) = n ^ q;
- idx0 = d ^ q;
-#endif
+ *((__global long*)(Scratchpad + (IDX((idx0) >> 4)))) = n ^ q;
+ idx0 = (d ^ q) & MASK;
// cryptonight_haven
-#if (ALGO == 9)
- long n = *((__global long*)(Scratchpad + (IDX((idx0 & MASK) >> 4))));
- int d = ((__global int*)(Scratchpad + (IDX((idx0 & MASK) >> 4))))[2];
+#elif (ALGO == 9)
+ long n = *((__global long*)(Scratchpad + (IDX((idx0) >> 4))));
+ int d = ((__global int*)(Scratchpad + (IDX((idx0) >> 4))))[2];
long q = n / (d | 0x5);
- *((__global long*)(Scratchpad + (IDX((idx0 & MASK) >> 4)))) = n ^ q;
- idx0 = (~d) ^ q;
+ *((__global long*)(Scratchpad + (IDX((idx0) >> 4)))) = n ^ q;
+ idx0 = ((~d) ^ q) & MASK;
#endif
+
}
}
mem_fence(CLK_GLOBAL_MEM_FENCE);
}
+)==="
+R"===(
+
__attribute__((reqd_work_group_size(WORKSIZE, 8, 1)))
__kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states, __global uint *Branch0, __global uint *Branch1, __global uint *Branch2, __global uint *Branch3, ulong Threads)
{
@@ -918,7 +1044,7 @@ __kernel void Skein(__global ulong *states, __global uint *BranchBuf, __global u
{
ulong outIdx = atomic_inc(output + 0xFF);
if(outIdx < 0xFF)
- output[outIdx] = BranchBuf[idx] + get_global_offset(0);
+ output[outIdx] = BranchBuf[idx] + (uint)get_global_offset(0);
}
}
mem_fence(CLK_GLOBAL_MEM_FENCE);
@@ -994,7 +1120,7 @@ __kernel void JH(__global ulong *states, __global uint *BranchBuf, __global uint
{
ulong outIdx = atomic_inc(output + 0xFF);
if(outIdx < 0xFF)
- output[outIdx] = BranchBuf[idx] + get_global_offset(0);
+ output[outIdx] = BranchBuf[idx] + (uint)get_global_offset(0);
}
}
}
@@ -1072,7 +1198,7 @@ __kernel void Blake(__global ulong *states, __global uint *BranchBuf, __global u
{
ulong outIdx = atomic_inc(output + 0xFF);
if(outIdx < 0xFF)
- output[outIdx] = BranchBuf[idx] + get_global_offset(0);
+ output[outIdx] = BranchBuf[idx] + (uint)get_global_offset(0);
}
}
}
@@ -1095,7 +1221,7 @@ __kernel void Groestl(__global ulong *states, __global uint *BranchBuf, __global
#pragma unroll 4
for(uint i = 0; i < 4; ++i)
{
- ulong H[8], M[8];
+ volatile ulong H[8], M[8];
if(i < 3)
{
@@ -1133,7 +1259,7 @@ __kernel void Groestl(__global ulong *states, __global uint *BranchBuf, __global
{
ulong outIdx = atomic_inc(output + 0xFF);
if(outIdx < 0xFF)
- output[outIdx] = BranchBuf[idx] + get_global_offset(0);
+ output[outIdx] = BranchBuf[idx] + (uint)get_global_offset(0);
}
}
}
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl b/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl
new file mode 100644
index 000000000..607806b7a
--- /dev/null
+++ b/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl
@@ -0,0 +1,108 @@
+R"===(
+/*
+ * @author SChernykh
+ */
+static const __constant uint RCP_C[256] =
+{
+ 0xfe01be73u,0xfd07ff01u,0xfa118c5au,0xf924fb13u,0xf630cddbu,0xf558f73cu,0xf25f2934u,0xf1a3f37bu,
+ 0xee9c4562u,0xee02efd0u,0xeae7ced5u,0xea76ec3au,0xe7417330u,0xe6ffe8b8u,0xe3a8e217u,0xe39be54au,
+ 0xe01dcd03u,0xe04ae1f0u,0xdc9fea3bu,0xdd0bdea8u,0xd92eef38u,0xd9dedb73u,0xd5ca9626u,0xd6c3d84fu,
+ 0xd27299dcu,0xd3b9d53cu,0xcf26b659u,0xd0bfd23au,0xcbe6ab09u,0xcdd5cf48u,0xc8b23886u,0xcafacc65u,
+ 0xc58920e5u,0xc82ec992u,0xc26b283eu,0xc572c6ceu,0xbf5813d7u,0xc2c3c419u,0xbc4facdbu,0xc023c171u,
+ 0xb951b9f6u,0xbd8fbed7u,0xb65e05c8u,0xbb09bc4bu,0xb3745d97u,0xb890b9cbu,0xb0948d04u,0xb624b758u,
+ 0xadbe61e8u,0xb3c3b4f2u,0xaaf1ae2au,0xb16eb297u,0xa82e412eu,0xaf25b048u,0xa573ec98u,0xace7ae05u,
+ 0xa2c28519u,0xaab4abcdu,0xa019df1cu,0xa88ca99fu,0x9d79cf91u,0xa66ea77cu,0x9ae22df8u,0xa45ba563u,
+ 0x9852d0ceu,0xa251a354u,0x95cb912eu,0xa050a14fu,0x934c48d6u,0x9e5a9f54u,0x90d4d228u,0x9c6c9d62u,
+ 0x8e650939u,0x9a879b79u,0x8bfccaf5u,0x98ac9998u,0x899bf212u,0x96d897c1u,0x87425eedu,0x950d95f2u,
+ 0x84efefd3u,0x934a942bu,0x82a48450u,0x918f926cu,0x805ffcb4u,0x8fdc90b5u,0x7e223ab7u,0x8e308f05u,
+ 0x7beb1f71u,0x8c8c8d5du,0x79ba8ce2u,0x8aef8bbdu,0x7790683eu,0x89598a23u,0x756c9343u,0x87ca8891u,
+ 0x734ef468u,0x86428705u,0x71376efbu,0x84c18581u,0x6f25e9ebu,0x83458402u,0x6d1a4b34u,0x81d0828au,
+ 0x6b147a52u,0x80628118u,0x69145cfbu,0x7ef97fadu,0x6719dd39u,0x7d967e47u,0x6524e2abu,0x7c397ce7u,
+ 0x6335561bu,0x7ae27b8du,0x614b21eau,0x79907a38u,0x5f662f10u,0x784478e9u,0x5d8667dfu,0x76fd77a0u,
+ 0x5babb887u,0x75bb765bu,0x59d60b2eu,0x747e751cu,0x58054d25u,0x734673e1u,0x5639688fu,0x721372acu,
+ 0x54724c2du,0x70e5717bu,0x52afe29cu,0x6fbb7050u,0x50f21c05u,0x6e966f28u,0x4f38e412u,0x6d766e06u,
+ 0x4d842a91u,0x6c5a6ce7u,0x4bd3dcd0u,0x6b426bcdu,0x4a27e96au,0x6a2e6ab8u,0x4880415eu,0x691f69a6u,
+ 0x46dcd25du,0x68136899u,0x453d8df4u,0x670c678fu,0x43a262a5u,0x6608668au,0x420b42d6u,0x65096588u,
+ 0x40781dd3u,0x640d648au,0x3ee8e49au,0x63146390u,0x3d5d8a11u,0x621f6299u,0x3bd5fee0u,0x612e61a6u,
+ 0x3a523496u,0x604060b7u,0x38d21e75u,0x5f565fcbu,0x3755aec4u,0x5e6f5ee2u,0x35dcd78fu,0x5d8b5dfdu,
+ 0x34678d72u,0x5cab5d1au,0x32f5c17cu,0x5bcd5c3bu,0x318767f1u,0x5af35b60u,0x301c7511u,0x5a1b5a87u,
+ 0x2eb4dccau,0x594759b1u,0x2d50935cu,0x587658deu,0x2bef8bfau,0x57a7580eu,0x2a91bc5cu,0x56db5741u,
+ 0x2937198fu,0x56125676u,0x27df970eu,0x554c55afu,0x268b2b78u,0x548854eau,0x2539cba1u,0x53c75428u,
+ 0x23eb6d84u,0x53095368u,0x22a00644u,0x524d52abu,0x21578cd3u,0x519451f0u,0x2011f5f9u,0x50dd5138u,
+ 0x1ecf388eu,0x50285082u,0x1d8f4b53u,0x4f764fcfu,0x1c5224abu,0x4ec64f1eu,0x1b17bb87u,0x4e184e6fu,
+ 0x19e0073fu,0x4d6d4dc2u,0x18aafe0au,0x4cc44d18u,0x177896f3u,0x4c1c4c70u,0x1648cb16u,0x4b784bcau,
+ 0x151b9051u,0x4ad54b26u,0x13f0deeau,0x4a344a84u,0x12c8aef3u,0x499549e4u,0x11a2f829u,0x48f84946u,
+ 0x107fb1ffu,0x485d48abu,0xf5ed5f0u,0x47c44811u,0xe405bc1u,0x472d4779u,0xd243bdau,0x469846e3u,
+ 0xc0a6fa1u,0x4605464eu,0xaf2edf2u,0x457345bcu,0x9ddb163u,0x44e3452bu,0x8cab264u,0x4455449cu,
+ 0x7b9e9d5u,0x43c9440fu,0x6ab5173u,0x433e4383u,0x59ee141u,0x42b542fau,0x49494c7u,0x422e4271u,
+ 0x38c62ffu,0x41a841ebu,0x286478bu,0x41244166u,0x1823b84u,0x40a140e2u,0x803883u,0x401C4060u,
+};
+
+inline uint get_reciprocal(const __local uchar *RCP, uint a)
+{
+ const uint index1 = (a & 0x7F000000U) >> 21;
+ const int index2 = (int)((a >> 8) & 0xFFFFU) - 32768;
+
+ const uint r1 = *(const __local uint*)(RCP + index1);
+
+ uint r2_0 = *(const __local uint*)(RCP + index1 + 4);
+ if (index2 > 0) r2_0 >>= 16;
+ const int r2 = r2_0 & 0xFFFFU;
+
+ const uint r = r1 - (uint)(mul24(r2, index2) >> 6);
+
+ const ulong lo0 = (ulong)(r) * a;
+ ulong lo = lo0 + ((ulong)(a) << 32);
+
+ a >>= 1;
+ const bool b = (a >= lo) || (lo >= lo0);
+ lo = a - lo;
+
+ const ulong k = mul_hi(as_uint2(lo).s0, r) + ((ulong)(r) * as_uint2(lo).s1) + lo;
+ return as_uint2(k).s1 + (b ? r : 0);
+}
+
+inline uint2 fast_div_v2(const __local uint *RCP, ulong a, uint b)
+{
+ const uint r = get_reciprocal((const __local uchar *)RCP, b);
+ const ulong k = mul_hi(as_uint2(a).s0, r) + ((ulong)(r) * as_uint2(a).s1) + a;
+
+ ulong q;
+ ((uint*)&q)[0] = as_uint2(k).s1;;
+ ((uint*)&q)[1] = (k < a) ? 1 : 0;
+
+ const long tmp = a - q * b;
+ const bool overshoot = (tmp < 0);
+ const bool undershoot = (tmp >= b);
+
+ return (uint2)(
+ as_uint2(q).s0 + (undershoot ? 1U : 0U) - (overshoot ? 1U : 0U),
+ as_uint2(tmp).s0 + (overshoot ? b : 0U) - (undershoot ? b : 0U)
+ );
+}
+
+inline uint fast_sqrt_v2(const ulong n1)
+{
+ float x = as_float((as_uint2(n1).s1 >> 9) + ((64U + 127U) << 23));
+
+ float x1 = native_rsqrt(x);
+ x = native_sqrt(x);
+
+ // The following line does x1 *= 4294967296.0f;
+ x1 = as_float(as_uint(x1) + (32U << 23));
+
+ const uint x0 = as_uint(x) - (158U << 23);
+ const long delta0 = n1 - (((long)(x0) * x0) << 18);
+ const float delta = convert_float_rte(as_int2(delta0).s1) * x1;
+
+ uint result = (x0 << 10) + convert_int_rte(delta);
+ const uint s = result >> 1;
+ const uint b = result & 1;
+
+ const ulong x2 = (ulong)(s) * (s + b) + ((ulong)(result) << 32) - n1;
+ if ((long)(x2 + b) > 0) --result;
+ if ((long)(x2 + 0x100000000UL + s) < 0) ++result;
+
+ return result;
+}
+)==="
diff --git a/xmrstak/backend/amd/autoAdjust.hpp b/xmrstak/backend/amd/autoAdjust.hpp
index d6acec971..c5b331c87 100644
--- a/xmrstak/backend/amd/autoAdjust.hpp
+++ b/xmrstak/backend/amd/autoAdjust.hpp
@@ -127,6 +127,24 @@ class autoAdjust
minFreeMem = 512u * byteToMiB;
}
+ // check if cryptonight_monero_v8 is selected for the user or dev pool
+ bool useCryptonight_v8 =
+ ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_monero_v8 ||
+ ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot() == cryptonight_monero_v8 ||
+ ::jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgo() == cryptonight_monero_v8 ||
+ ::jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgoRoot() == cryptonight_monero_v8;
+
+ // set strided index to default
+ ctx.stridedIndex = 1;
+
+ // nvidia performance is very bad if the scratchpad is not contiguous
+ if(ctx.isNVIDIA)
+ ctx.stridedIndex = 0;
+
+ // use chunked (4x16byte) scratchpad for all backends. Default `mem_chunk` is `2`
+ if(useCryptonight_v8)
+ ctx.stridedIndex = 2;
+
// increase all intensity limits by two for aeon
if(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_lite)
maxThreads *= 2u;
@@ -153,8 +171,8 @@ class autoAdjust
// set 8 threads per block (this is a good value for the most gpus)
conf += std::string(" { \"index\" : ") + std::to_string(ctx.deviceIdx) + ",\n" +
" \"intensity\" : " + std::to_string(intensity) + ", \"worksize\" : " + std::to_string(8) + ",\n" +
- " \"affine_to_cpu\" : false, \"strided_index\" : " + (ctx.isNVIDIA ? "0" : "1") + ", \"mem_chunk\" : 2,\n"
- " \"comp_mode\" : true\n" +
+ " \"affine_to_cpu\" : false, \"strided_index\" : " + std::to_string(ctx.stridedIndex) + ", \"mem_chunk\" : 2,\n"
+ " \"unroll\" : 8, \"comp_mode\" : true\n" +
" },\n";
}
else
diff --git a/xmrstak/backend/amd/config.tpl b/xmrstak/backend/amd/config.tpl
index 28855f070..421e0ed4b 100644
--- a/xmrstak/backend/amd/config.tpl
+++ b/xmrstak/backend/amd/config.tpl
@@ -1,4 +1,5 @@
-R"===(
+R"===(// generated by XMRSTAK_VERSION
+
/*
* GPU configuration. You should play around with intensity and worksize as the fastest settings will vary.
* index - GPU index number usually starts from 0
@@ -9,17 +10,20 @@ R"===(
* 2 = chunked memory, chunk size is controlled by 'mem_chunk'
* required: intensity must be a multiple of worksize
* 1 or true = use 16byte contiguous memory per thread, the next memory block has offset of intensity blocks
+ * (for cryptonight_v8 and monero it is equal to strided_index = 0)
* 0 or false = use a contiguous block of memory per thread
* mem_chunk - range 0 to 18: set the number of elements (16byte) per chunk
* this value is only used if 'strided_index' == 2
* element count is computed with the equation: 2 to the power of 'mem_chunk' e.g. 4 means a chunk of 16 elements(256byte)
+ * unroll - allow to control how often the POW main loop is unrolled; valid range [1;128) - for most OpenCL implementations it must be a power of two.
* comp_mode - Compatibility enable/disable the automatic guard around compute kernel which allows
* to use a intensity which is not the multiple of the worksize.
* If you set false and the intensity is not multiple of the worksize the miner can crash:
* in this case set the intensity to a multiple of the worksize or activate comp_mode.
* "gpu_threads_conf" :
* [
- * { "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false, "strided_index" : true, "mem_chunk" : 2, "comp_mode" : true },
+ * { "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false,
+ * "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true },
* ],
* If you do not wish to mine with your AMD GPU(s) then use:
* "gpu_threads_conf" :
@@ -34,5 +38,4 @@ GPUCONFIG
* Platform index. This will be 0 unless you have different OpenCL platform - eg. AMD and Intel.
*/
"platform_index" : PLATFORMINDEX,
-
)==="
diff --git a/xmrstak/backend/amd/jconf.cpp b/xmrstak/backend/amd/jconf.cpp
index 0f39ff2b9..152f8add4 100644
--- a/xmrstak/backend/amd/jconf.cpp
+++ b/xmrstak/backend/amd/jconf.cpp
@@ -106,17 +106,18 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg)
if(!oThdConf.IsObject())
return false;
- const Value *idx, *intensity, *w_size, *aff, *stridedIndex, *memChunk, *compMode;
+ const Value *idx, *intensity, *w_size, *aff, *stridedIndex, *memChunk, *unroll, *compMode;
idx = GetObjectMember(oThdConf, "index");
intensity = GetObjectMember(oThdConf, "intensity");
w_size = GetObjectMember(oThdConf, "worksize");
aff = GetObjectMember(oThdConf, "affine_to_cpu");
stridedIndex = GetObjectMember(oThdConf, "strided_index");
memChunk = GetObjectMember(oThdConf, "mem_chunk");
+ unroll = GetObjectMember(oThdConf, "unroll");
compMode = GetObjectMember(oThdConf, "comp_mode");
if(idx == nullptr || intensity == nullptr || w_size == nullptr || aff == nullptr || memChunk == nullptr ||
- stridedIndex == nullptr || compMode == nullptr)
+ stridedIndex == nullptr || unroll == nullptr || compMode == nullptr)
return false;
if(!idx->IsUint64() || !intensity->IsUint64() || !w_size->IsUint64())
@@ -142,13 +143,20 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg)
return false;
}
+ if(!memChunk->IsUint64() || (int)memChunk->GetInt64() > 18 )
+ {
+ printer::inst()->print_msg(L0, "ERROR: mem_chunk must be smaller than 18");
+ return false;
+ }
+
cfg.memChunk = (int)memChunk->GetInt64();
- if(!idx->IsUint64() || cfg.memChunk > 18 )
+ if(!unroll->IsUint64() || (int)unroll->GetInt64() >= 128 || (int)unroll->GetInt64() == 0)
{
- printer::inst()->print_msg(L0, "ERROR: mem_chunk must be smaller than 18");
+ printer::inst()->print_msg(L0, "ERROR: unroll must be smaller than 128 and not zero");
return false;
}
+ cfg.unroll = (int)unroll->GetInt64();
if(!compMode->IsBool())
return false;
diff --git a/xmrstak/backend/amd/jconf.hpp b/xmrstak/backend/amd/jconf.hpp
index 580b69fe7..b852c5940 100644
--- a/xmrstak/backend/amd/jconf.hpp
+++ b/xmrstak/backend/amd/jconf.hpp
@@ -28,6 +28,7 @@ class jconf
long long cpu_aff;
int stridedIndex;
int memChunk;
+ int unroll;
bool compMode;
};
diff --git a/xmrstak/backend/amd/minethd.cpp b/xmrstak/backend/amd/minethd.cpp
index f7b47249e..5e70f25a6 100644
--- a/xmrstak/backend/amd/minethd.cpp
+++ b/xmrstak/backend/amd/minethd.cpp
@@ -99,6 +99,7 @@ bool minethd::init_gpus()
vGpuData[i].stridedIndex = cfg.stridedIndex;
vGpuData[i].memChunk = cfg.memChunk;
vGpuData[i].compMode = cfg.compMode;
+ vGpuData[i].unroll = cfg.unroll;
}
return InitOpenCL(vGpuData.data(), n, jconf::inst()->GetPlatformIdx()) == ERR_SUCCESS;
@@ -173,6 +174,11 @@ void minethd::work_main()
cryptonight_ctx* cpu_ctx;
cpu_ctx = cpu::minethd::minethd_alloc_ctx();
+ if(cpu_ctx == nullptr)
+ {
+ printer::inst()->print_msg(L0, "ERROR: miner was not able to allocate memory, miner will be stopped.");
+ win_exit(1);
+ }
// start with root algorithm and switch later if fork version is reached
auto miner_algo = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot();
cn_hash_fun hash_fun = cpu::minethd::func_selector(::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, miner_algo);
@@ -235,7 +241,7 @@ void minethd::work_main()
if(globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) != iJobNo)
break;
}
-
+
cl_uint results[0x100];
memset(results,0,sizeof(cl_uint)*(0x100));
@@ -252,7 +258,7 @@ void minethd::work_main()
*(uint32_t*)(bWorkBlob + 39) = results[i];
- hash_fun(bWorkBlob, oWork.iWorkSize, bResult, cpu_ctx);
+ hash_fun(bWorkBlob, oWork.iWorkSize, bResult, &cpu_ctx);
if ( (*((uint64_t*)(bResult + 24))) < oWork.iTarget)
executor::inst()->push_event(ex_event(job_result(oWork.sJobID, results[i], bResult, iThreadNo, miner_algo), oWork.iPoolId));
else
diff --git a/xmrstak/backend/amd/minethd.hpp b/xmrstak/backend/amd/minethd.hpp
index 3142117c5..32e66ec87 100644
--- a/xmrstak/backend/amd/minethd.hpp
+++ b/xmrstak/backend/amd/minethd.hpp
@@ -24,14 +24,14 @@ class minethd : public iBackend
static bool init_gpus();
private:
- typedef void (*cn_hash_fun)(const void*, size_t, void*, cryptonight_ctx*);
+ typedef void (*cn_hash_fun)(const void*, size_t, void*, cryptonight_ctx**);
minethd(miner_work& pWork, size_t iNo, GpuContext* ctx, const jconf::thd_cfg cfg);
void work_main();
uint64_t iJobNo;
-
+
miner_work oWork;
std::promise order_fix;
diff --git a/xmrstak/backend/backendConnector.cpp b/xmrstak/backend/backendConnector.cpp
index 525413fd5..92bb01506 100644
--- a/xmrstak/backend/backendConnector.cpp
+++ b/xmrstak/backend/backendConnector.cpp
@@ -63,10 +63,35 @@ std::vector* BackendConnector::thread_starter(miner_work& pWork)
#ifndef CONF_NO_CUDA
if(params::inst().useNVIDIA)
{
- plugin nvidiaplugin("NVIDIA", "xmrstak_cuda_backend");
- std::vector* nvidiaThreads = nvidiaplugin.startBackend(static_cast(pvThreads->size()), pWork, environment::inst());
- pvThreads->insert(std::end(*pvThreads), std::begin(*nvidiaThreads), std::end(*nvidiaThreads));
- if(nvidiaThreads->size() == 0)
+ plugin nvidiaplugin;
+ std::vector* nvidiaThreads;
+ std::vector libNames = {"xmrstak_cuda_backend_cuda10_0", "xmrstak_cuda_backend_cuda9_2", "xmrstak_cuda_backend"};
+ size_t numWorkers = 0u;
+
+ for( const auto & name : libNames)
+ {
+ printer::inst()->print_msg(L0, "NVIDIA: try to load library '%s'", name.c_str());
+ nvidiaplugin.load("NVIDIA", name);
+ std::vector* nvidiaThreads = nvidiaplugin.startBackend(static_cast(pvThreads->size()), pWork, environment::inst());
+ if(nvidiaThreads != nullptr)
+ {
+ pvThreads->insert(std::end(*pvThreads), std::begin(*nvidiaThreads), std::end(*nvidiaThreads));
+ numWorkers = nvidiaThreads->size();
+ delete nvidiaThreads;
+ }
+ else
+ {
+ // remove the plugin if we have found no GPUs
+ nvidiaplugin.unload();
+ }
+ // we found at leat one working GPU
+ if(numWorkers != 0)
+ {
+ printer::inst()->print_msg(L0, "NVIDIA: use library '%s'", name.c_str());
+ break;
+ }
+ }
+ if(numWorkers == 0)
printer::inst()->print_msg(L0, "WARNING: backend NVIDIA disabled.");
}
#endif
@@ -75,10 +100,17 @@ std::vector* BackendConnector::thread_starter(miner_work& pWork)
if(params::inst().useAMD)
{
const std::string backendName = xmrstak::params::inst().openCLVendor;
- plugin amdplugin(backendName, "xmrstak_opencl_backend");
+ plugin amdplugin;
+ amdplugin.load(backendName, "xmrstak_opencl_backend");
std::vector* amdThreads = amdplugin.startBackend(static_cast(pvThreads->size()), pWork, environment::inst());
- pvThreads->insert(std::end(*pvThreads), std::begin(*amdThreads), std::end(*amdThreads));
- if(amdThreads->size() == 0)
+ size_t numWorkers = 0u;
+ if(amdThreads != nullptr)
+ {
+ pvThreads->insert(std::end(*pvThreads), std::begin(*amdThreads), std::end(*amdThreads));
+ numWorkers = amdThreads->size();
+ delete amdThreads;
+ }
+ if(numWorkers == 0)
printer::inst()->print_msg(L0, "WARNING: backend %s (OpenCL) disabled.", backendName.c_str());
}
#endif
diff --git a/xmrstak/backend/cpu/autoAdjust.hpp b/xmrstak/backend/cpu/autoAdjust.hpp
index 57dbef053..e7f3e9148 100644
--- a/xmrstak/backend/cpu/autoAdjust.hpp
+++ b/xmrstak/backend/cpu/autoAdjust.hpp
@@ -7,6 +7,7 @@
#include "xmrstak/misc/configEditor.hpp"
#include "xmrstak/params.hpp"
#include "xmrstak/backend/cryptonight.hpp"
+#include "xmrstak/backend/cpu/cpuType.hpp"
#include
#ifdef _WIN32
@@ -20,14 +21,6 @@ namespace xmrstak
{
namespace cpu
{
-// Mask bits between h and l and return the value
-// This enables us to put in values exactly like in the manual
-// For example EBX[31:22] is get_masked(cpu_info[1], 31, 22)
-inline int32_t get_masked(int32_t val, int32_t h, int32_t l)
-{
- val &= (0x7FFFFFFF >> (31-(h-l))) << l;
- return val >> l;
-}
class autoAdjust
{
@@ -58,7 +51,7 @@ class autoAdjust
if(L3KB_size < halfHashMemSizeKB || L3KB_size > (halfHashMemSizeKB * 2048))
printer::inst()->print_msg(L0, "Autoconf failed: L3 size sanity check failed - %u KB.", L3KB_size);
- conf += std::string(" { \"low_power_mode\" : false, \"no_prefetch\" : true, \"affine_to_cpu\" : false },\n");
+ conf += std::string(" { \"low_power_mode\" : false, \"no_prefetch\" : true, \"asm\" : \"off\", \"affine_to_cpu\" : false },\n");
printer::inst()->print_msg(L0, "Autoconf FAILED. Create config for a single thread. Please try to add new ones until the hashrate slows down.");
}
else
@@ -82,7 +75,7 @@ class autoAdjust
conf += std::string(" { \"low_power_mode\" : ");
conf += std::string(double_mode ? "true" : "false");
- conf += std::string(", \"no_prefetch\" : true, \"affine_to_cpu\" : ");
+ conf += std::string(", \"no_prefetch\" : true, \"asm\" : \"auto\", \"affine_to_cpu\" : ");
conf += std::to_string(aff_id);
conf += std::string(" },\n");
@@ -143,7 +136,8 @@ class autoAdjust
L3KB_size = get_masked(cpu_info[3], 31, 18) * 512;
::jconf::cpuid(1, 0, cpu_info);
- if(get_masked(cpu_info[0], 11, 8) < 0x17) //0x17h is Zen
+
+ if(getModel().family < 0x17) //0x17h is Zen
old_amd = true;
return true;
diff --git a/xmrstak/backend/cpu/autoAdjustHwloc.hpp b/xmrstak/backend/cpu/autoAdjustHwloc.hpp
index 01d2280d8..b61582588 100644
--- a/xmrstak/backend/cpu/autoAdjustHwloc.hpp
+++ b/xmrstak/backend/cpu/autoAdjustHwloc.hpp
@@ -70,7 +70,7 @@ class autoAdjust
{
conf += std::string(" { \"low_power_mode\" : ");
conf += std::string((id & 0x8000000) != 0 ? "true" : "false");
- conf += std::string(", \"no_prefetch\" : true, \"affine_to_cpu\" : ");
+ conf += std::string(", \"no_prefetch\" : true, \"asm\" : \"auto\", \"affine_to_cpu\" : ");
conf += std::to_string(id & 0x7FFFFFF);
conf += std::string(" },\n");
}
@@ -78,7 +78,8 @@ class autoAdjust
catch(const std::runtime_error& err)
{
// \todo add fallback to default auto adjust
- conf += std::string(" { \"low_power_mode\" : false, \"no_prefetch\" : true, \"affine_to_cpu\" : false },\n");
+ conf += std::string(" { \"low_power_mode\" : false");
+ conf += std::string(", \"no_prefetch\" : true, \"asm\" : \"off\", \"affine_to_cpu\" : false },\n");
printer::inst()->print_msg(L0, "Autoconf FAILED: %s. Create config for a single thread.", err.what());
}
diff --git a/xmrstak/backend/cpu/config.tpl b/xmrstak/backend/cpu/config.tpl
index 2fc9a47ec..1a64860e4 100644
--- a/xmrstak/backend/cpu/config.tpl
+++ b/xmrstak/backend/cpu/config.tpl
@@ -1,4 +1,5 @@
-R"===(
+R"===(// generated by XMRSTAK_VERSION
+
/*
* Thread configuration for each thread. Make sure it matches the number above.
* low_power_mode - This can either be a boolean (true or false), or a number between 1 to 5. When set to true,
@@ -7,10 +8,16 @@ R"===(
* the maximum performance. When set to a number N greater than 1, this mode will increase the
* cache usage and single thread performance by N times.
*
- * no_prefetch - Some systems can gain up to extra 5% here, but sometimes it will have no difference or make
+ * no_prefetch - Some systems can gain up to extra 5% here, but sometimes it will have no difference or make
* things slower.
*
- * affine_to_cpu - This can be either false (no affinity), or the CPU core number. Note that on hyperthreading
+ * asm - Allow to switch to a assembler version of cryptonight_v8; allowed value [auto, off, intel_avx, amd_avx]
+ * - auto: xmr-stak will automatically detect the asm type (default)
+ * - off: disable the usage of optimized assembler
+ * - intel_avx: supports Intel cpus with avx instructions e.g. Xeon v2, Core i7/i5/i3 3xxx, Pentium G2xxx, Celeron G1xxx
+ * - amd_avx: supports AMD cpus with avx instructions e.g. AMD Ryzen 1xxx and 2xxx series
+ *
+ * affine_to_cpu - This can be either false (no affinity), or the CPU core number. Note that on hyperthreading
* systems it is better to assign threads to physical cores. On Windows this usually means selecting
* even or odd numbered cpu numbers. For Linux it will be usually the lower CPU numbers, so for a 4
* physical core CPU you should select cpu numbers 0-3.
@@ -21,8 +28,8 @@ R"===(
* A filled out configuration should look like this:
* "cpu_threads_conf" :
* [
- * { "low_power_mode" : false, "no_prefetch" : true, "affine_to_cpu" : 0 },
- * { "low_power_mode" : false, "no_prefetch" : true, "affine_to_cpu" : 1 },
+ * { "low_power_mode" : false, "no_prefetch" : true, "asm" : "auto", "affine_to_cpu" : 0 },
+ * { "low_power_mode" : false, "no_prefetch" : true, "asm" : "auto", "affine_to_cpu" : 1 },
* ],
* If you do not wish to mine with your CPU(s) then use:
* "cpu_threads_conf" :
@@ -33,5 +40,4 @@ R"===(
[
CPUCONFIG
],
-
)==="
diff --git a/xmrstak/backend/cpu/cpuType.cpp b/xmrstak/backend/cpu/cpuType.cpp
new file mode 100644
index 000000000..5959b75cc
--- /dev/null
+++ b/xmrstak/backend/cpu/cpuType.cpp
@@ -0,0 +1,79 @@
+
+#include "xmrstak/backend/cpu/cpuType.hpp"
+
+#include
+#include
+#include
+
+#ifdef _WIN32
+#define strcasecmp _stricmp
+#include
+#else
+#include
+#endif
+
+namespace xmrstak
+{
+namespace cpu
+{
+ void cpuid(uint32_t eax, int32_t ecx, int32_t val[4])
+ {
+ std::memset(val, 0, sizeof(int32_t)*4);
+
+ #ifdef _WIN32
+ __cpuidex(val, eax, ecx);
+ #else
+ __cpuid_count(eax, ecx, val[0], val[1], val[2], val[3]);
+ #endif
+ }
+
+ int32_t get_masked(int32_t val, int32_t h, int32_t l)
+ {
+ val &= (0x7FFFFFFF >> (31-(h-l))) << l;
+ return val >> l;
+ }
+
+ bool has_feature(int32_t val, int32_t bit)
+ {
+ int32_t mask = 1 << bit;
+ return (val & mask) != 0u;
+
+ }
+
+ Model getModel()
+ {
+ int32_t cpu_info[4];
+ char cpustr[13] = {0};
+
+ cpuid(0, 0, cpu_info);
+ std::memcpy(cpustr, &cpu_info[1], 4);
+ std::memcpy(cpustr+4, &cpu_info[3], 4);
+ std::memcpy(cpustr+8, &cpu_info[2], 4);
+
+ Model result;
+
+ cpuid(1, 0, cpu_info);
+
+ result.family = get_masked(cpu_info[0], 12, 8);
+ result.model = get_masked(cpu_info[0], 8, 4) | get_masked(cpu_info[0], 20, 16) << 4;
+ result.type_name = cpustr;
+
+ // feature bits https://en.wikipedia.org/wiki/CPUID
+ // sse2
+ result.sse2 = has_feature(cpu_info[3], 26);
+ // aes-ni
+ result.aes = has_feature(cpu_info[2], 25);
+ // avx
+ result.avx = has_feature(cpu_info[2], 28);
+
+ if(strcmp(cpustr, "AuthenticAMD") == 0)
+ {
+ if(result.family == 0xF)
+ result.family += get_masked(cpu_info[0], 28, 20);
+ }
+
+ return result;
+ }
+
+} // namespace cpu
+} // namespace xmrstak
diff --git a/xmrstak/backend/cpu/cpuType.hpp b/xmrstak/backend/cpu/cpuType.hpp
new file mode 100644
index 000000000..7f6bfaf51
--- /dev/null
+++ b/xmrstak/backend/cpu/cpuType.hpp
@@ -0,0 +1,32 @@
+#pragma once
+
+#include
+#include
+
+
+namespace xmrstak
+{
+namespace cpu
+{
+ struct Model
+ {
+ uint32_t family = 0u;
+ uint32_t model = 0u;
+ bool aes = false;
+ bool sse2 = false;
+ bool avx = false;
+ std::string type_name = "unknown";
+ };
+
+ Model getModel();
+
+ /** Mask bits between h and l and return the value
+ *
+ * This enables us to put in values exactly like in the manual
+ * For example EBX[30:22] is get_masked(cpu_info[1], 31, 22)
+ */
+ int32_t get_masked(int32_t val, int32_t h, int32_t l);
+
+
+} // namespace cpu
+} // namespace xmrstak
diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_double_main_loop_sandybridge_linux.inc b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_double_main_loop_sandybridge_linux.inc
new file mode 100644
index 000000000..79adab671
--- /dev/null
+++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_double_main_loop_sandybridge_linux.inc
@@ -0,0 +1,410 @@
+ mov rax, rsp
+ push rbx
+ push rbp
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+ sub rsp, 184
+
+ stmxcsr DWORD PTR [rsp+272]
+ mov DWORD PTR [rsp+276], 24448
+ ldmxcsr DWORD PTR [rsp+276]
+
+ mov r13, QWORD PTR [rcx+224]
+ mov r9, rdx
+ mov r10, QWORD PTR [rcx+32]
+ mov r8, rcx
+ xor r10, QWORD PTR [rcx]
+ mov r14d, 524288
+ mov r11, QWORD PTR [rcx+40]
+ xor r11, QWORD PTR [rcx+8]
+ mov rsi, QWORD PTR [rdx+224]
+ mov rdx, QWORD PTR [rcx+56]
+ xor rdx, QWORD PTR [rcx+24]
+ mov rdi, QWORD PTR [r9+32]
+ xor rdi, QWORD PTR [r9]
+ mov rbp, QWORD PTR [r9+40]
+ xor rbp, QWORD PTR [r9+8]
+ movq xmm0, rdx
+ movaps XMMWORD PTR [rax-88], xmm6
+ movaps XMMWORD PTR [rax-104], xmm7
+ movaps XMMWORD PTR [rax-120], xmm8
+ movaps XMMWORD PTR [rsp+112], xmm9
+ movaps XMMWORD PTR [rsp+96], xmm10
+ movaps XMMWORD PTR [rsp+80], xmm11
+ movaps XMMWORD PTR [rsp+64], xmm12
+ movaps XMMWORD PTR [rsp+48], xmm13
+ movaps XMMWORD PTR [rsp+32], xmm14
+ movaps XMMWORD PTR [rsp+16], xmm15
+ mov rdx, r10
+ movq xmm4, QWORD PTR [r8+96]
+ and edx, 2097136
+ mov rax, QWORD PTR [rcx+48]
+ xorps xmm13, xmm13
+ xor rax, QWORD PTR [rcx+16]
+ mov rcx, QWORD PTR [rcx+88]
+ xor rcx, QWORD PTR [r8+72]
+ movq xmm5, QWORD PTR [r8+104]
+ movq xmm7, rax
+
+ mov eax, 1
+ shl rax, 52
+ movq xmm14, rax
+ punpcklqdq xmm14, xmm14
+
+ mov eax, 1023
+ shl rax, 52
+ movq xmm12, rax
+ punpcklqdq xmm12, xmm12
+
+ mov rax, QWORD PTR [r8+80]
+ xor rax, QWORD PTR [r8+64]
+ punpcklqdq xmm7, xmm0
+ movq xmm0, rcx
+ mov rcx, QWORD PTR [r9+56]
+ xor rcx, QWORD PTR [r9+24]
+ movq xmm3, rax
+ mov rax, QWORD PTR [r9+48]
+ xor rax, QWORD PTR [r9+16]
+ punpcklqdq xmm3, xmm0
+ movq xmm0, rcx
+ mov QWORD PTR [rsp], r13
+ mov rcx, QWORD PTR [r9+88]
+ xor rcx, QWORD PTR [r9+72]
+ movq xmm6, rax
+ mov rax, QWORD PTR [r9+80]
+ xor rax, QWORD PTR [r9+64]
+ punpcklqdq xmm6, xmm0
+ movq xmm0, rcx
+ mov QWORD PTR [rsp+256], r10
+ mov rcx, rdi
+ mov QWORD PTR [rsp+264], r11
+ movq xmm8, rax
+ and ecx, 2097136
+ punpcklqdq xmm8, xmm0
+ movq xmm0, QWORD PTR [r9+96]
+ punpcklqdq xmm4, xmm0
+ movq xmm0, QWORD PTR [r9+104]
+ lea r8, QWORD PTR [rcx+rsi]
+ movdqu xmm11, XMMWORD PTR [r8]
+ punpcklqdq xmm5, xmm0
+ lea r9, QWORD PTR [rdx+r13]
+ movdqu xmm15, XMMWORD PTR [r9]
+
+ALIGN 16
+main_loop_double_sandybridge:
+ movdqu xmm9, xmm15
+ mov eax, edx
+ mov ebx, edx
+ xor eax, 16
+ xor ebx, 32
+ xor edx, 48
+
+ movq xmm0, r11
+ movq xmm2, r10
+ punpcklqdq xmm2, xmm0
+ aesenc xmm9, xmm2
+
+ movdqu xmm0, XMMWORD PTR [rax+r13]
+ movdqu xmm1, XMMWORD PTR [rbx+r13]
+ paddq xmm0, xmm7
+ paddq xmm1, xmm2
+ movdqu XMMWORD PTR [rbx+r13], xmm0
+ movdqu xmm0, XMMWORD PTR [rdx+r13]
+ movdqu XMMWORD PTR [rdx+r13], xmm1
+ paddq xmm0, xmm3
+ movdqu XMMWORD PTR [rax+r13], xmm0
+
+ movq r11, xmm9
+ mov edx, r11d
+ and edx, 2097136
+ movdqa xmm0, xmm9
+ pxor xmm0, xmm7
+ movdqu XMMWORD PTR [r9], xmm0
+
+ lea rbx, QWORD PTR [rdx+r13]
+ mov r10, QWORD PTR [rdx+r13]
+
+ movdqu xmm10, xmm11
+ movq xmm0, rbp
+ movq xmm11, rdi
+ punpcklqdq xmm11, xmm0
+ aesenc xmm10, xmm11
+
+ mov eax, ecx
+ mov r12d, ecx
+ xor eax, 16
+ xor r12d, 32
+ xor ecx, 48
+
+ movdqu xmm0, XMMWORD PTR [rax+rsi]
+ paddq xmm0, xmm6
+ movdqu xmm1, XMMWORD PTR [r12+rsi]
+ movdqu XMMWORD PTR [r12+rsi], xmm0
+ paddq xmm1, xmm11
+ movdqu xmm0, XMMWORD PTR [rcx+rsi]
+ movdqu XMMWORD PTR [rcx+rsi], xmm1
+ paddq xmm0, xmm8
+ movdqu XMMWORD PTR [rax+rsi], xmm0
+
+ movq rcx, xmm10
+ and ecx, 2097136
+
+ movdqa xmm0, xmm10
+ pxor xmm0, xmm6
+ movdqu XMMWORD PTR [r8], xmm0
+ mov r12, QWORD PTR [rcx+rsi]
+
+ mov r9, QWORD PTR [rbx+8]
+
+ xor edx, 16
+ mov r8d, edx
+ mov r15d, edx
+
+ movq rdx, xmm5
+ shl rdx, 32
+ movq rax, xmm4
+ xor rdx, rax
+ xor r10, rdx
+ mov rax, r10
+ mul r11
+ mov r11d, r8d
+ xor r11d, 48
+ movq xmm0, rdx
+ xor rdx, [r11+r13]
+ movq xmm1, rax
+ xor rax, [r11+r13+8]
+ punpcklqdq xmm0, xmm1
+
+ pxor xmm0, XMMWORD PTR [r8+r13]
+ xor r8d, 32
+ movdqu xmm1, XMMWORD PTR [r11+r13]
+ paddq xmm0, xmm7
+ paddq xmm1, xmm2
+ movdqu XMMWORD PTR [r11+r13], xmm0
+ movdqu xmm0, XMMWORD PTR [r8+r13]
+ movdqu XMMWORD PTR [r8+r13], xmm1
+ paddq xmm0, xmm3
+ movdqu XMMWORD PTR [r15+r13], xmm0
+
+ mov r11, QWORD PTR [rsp+256]
+ add r11, rdx
+ mov rdx, QWORD PTR [rsp+264]
+ add rdx, rax
+ mov QWORD PTR [rbx], r11
+ xor r11, r10
+ mov QWORD PTR [rbx+8], rdx
+ xor rdx, r9
+ mov QWORD PTR [rsp+256], r11
+ and r11d, 2097136
+ mov QWORD PTR [rsp+264], rdx
+ mov QWORD PTR [rsp+8], r11
+ lea r15, QWORD PTR [r11+r13]
+ movdqu xmm15, XMMWORD PTR [r11+r13]
+ lea r13, QWORD PTR [rsi+rcx]
+ movdqa xmm0, xmm5
+ psrldq xmm0, 8
+ movaps xmm2, xmm13
+ movq r10, xmm0
+ psllq xmm5, 1
+ shl r10, 32
+ movdqa xmm0, xmm9
+ psrldq xmm0, 8
+ movdqa xmm1, xmm10
+ movq r11, xmm0
+ psrldq xmm1, 8
+ movq r8, xmm1
+ psrldq xmm4, 8
+ movaps xmm0, xmm13
+ movq rax, xmm4
+ xor r10, rax
+ movaps xmm1, xmm13
+ xor r10, r12
+ lea rax, QWORD PTR [r11+1]
+ shr rax, 1
+ movdqa xmm3, xmm9
+ punpcklqdq xmm3, xmm10
+ paddq xmm5, xmm3
+ movq rdx, xmm5
+ psrldq xmm5, 8
+ cvtsi2sd xmm2, rax
+ or edx, -2147483647
+ lea rax, QWORD PTR [r8+1]
+ shr rax, 1
+ movq r9, xmm5
+ cvtsi2sd xmm0, rax
+ or r9d, -2147483647
+ cvtsi2sd xmm1, rdx
+ unpcklpd xmm2, xmm0
+ movaps xmm0, xmm13
+ cvtsi2sd xmm0, r9
+ unpcklpd xmm1, xmm0
+ divpd xmm2, xmm1
+ paddq xmm2, xmm14
+ cvttsd2si rax, xmm2
+ psrldq xmm2, 8
+ mov rbx, rax
+ imul rax, rdx
+ sub r11, rax
+ js div_fix_1_sandybridge
+div_fix_1_ret_sandybridge:
+
+ cvttsd2si rdx, xmm2
+ mov rax, rdx
+ imul rax, r9
+ movd xmm2, r11d
+ movd xmm4, ebx
+ sub r8, rax
+ js div_fix_2_sandybridge
+div_fix_2_ret_sandybridge:
+
+ movd xmm1, r8d
+ movd xmm0, edx
+ punpckldq xmm2, xmm1
+ punpckldq xmm4, xmm0
+ punpckldq xmm4, xmm2
+ paddq xmm3, xmm4
+ movdqa xmm0, xmm3
+ psrlq xmm0, 12
+ paddq xmm0, xmm12
+ sqrtpd xmm1, xmm0
+ movq r9, xmm1
+ movdqa xmm5, xmm1
+ psrlq xmm5, 19
+ test r9, 524287
+ je sqrt_fix_1_sandybridge
+sqrt_fix_1_ret_sandybridge:
+
+ movq r9, xmm10
+ psrldq xmm1, 8
+ movq r8, xmm1
+ test r8, 524287
+ je sqrt_fix_2_sandybridge
+sqrt_fix_2_ret_sandybridge:
+
+ mov r12d, ecx
+ mov r8d, ecx
+ xor r12d, 16
+ xor r8d, 32
+ xor ecx, 48
+ mov rax, r10
+ mul r9
+ movq xmm0, rax
+ movq xmm3, rdx
+ punpcklqdq xmm3, xmm0
+
+ movdqu xmm0, XMMWORD PTR [r12+rsi]
+ pxor xmm0, xmm3
+ movdqu xmm1, XMMWORD PTR [r8+rsi]
+ xor rdx, [r8+rsi]
+ xor rax, [r8+rsi+8]
+ movdqu xmm3, XMMWORD PTR [rcx+rsi]
+ paddq xmm0, xmm6
+ paddq xmm1, xmm11
+ paddq xmm3, xmm8
+ movdqu XMMWORD PTR [r8+rsi], xmm0
+ movdqu XMMWORD PTR [rcx+rsi], xmm1
+ movdqu XMMWORD PTR [r12+rsi], xmm3
+
+ add rdi, rdx
+ mov QWORD PTR [r13], rdi
+ xor rdi, r10
+ mov ecx, edi
+ and ecx, 2097136
+ lea r8, QWORD PTR [rcx+rsi]
+
+ mov rdx, QWORD PTR [r13+8]
+ add rbp, rax
+ mov QWORD PTR [r13+8], rbp
+ movdqu xmm11, XMMWORD PTR [rcx+rsi]
+ xor rbp, rdx
+ mov r13, QWORD PTR [rsp]
+ movdqa xmm3, xmm7
+ mov rdx, QWORD PTR [rsp+8]
+ movdqa xmm8, xmm6
+ mov r10, QWORD PTR [rsp+256]
+ movdqa xmm7, xmm9
+ mov r11, QWORD PTR [rsp+264]
+ movdqa xmm6, xmm10
+ mov r9, r15
+ dec r14d
+ jne main_loop_double_sandybridge
+
+ ldmxcsr DWORD PTR [rsp+272]
+ movaps xmm13, XMMWORD PTR [rsp+48]
+ lea r11, QWORD PTR [rsp+184]
+ movaps xmm6, XMMWORD PTR [r11-24]
+ movaps xmm7, XMMWORD PTR [r11-40]
+ movaps xmm8, XMMWORD PTR [r11-56]
+ movaps xmm9, XMMWORD PTR [r11-72]
+ movaps xmm10, XMMWORD PTR [r11-88]
+ movaps xmm11, XMMWORD PTR [r11-104]
+ movaps xmm12, XMMWORD PTR [r11-120]
+ movaps xmm14, XMMWORD PTR [rsp+32]
+ movaps xmm15, XMMWORD PTR [rsp+16]
+ mov rsp, r11
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbp
+ pop rbx
+ jmp cnv2_double_mainloop_asm_sandybridge_endp
+
+div_fix_1_sandybridge:
+ dec rbx
+ add r11, rdx
+ jmp div_fix_1_ret_sandybridge
+
+div_fix_2_sandybridge:
+ dec rdx
+ add r8, r9
+ jmp div_fix_2_ret_sandybridge
+
+sqrt_fix_1_sandybridge:
+ movq r8, xmm3
+ movdqa xmm0, xmm5
+ psrldq xmm0, 8
+ dec r9
+ mov r11d, -1022
+ shl r11, 32
+ mov rax, r9
+ shr r9, 19
+ shr rax, 20
+ mov rdx, r9
+ sub rdx, rax
+ lea rdx, [rdx+r11+1]
+ add rax, r11
+ imul rdx, rax
+ sub rdx, r8
+ adc r9, 0
+ movq xmm5, r9
+ punpcklqdq xmm5, xmm0
+ jmp sqrt_fix_1_ret_sandybridge
+
+sqrt_fix_2_sandybridge:
+ psrldq xmm3, 8
+ movq r11, xmm3
+ dec r8
+ mov ebx, -1022
+ shl rbx, 32
+ mov rax, r8
+ shr r8, 19
+ shr rax, 20
+ mov rdx, r8
+ sub rdx, rax
+ lea rdx, [rdx+rbx+1]
+ add rax, rbx
+ imul rdx, rax
+ sub rdx, r11
+ adc r8, 0
+ movq xmm0, r8
+ punpcklqdq xmm5, xmm0
+ jmp sqrt_fix_2_ret_sandybridge
+
+cnv2_double_mainloop_asm_sandybridge_endp:
diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_double_main_loop_sandybridge_win64.inc b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_double_main_loop_sandybridge_win64.inc
new file mode 100644
index 000000000..ad8f18233
--- /dev/null
+++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_double_main_loop_sandybridge_win64.inc
@@ -0,0 +1,410 @@
+ mov rax, rsp
+ push rbx
+ push rbp
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+ sub rsp, 184
+
+ stmxcsr DWORD PTR [rsp+272]
+ mov DWORD PTR [rsp+276], 24448
+ ldmxcsr DWORD PTR [rsp+276]
+
+ mov r13, QWORD PTR [rcx+224]
+ mov r9, rdx
+ mov r10, QWORD PTR [rcx+32]
+ mov r8, rcx
+ xor r10, QWORD PTR [rcx]
+ mov r14d, 524288
+ mov r11, QWORD PTR [rcx+40]
+ xor r11, QWORD PTR [rcx+8]
+ mov rsi, QWORD PTR [rdx+224]
+ mov rdx, QWORD PTR [rcx+56]
+ xor rdx, QWORD PTR [rcx+24]
+ mov rdi, QWORD PTR [r9+32]
+ xor rdi, QWORD PTR [r9]
+ mov rbp, QWORD PTR [r9+40]
+ xor rbp, QWORD PTR [r9+8]
+ movd xmm0, rdx
+ movaps XMMWORD PTR [rax-88], xmm6
+ movaps XMMWORD PTR [rax-104], xmm7
+ movaps XMMWORD PTR [rax-120], xmm8
+ movaps XMMWORD PTR [rsp+112], xmm9
+ movaps XMMWORD PTR [rsp+96], xmm10
+ movaps XMMWORD PTR [rsp+80], xmm11
+ movaps XMMWORD PTR [rsp+64], xmm12
+ movaps XMMWORD PTR [rsp+48], xmm13
+ movaps XMMWORD PTR [rsp+32], xmm14
+ movaps XMMWORD PTR [rsp+16], xmm15
+ mov rdx, r10
+ movq xmm4, QWORD PTR [r8+96]
+ and edx, 2097136
+ mov rax, QWORD PTR [rcx+48]
+ xorps xmm13, xmm13
+ xor rax, QWORD PTR [rcx+16]
+ mov rcx, QWORD PTR [rcx+88]
+ xor rcx, QWORD PTR [r8+72]
+ movq xmm5, QWORD PTR [r8+104]
+ movd xmm7, rax
+
+ mov eax, 1
+ shl rax, 52
+ movd xmm14, rax
+ punpcklqdq xmm14, xmm14
+
+ mov eax, 1023
+ shl rax, 52
+ movd xmm12, rax
+ punpcklqdq xmm12, xmm12
+
+ mov rax, QWORD PTR [r8+80]
+ xor rax, QWORD PTR [r8+64]
+ punpcklqdq xmm7, xmm0
+ movd xmm0, rcx
+ mov rcx, QWORD PTR [r9+56]
+ xor rcx, QWORD PTR [r9+24]
+ movd xmm3, rax
+ mov rax, QWORD PTR [r9+48]
+ xor rax, QWORD PTR [r9+16]
+ punpcklqdq xmm3, xmm0
+ movd xmm0, rcx
+ mov QWORD PTR [rsp], r13
+ mov rcx, QWORD PTR [r9+88]
+ xor rcx, QWORD PTR [r9+72]
+ movd xmm6, rax
+ mov rax, QWORD PTR [r9+80]
+ xor rax, QWORD PTR [r9+64]
+ punpcklqdq xmm6, xmm0
+ movd xmm0, rcx
+ mov QWORD PTR [rsp+256], r10
+ mov rcx, rdi
+ mov QWORD PTR [rsp+264], r11
+ movd xmm8, rax
+ and ecx, 2097136
+ punpcklqdq xmm8, xmm0
+ movd xmm0, QWORD PTR [r9+96]
+ punpcklqdq xmm4, xmm0
+ movd xmm0, QWORD PTR [r9+104]
+ lea r8, QWORD PTR [rcx+rsi]
+ movdqu xmm11, XMMWORD PTR [r8]
+ punpcklqdq xmm5, xmm0
+ lea r9, QWORD PTR [rdx+r13]
+ movdqu xmm15, XMMWORD PTR [r9]
+
+ ALIGN 64
+main_loop_double_sandybridge:
+ movdqu xmm9, xmm15
+ mov eax, edx
+ mov ebx, edx
+ xor eax, 16
+ xor ebx, 32
+ xor edx, 48
+
+ movd xmm0, r11
+ movd xmm2, r10
+ punpcklqdq xmm2, xmm0
+ aesenc xmm9, xmm2
+
+ movdqu xmm0, XMMWORD PTR [rax+r13]
+ movdqu xmm1, XMMWORD PTR [rbx+r13]
+ paddq xmm0, xmm7
+ paddq xmm1, xmm2
+ movdqu XMMWORD PTR [rbx+r13], xmm0
+ movdqu xmm0, XMMWORD PTR [rdx+r13]
+ movdqu XMMWORD PTR [rdx+r13], xmm1
+ paddq xmm0, xmm3
+ movdqu XMMWORD PTR [rax+r13], xmm0
+
+ movd r11, xmm9
+ mov edx, r11d
+ and edx, 2097136
+ movdqa xmm0, xmm9
+ pxor xmm0, xmm7
+ movdqu XMMWORD PTR [r9], xmm0
+
+ lea rbx, QWORD PTR [rdx+r13]
+ mov r10, QWORD PTR [rdx+r13]
+
+ movdqu xmm10, xmm11
+ movd xmm0, rbp
+ movd xmm11, rdi
+ punpcklqdq xmm11, xmm0
+ aesenc xmm10, xmm11
+
+ mov eax, ecx
+ mov r12d, ecx
+ xor eax, 16
+ xor r12d, 32
+ xor ecx, 48
+
+ movdqu xmm0, XMMWORD PTR [rax+rsi]
+ paddq xmm0, xmm6
+ movdqu xmm1, XMMWORD PTR [r12+rsi]
+ movdqu XMMWORD PTR [r12+rsi], xmm0
+ paddq xmm1, xmm11
+ movdqu xmm0, XMMWORD PTR [rcx+rsi]
+ movdqu XMMWORD PTR [rcx+rsi], xmm1
+ paddq xmm0, xmm8
+ movdqu XMMWORD PTR [rax+rsi], xmm0
+
+ movd rcx, xmm10
+ and ecx, 2097136
+
+ movdqa xmm0, xmm10
+ pxor xmm0, xmm6
+ movdqu XMMWORD PTR [r8], xmm0
+ mov r12, QWORD PTR [rcx+rsi]
+
+ mov r9, QWORD PTR [rbx+8]
+
+ xor edx, 16
+ mov r8d, edx
+ mov r15d, edx
+
+ movd rdx, xmm5
+ shl rdx, 32
+ movd rax, xmm4
+ xor rdx, rax
+ xor r10, rdx
+ mov rax, r10
+ mul r11
+ mov r11d, r8d
+ xor r11d, 48
+ movd xmm0, rdx
+ xor rdx, [r11+r13]
+ movd xmm1, rax
+ xor rax, [r11+r13+8]
+ punpcklqdq xmm0, xmm1
+
+ pxor xmm0, XMMWORD PTR [r8+r13]
+ xor r8d, 32
+ movdqu xmm1, XMMWORD PTR [r11+r13]
+ paddq xmm0, xmm7
+ paddq xmm1, xmm2
+ movdqu XMMWORD PTR [r11+r13], xmm0
+ movdqu xmm0, XMMWORD PTR [r8+r13]
+ movdqu XMMWORD PTR [r8+r13], xmm1
+ paddq xmm0, xmm3
+ movdqu XMMWORD PTR [r15+r13], xmm0
+
+ mov r11, QWORD PTR [rsp+256]
+ add r11, rdx
+ mov rdx, QWORD PTR [rsp+264]
+ add rdx, rax
+ mov QWORD PTR [rbx], r11
+ xor r11, r10
+ mov QWORD PTR [rbx+8], rdx
+ xor rdx, r9
+ mov QWORD PTR [rsp+256], r11
+ and r11d, 2097136
+ mov QWORD PTR [rsp+264], rdx
+ mov QWORD PTR [rsp+8], r11
+ lea r15, QWORD PTR [r11+r13]
+ movdqu xmm15, XMMWORD PTR [r11+r13]
+ lea r13, QWORD PTR [rsi+rcx]
+ movdqa xmm0, xmm5
+ psrldq xmm0, 8
+ movaps xmm2, xmm13
+ movd r10, xmm0
+ psllq xmm5, 1
+ shl r10, 32
+ movdqa xmm0, xmm9
+ psrldq xmm0, 8
+ movdqa xmm1, xmm10
+ movd r11, xmm0
+ psrldq xmm1, 8
+ movd r8, xmm1
+ psrldq xmm4, 8
+ movaps xmm0, xmm13
+ movd rax, xmm4
+ xor r10, rax
+ movaps xmm1, xmm13
+ xor r10, r12
+ lea rax, QWORD PTR [r11+1]
+ shr rax, 1
+ movdqa xmm3, xmm9
+ punpcklqdq xmm3, xmm10
+ paddq xmm5, xmm3
+ movd rdx, xmm5
+ psrldq xmm5, 8
+ cvtsi2sd xmm2, rax
+ or edx, -2147483647
+ lea rax, QWORD PTR [r8+1]
+ shr rax, 1
+ movd r9, xmm5
+ cvtsi2sd xmm0, rax
+ or r9d, -2147483647
+ cvtsi2sd xmm1, rdx
+ unpcklpd xmm2, xmm0
+ movaps xmm0, xmm13
+ cvtsi2sd xmm0, r9
+ unpcklpd xmm1, xmm0
+ divpd xmm2, xmm1
+ paddq xmm2, xmm14
+ cvttsd2si rax, xmm2
+ psrldq xmm2, 8
+ mov rbx, rax
+ imul rax, rdx
+ sub r11, rax
+ js div_fix_1_sandybridge
+div_fix_1_ret_sandybridge:
+
+ cvttsd2si rdx, xmm2
+ mov rax, rdx
+ imul rax, r9
+ movd xmm2, r11d
+ movd xmm4, ebx
+ sub r8, rax
+ js div_fix_2_sandybridge
+div_fix_2_ret_sandybridge:
+
+ movd xmm1, r8d
+ movd xmm0, edx
+ punpckldq xmm2, xmm1
+ punpckldq xmm4, xmm0
+ punpckldq xmm4, xmm2
+ paddq xmm3, xmm4
+ movdqa xmm0, xmm3
+ psrlq xmm0, 12
+ paddq xmm0, xmm12
+ sqrtpd xmm1, xmm0
+ movd r9, xmm1
+ movdqa xmm5, xmm1
+ psrlq xmm5, 19
+ test r9, 524287
+ je sqrt_fix_1_sandybridge
+sqrt_fix_1_ret_sandybridge:
+
+ movd r9, xmm10
+ psrldq xmm1, 8
+ movd r8, xmm1
+ test r8, 524287
+ je sqrt_fix_2_sandybridge
+sqrt_fix_2_ret_sandybridge:
+
+ mov r12d, ecx
+ mov r8d, ecx
+ xor r12d, 16
+ xor r8d, 32
+ xor ecx, 48
+ mov rax, r10
+ mul r9
+ movd xmm0, rax
+ movd xmm3, rdx
+ punpcklqdq xmm3, xmm0
+
+ movdqu xmm0, XMMWORD PTR [r12+rsi]
+ pxor xmm0, xmm3
+ movdqu xmm1, XMMWORD PTR [r8+rsi]
+ xor rdx, [r8+rsi]
+ xor rax, [r8+rsi+8]
+ movdqu xmm3, XMMWORD PTR [rcx+rsi]
+ paddq xmm0, xmm6
+ paddq xmm1, xmm11
+ paddq xmm3, xmm8
+ movdqu XMMWORD PTR [r8+rsi], xmm0
+ movdqu XMMWORD PTR [rcx+rsi], xmm1
+ movdqu XMMWORD PTR [r12+rsi], xmm3
+
+ add rdi, rdx
+ mov QWORD PTR [r13], rdi
+ xor rdi, r10
+ mov ecx, edi
+ and ecx, 2097136
+ lea r8, QWORD PTR [rcx+rsi]
+
+ mov rdx, QWORD PTR [r13+8]
+ add rbp, rax
+ mov QWORD PTR [r13+8], rbp
+ movdqu xmm11, XMMWORD PTR [rcx+rsi]
+ xor rbp, rdx
+ mov r13, QWORD PTR [rsp]
+ movdqa xmm3, xmm7
+ mov rdx, QWORD PTR [rsp+8]
+ movdqa xmm8, xmm6
+ mov r10, QWORD PTR [rsp+256]
+ movdqa xmm7, xmm9
+ mov r11, QWORD PTR [rsp+264]
+ movdqa xmm6, xmm10
+ mov r9, r15
+ dec r14d
+ jne main_loop_double_sandybridge
+
+ ldmxcsr DWORD PTR [rsp+272]
+ movaps xmm13, XMMWORD PTR [rsp+48]
+ lea r11, QWORD PTR [rsp+184]
+ movaps xmm6, XMMWORD PTR [r11-24]
+ movaps xmm7, XMMWORD PTR [r11-40]
+ movaps xmm8, XMMWORD PTR [r11-56]
+ movaps xmm9, XMMWORD PTR [r11-72]
+ movaps xmm10, XMMWORD PTR [r11-88]
+ movaps xmm11, XMMWORD PTR [r11-104]
+ movaps xmm12, XMMWORD PTR [r11-120]
+ movaps xmm14, XMMWORD PTR [rsp+32]
+ movaps xmm15, XMMWORD PTR [rsp+16]
+ mov rsp, r11
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbp
+ pop rbx
+ jmp cnv2_double_mainloop_asm_sandybridge_endp
+
+div_fix_1_sandybridge:
+ dec rbx
+ add r11, rdx
+ jmp div_fix_1_ret_sandybridge
+
+div_fix_2_sandybridge:
+ dec rdx
+ add r8, r9
+ jmp div_fix_2_ret_sandybridge
+
+sqrt_fix_1_sandybridge:
+ movd r8, xmm3
+ movdqa xmm0, xmm5
+ psrldq xmm0, 8
+ dec r9
+ mov r11d, -1022
+ shl r11, 32
+ mov rax, r9
+ shr r9, 19
+ shr rax, 20
+ mov rdx, r9
+ sub rdx, rax
+ lea rdx, [rdx+r11+1]
+ add rax, r11
+ imul rdx, rax
+ sub rdx, r8
+ adc r9, 0
+ movd xmm5, r9
+ punpcklqdq xmm5, xmm0
+ jmp sqrt_fix_1_ret_sandybridge
+
+sqrt_fix_2_sandybridge:
+ psrldq xmm3, 8
+ movd r11, xmm3
+ dec r8
+ mov ebx, -1022
+ shl rbx, 32
+ mov rax, r8
+ shr r8, 19
+ shr rax, 20
+ mov rdx, r8
+ sub rdx, rax
+ lea rdx, [rdx+rbx+1]
+ add rax, rbx
+ imul rdx, rax
+ sub rdx, r11
+ adc r8, 0
+ movd xmm0, r8
+ punpcklqdq xmm5, xmm0
+ jmp sqrt_fix_2_ret_sandybridge
+
+cnv2_double_mainloop_asm_sandybridge_endp:
diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S
new file mode 100644
index 000000000..c0a3d0b41
--- /dev/null
+++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S
@@ -0,0 +1,37 @@
+#define ALIGN .align
+.intel_syntax noprefix
+#ifdef __APPLE__
+# define FN_PREFIX(fn) _ ## fn
+.text
+#else
+# define FN_PREFIX(fn) fn
+.section .text
+#endif
+.global FN_PREFIX(cryptonight_v8_mainloop_ivybridge_asm)
+.global FN_PREFIX(cryptonight_v8_mainloop_ryzen_asm)
+.global FN_PREFIX(cryptonight_v8_double_mainloop_sandybridge_asm)
+
+ALIGN 8
+FN_PREFIX(cryptonight_v8_mainloop_ivybridge_asm):
+ sub rsp, 48
+ mov rcx, rdi
+ #include "cryptonight_v8_main_loop_ivybridge_linux.inc"
+ add rsp, 48
+ ret 0
+
+ALIGN 8
+FN_PREFIX(cryptonight_v8_mainloop_ryzen_asm):
+ sub rsp, 48
+ mov rcx, rdi
+ #include "cryptonight_v8_main_loop_ryzen_linux.inc"
+ add rsp, 48
+ ret 0
+
+ALIGN 16
+FN_PREFIX(cryptonight_v8_double_mainloop_sandybridge_asm):
+ sub rsp, 48
+ mov rcx, rdi
+ mov rdx, rsi
+ #include "cryptonight_v8_double_main_loop_sandybridge_linux.inc"
+ add rsp, 48
+ ret 0
diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.asm b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.asm
new file mode 100644
index 000000000..1f3d2e15c
--- /dev/null
+++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.asm
@@ -0,0 +1,25 @@
+_TEXT_CNV8_MAINLOOP SEGMENT PAGE READ EXECUTE
+PUBLIC cryptonight_v8_mainloop_ivybridge_asm
+PUBLIC cryptonight_v8_mainloop_ryzen_asm
+PUBLIC cryptonight_v8_double_mainloop_sandybridge_asm
+
+ALIGN 8
+cryptonight_v8_mainloop_ivybridge_asm PROC
+ INCLUDE cryptonight_v8_main_loop_ivybridge_win64.inc
+ ret 0
+cryptonight_v8_mainloop_ivybridge_asm ENDP
+
+ALIGN 8
+cryptonight_v8_mainloop_ryzen_asm PROC
+ INCLUDE cryptonight_v8_main_loop_ryzen_win64.inc
+ ret 0
+cryptonight_v8_mainloop_ryzen_asm ENDP
+
+ALIGN 8
+cryptonight_v8_double_mainloop_sandybridge_asm PROC
+ INCLUDE cryptonight_v8_double_main_loop_sandybridge_win64.inc
+ ret 0
+cryptonight_v8_double_mainloop_sandybridge_asm ENDP
+
+_TEXT_CNV8_MAINLOOP ENDS
+END
diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_linux.inc b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_linux.inc
new file mode 100644
index 000000000..cbe43b0d3
--- /dev/null
+++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_linux.inc
@@ -0,0 +1,187 @@
+ mov QWORD PTR [rsp+24], rbx
+ push rbp
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+ sub rsp, 80
+
+ stmxcsr DWORD PTR [rsp]
+ mov DWORD PTR [rsp+4], 24448
+ ldmxcsr DWORD PTR [rsp+4]
+
+ mov rax, QWORD PTR [rcx+48]
+ mov r9, rcx
+ xor rax, QWORD PTR [rcx+16]
+ mov esi, 524288
+ mov r8, QWORD PTR [rcx+32]
+ mov r13d, -2147483647
+ xor r8, QWORD PTR [rcx]
+ mov r11, QWORD PTR [rcx+40]
+ mov r10, r8
+ mov rdx, QWORD PTR [rcx+56]
+ movq xmm4, rax
+ xor rdx, QWORD PTR [rcx+24]
+ xor r11, QWORD PTR [rcx+8]
+ mov rbx, QWORD PTR [rcx+224]
+ mov rax, QWORD PTR [r9+80]
+ xor rax, QWORD PTR [r9+64]
+ movq xmm0, rdx
+ mov rcx, QWORD PTR [rcx+88]
+ xor rcx, QWORD PTR [r9+72]
+ movq xmm3, QWORD PTR [r9+104]
+ movaps XMMWORD PTR [rsp+64], xmm6
+ movaps XMMWORD PTR [rsp+48], xmm7
+ movaps XMMWORD PTR [rsp+32], xmm8
+ and r10d, 2097136
+ movq xmm5, rax
+
+ xor eax, eax
+ mov QWORD PTR [rsp+16], rax
+
+ mov ax, 1023
+ shl rax, 52
+ movq xmm8, rax
+ mov r15, QWORD PTR [r9+96]
+ punpcklqdq xmm4, xmm0
+ movq xmm0, rcx
+ punpcklqdq xmm5, xmm0
+ movdqu xmm6, XMMWORD PTR [r10+rbx]
+
+ ALIGN 8
+main_loop_ivybridge:
+ lea rdx, QWORD PTR [r10+rbx]
+ mov ecx, r10d
+ mov eax, r10d
+ mov rdi, r15
+ xor ecx, 16
+ xor eax, 32
+ xor r10d, 48
+ movq xmm0, r11
+ movq xmm7, r8
+ punpcklqdq xmm7, xmm0
+ aesenc xmm6, xmm7
+ movq rbp, xmm6
+ mov r9, rbp
+ and r9d, 2097136
+ movdqu xmm2, XMMWORD PTR [rcx+rbx]
+ movdqu xmm1, XMMWORD PTR [rax+rbx]
+ movdqu xmm0, XMMWORD PTR [r10+rbx]
+ paddq xmm1, xmm7
+ paddq xmm0, xmm5
+ paddq xmm2, xmm4
+ movdqu XMMWORD PTR [rcx+rbx], xmm0
+ movdqu XMMWORD PTR [rax+rbx], xmm2
+ movdqu XMMWORD PTR [r10+rbx], xmm1
+ mov r10, r9
+ xor r10d, 32
+ movq rcx, xmm3
+ mov rax, rcx
+ shl rax, 32
+ xor rdi, rax
+ movdqa xmm0, xmm6
+ pxor xmm0, xmm4
+ movdqu XMMWORD PTR [rdx], xmm0
+ xor rdi, QWORD PTR [r9+rbx]
+ lea r14, QWORD PTR [r9+rbx]
+ mov r12, QWORD PTR [r14+8]
+ xor edx, edx
+ lea r9d, DWORD PTR [ecx+ecx]
+ add r9d, ebp
+ movdqa xmm0, xmm6
+ psrldq xmm0, 8
+ or r9d, r13d
+ movq rax, xmm0
+ div r9
+ xorps xmm3, xmm3
+ mov eax, eax
+ shl rdx, 32
+ add rdx, rax
+ lea r9, QWORD PTR [rdx+rbp]
+ mov r15, rdx
+ mov rax, r9
+ shr rax, 12
+ movq xmm0, rax
+ paddq xmm0, xmm8
+ sqrtsd xmm3, xmm0
+ psubq xmm3, XMMWORD PTR [rsp+16]
+ movq rdx, xmm3
+ test edx, 524287
+ je sqrt_fixup_ivybridge
+ psrlq xmm3, 19
+ psubq xmm3, XMMWORD PTR [rsp+16]
+sqrt_fixup_ivybridge_ret:
+
+ mov ecx, r10d
+ mov rax, rdi
+ mul rbp
+ movq xmm2, rdx
+ xor rdx, [rcx+rbx]
+ add r8, rdx
+ mov QWORD PTR [r14], r8
+ xor r8, rdi
+ mov edi, r8d
+ and edi, 2097136
+ movq xmm0, rax
+ xor rax, [rcx+rbx+8]
+ add r11, rax
+ mov QWORD PTR [r14+8], r11
+ punpcklqdq xmm2, xmm0
+
+ mov r9d, r10d
+ xor r9d, 48
+ xor r10d, 16
+ pxor xmm2, XMMWORD PTR [r9+rbx]
+ movdqu xmm0, XMMWORD PTR [r10+rbx]
+ paddq xmm0, xmm5
+ movdqu xmm1, XMMWORD PTR [rcx+rbx]
+ paddq xmm2, xmm4
+ paddq xmm1, xmm7
+ movdqa xmm5, xmm4
+ movdqu XMMWORD PTR [r9+rbx], xmm0
+ movdqa xmm4, xmm6
+ movdqu XMMWORD PTR [rcx+rbx], xmm2
+ movdqu XMMWORD PTR [r10+rbx], xmm1
+ movdqu xmm6, [rdi+rbx]
+ mov r10d, edi
+ xor r11, r12
+ dec rsi
+ jne main_loop_ivybridge
+
+ ldmxcsr DWORD PTR [rsp]
+ mov rbx, QWORD PTR [rsp+160]
+ movaps xmm6, XMMWORD PTR [rsp+64]
+ movaps xmm7, XMMWORD PTR [rsp+48]
+ movaps xmm8, XMMWORD PTR [rsp+32]
+ add rsp, 80
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbp
+ jmp cnv2_main_loop_ivybridge_endp
+
+sqrt_fixup_ivybridge:
+ dec rdx
+ mov r13d, -1022
+ shl r13, 32
+ mov rax, rdx
+ shr rdx, 19
+ shr rax, 20
+ mov rcx, rdx
+ sub rcx, rax
+ add rax, r13
+ not r13
+ sub rcx, r13
+ mov r13d, -2147483647
+ imul rcx, rax
+ sub rcx, r9
+ adc rdx, 0
+ movq xmm3, rdx
+ jmp sqrt_fixup_ivybridge_ret
+
+cnv2_main_loop_ivybridge_endp:
diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_win64.inc b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_win64.inc
new file mode 100755
index 000000000..8d49c5db7
--- /dev/null
+++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_win64.inc
@@ -0,0 +1,187 @@
+ mov QWORD PTR [rsp+24], rbx
+ push rbp
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+ sub rsp, 80
+
+ stmxcsr DWORD PTR [rsp]
+ mov DWORD PTR [rsp+4], 24448
+ ldmxcsr DWORD PTR [rsp+4]
+
+ mov rax, QWORD PTR [rcx+48]
+ mov r9, rcx
+ xor rax, QWORD PTR [rcx+16]
+ mov esi, 524288
+ mov r8, QWORD PTR [rcx+32]
+ mov r13d, -2147483647
+ xor r8, QWORD PTR [rcx]
+ mov r11, QWORD PTR [rcx+40]
+ mov r10, r8
+ mov rdx, QWORD PTR [rcx+56]
+ movd xmm4, rax
+ xor rdx, QWORD PTR [rcx+24]
+ xor r11, QWORD PTR [rcx+8]
+ mov rbx, QWORD PTR [rcx+224]
+ mov rax, QWORD PTR [r9+80]
+ xor rax, QWORD PTR [r9+64]
+ movd xmm0, rdx
+ mov rcx, QWORD PTR [rcx+88]
+ xor rcx, QWORD PTR [r9+72]
+ movq xmm3, QWORD PTR [r9+104]
+ movaps XMMWORD PTR [rsp+64], xmm6
+ movaps XMMWORD PTR [rsp+48], xmm7
+ movaps XMMWORD PTR [rsp+32], xmm8
+ and r10d, 2097136
+ movd xmm5, rax
+
+ xor eax, eax
+ mov QWORD PTR [rsp+16], rax
+
+ mov ax, 1023
+ shl rax, 52
+ movd xmm8, rax
+ mov r15, QWORD PTR [r9+96]
+ punpcklqdq xmm4, xmm0
+ movd xmm0, rcx
+ punpcklqdq xmm5, xmm0
+ movdqu xmm6, XMMWORD PTR [r10+rbx]
+
+ ALIGN 8
+main_loop_ivybridge:
+ lea rdx, QWORD PTR [r10+rbx]
+ mov ecx, r10d
+ mov eax, r10d
+ mov rdi, r15
+ xor ecx, 16
+ xor eax, 32
+ xor r10d, 48
+ movd xmm0, r11
+ movd xmm7, r8
+ punpcklqdq xmm7, xmm0
+ aesenc xmm6, xmm7
+ movd rbp, xmm6
+ mov r9, rbp
+ and r9d, 2097136
+ movdqu xmm2, XMMWORD PTR [rcx+rbx]
+ movdqu xmm1, XMMWORD PTR [rax+rbx]
+ movdqu xmm0, XMMWORD PTR [r10+rbx]
+ paddq xmm1, xmm7
+ paddq xmm0, xmm5
+ paddq xmm2, xmm4
+ movdqu XMMWORD PTR [rcx+rbx], xmm0
+ movdqu XMMWORD PTR [rax+rbx], xmm2
+ movdqu XMMWORD PTR [r10+rbx], xmm1
+ mov r10, r9
+ xor r10d, 32
+ movd rcx, xmm3
+ mov rax, rcx
+ shl rax, 32
+ xor rdi, rax
+ movdqa xmm0, xmm6
+ pxor xmm0, xmm4
+ movdqu XMMWORD PTR [rdx], xmm0
+ xor rdi, QWORD PTR [r9+rbx]
+ lea r14, QWORD PTR [r9+rbx]
+ mov r12, QWORD PTR [r14+8]
+ xor edx, edx
+ lea r9d, DWORD PTR [ecx+ecx]
+ add r9d, ebp
+ movdqa xmm0, xmm6
+ psrldq xmm0, 8
+ or r9d, r13d
+ movd rax, xmm0
+ div r9
+ xorps xmm3, xmm3
+ mov eax, eax
+ shl rdx, 32
+ add rdx, rax
+ lea r9, QWORD PTR [rdx+rbp]
+ mov r15, rdx
+ mov rax, r9
+ shr rax, 12
+ movd xmm0, rax
+ paddq xmm0, xmm8
+ sqrtsd xmm3, xmm0
+ psubq xmm3, XMMWORD PTR [rsp+16]
+ movd rdx, xmm3
+ test edx, 524287
+ je sqrt_fixup_ivybridge
+ psrlq xmm3, 19
+ psubq xmm3, XMMWORD PTR [rsp+16]
+sqrt_fixup_ivybridge_ret:
+
+ mov ecx, r10d
+ mov rax, rdi
+ mul rbp
+ movd xmm2, rdx
+ xor rdx, [rcx+rbx]
+ add r8, rdx
+ mov QWORD PTR [r14], r8
+ xor r8, rdi
+ mov edi, r8d
+ and edi, 2097136
+ movd xmm0, rax
+ xor rax, [rcx+rbx+8]
+ add r11, rax
+ mov QWORD PTR [r14+8], r11
+ punpcklqdq xmm2, xmm0
+
+ mov r9d, r10d
+ xor r9d, 48
+ xor r10d, 16
+ pxor xmm2, XMMWORD PTR [r9+rbx]
+ movdqu xmm0, XMMWORD PTR [r10+rbx]
+ paddq xmm0, xmm5
+ movdqu xmm1, XMMWORD PTR [rcx+rbx]
+ paddq xmm2, xmm4
+ paddq xmm1, xmm7
+ movdqa xmm5, xmm4
+ movdqu XMMWORD PTR [r9+rbx], xmm0
+ movdqa xmm4, xmm6
+ movdqu XMMWORD PTR [rcx+rbx], xmm2
+ movdqu XMMWORD PTR [r10+rbx], xmm1
+ movdqu xmm6, [rdi+rbx]
+ mov r10d, edi
+ xor r11, r12
+ dec rsi
+ jne main_loop_ivybridge
+
+ ldmxcsr DWORD PTR [rsp]
+ mov rbx, QWORD PTR [rsp+160]
+ movaps xmm6, XMMWORD PTR [rsp+64]
+ movaps xmm7, XMMWORD PTR [rsp+48]
+ movaps xmm8, XMMWORD PTR [rsp+32]
+ add rsp, 80
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbp
+ jmp cnv2_main_loop_ivybridge_endp
+
+sqrt_fixup_ivybridge:
+ dec rdx
+ mov r13d, -1022
+ shl r13, 32
+ mov rax, rdx
+ shr rdx, 19
+ shr rax, 20
+ mov rcx, rdx
+ sub rcx, rax
+ add rax, r13
+ not r13
+ sub rcx, r13
+ mov r13d, -2147483647
+ imul rcx, rax
+ sub rcx, r9
+ adc rdx, 0
+ movd xmm3, rdx
+ jmp sqrt_fixup_ivybridge_ret
+
+cnv2_main_loop_ivybridge_endp:
diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_linux.inc b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_linux.inc
new file mode 100644
index 000000000..cd8b43477
--- /dev/null
+++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_linux.inc
@@ -0,0 +1,179 @@
+ mov QWORD PTR [rsp+16], rbx
+ mov QWORD PTR [rsp+24], rbp
+ mov QWORD PTR [rsp+32], rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+ sub rsp, 64
+
+ stmxcsr DWORD PTR [rsp]
+ mov DWORD PTR [rsp+4], 24448
+ ldmxcsr DWORD PTR [rsp+4]
+
+ mov rax, QWORD PTR [rcx+48]
+ mov r9, rcx
+ xor rax, QWORD PTR [rcx+16]
+ mov ebp, 524288
+ mov r8, QWORD PTR [rcx+32]
+ xor r8, QWORD PTR [rcx]
+ mov r11, QWORD PTR [rcx+40]
+ mov r10, r8
+ mov rdx, QWORD PTR [rcx+56]
+ movq xmm3, rax
+ xor rdx, QWORD PTR [rcx+24]
+ xor r11, QWORD PTR [rcx+8]
+ mov rbx, QWORD PTR [rcx+224]
+ mov rax, QWORD PTR [r9+80]
+ xor rax, QWORD PTR [r9+64]
+ movq xmm0, rdx
+ mov rcx, QWORD PTR [rcx+88]
+ xor rcx, QWORD PTR [r9+72]
+ mov rdi, QWORD PTR [r9+104]
+ and r10d, 2097136
+ movaps XMMWORD PTR [rsp+48], xmm6
+ movq xmm4, rax
+ movaps XMMWORD PTR [rsp+32], xmm7
+ movaps XMMWORD PTR [rsp+16], xmm8
+ xorps xmm8, xmm8
+ mov ax, 1023
+ shl rax, 52
+ movq xmm7, rax
+ mov r15, QWORD PTR [r9+96]
+ punpcklqdq xmm3, xmm0
+ movq xmm0, rcx
+ punpcklqdq xmm4, xmm0
+
+ ALIGN 8
+main_loop_ryzen:
+ movdqa xmm5, XMMWORD PTR [r10+rbx]
+ movq xmm0, r11
+ movq xmm6, r8
+ punpcklqdq xmm6, xmm0
+ lea rdx, QWORD PTR [r10+rbx]
+ lea r9, QWORD PTR [rdi+rdi]
+ shl rdi, 32
+
+ mov ecx, r10d
+ mov eax, r10d
+ xor ecx, 16
+ xor eax, 32
+ xor r10d, 48
+ aesenc xmm5, xmm6
+ movdqa xmm2, XMMWORD PTR [rcx+rbx]
+ movdqa xmm1, XMMWORD PTR [rax+rbx]
+ movdqa xmm0, XMMWORD PTR [r10+rbx]
+ paddq xmm2, xmm3
+ paddq xmm1, xmm6
+ paddq xmm0, xmm4
+ movdqa XMMWORD PTR [rcx+rbx], xmm0
+ movdqa XMMWORD PTR [rax+rbx], xmm2
+ movdqa XMMWORD PTR [r10+rbx], xmm1
+
+ movaps xmm1, xmm8
+ mov rsi, r15
+ xor rsi, rdi
+ movq r14, xmm5
+ movdqa xmm0, xmm5
+ pxor xmm0, xmm3
+ mov r10, r14
+ and r10d, 2097136
+ movdqa XMMWORD PTR [rdx], xmm0
+ xor rsi, QWORD PTR [r10+rbx]
+ lea r12, QWORD PTR [r10+rbx]
+ mov r13, QWORD PTR [r10+rbx+8]
+
+ add r9d, r14d
+ or r9d, -2147483647
+ xor edx, edx
+ movdqa xmm0, xmm5
+ psrldq xmm0, 8
+ movq rax, xmm0
+
+ div r9
+ movq xmm0, rax
+ movq xmm1, rdx
+ punpckldq xmm0, xmm1
+ movq r15, xmm0
+ paddq xmm0, xmm5
+ movdqa xmm2, xmm0
+ psrlq xmm0, 12
+ paddq xmm0, xmm7
+ sqrtsd xmm1, xmm0
+ movq rdi, xmm1
+ test rdi, 524287
+ je sqrt_fixup_ryzen
+ shr rdi, 19
+
+sqrt_fixup_ryzen_ret:
+ mov rax, rsi
+ mul r14
+ movq xmm1, rax
+ movq xmm0, rdx
+ punpcklqdq xmm0, xmm1
+
+ mov r9d, r10d
+ mov ecx, r10d
+ xor r9d, 16
+ xor ecx, 32
+ xor r10d, 48
+ movdqa xmm1, XMMWORD PTR [rcx+rbx]
+ xor rdx, [rcx+rbx]
+ xor rax, [rcx+rbx+8]
+ movdqa xmm2, XMMWORD PTR [r9+rbx]
+ pxor xmm2, xmm0
+ paddq xmm4, XMMWORD PTR [r10+rbx]
+ paddq xmm2, xmm3
+ paddq xmm1, xmm6
+ movdqa XMMWORD PTR [r9+rbx], xmm4
+ movdqa XMMWORD PTR [rcx+rbx], xmm2
+ movdqa XMMWORD PTR [r10+rbx], xmm1
+
+ movdqa xmm4, xmm3
+ add r8, rdx
+ add r11, rax
+ mov QWORD PTR [r12], r8
+ xor r8, rsi
+ mov QWORD PTR [r12+8], r11
+ mov r10, r8
+ xor r11, r13
+ and r10d, 2097136
+ movdqa xmm3, xmm5
+ dec ebp
+ jne main_loop_ryzen
+
+ ldmxcsr DWORD PTR [rsp]
+ movaps xmm6, XMMWORD PTR [rsp+48]
+ lea r11, QWORD PTR [rsp+64]
+ mov rbx, QWORD PTR [r11+56]
+ mov rbp, QWORD PTR [r11+64]
+ mov rsi, QWORD PTR [r11+72]
+ movaps xmm8, XMMWORD PTR [r11-48]
+ movaps xmm7, XMMWORD PTR [rsp+32]
+ mov rsp, r11
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ jmp cnv2_main_loop_ryzen_endp
+
+sqrt_fixup_ryzen:
+ movq r9, xmm2
+ dec rdi
+ mov edx, -1022
+ shl rdx, 32
+ mov rax, rdi
+ shr rdi, 19
+ shr rax, 20
+ mov rcx, rdi
+ sub rcx, rax
+ lea rcx, [rcx+rdx+1]
+ add rax, rdx
+ imul rcx, rax
+ sub rcx, r9
+ adc rdi, 0
+ jmp sqrt_fixup_ryzen_ret
+
+cnv2_main_loop_ryzen_endp:
diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_win64.inc b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_win64.inc
new file mode 100755
index 000000000..d103cc2ee
--- /dev/null
+++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_win64.inc
@@ -0,0 +1,179 @@
+ mov QWORD PTR [rsp+16], rbx
+ mov QWORD PTR [rsp+24], rbp
+ mov QWORD PTR [rsp+32], rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+ sub rsp, 64
+
+ stmxcsr DWORD PTR [rsp]
+ mov DWORD PTR [rsp+4], 24448
+ ldmxcsr DWORD PTR [rsp+4]
+
+ mov rax, QWORD PTR [rcx+48]
+ mov r9, rcx
+ xor rax, QWORD PTR [rcx+16]
+ mov ebp, 524288
+ mov r8, QWORD PTR [rcx+32]
+ xor r8, QWORD PTR [rcx]
+ mov r11, QWORD PTR [rcx+40]
+ mov r10, r8
+ mov rdx, QWORD PTR [rcx+56]
+ movd xmm3, rax
+ xor rdx, QWORD PTR [rcx+24]
+ xor r11, QWORD PTR [rcx+8]
+ mov rbx, QWORD PTR [rcx+224]
+ mov rax, QWORD PTR [r9+80]
+ xor rax, QWORD PTR [r9+64]
+ movd xmm0, rdx
+ mov rcx, QWORD PTR [rcx+88]
+ xor rcx, QWORD PTR [r9+72]
+ mov rdi, QWORD PTR [r9+104]
+ and r10d, 2097136
+ movaps XMMWORD PTR [rsp+48], xmm6
+ movd xmm4, rax
+ movaps XMMWORD PTR [rsp+32], xmm7
+ movaps XMMWORD PTR [rsp+16], xmm8
+ xorps xmm8, xmm8
+ mov ax, 1023
+ shl rax, 52
+ movd xmm7, rax
+ mov r15, QWORD PTR [r9+96]
+ punpcklqdq xmm3, xmm0
+ movd xmm0, rcx
+ punpcklqdq xmm4, xmm0
+
+ ALIGN 8
+main_loop_ryzen:
+ movdqa xmm5, XMMWORD PTR [r10+rbx]
+ movd xmm0, r11
+ movd xmm6, r8
+ punpcklqdq xmm6, xmm0
+ lea rdx, QWORD PTR [r10+rbx]
+ lea r9, QWORD PTR [rdi+rdi]
+ shl rdi, 32
+
+ mov ecx, r10d
+ mov eax, r10d
+ xor ecx, 16
+ xor eax, 32
+ xor r10d, 48
+ aesenc xmm5, xmm6
+ movdqa xmm2, XMMWORD PTR [rcx+rbx]
+ movdqa xmm1, XMMWORD PTR [rax+rbx]
+ movdqa xmm0, XMMWORD PTR [r10+rbx]
+ paddq xmm2, xmm3
+ paddq xmm1, xmm6
+ paddq xmm0, xmm4
+ movdqa XMMWORD PTR [rcx+rbx], xmm0
+ movdqa XMMWORD PTR [rax+rbx], xmm2
+ movdqa XMMWORD PTR [r10+rbx], xmm1
+
+ movaps xmm1, xmm8
+ mov rsi, r15
+ xor rsi, rdi
+ movd r14, xmm5
+ movdqa xmm0, xmm5
+ pxor xmm0, xmm3
+ mov r10, r14
+ and r10d, 2097136
+ movdqa XMMWORD PTR [rdx], xmm0
+ xor rsi, QWORD PTR [r10+rbx]
+ lea r12, QWORD PTR [r10+rbx]
+ mov r13, QWORD PTR [r10+rbx+8]
+
+ add r9d, r14d
+ or r9d, -2147483647
+ xor edx, edx
+ movdqa xmm0, xmm5
+ psrldq xmm0, 8
+ movd rax, xmm0
+
+ div r9
+ movd xmm0, rax
+ movd xmm1, rdx
+ punpckldq xmm0, xmm1
+ movd r15, xmm0
+ paddq xmm0, xmm5
+ movdqa xmm2, xmm0
+ psrlq xmm0, 12
+ paddq xmm0, xmm7
+ sqrtsd xmm1, xmm0
+ movd rdi, xmm1
+ test rdi, 524287
+ je sqrt_fixup_ryzen
+ shr rdi, 19
+
+sqrt_fixup_ryzen_ret:
+ mov rax, rsi
+ mul r14
+ movd xmm1, rax
+ movd xmm0, rdx
+ punpcklqdq xmm0, xmm1
+
+ mov r9d, r10d
+ mov ecx, r10d
+ xor r9d, 16
+ xor ecx, 32
+ xor r10d, 48
+ movdqa xmm1, XMMWORD PTR [rcx+rbx]
+ xor rdx, [rcx+rbx]
+ xor rax, [rcx+rbx+8]
+ movdqa xmm2, XMMWORD PTR [r9+rbx]
+ pxor xmm2, xmm0
+ paddq xmm4, XMMWORD PTR [r10+rbx]
+ paddq xmm2, xmm3
+ paddq xmm1, xmm6
+ movdqa XMMWORD PTR [r9+rbx], xmm4
+ movdqa XMMWORD PTR [rcx+rbx], xmm2
+ movdqa XMMWORD PTR [r10+rbx], xmm1
+
+ movdqa xmm4, xmm3
+ add r8, rdx
+ add r11, rax
+ mov QWORD PTR [r12], r8
+ xor r8, rsi
+ mov QWORD PTR [r12+8], r11
+ mov r10, r8
+ xor r11, r13
+ and r10d, 2097136
+ movdqa xmm3, xmm5
+ dec ebp
+ jne main_loop_ryzen
+
+ ldmxcsr DWORD PTR [rsp]
+ movaps xmm6, XMMWORD PTR [rsp+48]
+ lea r11, QWORD PTR [rsp+64]
+ mov rbx, QWORD PTR [r11+56]
+ mov rbp, QWORD PTR [r11+64]
+ mov rsi, QWORD PTR [r11+72]
+ movaps xmm8, XMMWORD PTR [r11-48]
+ movaps xmm7, XMMWORD PTR [rsp+32]
+ mov rsp, r11
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ jmp cnv2_main_loop_ryzen_endp
+
+sqrt_fixup_ryzen:
+ movd r9, xmm2
+ dec rdi
+ mov edx, -1022
+ shl rdx, 32
+ mov rax, rdi
+ shr rdi, 19
+ shr rax, 20
+ mov rcx, rdi
+ sub rcx, rax
+ lea rcx, [rcx+rdx+1]
+ add rax, rdx
+ imul rcx, rax
+ sub rcx, r9
+ adc rdi, 0
+ jmp sqrt_fixup_ryzen_ret
+
+cnv2_main_loop_ryzen_endp:
diff --git a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
index 9f70bcfa7..2b1741764 100644
--- a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
+++ b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
@@ -19,6 +19,8 @@
#include "xmrstak/backend/cryptonight.hpp"
#include
#include
+#include
+#include
#ifdef __GNUC__
#include
@@ -151,15 +153,15 @@ static inline void soft_aes_round(__m128i key, __m128i* x0, __m128i* x1, __m128i
inline void mix_and_propagate(__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3, __m128i& x4, __m128i& x5, __m128i& x6, __m128i& x7)
{
- __m128i tmp0 = x0;
- x0 = _mm_xor_si128(x0, x1);
- x1 = _mm_xor_si128(x1, x2);
- x2 = _mm_xor_si128(x2, x3);
- x3 = _mm_xor_si128(x3, x4);
- x4 = _mm_xor_si128(x4, x5);
- x5 = _mm_xor_si128(x5, x6);
- x6 = _mm_xor_si128(x6, x7);
- x7 = _mm_xor_si128(x7, tmp0);
+ __m128i tmp0 = x0;
+ x0 = _mm_xor_si128(x0, x1);
+ x1 = _mm_xor_si128(x1, x2);
+ x2 = _mm_xor_si128(x2, x3);
+ x3 = _mm_xor_si128(x3, x4);
+ x4 = _mm_xor_si128(x4, x5);
+ x5 = _mm_xor_si128(x5, x6);
+ x6 = _mm_xor_si128(x6, x7);
+ x7 = _mm_xor_si128(x7, tmp0);
}
template
@@ -422,6 +424,29 @@ void cn_implode_scratchpad(const __m128i* input, __m128i* output)
_mm_store_si128(output + 11, xout7);
}
+inline uint64_t int_sqrt33_1_double_precision(const uint64_t n0)
+{
+ __m128d x = _mm_castsi128_pd(_mm_add_epi64(_mm_cvtsi64_si128(n0 >> 12), _mm_set_epi64x(0, 1023ULL << 52)));
+ x = _mm_sqrt_sd(_mm_setzero_pd(), x);
+ uint64_t r = static_cast(_mm_cvtsi128_si64(_mm_castpd_si128(x)));
+
+ const uint64_t s = r >> 20;
+ r >>= 19;
+
+ uint64_t x2 = (s - (1022ULL << 32)) * (r - s - (1022ULL << 32) + 1);
+
+#ifdef __INTEL_COMPILER
+ _addcarry_u64(_subborrow_u64(0, x2, n0, (unsigned __int64*)&x2), r, 0, (unsigned __int64*)&r);
+#elif defined(_MSC_VER) || (__GNUC__ >= 7)
+ _addcarry_u64(_subborrow_u64(0, x2, n0, (unsigned long long int*)&x2), r, 0, (unsigned long long int*)&r);
+#else
+ // GCC versions prior to 7 don't generate correct assembly for _subborrow_u64 -> _addcarry_u64 sequence
+ // Fallback to simpler code
+ if (x2 < n0) ++r;
+#endif
+ return r;
+}
+
inline __m128i aes_round_bittube2(const __m128i& val, const __m128i& key)
{
alignas(16) uint32_t k[4];
@@ -467,712 +492,511 @@ inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp)
}
-template
-void cryptonight_hash(const void* input, size_t len, void* output, cryptonight_ctx* ctx0)
+/** optimal type for sqrt
+ *
+ * Depending on the number of hashes calculated the optimal type for the sqrt value will be selected.
+ *
+ * @tparam N number of hashes per thread
+ */
+template
+struct GetOptimalSqrtType
{
- constexpr size_t MASK = cn_select_mask();
- constexpr size_t ITERATIONS = cn_select_iter();
- constexpr size_t MEM = cn_select_memory();
-
- if((ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) && len < 43)
- {
- memset(output, 0, 32);
- return;
- }
-
- keccak((const uint8_t *)input, len, ctx0->hash_state, 200);
+ using type = __m128i;
+};
- uint64_t monero_const;
- if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2)
- {
- monero_const = *reinterpret_cast(reinterpret_cast(input) + 35);
- monero_const ^= *(reinterpret_cast(ctx0->hash_state) + 24);
- }
-
- // Optim - 99% time boundary
- cn_explode_scratchpad((__m128i*)ctx0->hash_state, (__m128i*)ctx0->long_state);
-
- uint8_t* l0 = ctx0->long_state;
- uint64_t* h0 = (uint64_t*)ctx0->hash_state;
-
- uint64_t al0 = h0[0] ^ h0[4];
- uint64_t ah0 = h0[1] ^ h0[5];
- __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
-
- uint64_t idx0 = h0[0] ^ h0[4];
-
- // Optim - 90% time boundary
- for(size_t i = 0; i < ITERATIONS; i++)
- {
- __m128i cx;
- cx = _mm_load_si128((__m128i *)&l0[idx0 & MASK]);
+template<>
+struct GetOptimalSqrtType<1u>
+{
+ using type = uint64_t;
+};
+template
+using GetOptimalSqrtType_t = typename GetOptimalSqrtType::type;
+
+/** assign a value and convert if necessary
+ *
+ * @param output output type
+ * @param input value which is assigned to output
+ * @{
+ */
+inline void assign(__m128i& output, const uint64_t input)
+{
+ output = _mm_cvtsi64_si128(input);
+}
- if (ALGO == cryptonight_bittube2)
- {
- cx = aes_round_bittube2(cx, _mm_set_epi64x(ah0, al0));
- }
- else
- {
- if(SOFT_AES)
- cx = soft_aesenc(cx, _mm_set_epi64x(ah0, al0));
- else
- cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah0, al0));
- }
+inline void assign(uint64_t& output, const uint64_t input)
+{
+ output = input;
+}
- if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2)
- cryptonight_monero_tweak((uint64_t*)&l0[idx0 & MASK], _mm_xor_si128(bx0, cx));
- else
- _mm_store_si128((__m128i *)&l0[idx0 & MASK], _mm_xor_si128(bx0, cx));
+inline void assign(uint64_t& output, const __m128i& input)
+{
+ output = _mm_cvtsi128_si64(input);
+}
+/** @} */
- idx0 = _mm_cvtsi128_si64(cx);
+inline void set_float_rounding_mode()
+{
+#ifdef _MSC_VER
+ _control87(RC_DOWN, MCW_RC);
+#else
+ std::fesetround(FE_DOWNWARD);
+#endif
+}
- if(PREFETCH)
- _mm_prefetch((const char*)&l0[idx0 & MASK], _MM_HINT_T0);
- bx0 = cx;
+#define CN_MONERO_V8_SHUFFLE_0(n, l0, idx0, ax0, bx0, bx1) \
+ /* Shuffle the other 3x16 byte chunks in the current 64-byte cache line */ \
+ if(ALGO == cryptonight_monero_v8) \
+ { \
+ const uint64_t idx1 = idx0 & MASK; \
+ const __m128i chunk1 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x10]); \
+ const __m128i chunk2 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x20]); \
+ const __m128i chunk3 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x30]); \
+ _mm_store_si128((__m128i *)&l0[idx1 ^ 0x10], _mm_add_epi64(chunk3, bx1)); \
+ _mm_store_si128((__m128i *)&l0[idx1 ^ 0x20], _mm_add_epi64(chunk1, bx0)); \
+ _mm_store_si128((__m128i *)&l0[idx1 ^ 0x30], _mm_add_epi64(chunk2, ax0)); \
+ }
- uint64_t hi, lo, cl, ch;
- cl = ((uint64_t*)&l0[idx0 & MASK])[0];
- ch = ((uint64_t*)&l0[idx0 & MASK])[1];
+#define CN_MONERO_V8_SHUFFLE_1(n, l0, idx0, ax0, bx0, bx1, lo, hi) \
+ /* Shuffle the other 3x16 byte chunks in the current 64-byte cache line */ \
+ if(ALGO == cryptonight_monero_v8) \
+ { \
+ const uint64_t idx1 = idx0 & MASK; \
+ const __m128i chunk1 = _mm_xor_si128(_mm_load_si128((__m128i *)&l0[idx1 ^ 0x10]), _mm_set_epi64x(lo, hi)); \
+ const __m128i chunk2 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x20]); \
+ hi ^= ((uint64_t*)&chunk2)[0]; \
+ lo ^= ((uint64_t*)&chunk2)[1]; \
+ const __m128i chunk3 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x30]); \
+ _mm_store_si128((__m128i *)&l0[idx1 ^ 0x10], _mm_add_epi64(chunk3, bx1)); \
+ _mm_store_si128((__m128i *)&l0[idx1 ^ 0x20], _mm_add_epi64(chunk1, bx0)); \
+ _mm_store_si128((__m128i *)&l0[idx1 ^ 0x30], _mm_add_epi64(chunk2, ax0)); \
+ }
- lo = _umul128(idx0, cl, &hi);
+#define CN_MONERO_V8_DIV(n, cx, sqrt_result, division_result_xmm, cl) \
+ if(ALGO == cryptonight_monero_v8) \
+ { \
+ uint64_t sqrt_result_tmp; \
+ assign(sqrt_result_tmp, sqrt_result); \
+ /* Use division and square root results from the _previous_ iteration to hide the latency */ \
+ const uint64_t cx_64 = _mm_cvtsi128_si64(cx); \
+ cl ^= static_cast(_mm_cvtsi128_si64(division_result_xmm)) ^ (sqrt_result_tmp << 32); \
+ const uint32_t d = (cx_64 + (sqrt_result_tmp << 1)) | 0x80000001UL; \
+ /* Most and least significant bits in the divisor are set to 1 \
+ * to make sure we don't divide by a small or even number, \
+ * so there are no shortcuts for such cases \
+ * \
+ * Quotient may be as large as (2^64 - 1)/(2^31 + 1) = 8589934588 = 2^33 - 4 \
+ * We drop the highest bit to fit both quotient and remainder in 32 bits \
+ */ \
+ /* Compiler will optimize it to a single div instruction */ \
+ const uint64_t cx_s = _mm_cvtsi128_si64(_mm_srli_si128(cx, 8)); \
+ const uint64_t division_result = static_cast(cx_s / d) + ((cx_s % d) << 32); \
+ division_result_xmm = _mm_cvtsi64_si128(static_cast(division_result)); \
+ /* Use division_result as an input for the square root to prevent parallel implementation in hardware */ \
+ assign(sqrt_result, int_sqrt33_1_double_precision(cx_64 + division_result)); \
+ }
- al0 += hi;
- ((uint64_t*)&l0[idx0 & MASK])[0] = al0;
- al0 ^= cl;
- if(PREFETCH)
- _mm_prefetch((const char*)&l0[al0 & MASK], _MM_HINT_T0);
- ah0 += lo;
+#define CN_INIT_SINGLE \
+ if((ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) && len < 43) \
+ { \
+ memset(output, 0, 32 * N); \
+ return; \
+ }
- if (ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) {
- if (ALGO == cryptonight_ipbc || ALGO == cryptonight_bittube2)
- ((uint64_t*)&l0[idx0 & MASK])[1] = ah0 ^ monero_const ^ ((uint64_t*)&l0[idx0 & MASK])[0];
- else
- ((uint64_t*)&l0[idx0 & MASK])[1] = ah0 ^ monero_const;
- }
- else
- ((uint64_t*)&l0[idx0 & MASK])[1] = ah0;
- ah0 ^= ch;
+#define CN_INIT(n, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm) \
+ keccak((const uint8_t *)input + len * n, len, ctx[n]->hash_state, 200); \
+ uint64_t monero_const; \
+ if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) \
+ { \
+ monero_const = *reinterpret_cast(reinterpret_cast(input) + len * n + 35); \
+ monero_const ^= *(reinterpret_cast(ctx[n]->hash_state) + 24); \
+ } \
+ /* Optim - 99% time boundary */ \
+ cn_explode_scratchpad((__m128i*)ctx[n]->hash_state, (__m128i*)ctx[n]->long_state); \
+ \
+ __m128i ax0; \
+ uint64_t idx0; \
+ __m128i bx0; \
+ uint8_t* l0 = ctx[n]->long_state; \
+ /* BEGIN cryptonight_monero_v8 variables */ \
+ __m128i bx1; \
+ __m128i division_result_xmm; \
+ GetOptimalSqrtType_t sqrt_result; \
+ /* END cryptonight_monero_v8 variables */ \
+ { \
+ uint64_t* h0 = (uint64_t*)ctx[n]->hash_state; \
+ idx0 = h0[0] ^ h0[4]; \
+ ax0 = _mm_set_epi64x(h0[1] ^ h0[5], idx0); \
+ bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); \
+ if(ALGO == cryptonight_monero_v8) \
+ { \
+ bx1 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]); \
+ division_result_xmm = _mm_cvtsi64_si128(h0[12]); \
+ assign(sqrt_result, h0[13]); \
+ set_float_rounding_mode(); \
+ } \
+ } \
+ __m128i *ptr0
- idx0 = al0;
+#define CN_STEP1(n, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1) \
+ __m128i cx; \
+ ptr0 = (__m128i *)&l0[idx0 & MASK]; \
+ cx = _mm_load_si128(ptr0); \
+ if (ALGO == cryptonight_bittube2) \
+ { \
+ cx = aes_round_bittube2(cx, ax0); \
+ } \
+ else \
+ { \
+ if(SOFT_AES) \
+ cx = soft_aesenc(cx, ax0); \
+ else \
+ cx = _mm_aesenc_si128(cx, ax0); \
+ } \
+ CN_MONERO_V8_SHUFFLE_0(n, l0, idx0, ax0, bx0, bx1)
- if(ALGO == cryptonight_heavy || ALGO == cryptonight_bittube2)
- {
- int64_t n = ((int64_t*)&l0[idx0 & MASK])[0];
- int32_t d = ((int32_t*)&l0[idx0 & MASK])[2];
- int64_t q = n / (d | 0x5);
+#define CN_STEP2(n, monero_const, l0, ax0, bx0, idx0, ptr0, cx) \
+ if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) \
+ cryptonight_monero_tweak((uint64_t*)ptr0, _mm_xor_si128(bx0, cx)); \
+ else \
+ _mm_store_si128((__m128i *)ptr0, _mm_xor_si128(bx0, cx)); \
+ idx0 = _mm_cvtsi128_si64(cx); \
+ \
+ ptr0 = (__m128i *)&l0[idx0 & MASK]; \
+ if(PREFETCH) \
+ _mm_prefetch((const char*)ptr0, _MM_HINT_T0); \
+ if(ALGO != cryptonight_monero_v8) \
+ bx0 = cx
+
+#define CN_STEP3(n, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm) \
+ uint64_t lo, cl, ch; \
+ uint64_t al0 = _mm_cvtsi128_si64(ax0); \
+ uint64_t ah0 = ((uint64_t*)&ax0)[1]; \
+ cl = ((uint64_t*)ptr0)[0]; \
+ ch = ((uint64_t*)ptr0)[1]; \
+ CN_MONERO_V8_DIV(n, cx, sqrt_result, division_result_xmm, cl); \
+ { \
+ uint64_t hi; \
+ lo = _umul128(idx0, cl, &hi); \
+ CN_MONERO_V8_SHUFFLE_1(n, l0, idx0, ax0, bx0, bx1, lo, hi); \
+ ah0 += lo; \
+ al0 += hi; \
+ } \
+ if(ALGO == cryptonight_monero_v8) \
+ { \
+ bx1 = bx0; \
+ bx0 = cx; \
+ } \
+ ((uint64_t*)ptr0)[0] = al0; \
+ if(PREFETCH) \
+ _mm_prefetch((const char*)ptr0, _MM_HINT_T0)
- ((int64_t*)&l0[idx0 & MASK])[0] = n ^ q;
- idx0 = d ^ q;
- }
- else if(ALGO == cryptonight_haven)
- {
- int64_t n = ((int64_t*)&l0[idx0 & MASK])[0];
- int32_t d = ((int32_t*)&l0[idx0 & MASK])[2];
- int64_t q = n / (d | 0x5);
+#define CN_STEP4(n, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0) \
+ if (ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) \
+ { \
+ if (ALGO == cryptonight_ipbc || ALGO == cryptonight_bittube2) \
+ ((uint64_t*)ptr0)[1] = ah0 ^ monero_const ^ ((uint64_t*)ptr0)[0]; \
+ else \
+ ((uint64_t*)ptr0)[1] = ah0 ^ monero_const; \
+ } \
+ else \
+ ((uint64_t*)ptr0)[1] = ah0; \
+ al0 ^= cl; \
+ ah0 ^= ch; \
+ ax0 = _mm_set_epi64x(ah0, al0); \
+ idx0 = al0;
- ((int64_t*)&l0[idx0 & MASK])[0] = n ^ q;
- idx0 = (~d) ^ q;
- }
+#define CN_STEP5(n, monero_const, l0, ax0, bx0, idx0, ptr0) \
+ if(ALGO == cryptonight_heavy || ALGO == cryptonight_bittube2) \
+ { \
+ ptr0 = (__m128i *)&l0[idx0 & MASK]; \
+ int64_t u = ((int64_t*)ptr0)[0]; \
+ int32_t d = ((int32_t*)ptr0)[2]; \
+ int64_t q = u / (d | 0x5); \
+ \
+ ((int64_t*)ptr0)[0] = u ^ q; \
+ idx0 = d ^ q; \
+ } \
+ else if(ALGO == cryptonight_haven) \
+ { \
+ ptr0 = (__m128i *)&l0[idx0 & MASK]; \
+ int64_t u = ((int64_t*)ptr0)[0]; \
+ int32_t d = ((int32_t*)ptr0)[2]; \
+ int64_t q = u / (d | 0x5); \
+ \
+ ((int64_t*)ptr0)[0] = u ^ q; \
+ idx0 = (~d) ^ q; \
}
- // Optim - 90% time boundary
- cn_implode_scratchpad((__m128i*)ctx0->long_state, (__m128i*)ctx0->hash_state);
-
- // Optim - 99% time boundary
+#define CN_FINALIZE(n) \
+ /* Optim - 90% time boundary */ \
+ cn_implode_scratchpad((__m128i*)ctx[n]->long_state, (__m128i*)ctx[n]->hash_state); \
+ /* Optim - 99% time boundary */ \
+ keccakf((uint64_t*)ctx[n]->hash_state, 24); \
+ extra_hashes[ctx[n]->hash_state[0] & 3](ctx[n]->hash_state, 200, (char*)output + 32 * n)
- keccakf((uint64_t*)ctx0->hash_state, 24);
- extra_hashes[ctx0->hash_state[0] & 3](ctx0->hash_state, 200, (char*)output);
-}
+//! defer the evaluation of an macro
+#ifndef _MSC_VER
+# define CN_DEFER(...) __VA_ARGS__
+#else
+# define CN_EMPTY(...)
+# define CN_DEFER(...) __VA_ARGS__ CN_EMPTY()
+#endif
-// This lovely creation will do 2 cn hashes at a time. We have plenty of space on silicon
-// to fit temporary vars for two contexts. Function will read len*2 from input and write 64 bytes to output
-// We are still limited by L3 cache, so doubling will only work with CPUs where we have more than 2MB to core (Xeons)
-template
-void cryptonight_double_hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
+//! execute the macro f with the passed arguments
+#define CN_EXEC(f,...) CN_DEFER(f)(__VA_ARGS__)
+
+/** add append n to all arguments and keeps n as first argument
+ *
+ * @param n number which is appended to the arguments (expect the first argument n)
+ *
+ * @code{.cpp}
+ * CN_ENUM_2(1, foo, bar)
+ * // is transformed to
+ * 1, foo1, bar1
+ * @endcode
+ */
+#define CN_ENUM_0(n, ...) n
+#define CN_ENUM_1(n, x1) n, x1 ## n
+#define CN_ENUM_2(n, x1, x2) n, x1 ## n, x2 ## n
+#define CN_ENUM_3(n, x1, x2, x3) n, x1 ## n, x2 ## n, x3 ## n
+#define CN_ENUM_4(n, x1, x2, x3, x4) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n
+#define CN_ENUM_5(n, x1, x2, x3, x4, x5) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n
+#define CN_ENUM_6(n, x1, x2, x3, x4, x5, x6) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n
+#define CN_ENUM_7(n, x1, x2, x3, x4, x5, x6, x7) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n
+#define CN_ENUM_8(n, x1, x2, x3, x4, x5, x6, x7, x8) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n
+#define CN_ENUM_9(n, x1, x2, x3, x4, x5, x6, x7, x8, x9) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n
+#define CN_ENUM_10(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n
+#define CN_ENUM_11(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n
+#define CN_ENUM_12(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n, x12 ## n
+#define CN_ENUM_13(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n, x12 ## n, x13 ## n
+#define CN_ENUM_14(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n, x12 ## n, x13 ## n, x14 ## n
+#define CN_ENUM_15(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n, x12 ## n, x13 ## n, x14 ## n, x15 ## n
+
+/** repeat a macro call multiple times
+ *
+ * @param n number of arguments followed after f
+ * @param f name of the macro which should be executed
+ * @param ... n parameter which name will get appended by a unique number
+ *
+ * @code{.cpp}
+ * REPEAT_2(2, f, foo, bar)
+ * // is transformed to
+ * f(0, foo0, bar); f(1, foo1, bar1)
+ * @endcode
+ */
+#define REPEAT_1(n, f, ...) CN_EXEC(f, CN_ENUM_ ## n(0, __VA_ARGS__))
+#define REPEAT_2(n, f, ...) CN_EXEC(f, CN_ENUM_ ## n(0, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(1, __VA_ARGS__))
+#define REPEAT_3(n, f, ...) CN_EXEC(f, CN_ENUM_ ## n(0, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(1, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(2, __VA_ARGS__))
+#define REPEAT_4(n, f, ...) CN_EXEC(f, CN_ENUM_ ## n(0, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(1, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(2, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(3, __VA_ARGS__))
+#define REPEAT_5(n, f, ...) CN_EXEC(f, CN_ENUM_ ## n(0, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(1, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(2, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(3, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(4, __VA_ARGS__))
+
+template< size_t N>
+struct Cryptonight_hash;
+
+template< >
+struct Cryptonight_hash<1>
{
- constexpr size_t MASK = cn_select_mask();
- constexpr size_t ITERATIONS = cn_select_iter();
- constexpr size_t MEM = cn_select_memory();
+ static constexpr size_t N = 1;
- if((ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) && len < 43)
+ template
+ static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
{
- memset(output, 0, 64);
- return;
- }
-
- keccak((const uint8_t *)input, len, ctx[0]->hash_state, 200);
- keccak((const uint8_t *)input+len, len, ctx[1]->hash_state, 200);
-
- uint64_t monero_const_0, monero_const_1;
- if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2)
- {
- monero_const_0 = *reinterpret_cast(reinterpret_cast(input) + 35);
- monero_const_0 ^= *(reinterpret_cast(ctx[0]->hash_state) + 24);
- monero_const_1 = *reinterpret_cast(reinterpret_cast(input) + len + 35);
- monero_const_1 ^= *(reinterpret_cast(ctx[1]->hash_state) + 24);
- }
-
- // Optim - 99% time boundary
- cn_explode_scratchpad((__m128i*)ctx[0]->hash_state, (__m128i*)ctx[0]->long_state);
- cn_explode_scratchpad((__m128i*)ctx[1]->hash_state, (__m128i*)ctx[1]->long_state);
+ constexpr size_t MASK = cn_select_mask();
+ constexpr size_t ITERATIONS = cn_select_iter();
+ constexpr size_t MEM = cn_select_memory();
- uint8_t* l0 = ctx[0]->long_state;
- uint64_t* h0 = (uint64_t*)ctx[0]->hash_state;
- uint8_t* l1 = ctx[1]->long_state;
- uint64_t* h1 = (uint64_t*)ctx[1]->hash_state;
+ CN_INIT_SINGLE;
+ REPEAT_1(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm);
- uint64_t axl0 = h0[0] ^ h0[4];
- uint64_t axh0 = h0[1] ^ h0[5];
- __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
- uint64_t axl1 = h1[0] ^ h1[4];
- uint64_t axh1 = h1[1] ^ h1[5];
- __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
-
- uint64_t idx0 = h0[0] ^ h0[4];
- uint64_t idx1 = h1[0] ^ h1[4];
-
- // Optim - 90% time boundary
- for (size_t i = 0; i < ITERATIONS; i++)
- {
- __m128i cx;
- cx = _mm_load_si128((__m128i *)&l0[idx0 & MASK]);
-
- if (ALGO == cryptonight_bittube2)
- {
- cx = aes_round_bittube2(cx, _mm_set_epi64x(axh0, axl0));
- }
- else
+ // Optim - 90% time boundary
+ for(size_t i = 0; i < ITERATIONS; i++)
{
- if(SOFT_AES)
- cx = soft_aesenc(cx, _mm_set_epi64x(axh0, axl0));
- else
- cx = _mm_aesenc_si128(cx, _mm_set_epi64x(axh0, axl0));
+ REPEAT_1(8, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1);
+ REPEAT_1(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx);
+ REPEAT_1(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm);
+ REPEAT_1(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0);
+ REPEAT_1(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0);
}
- if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2)
- cryptonight_monero_tweak((uint64_t*)&l0[idx0 & MASK], _mm_xor_si128(bx0, cx));
- else
- _mm_store_si128((__m128i *)&l0[idx0 & MASK], _mm_xor_si128(bx0, cx));
+ REPEAT_1(0, CN_FINALIZE);
+ }
+};
- idx0 = _mm_cvtsi128_si64(cx);
- bx0 = cx;
+template< >
+struct Cryptonight_hash<2>
+{
+ static constexpr size_t N = 2;
- if(PREFETCH)
- _mm_prefetch((const char*)&l0[idx0 & MASK], _MM_HINT_T0);
+ template
+ static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
+ {
+ constexpr size_t MASK = cn_select_mask();
+ constexpr size_t ITERATIONS = cn_select_iter();
+ constexpr size_t MEM = cn_select_memory();
- cx = _mm_load_si128((__m128i *)&l1[idx1 & MASK]);
+ CN_INIT_SINGLE;
+ REPEAT_2(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm);
- if (ALGO == cryptonight_bittube2)
- {
- cx = aes_round_bittube2(cx, _mm_set_epi64x(axh1, axl1));
- }
- else
+ // Optim - 90% time boundary
+ for(size_t i = 0; i < ITERATIONS; i++)
{
- if(SOFT_AES)
- cx = soft_aesenc(cx, _mm_set_epi64x(axh1, axl1));
- else
- cx = _mm_aesenc_si128(cx, _mm_set_epi64x(axh1, axl1));
+ REPEAT_2(8, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1);
+ REPEAT_2(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx);
+ REPEAT_2(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm);
+ REPEAT_2(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0);
+ REPEAT_2(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0);
}
- if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2)
- cryptonight_monero_tweak((uint64_t*)&l1[idx1 & MASK], _mm_xor_si128(bx1, cx));
- else
- _mm_store_si128((__m128i *)&l1[idx1 & MASK], _mm_xor_si128(bx1, cx));
-
- idx1 = _mm_cvtsi128_si64(cx);
- bx1 = cx;
-
- if(PREFETCH)
- _mm_prefetch((const char*)&l1[idx1 & MASK], _MM_HINT_T0);
-
- uint64_t hi, lo, cl, ch;
- cl = ((uint64_t*)&l0[idx0 & MASK])[0];
- ch = ((uint64_t*)&l0[idx0 & MASK])[1];
-
- lo = _umul128(idx0, cl, &hi);
+ REPEAT_2(0, CN_FINALIZE);
+ }
+};
- axl0 += hi;
- axh0 += lo;
- ((uint64_t*)&l0[idx0 & MASK])[0] = axl0;
+template< >
+struct Cryptonight_hash<3>
+{
+ static constexpr size_t N = 3;
- if (ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) {
- if (ALGO == cryptonight_ipbc || ALGO == cryptonight_bittube2)
- ((uint64_t*)&l0[idx0 & MASK])[1] = axh0 ^ monero_const_0 ^ ((uint64_t*)&l0[idx0 & MASK])[0];
- else
- ((uint64_t*)&l0[idx0 & MASK])[1] = axh0 ^ monero_const_0;
- } else
- ((uint64_t*)&l0[idx0 & MASK])[1] = axh0;
+ template
+ static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
+ {
+ constexpr size_t MASK = cn_select_mask();
+ constexpr size_t ITERATIONS = cn_select_iter();
+ constexpr size_t MEM = cn_select_memory();
- axh0 ^= ch;
- axl0 ^= cl;
- idx0 = axl0;
+ CN_INIT_SINGLE;
+ REPEAT_3(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm);
- if(ALGO == cryptonight_heavy || ALGO == cryptonight_bittube2)
+ // Optim - 90% time boundary
+ for(size_t i = 0; i < ITERATIONS; i++)
{
- int64_t n = ((int64_t*)&l0[idx0 & MASK])[0];
- int32_t d = ((int32_t*)&l0[idx0 & MASK])[2];
- int64_t q = n / (d | 0x5);
-
- ((int64_t*)&l0[idx0 & MASK])[0] = n ^ q;
- idx0 = d ^ q;
+ REPEAT_3(8, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1);
+ REPEAT_3(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx);
+ REPEAT_3(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm);
+ REPEAT_3(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0);
+ REPEAT_3(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0);
}
- else if(ALGO == cryptonight_haven)
- {
- int64_t n = ((int64_t*)&l0[idx0 & MASK])[0];
- int32_t d = ((int32_t*)&l0[idx0 & MASK])[2];
- int64_t q = n / (d | 0x5);
-
- ((int64_t*)&l0[idx0 & MASK])[0] = n ^ q;
- idx0 = (~d) ^ q;
- }
-
- if(PREFETCH)
- _mm_prefetch((const char*)&l0[idx0 & MASK], _MM_HINT_T0);
-
- cl = ((uint64_t*)&l1[idx1 & MASK])[0];
- ch = ((uint64_t*)&l1[idx1 & MASK])[1];
- lo = _umul128(idx1, cl, &hi);
+ REPEAT_3(0, CN_FINALIZE);
+ }
+};
- axl1 += hi;
- axh1 += lo;
- ((uint64_t*)&l1[idx1 & MASK])[0] = axl1;
+template< >
+struct Cryptonight_hash<4>
+{
+ static constexpr size_t N = 4;
- if (ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_bittube2) {
- if (ALGO == cryptonight_ipbc || ALGO == cryptonight_bittube2)
- ((uint64_t*)&l1[idx1 & MASK])[1] = axh1 ^ monero_const_1 ^ ((uint64_t*)&l1[idx1 & MASK])[0];
- else
- ((uint64_t*)&l1[idx1 & MASK])[1] = axh1 ^ monero_const_1;
- } else
- ((uint64_t*)&l1[idx1 & MASK])[1] = axh1;
+ template
+ static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
+ {
+ constexpr size_t MASK = cn_select_mask();
+ constexpr size_t ITERATIONS = cn_select_iter();
+ constexpr size_t MEM = cn_select_memory();
- axh1 ^= ch;
- axl1 ^= cl;
- idx1 = axl1;
+ CN_INIT_SINGLE;
+ REPEAT_4(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm);
- if(ALGO == cryptonight_heavy || ALGO == cryptonight_bittube2)
+ // Optim - 90% time boundary
+ for(size_t i = 0; i < ITERATIONS; i++)
{
- int64_t n = ((int64_t*)&l1[idx1 & MASK])[0];
- int32_t d = ((int32_t*)&l1[idx1 & MASK])[2];
- int64_t q = n / (d | 0x5);
-
- ((int64_t*)&l1[idx1 & MASK])[0] = n ^ q;
- idx1 = d ^ q;
+ REPEAT_4(8, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1);
+ REPEAT_4(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx);
+ REPEAT_4(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm);
+ REPEAT_4(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0);
+ REPEAT_4(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0);
}
- else if(ALGO == cryptonight_haven)
- {
- int64_t n = ((int64_t*)&l1[idx1 & MASK])[0];
- int32_t d = ((int32_t*)&l1[idx1 & MASK])[2];
- int64_t q = n / (d | 0x5);
- ((int64_t*)&l1[idx1 & MASK])[0] = n ^ q;
- idx1 = (~d) ^ q;
- }
-
- if(PREFETCH)
- _mm_prefetch((const char*)&l1[idx1 & MASK], _MM_HINT_T0);
+ REPEAT_4(0, CN_FINALIZE);
}
+};
- // Optim - 90% time boundary
- cn_implode_scratchpad((__m128i*)ctx[0]->long_state, (__m128i*)ctx[0]->hash_state);
- cn_implode_scratchpad((__m128i*)ctx[1]->long_state, (__m128i*)ctx[1]->hash_state);
-
- // Optim - 99% time boundary
-
- keccakf((uint64_t*)ctx[0]->hash_state, 24);
- extra_hashes[ctx[0]->hash_state[0] & 3](ctx[0]->hash_state, 200, (char*)output);
- keccakf((uint64_t*)ctx[1]->hash_state, 24);
- extra_hashes[ctx[1]->hash_state[0] & 3](ctx[1]->hash_state, 200, (char*)output + 32);
-}
+template< >
+struct Cryptonight_hash<5>
+{
+ static constexpr size_t N = 5;
-#define CN_STEP1(a, b, c, l, ptr, idx) \
- ptr = (__m128i *)&l[idx & MASK]; \
- if(PREFETCH) \
- _mm_prefetch((const char*)ptr, _MM_HINT_T0); \
- c = _mm_load_si128(ptr);
-
-#define CN_STEP2(a, b, c, l, ptr, idx) \
- if (ALGO == cryptonight_bittube2) \
- { \
- c = aes_round_bittube2(c, a); \
- } \
- else \
- { \
- if(SOFT_AES) \
- c = soft_aesenc(c, a); \
- else \
- c = _mm_aesenc_si128(c, a); \
- } \
- b = _mm_xor_si128(b, c); \
- if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) \
- cryptonight_monero_tweak((uint64_t*)ptr, b); \
- else \
- _mm_store_si128(ptr, b);\
-
-#define CN_STEP3(a, b, c, l, ptr, idx) \
- idx = _mm_cvtsi128_si64(c); \
- ptr = (__m128i *)&l[idx & MASK]; \
- if(PREFETCH) \
- _mm_prefetch((const char*)ptr, _MM_HINT_T0); \
- b = _mm_load_si128(ptr);
-
-#define CN_STEP4(a, b, c, l, mc, ptr, idx) \
- lo = _umul128(idx, _mm_cvtsi128_si64(b), &hi); \
- a = _mm_add_epi64(a, _mm_set_epi64x(lo, hi)); \
- if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) \
- { \
- _mm_store_si128(ptr, _mm_xor_si128(a, mc)); \
- if (ALGO == cryptonight_ipbc || ALGO == cryptonight_bittube2) \
- ((uint64_t*)ptr)[1] ^= ((uint64_t*)ptr)[0];\
- } \
- else \
- _mm_store_si128(ptr, a);\
- a = _mm_xor_si128(a, b); \
- idx = _mm_cvtsi128_si64(a); \
- if(ALGO == cryptonight_heavy || ALGO == cryptonight_bittube2) \
- { \
- int64_t n = ((int64_t*)&l[idx & MASK])[0]; \
- int32_t d = ((int32_t*)&l[idx & MASK])[2]; \
- int64_t q = n / (d | 0x5); \
- ((int64_t*)&l[idx & MASK])[0] = n ^ q; \
- idx = d ^ q; \
- } \
- else if(ALGO == cryptonight_haven) \
- { \
- int64_t n = ((int64_t*)&l[idx & MASK])[0]; \
- int32_t d = ((int32_t*)&l[idx & MASK])[2]; \
- int64_t q = n / (d | 0x5); \
- ((int64_t*)&l[idx & MASK])[0] = n ^ q; \
- idx = (~d) ^ q; \
- }
+ template
+ static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
+ {
+ constexpr size_t MASK = cn_select_mask();
+ constexpr size_t ITERATIONS = cn_select_iter();
+ constexpr size_t MEM = cn_select_memory();
-#define CONST_INIT(ctx, n) \
- __m128i mc##n = _mm_set_epi64x(*reinterpret_cast(reinterpret_cast(input) + n * len + 35) ^ \
- *(reinterpret_cast((ctx)->hash_state) + 24), 0);
+ CN_INIT_SINGLE;
+ REPEAT_5(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm);
-// This lovelier creation will do 3 cn hashes at a time.
-template
-void cryptonight_triple_hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
-{
- constexpr size_t MASK = cn_select_mask();
- constexpr size_t ITERATIONS = cn_select_iter();
- constexpr size_t MEM = cn_select_memory();
+ // Optim - 90% time boundary
+ for(size_t i = 0; i < ITERATIONS; i++)
+ {
+ REPEAT_5(8, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1);
+ REPEAT_5(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx);
+ REPEAT_5(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm);
+ REPEAT_5(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0);
+ REPEAT_5(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0);
+ }
- if((ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) && len < 43)
- {
- memset(output, 0, 32 * 3);
- return;
+ REPEAT_5(0, CN_FINALIZE);
}
+};
- for (size_t i = 0; i < 3; i++)
- {
- keccak((const uint8_t *)input + len * i, len, ctx[i]->hash_state, 200);
- cn_explode_scratchpad((__m128i*)ctx[i]->hash_state, (__m128i*)ctx[i]->long_state);
- }
+extern "C" void cryptonight_v8_mainloop_ivybridge_asm(cryptonight_ctx* ctx0);
+extern "C" void cryptonight_v8_mainloop_ryzen_asm(cryptonight_ctx* ctx0);
+extern "C" void cryptonight_v8_double_mainloop_sandybridge_asm(cryptonight_ctx* ctx0, cryptonight_ctx* ctx1);
- CONST_INIT(ctx[0], 0);
- CONST_INIT(ctx[1], 1);
- CONST_INIT(ctx[2], 2);
-
- uint8_t* l0 = ctx[0]->long_state;
- uint64_t* h0 = (uint64_t*)ctx[0]->hash_state;
- uint8_t* l1 = ctx[1]->long_state;
- uint64_t* h1 = (uint64_t*)ctx[1]->hash_state;
- uint8_t* l2 = ctx[2]->long_state;
- uint64_t* h2 = (uint64_t*)ctx[2]->hash_state;
-
- __m128i ax0 = _mm_set_epi64x(h0[1] ^ h0[5], h0[0] ^ h0[4]);
- __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
- __m128i ax1 = _mm_set_epi64x(h1[1] ^ h1[5], h1[0] ^ h1[4]);
- __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
- __m128i ax2 = _mm_set_epi64x(h2[1] ^ h2[5], h2[0] ^ h2[4]);
- __m128i bx2 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]);
- __m128i cx0 = _mm_set_epi64x(0, 0);
- __m128i cx1 = _mm_set_epi64x(0, 0);
- __m128i cx2 = _mm_set_epi64x(0, 0);
-
- uint64_t idx0, idx1, idx2;
- idx0 = _mm_cvtsi128_si64(ax0);
- idx1 = _mm_cvtsi128_si64(ax1);
- idx2 = _mm_cvtsi128_si64(ax2);
-
- for (size_t i = 0; i < ITERATIONS/2; i++)
- {
- uint64_t hi, lo;
- __m128i *ptr0, *ptr1, *ptr2;
-
- // EVEN ROUND
- CN_STEP1(ax0, bx0, cx0, l0, ptr0, idx0);
- CN_STEP1(ax1, bx1, cx1, l1, ptr1, idx1);
- CN_STEP1(ax2, bx2, cx2, l2, ptr2, idx2);
-
- CN_STEP2(ax0, bx0, cx0, l0, ptr0, idx0);
- CN_STEP2(ax1, bx1, cx1, l1, ptr1, idx1);
- CN_STEP2(ax2, bx2, cx2, l2, ptr2, idx2);
-
- CN_STEP3(ax0, bx0, cx0, l0, ptr0, idx0);
- CN_STEP3(ax1, bx1, cx1, l1, ptr1, idx1);
- CN_STEP3(ax2, bx2, cx2, l2, ptr2, idx2);
-
- CN_STEP4(ax0, bx0, cx0, l0, mc0, ptr0, idx0);
- CN_STEP4(ax1, bx1, cx1, l1, mc1, ptr1, idx1);
- CN_STEP4(ax2, bx2, cx2, l2, mc2, ptr2, idx2);
-
- // ODD ROUND
- CN_STEP1(ax0, cx0, bx0, l0, ptr0, idx0);
- CN_STEP1(ax1, cx1, bx1, l1, ptr1, idx1);
- CN_STEP1(ax2, cx2, bx2, l2, ptr2, idx2);
-
- CN_STEP2(ax0, cx0, bx0, l0, ptr0, idx0);
- CN_STEP2(ax1, cx1, bx1, l1, ptr1, idx1);
- CN_STEP2(ax2, cx2, bx2, l2, ptr2, idx2);
-
- CN_STEP3(ax0, cx0, bx0, l0, ptr0, idx0);
- CN_STEP3(ax1, cx1, bx1, l1, ptr1, idx1);
- CN_STEP3(ax2, cx2, bx2, l2, ptr2, idx2);
-
- CN_STEP4(ax0, cx0, bx0, l0, mc0, ptr0, idx0);
- CN_STEP4(ax1, cx1, bx1, l1, mc1, ptr1, idx1);
- CN_STEP4(ax2, cx2, bx2, l2, mc2, ptr2, idx2);
- }
- for (size_t i = 0; i < 3; i++)
- {
- cn_implode_scratchpad((__m128i*)ctx[i]->long_state, (__m128i*)ctx[i]->hash_state);
- keccakf((uint64_t*)ctx[i]->hash_state, 24);
- extra_hashes[ctx[i]->hash_state[0] & 3](ctx[i]->hash_state, 200, (char*)output + 32 * i);
- }
-}
+template< size_t N, size_t asm_version>
+struct Cryptonight_hash_asm;
-// This even lovelier creation will do 4 cn hashes at a time.
-template
-void cryptonight_quad_hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
+template
+struct Cryptonight_hash_asm<1, asm_version>
{
- constexpr size_t MASK = cn_select_mask();
- constexpr size_t ITERATIONS = cn_select_iter();
- constexpr size_t MEM = cn_select_memory();
+ static constexpr size_t N = 1;
- if((ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) && len < 43)
+ template
+ static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
{
- memset(output, 0, 32 * 4);
- return;
- }
+ constexpr size_t MEM = cn_select_memory();
- for (size_t i = 0; i < 4; i++)
- {
- keccak((const uint8_t *)input + len * i, len, ctx[i]->hash_state, 200);
- cn_explode_scratchpad((__m128i*)ctx[i]->hash_state, (__m128i*)ctx[i]->long_state);
- }
+ keccak((const uint8_t *)input, len, ctx[0]->hash_state, 200);
+ cn_explode_scratchpad((__m128i*)ctx[0]->hash_state, (__m128i*)ctx[0]->long_state);
- CONST_INIT(ctx[0], 0);
- CONST_INIT(ctx[1], 1);
- CONST_INIT(ctx[2], 2);
- CONST_INIT(ctx[3], 3);
-
- uint8_t* l0 = ctx[0]->long_state;
- uint64_t* h0 = (uint64_t*)ctx[0]->hash_state;
- uint8_t* l1 = ctx[1]->long_state;
- uint64_t* h1 = (uint64_t*)ctx[1]->hash_state;
- uint8_t* l2 = ctx[2]->long_state;
- uint64_t* h2 = (uint64_t*)ctx[2]->hash_state;
- uint8_t* l3 = ctx[3]->long_state;
- uint64_t* h3 = (uint64_t*)ctx[3]->hash_state;
-
- __m128i ax0 = _mm_set_epi64x(h0[1] ^ h0[5], h0[0] ^ h0[4]);
- __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
- __m128i ax1 = _mm_set_epi64x(h1[1] ^ h1[5], h1[0] ^ h1[4]);
- __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
- __m128i ax2 = _mm_set_epi64x(h2[1] ^ h2[5], h2[0] ^ h2[4]);
- __m128i bx2 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]);
- __m128i ax3 = _mm_set_epi64x(h3[1] ^ h3[5], h3[0] ^ h3[4]);
- __m128i bx3 = _mm_set_epi64x(h3[3] ^ h3[7], h3[2] ^ h3[6]);
- __m128i cx0 = _mm_set_epi64x(0, 0);
- __m128i cx1 = _mm_set_epi64x(0, 0);
- __m128i cx2 = _mm_set_epi64x(0, 0);
- __m128i cx3 = _mm_set_epi64x(0, 0);
-
- uint64_t idx0, idx1, idx2, idx3;
- idx0 = _mm_cvtsi128_si64(ax0);
- idx1 = _mm_cvtsi128_si64(ax1);
- idx2 = _mm_cvtsi128_si64(ax2);
- idx3 = _mm_cvtsi128_si64(ax3);
-
- for (size_t i = 0; i < ITERATIONS/2; i++)
- {
- uint64_t hi, lo;
- __m128i *ptr0, *ptr1, *ptr2, *ptr3;
-
- // EVEN ROUND
- CN_STEP1(ax0, bx0, cx0, l0, ptr0, idx0);
- CN_STEP1(ax1, bx1, cx1, l1, ptr1, idx1);
- CN_STEP1(ax2, bx2, cx2, l2, ptr2, idx2);
- CN_STEP1(ax3, bx3, cx3, l3, ptr3, idx3);
-
- CN_STEP2(ax0, bx0, cx0, l0, ptr0, idx0);
- CN_STEP2(ax1, bx1, cx1, l1, ptr1, idx1);
- CN_STEP2(ax2, bx2, cx2, l2, ptr2, idx2);
- CN_STEP2(ax3, bx3, cx3, l3, ptr3, idx3);
-
- CN_STEP3(ax0, bx0, cx0, l0, ptr0, idx0);
- CN_STEP3(ax1, bx1, cx1, l1, ptr1, idx1);
- CN_STEP3(ax2, bx2, cx2, l2, ptr2, idx2);
- CN_STEP3(ax3, bx3, cx3, l3, ptr3, idx3);
-
- CN_STEP4(ax0, bx0, cx0, l0, mc0, ptr0, idx0);
- CN_STEP4(ax1, bx1, cx1, l1, mc1, ptr1, idx1);
- CN_STEP4(ax2, bx2, cx2, l2, mc2, ptr2, idx2);
- CN_STEP4(ax3, bx3, cx3, l3, mc3, ptr3, idx3);
-
- // ODD ROUND
- CN_STEP1(ax0, cx0, bx0, l0, ptr0, idx0);
- CN_STEP1(ax1, cx1, bx1, l1, ptr1, idx1);
- CN_STEP1(ax2, cx2, bx2, l2, ptr2, idx2);
- CN_STEP1(ax3, cx3, bx3, l3, ptr3, idx3);
-
- CN_STEP2(ax0, cx0, bx0, l0, ptr0, idx0);
- CN_STEP2(ax1, cx1, bx1, l1, ptr1, idx1);
- CN_STEP2(ax2, cx2, bx2, l2, ptr2, idx2);
- CN_STEP2(ax3, cx3, bx3, l3, ptr3, idx3);
-
- CN_STEP3(ax0, cx0, bx0, l0, ptr0, idx0);
- CN_STEP3(ax1, cx1, bx1, l1, ptr1, idx1);
- CN_STEP3(ax2, cx2, bx2, l2, ptr2, idx2);
- CN_STEP3(ax3, cx3, bx3, l3, ptr3, idx3);
-
- CN_STEP4(ax0, cx0, bx0, l0, mc0, ptr0, idx0);
- CN_STEP4(ax1, cx1, bx1, l1, mc1, ptr1, idx1);
- CN_STEP4(ax2, cx2, bx2, l2, mc2, ptr2, idx2);
- CN_STEP4(ax3, cx3, bx3, l3, mc3, ptr3, idx3);
- }
+ if(asm_version == 0)
+ cryptonight_v8_mainloop_ivybridge_asm(ctx[0]);
+ else if(asm_version == 1)
+ cryptonight_v8_mainloop_ryzen_asm(ctx[0]);
- for (size_t i = 0; i < 4; i++)
- {
- cn_implode_scratchpad((__m128i*)ctx[i]->long_state, (__m128i*)ctx[i]->hash_state);
- keccakf((uint64_t*)ctx[i]->hash_state, 24);
- extra_hashes[ctx[i]->hash_state[0] & 3](ctx[i]->hash_state, 200, (char*)output + 32 * i);
+ cn_implode_scratchpad((__m128i*)ctx[0]->long_state, (__m128i*)ctx[0]->hash_state);
+ keccakf((uint64_t*)ctx[0]->hash_state, 24);
+ extra_hashes[ctx[0]->hash_state[0] & 3](ctx[0]->hash_state, 200, (char*)output);
}
-}
+};
-// This most lovely creation will do 5 cn hashes at a time.
-template
-void cryptonight_penta_hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
+// double hash only for intel
+template< >
+struct Cryptonight_hash_asm<2, 0>
{
- constexpr size_t MASK = cn_select_mask();
- constexpr size_t ITERATIONS = cn_select_iter();
- constexpr size_t MEM = cn_select_memory();
+ static constexpr size_t N = 2;
- if((ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) && len < 43)
+ template
+ static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
{
- memset(output, 0, 32 * 5);
- return;
- }
+ constexpr size_t MEM = cn_select_memory();
- for (size_t i = 0; i < 5; i++)
- {
- keccak((const uint8_t *)input + len * i, len, ctx[i]->hash_state, 200);
- cn_explode_scratchpad((__m128i*)ctx[i]->hash_state, (__m128i*)ctx[i]->long_state);
- }
+ for(size_t i = 0; i < N; ++i)
+ {
+ keccak((const uint8_t *)input + len * i, len, ctx[i]->hash_state, 200);
+ /* Optim - 99% time boundary */
+ cn_explode_scratchpad((__m128i*)ctx[i]->hash_state, (__m128i*)ctx[i]->long_state);
+ }
- CONST_INIT(ctx[0], 0);
- CONST_INIT(ctx[1], 1);
- CONST_INIT(ctx[2], 2);
- CONST_INIT(ctx[3], 3);
- CONST_INIT(ctx[4], 4);
-
- uint8_t* l0 = ctx[0]->long_state;
- uint64_t* h0 = (uint64_t*)ctx[0]->hash_state;
- uint8_t* l1 = ctx[1]->long_state;
- uint64_t* h1 = (uint64_t*)ctx[1]->hash_state;
- uint8_t* l2 = ctx[2]->long_state;
- uint64_t* h2 = (uint64_t*)ctx[2]->hash_state;
- uint8_t* l3 = ctx[3]->long_state;
- uint64_t* h3 = (uint64_t*)ctx[3]->hash_state;
- uint8_t* l4 = ctx[4]->long_state;
- uint64_t* h4 = (uint64_t*)ctx[4]->hash_state;
-
- __m128i ax0 = _mm_set_epi64x(h0[1] ^ h0[5], h0[0] ^ h0[4]);
- __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
- __m128i ax1 = _mm_set_epi64x(h1[1] ^ h1[5], h1[0] ^ h1[4]);
- __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
- __m128i ax2 = _mm_set_epi64x(h2[1] ^ h2[5], h2[0] ^ h2[4]);
- __m128i bx2 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]);
- __m128i ax3 = _mm_set_epi64x(h3[1] ^ h3[5], h3[0] ^ h3[4]);
- __m128i bx3 = _mm_set_epi64x(h3[3] ^ h3[7], h3[2] ^ h3[6]);
- __m128i ax4 = _mm_set_epi64x(h4[1] ^ h4[5], h4[0] ^ h4[4]);
- __m128i bx4 = _mm_set_epi64x(h4[3] ^ h4[7], h4[2] ^ h4[6]);
- __m128i cx0 = _mm_set_epi64x(0, 0);
- __m128i cx1 = _mm_set_epi64x(0, 0);
- __m128i cx2 = _mm_set_epi64x(0, 0);
- __m128i cx3 = _mm_set_epi64x(0, 0);
- __m128i cx4 = _mm_set_epi64x(0, 0);
-
- uint64_t idx0, idx1, idx2, idx3, idx4;
- idx0 = _mm_cvtsi128_si64(ax0);
- idx1 = _mm_cvtsi128_si64(ax1);
- idx2 = _mm_cvtsi128_si64(ax2);
- idx3 = _mm_cvtsi128_si64(ax3);
- idx4 = _mm_cvtsi128_si64(ax4);
-
- for (size_t i = 0; i < ITERATIONS/2; i++)
- {
- uint64_t hi, lo;
- __m128i *ptr0, *ptr1, *ptr2, *ptr3, *ptr4;
-
- // EVEN ROUND
- CN_STEP1(ax0, bx0, cx0, l0, ptr0, idx0);
- CN_STEP1(ax1, bx1, cx1, l1, ptr1, idx1);
- CN_STEP1(ax2, bx2, cx2, l2, ptr2, idx2);
- CN_STEP1(ax3, bx3, cx3, l3, ptr3, idx3);
- CN_STEP1(ax4, bx4, cx4, l4, ptr4, idx4);
-
- CN_STEP2(ax0, bx0, cx0, l0, ptr0, idx0);
- CN_STEP2(ax1, bx1, cx1, l1, ptr1, idx1);
- CN_STEP2(ax2, bx2, cx2, l2, ptr2, idx2);
- CN_STEP2(ax3, bx3, cx3, l3, ptr3, idx3);
- CN_STEP2(ax4, bx4, cx4, l4, ptr4, idx4);
-
- CN_STEP3(ax0, bx0, cx0, l0, ptr0, idx0);
- CN_STEP3(ax1, bx1, cx1, l1, ptr1, idx1);
- CN_STEP3(ax2, bx2, cx2, l2, ptr2, idx2);
- CN_STEP3(ax3, bx3, cx3, l3, ptr3, idx3);
- CN_STEP3(ax4, bx4, cx4, l4, ptr4, idx4);
-
- CN_STEP4(ax0, bx0, cx0, l0, mc0, ptr0, idx0);
- CN_STEP4(ax1, bx1, cx1, l1, mc1, ptr1, idx1);
- CN_STEP4(ax2, bx2, cx2, l2, mc2, ptr2, idx2);
- CN_STEP4(ax3, bx3, cx3, l3, mc3, ptr3, idx3);
- CN_STEP4(ax4, bx4, cx4, l4, mc4, ptr4, idx4);
-
- // ODD ROUND
- CN_STEP1(ax0, cx0, bx0, l0, ptr0, idx0);
- CN_STEP1(ax1, cx1, bx1, l1, ptr1, idx1);
- CN_STEP1(ax2, cx2, bx2, l2, ptr2, idx2);
- CN_STEP1(ax3, cx3, bx3, l3, ptr3, idx3);
- CN_STEP1(ax4, cx4, bx4, l4, ptr4, idx4);
-
- CN_STEP2(ax0, cx0, bx0, l0, ptr0, idx0);
- CN_STEP2(ax1, cx1, bx1, l1, ptr1, idx1);
- CN_STEP2(ax2, cx2, bx2, l2, ptr2, idx2);
- CN_STEP2(ax3, cx3, bx3, l3, ptr3, idx3);
- CN_STEP2(ax4, cx4, bx4, l4, ptr4, idx4);
-
- CN_STEP3(ax0, cx0, bx0, l0, ptr0, idx0);
- CN_STEP3(ax1, cx1, bx1, l1, ptr1, idx1);
- CN_STEP3(ax2, cx2, bx2, l2, ptr2, idx2);
- CN_STEP3(ax3, cx3, bx3, l3, ptr3, idx3);
- CN_STEP3(ax4, cx4, bx4, l4, ptr4, idx4);
-
- CN_STEP4(ax0, cx0, bx0, l0, mc0, ptr0, idx0);
- CN_STEP4(ax1, cx1, bx1, l1, mc1, ptr1, idx1);
- CN_STEP4(ax2, cx2, bx2, l2, mc2, ptr2, idx2);
- CN_STEP4(ax3, cx3, bx3, l3, mc3, ptr3, idx3);
- CN_STEP4(ax4, cx4, bx4, l4, mc4, ptr4, idx4);
- }
+ cryptonight_v8_double_mainloop_sandybridge_asm(ctx[0], ctx[1]);
- for (size_t i = 0; i < 5; i++)
- {
- cn_implode_scratchpad((__m128i*)ctx[i]->long_state, (__m128i*)ctx[i]->hash_state);
- keccakf((uint64_t*)ctx[i]->hash_state, 24);
- extra_hashes[ctx[i]->hash_state[0] & 3](ctx[i]->hash_state, 200, (char*)output + 32 * i);
+ for(size_t i = 0; i < N; ++i)
+ {
+ /* Optim - 90% time boundary */
+ cn_implode_scratchpad((__m128i*)ctx[i]->long_state, (__m128i*)ctx[i]->hash_state);
+ /* Optim - 99% time boundary */
+ keccakf((uint64_t*)ctx[i]->hash_state, 24);
+ extra_hashes[ctx[i]->hash_state[0] & 3](ctx[i]->hash_state, 200, (char*)output + 32 * i);
+ }
}
-}
+};
diff --git a/xmrstak/backend/cpu/crypto/cryptonight_common.cpp b/xmrstak/backend/cpu/crypto/cryptonight_common.cpp
index a478c9b2a..a7e4696a8 100644
--- a/xmrstak/backend/cpu/crypto/cryptonight_common.cpp
+++ b/xmrstak/backend/cpu/crypto/cryptonight_common.cpp
@@ -216,6 +216,8 @@ cryptonight_ctx* cryptonight_alloc_ctx(size_t use_fast_mem, size_t use_mlock, al
ptr->long_state = (uint8_t*)_mm_malloc(hashMemSize, hashMemSize);
ptr->ctx_info[0] = 0;
ptr->ctx_info[1] = 0;
+ if(ptr->long_state == NULL)
+ printer::inst()->print_msg(L0, "MEMORY ALLOC FAILED: _mm_malloc was not able to allocate %s byte",std::to_string(hashMemSize).c_str());
return ptr;
}
@@ -243,25 +245,25 @@ cryptonight_ctx* cryptonight_alloc_ctx(size_t use_fast_mem, size_t use_mlock, al
return ptr;
}
#else
-
+//http://man7.org/linux/man-pages/man2/mmap.2.html
#if defined(__APPLE__)
- ptr->long_state = (uint8_t*)mmap(0, hashMemSize, PROT_READ | PROT_WRITE,
+ ptr->long_state = (uint8_t*)mmap(NULL, hashMemSize, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANON, VM_FLAGS_SUPERPAGE_SIZE_2MB, 0);
#elif defined(__FreeBSD__)
- ptr->long_state = (uint8_t*)mmap(0, hashMemSize, PROT_READ | PROT_WRITE,
+ ptr->long_state = (uint8_t*)mmap(NULL, hashMemSize, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS | MAP_ALIGNED_SUPER | MAP_PREFAULT_READ, -1, 0);
#elif defined(__OpenBSD__)
- ptr->long_state = (uint8_t*)mmap(0, hashMemSize, PROT_READ | PROT_WRITE,
+ ptr->long_state = (uint8_t*)mmap(NULL, hashMemSize, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANON, -1, 0);
#else
- ptr->long_state = (uint8_t*)mmap(0, hashMemSize, PROT_READ | PROT_WRITE,
- MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_POPULATE, 0, 0);
+ ptr->long_state = (uint8_t*)mmap(NULL, hashMemSize, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_POPULATE, -1, 0);
#endif
if (ptr->long_state == MAP_FAILED)
{
_mm_free(ptr);
- msg->warning = "mmap failed";
+ msg->warning = "mmap failed, check attribute 'use_slow_memory' in 'config.txt'";
return NULL;
}
diff --git a/xmrstak/backend/cpu/jconf.cpp b/xmrstak/backend/cpu/jconf.cpp
index 49da7ae2d..a14be1732 100644
--- a/xmrstak/backend/cpu/jconf.cpp
+++ b/xmrstak/backend/cpu/jconf.cpp
@@ -108,12 +108,13 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg)
if(!oThdConf.IsObject())
return false;
- const Value *mode, *no_prefetch, *aff;
+ const Value *mode, *no_prefetch, *aff, *asm_version;
mode = GetObjectMember(oThdConf, "low_power_mode");
no_prefetch = GetObjectMember(oThdConf, "no_prefetch");
aff = GetObjectMember(oThdConf, "affine_to_cpu");
+ asm_version = GetObjectMember(oThdConf, "asm");
- if(mode == nullptr || no_prefetch == nullptr || aff == nullptr)
+ if(mode == nullptr || no_prefetch == nullptr || aff == nullptr || asm_version == nullptr)
return false;
if(!mode->IsBool() && !mode->IsNumber())
@@ -140,6 +141,10 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg)
else
cfg.iCpuAff = -1;
+ if(!asm_version->IsString())
+ return false;
+ cfg.asm_version_str = asm_version->GetString();
+
return true;
}
diff --git a/xmrstak/backend/cpu/jconf.hpp b/xmrstak/backend/cpu/jconf.hpp
index be855036e..4ec9165d5 100644
--- a/xmrstak/backend/cpu/jconf.hpp
+++ b/xmrstak/backend/cpu/jconf.hpp
@@ -24,6 +24,7 @@ class jconf
struct thd_cfg {
int iMultiway;
bool bNoPrefetch;
+ std::string asm_version_str;
long long iCpuAff;
};
diff --git a/xmrstak/backend/cpu/minethd.cpp b/xmrstak/backend/cpu/minethd.cpp
index 2e7169ef7..a9f18d1b0 100644
--- a/xmrstak/backend/cpu/minethd.cpp
+++ b/xmrstak/backend/cpu/minethd.cpp
@@ -27,6 +27,7 @@
#include "xmrstak/backend/iBackend.hpp"
#include "xmrstak/backend/globalStates.hpp"
#include "xmrstak/misc/configEditor.hpp"
+#include "xmrstak/backend/cpu/cpuType.hpp"
#include "xmrstak/params.hpp"
#include "jconf.hpp"
@@ -104,7 +105,7 @@ bool minethd::thd_setaffinity(std::thread::native_handle_type h, uint64_t cpu_id
#endif
}
-minethd::minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch, int64_t affinity)
+minethd::minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch, int64_t affinity, const std::string& asm_version)
{
this->backendType = iBackend::CPU;
oWork = pWork;
@@ -113,6 +114,7 @@ minethd::minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch,
iJobNo = 0;
bNoPrefetch = no_prefetch;
this->affinity = affinity;
+ asm_version_str = asm_version;
std::unique_lock lck(thd_aff_set);
std::future order_guard = order_fix.get_future();
@@ -224,6 +226,7 @@ bool minethd::self_test()
{
if ((ctx[i] = minethd_alloc_ctx()) == nullptr)
{
+ printer::inst()->print_msg(L0, "ERROR: miner was not able to allocate memory.");
for (int j = 0; j < i; j++)
cryptonight_free_ctx(ctx[j]);
return false;
@@ -232,92 +235,174 @@ bool minethd::self_test()
bool bResult = true;
- if(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight)
- {
- unsigned char out[32 * MAX_N];
- cn_hash_fun hashf;
- cn_hash_fun_multi hashf_multi;
-
- hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight);
- hashf("This is a test", 14, out, ctx[0]);
- bResult = memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0;
-
- hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight);
- hashf("This is a test", 14, out, ctx[0]);
- bResult &= memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0;
-
- hashf_multi = func_multi_selector(2, ::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight);
- hashf_multi("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx);
- bResult &= memcmp(out, "\x3e\xbb\x7f\x9f\x7d\x27\x3d\x7c\x31\x8d\x86\x94\x77\x55\x0c\xc8\x00\xcf\xb1\x1b\x0c\xad\xb7\xff\xbd\xf6\xf8\x9f\x3a\x47\x1c\x59"
- "\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22", 64) == 0;
-
- hashf_multi = func_multi_selector(2, ::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight);
- hashf_multi("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx);
- bResult &= memcmp(out, "\x3e\xbb\x7f\x9f\x7d\x27\x3d\x7c\x31\x8d\x86\x94\x77\x55\x0c\xc8\x00\xcf\xb1\x1b\x0c\xad\xb7\xff\xbd\xf6\xf8\x9f\x3a\x47\x1c\x59"
- "\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22", 64) == 0;
-
- hashf_multi = func_multi_selector(3, ::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight);
- hashf_multi("This is a testThis is a testThis is a test", 14, out, ctx);
- bResult &= memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
- "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
- "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 96) == 0;
-
- hashf_multi = func_multi_selector(4, ::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight);
- hashf_multi("This is a testThis is a testThis is a testThis is a test", 14, out, ctx);
- bResult &= memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
- "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
- "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
- "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 128) == 0;
-
- hashf_multi = func_multi_selector(5, ::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight);
- hashf_multi("This is a testThis is a testThis is a testThis is a testThis is a test", 14, out, ctx);
- bResult &= memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
- "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
- "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
- "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
- "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 160) == 0;
- }
- else if(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_lite)
- {
- }
- else if(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_monero)
- {
- }
- else if(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_aeon)
- {
- }
- else if(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_ipbc)
- {
- }
- else if(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_stellite)
- {
- }
- else if(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_masari)
- {
- }
- else if(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_bittube2)
+ unsigned char out[32 * MAX_N];
+ cn_hash_fun hashf;
+ cn_hash_fun hashf_multi;
+
+ xmrstak_algo algo = xmrstak_algo::invalid_algo;
+
+ for(int algo_idx = 0; algo_idx < 2; ++algo_idx)
{
- unsigned char out[32 * MAX_N];
- cn_hash_fun hashf;
+ if(algo_idx == 0)
+ algo = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo();
+ else
+ algo = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot();
+
+ if(algo == cryptonight)
+ {
+ hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight);
+ hashf("This is a test", 14, out, ctx);
+ bResult = bResult && memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0;
+
+ hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight);
+ hashf("This is a test", 14, out, ctx);
+ bResult = bResult && memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0;
+
+ hashf_multi = func_multi_selector<2>(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight);
+ hashf_multi("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx);
+ bResult = bResult && memcmp(out, "\x3e\xbb\x7f\x9f\x7d\x27\x3d\x7c\x31\x8d\x86\x94\x77\x55\x0c\xc8\x00\xcf\xb1\x1b\x0c\xad\xb7\xff\xbd\xf6\xf8\x9f\x3a\x47\x1c\x59"
+ "\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22", 64) == 0;
+
+ hashf_multi = func_multi_selector<2>(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight);
+ hashf_multi("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx);
+ bResult = bResult && memcmp(out, "\x3e\xbb\x7f\x9f\x7d\x27\x3d\x7c\x31\x8d\x86\x94\x77\x55\x0c\xc8\x00\xcf\xb1\x1b\x0c\xad\xb7\xff\xbd\xf6\xf8\x9f\x3a\x47\x1c\x59"
+ "\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22", 64) == 0;
+
+ hashf_multi = func_multi_selector<3>(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight);
+ hashf_multi("This is a testThis is a testThis is a test", 14, out, ctx);
+ bResult = bResult && memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
+ "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
+ "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 96) == 0;
+
+ hashf_multi = func_multi_selector<4>(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight);
+ hashf_multi("This is a testThis is a testThis is a testThis is a test", 14, out, ctx);
+ bResult = bResult && memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
+ "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
+ "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
+ "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 128) == 0;
+
+ hashf_multi = func_multi_selector<5>(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight);
+ hashf_multi("This is a testThis is a testThis is a testThis is a testThis is a test", 14, out, ctx);
+ bResult = bResult && memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
+ "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
+ "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
+ "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
+ "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 160) == 0;
+ }
+ else if(algo == cryptonight_lite)
+ {
+ hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_lite);
+ hashf("This is a test This is a test This is a test", 44, out, ctx);
+ bResult = bResult && memcmp(out, "\x5a\x24\xa0\x29\xde\x1c\x39\x3f\x3d\x52\x7a\x2f\x9b\x39\xdc\x3d\xb3\xbc\x87\x11\x8b\x84\x52\x9b\x9f\x0\x88\x49\x25\x4b\x5\xce", 32) == 0;
+
+ hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_lite);
+ bResult = bResult && memcmp(out, "\x5a\x24\xa0\x29\xde\x1c\x39\x3f\x3d\x52\x7a\x2f\x9b\x39\xdc\x3d\xb3\xbc\x87\x11\x8b\x84\x52\x9b\x9f\x0\x88\x49\x25\x4b\x5\xce", 32) == 0;
+ }
+ else if(algo == cryptonight_monero)
+ {
+ hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_monero);
+ hashf("This is a test This is a test This is a test", 44, out, ctx);
+ bResult = bResult && memcmp(out, "\x1\x57\xc5\xee\x18\x8b\xbe\xc8\x97\x52\x85\xa3\x6\x4e\xe9\x20\x65\x21\x76\x72\xfd\x69\xa1\xae\xbd\x7\x66\xc7\xb5\x6e\xe0\xbd", 32) == 0;
+
+ hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_monero);
+ hashf("This is a test This is a test This is a test", 44, out, ctx);
+ bResult = bResult && memcmp(out, "\x1\x57\xc5\xee\x18\x8b\xbe\xc8\x97\x52\x85\xa3\x6\x4e\xe9\x20\x65\x21\x76\x72\xfd\x69\xa1\xae\xbd\x7\x66\xc7\xb5\x6e\xe0\xbd", 32) == 0;
+ }
+ else if(algo == cryptonight_monero_v8)
+ {
+ hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_monero_v8);
+ hashf("This is a test This is a test This is a test", 44, out, ctx);
+ bResult = memcmp(out, "\x35\x3f\xdc\x06\x8f\xd4\x7b\x03\xc0\x4b\x94\x31\xe0\x05\xe0\x0b\x68\xc2\x16\x8a\x3c\xc7\x33\x5c\x8b\x9b\x30\x81\x56\x59\x1a\x4f", 32) == 0;
+
+ hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_monero_v8);
+ hashf("This is a test This is a test This is a test", 44, out, ctx);
+ bResult &= memcmp(out, "\x35\x3f\xdc\x06\x8f\xd4\x7b\x03\xc0\x4b\x94\x31\xe0\x05\xe0\x0b\x68\xc2\x16\x8a\x3c\xc7\x33\x5c\x8b\x9b\x30\x81\x56\x59\x1a\x4f", 32) == 0;
+ }
+ else if(algo == cryptonight_aeon)
+ {
+ hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_aeon);
+ hashf("This is a test This is a test This is a test", 44, out, ctx);
+ bResult = bResult && memcmp(out, "\xfc\xa1\x7d\x44\x37\x70\x9b\x4a\x3b\xd7\x1e\xf3\xed\x21\xb4\x17\xca\x93\xdc\x86\x79\xce\x81\xdf\xd3\xcb\xdd\xa\x22\xd7\x58\xba", 32) == 0;
+
+ hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_aeon);
+ hashf("This is a test This is a test This is a test", 44, out, ctx);
+ bResult = bResult && memcmp(out, "\xfc\xa1\x7d\x44\x37\x70\x9b\x4a\x3b\xd7\x1e\xf3\xed\x21\xb4\x17\xca\x93\xdc\x86\x79\xce\x81\xdf\xd3\xcb\xdd\xa\x22\xd7\x58\xba", 32) == 0;
+ }
+ else if(algo == cryptonight_ipbc)
+ {
+ hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_ipbc);
+ hashf("This is a test This is a test This is a test", 44, out, ctx);
+ bResult = bResult && memcmp(out, "\xbc\xe7\x48\xaf\xc5\x31\xff\xc9\x33\x7f\xcf\x51\x1b\xe3\x20\xa3\xaa\x8d\x4\x55\xf9\x14\x2a\x61\xe8\x38\xdf\xdc\x3b\x28\x3e\x0xb0", 32) == 0;
+
+ hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_ipbc);
+ hashf("This is a test This is a test This is a test", 44, out, ctx);
+ bResult = bResult && memcmp(out, "\xbc\xe7\x48\xaf\xc5\x31\xff\xc9\x33\x7f\xcf\x51\x1b\xe3\x20\xa3\xaa\x8d\x4\x55\xf9\x14\x2a\x61\xe8\x38\xdf\xdc\x3b\x28\x3e\x0", 32) == 0;
+ }
+ else if(algo == cryptonight_stellite)
+ {
+ hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_stellite);
+ hashf("This is a test This is a test This is a test", 44, out, ctx);
+ bResult = bResult && memcmp(out, "\xb9\x9d\x6c\xee\x50\x3c\x6f\xa6\x3f\x30\x69\x24\x4a\x0\x9f\xe4\xd4\x69\x3f\x68\x92\xa4\x5c\xc2\x51\xae\x46\x87\x7c\x6b\x98\xae", 32) == 0;
+
+ hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_stellite);
+ hashf("This is a test This is a test This is a test", 44, out, ctx);
+ bResult = bResult && memcmp(out, "\xb9\x9d\x6c\xee\x50\x3c\x6f\xa6\x3f\x30\x69\x24\x4a\x0\x9f\xe4\xd4\x69\x3f\x68\x92\xa4\x5c\xc2\x51\xae\x46\x87\x7c\x6b\x98\xae", 32) == 0;
+ }
+ else if(algo == cryptonight_masari)
+ {
+ hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_masari);
+ hashf("This is a test This is a test This is a test", 44, out, ctx);
+ bResult = bResult && memcmp(out, "\xbf\x5f\xd\xf3\x5a\x65\x7c\x89\xb0\x41\xcf\xf0\xd\x46\x6a\xb6\x30\xf9\x77\x7f\xd9\xc6\x3\xd7\x3b\xd8\xf1\xb5\x4b\x49\xed\x28", 32) == 0;
+
+ hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_masari);
+ hashf("This is a test This is a test This is a test", 44, out, ctx);
+ bResult = bResult && memcmp(out, "\xbf\x5f\xd\xf3\x5a\x65\x7c\x89\xb0\x41\xcf\xf0\xd\x46\x6a\xb6\x30\xf9\x77\x7f\xd9\xc6\x3\xd7\x3b\xd8\xf1\xb5\x4b\x49\xed\x28", 32) == 0;
+ }
+ else if(algo == cryptonight_heavy)
+ {
+ hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_heavy);
+ hashf("This is a test This is a test This is a test", 44, out, ctx);
+ bResult = bResult && memcmp(out, "\xf9\x44\x97\xce\xb4\xf0\xd9\x84\xb\x9b\xfc\x45\x94\x74\x55\x25\xcf\x26\x83\x16\x4f\xc\xf8\x2d\xf5\xf\x25\xff\x45\x28\x2e\x85", 32) == 0;
+
+ hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_heavy);
+ hashf("This is a test This is a test This is a test", 44, out, ctx);
+ bResult = bResult && memcmp(out, "\xf9\x44\x97\xce\xb4\xf0\xd9\x84\xb\x9b\xfc\x45\x94\x74\x55\x25\xcf\x26\x83\x16\x4f\xc\xf8\x2d\xf5\xf\x25\xff\x45\x28\x2e\x85", 32) == 0;
+ }
+ else if(algo == cryptonight_haven)
+ {
+ hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_haven);
+ hashf("This is a test This is a test This is a test", 44, out, ctx);
+ bResult = bResult && memcmp(out, "\xc7\xd4\x52\x9\x2b\x48\xa5\xaf\xae\x11\xaf\x40\x9a\x87\xe5\x88\xf0\x29\x35\xa3\x68\xd\xe3\x6b\xce\x43\xf6\xc8\xdf\xd3\xe3\x9", 32) == 0;
+
+ hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_haven);
+ hashf("This is a test This is a test This is a test", 44, out, ctx);
+ bResult = bResult && memcmp(out, "\xc7\xd4\x52\x9\x2b\x48\xa5\xaf\xae\x11\xaf\x40\x9a\x87\xe5\x88\xf0\x29\x35\xa3\x68\xd\xe3\x6b\xce\x43\xf6\xc8\xdf\xd3\xe3\x9", 32) == 0;
+ }
+ else if(algo == cryptonight_bittube2)
+ {
+ unsigned char out[32 * MAX_N];
+ cn_hash_fun hashf;
+
+ hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_bittube2);
- hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_bittube2);
+ hashf("\x38\x27\x4c\x97\xc4\x5a\x17\x2c\xfc\x97\x67\x98\x70\x42\x2e\x3a\x1a\xb0\x78\x49\x60\xc6\x05\x14\xd8\x16\x27\x14\x15\xc3\x06\xee\x3a\x3e\xd1\xa7\x7e\x31\xf6\xa8\x85\xc3\xcb\xff\x01\x02\x03\x04", 48, out, ctx);
+ bResult = bResult && memcmp(out, "\x18\x2c\x30\x41\x93\x1a\x14\x73\xc6\xbf\x7e\x77\xfe\xb5\x17\x9b\xa8\xbe\xa9\x68\xba\x9e\xe1\xe8\x24\x1a\x12\x7a\xac\x81\xb4\x24", 32) == 0;
- hashf("\x38\x27\x4c\x97\xc4\x5a\x17\x2c\xfc\x97\x67\x98\x70\x42\x2e\x3a\x1a\xb0\x78\x49\x60\xc6\x05\x14\xd8\x16\x27\x14\x15\xc3\x06\xee\x3a\x3e\xd1\xa7\x7e\x31\xf6\xa8\x85\xc3\xcb\xff\x01\x02\x03\x04", 48, out, ctx[0]);
- bResult = memcmp(out, "\x18\x2c\x30\x41\x93\x1a\x14\x73\xc6\xbf\x7e\x77\xfe\xb5\x17\x9b\xa8\xbe\xa9\x68\xba\x9e\xe1\xe8\x24\x1a\x12\x7a\xac\x81\xb4\x24", 32) == 0;
+ hashf("\x04\x04\xb4\x94\xce\xd9\x05\x18\xe7\x25\x5d\x01\x28\x63\xde\x8a\x4d\x27\x72\xb1\xff\x78\x8c\xd0\x56\x20\x38\x98\x3e\xd6\x8c\x94\xea\x00\xfe\x43\x66\x68\x83\x00\x00\x00\x00\x18\x7c\x2e\x0f\x66\xf5\x6b\xb9\xef\x67\xed\x35\x14\x5c\x69\xd4\x69\x0d\x1f\x98\x22\x44\x01\x2b\xea\x69\x6e\xe8\xb3\x3c\x42\x12\x01", 76, out, ctx);
+ bResult = bResult && memcmp(out, "\x7f\xbe\xb9\x92\x76\x87\x5a\x3c\x43\xc2\xbe\x5a\x73\x36\x06\xb5\xdc\x79\xcc\x9c\xf3\x7c\x43\x3e\xb4\x18\x56\x17\xfb\x9b\xc9\x36", 32) == 0;
- hashf("\x04\x04\xb4\x94\xce\xd9\x05\x18\xe7\x25\x5d\x01\x28\x63\xde\x8a\x4d\x27\x72\xb1\xff\x78\x8c\xd0\x56\x20\x38\x98\x3e\xd6\x8c\x94\xea\x00\xfe\x43\x66\x68\x83\x00\x00\x00\x00\x18\x7c\x2e\x0f\x66\xf5\x6b\xb9\xef\x67\xed\x35\x14\x5c\x69\xd4\x69\x0d\x1f\x98\x22\x44\x01\x2b\xea\x69\x6e\xe8\xb3\x3c\x42\x12\x01", 76, out, ctx[0]);
- bResult = bResult && memcmp(out, "\x7f\xbe\xb9\x92\x76\x87\x5a\x3c\x43\xc2\xbe\x5a\x73\x36\x06\xb5\xdc\x79\xcc\x9c\xf3\x7c\x43\x3e\xb4\x18\x56\x17\xfb\x9b\xc9\x36", 32) == 0;
+ hashf("\x85\x19\xe0\x39\x17\x2b\x0d\x70\xe5\xca\x7b\x33\x83\xd6\xb3\x16\x73\x15\xa4\x22\x74\x7b\x73\xf0\x19\xcf\x95\x28\xf0\xfd\xe3\x41\xfd\x0f\x2a\x63\x03\x0b\xa6\x45\x05\x25\xcf\x6d\xe3\x18\x37\x66\x9a\xf6\xf1\xdf\x81\x31\xfa\xf5\x0a\xaa\xb8\xd3\xa7\x40\x55\x89", 64, out, ctx);
+ bResult = bResult && memcmp(out, "\x90\xdc\x65\x53\x8d\xb0\x00\xea\xa2\x52\xcd\xd4\x1c\x17\x7a\x64\xfe\xff\x95\x36\xe7\x71\x68\x35\xd4\xcf\x5c\x73\x56\xb1\x2f\xcd", 32) == 0;
+ }
- hashf("\x85\x19\xe0\x39\x17\x2b\x0d\x70\xe5\xca\x7b\x33\x83\xd6\xb3\x16\x73\x15\xa4\x22\x74\x7b\x73\xf0\x19\xcf\x95\x28\xf0\xfd\xe3\x41\xfd\x0f\x2a\x63\x03\x0b\xa6\x45\x05\x25\xcf\x6d\xe3\x18\x37\x66\x9a\xf6\xf1\xdf\x81\x31\xfa\xf5\x0a\xaa\xb8\xd3\xa7\x40\x55\x89", 64, out, ctx[0]);
- bResult = bResult && memcmp(out, "\x90\xdc\x65\x53\x8d\xb0\x00\xea\xa2\x52\xcd\xd4\x1c\x17\x7a\x64\xfe\xff\x95\x36\xe7\x71\x68\x35\xd4\xcf\x5c\x73\x56\xb1\x2f\xcd", 32) == 0;
+ if(!bResult)
+ printer::inst()->print_msg(L0,
+ "Cryptonight hash self-test failed. This might be caused by bad compiler optimizations.");
}
+
for (int i = 0; i < MAX_N; i++)
cryptonight_free_ctx(ctx[i]);
- if(!bResult)
- printer::inst()->print_msg(L0,
- "Cryptonight hash self-test failed. This might be caused by bad compiler optimizations.");
-
return bResult;
}
@@ -359,15 +444,41 @@ std::vector minethd::thread_starter(uint32_t threadOffset, miner_work
else
printer::inst()->print_msg(L1, "Starting %dx thread, no affinity.", cfg.iMultiway);
- minethd* thd = new minethd(pWork, i + threadOffset, cfg.iMultiway, cfg.bNoPrefetch, cfg.iCpuAff);
+ minethd* thd = new minethd(pWork, i + threadOffset, cfg.iMultiway, cfg.bNoPrefetch, cfg.iCpuAff, cfg.asm_version_str);
pvThreads.push_back(thd);
}
return pvThreads;
}
-minethd::cn_hash_fun minethd::func_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo)
+/** get the supported asm name
+ *
+ * @return asm type based on the number of hashes per thread the internal
+ * evaluated cpu type
+ */
+static std::string getAsmName(const uint32_t num_hashes)
{
+ std::string asm_type = "off";
+ if(num_hashes != 0)
+ {
+ auto cpu_model = getModel();
+
+ if(cpu_model.avx && cpu_model.aes)
+ {
+ if(cpu_model.type_name.find("Intel") != std::string::npos)
+ asm_type = "intel_avx";
+ else if(cpu_model.type_name.find("AMD") != std::string::npos && num_hashes == 1)
+ asm_type = "amd_avx";
+ }
+ }
+ return asm_type;
+}
+
+template
+minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo, const std::string& asm_version_str)
+{
+ static_assert(N >= 1, "number of threads must be >= 1" );
+
// We have two independent flag bits in the functions
// therefore we will build a binary digit and select the
// function as a two digit binary
@@ -405,388 +516,119 @@ minethd::cn_hash_fun minethd::func_selector(bool bHaveAes, bool bNoPrefetch, xmr
case cryptonight_bittube2:
algv = 9;
break;
+ case cryptonight_monero_v8:
+ algv = 10;
+ break;
default:
algv = 2;
break;
}
static const cn_hash_fun func_table[] = {
- cryptonight_hash,
- cryptonight_hash,
- cryptonight_hash,
- cryptonight_hash,
- cryptonight_hash,
- cryptonight_hash,
- cryptonight_hash,
- cryptonight_hash,
- cryptonight_hash,
- cryptonight_hash,
- cryptonight_hash,
- cryptonight_hash,
- cryptonight_hash,
- cryptonight_hash,
- cryptonight_hash,
- cryptonight_hash,
- cryptonight_hash,
- cryptonight_hash,
- cryptonight_hash,
- cryptonight_hash,
- cryptonight_hash,
- cryptonight_hash,
- cryptonight_hash,
- cryptonight_hash,
- cryptonight_hash,
- cryptonight_hash,
- cryptonight_hash,
- cryptonight_hash,
- cryptonight_hash,
- cryptonight_hash,
- cryptonight_hash,
- cryptonight_hash,
- cryptonight_hash,
- cryptonight_hash,
- cryptonight_hash,
- cryptonight_hash,
- cryptonight_hash,
- cryptonight_hash,
- cryptonight_hash,
- cryptonight_hash
+ Cryptonight_hash::template hash,
+ Cryptonight_hash::template hash,
+ Cryptonight_hash::template hash,
+ Cryptonight_hash::template hash,
+
+ Cryptonight_hash::template hash,
+ Cryptonight_hash::template hash,
+ Cryptonight_hash::template hash,
+ Cryptonight_hash::template hash,
+
+ Cryptonight_hash::template hash,
+ Cryptonight_hash::template hash,
+ Cryptonight_hash::template hash,
+ Cryptonight_hash::template hash,
+
+ Cryptonight_hash::template hash,
+ Cryptonight_hash::template hash,
+ Cryptonight_hash::template hash,
+ Cryptonight_hash::template hash,
+
+ Cryptonight_hash::template hash,
+ Cryptonight_hash::template hash,
+ Cryptonight_hash::template hash,
+ Cryptonight_hash::template hash,
+
+ Cryptonight_hash::template hash,
+ Cryptonight_hash::template hash,
+ Cryptonight_hash::template hash,
+ Cryptonight_hash::template hash,
+
+ Cryptonight_hash::template hash,
+ Cryptonight_hash::template hash,
+ Cryptonight_hash::template hash,
+ Cryptonight_hash::template hash,
+
+ Cryptonight_hash::template hash,
+ Cryptonight_hash::template hash,
+ Cryptonight_hash::template hash,
+ Cryptonight_hash::template hash,
+
+ Cryptonight_hash::template hash,
+ Cryptonight_hash::template hash,
+ Cryptonight_hash::template hash,
+ Cryptonight_hash::template hash,
+
+ Cryptonight_hash::template hash,
+ Cryptonight_hash::template hash,
+ Cryptonight_hash::template hash,
+ Cryptonight_hash::template hash,
+
+ Cryptonight_hash::template hash,
+ Cryptonight_hash::template hash,
+ Cryptonight_hash::template hash,
+ Cryptonight_hash::template hash
};
std::bitset<2> digit;
digit.set(0, !bHaveAes);
digit.set(1, !bNoPrefetch);
- return func_table[ algv << 2 | digit.to_ulong() ];
-}
-
-void minethd::work_main()
-{
- if(affinity >= 0) //-1 means no affinity
- bindMemoryToNUMANode(affinity);
-
- order_fix.set_value();
- std::unique_lock lck(thd_aff_set);
- lck.release();
- std::this_thread::yield();
-
- cryptonight_ctx* ctx;
- uint64_t iCount = 0;
- uint64_t* piHashVal;
- uint32_t* piNonce;
- job_result result;
-
- // start with root algorithm and switch later if fork version is reached
- auto miner_algo = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot();
- cn_hash_fun hash_fun = func_selector(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo);
- ctx = minethd_alloc_ctx();
+ auto selected_function = func_table[ algv << 2 | digit.to_ulong() ];
- piHashVal = (uint64_t*)(result.bResult + 24);
- piNonce = (uint32_t*)(oWork.bWorkBlob + 39);
- result.iThreadId = iThreadNo;
- uint8_t version = 0;
- size_t lastPoolId = 0;
-
- while (bQuit == 0)
+ // check for asm optimized version for cryptonight_v8
+ if(N <= 2 && algo == cryptonight_monero_v8 && bHaveAes)
{
- if (oWork.bStall)
- {
- /* We are stalled here because the executor didn't find a job for us yet,
- * either because of network latency, or a socket problem. Since we are
- * raison d'etre of this software it us sensible to just wait until we have something
- */
-
- while (globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo)
- std::this_thread::sleep_for(std::chrono::milliseconds(100));
-
- globalStates::inst().consume_work(oWork, iJobNo);
- continue;
- }
-
- size_t nonce_ctr = 0;
- constexpr size_t nonce_chunk = 4096; // Needs to be a power of 2
-
- assert(sizeof(job_result::sJobID) == sizeof(pool_job::sJobID));
- memcpy(result.sJobID, oWork.sJobID, sizeof(job_result::sJobID));
-
- if(oWork.bNiceHash)
- result.iNonce = *piNonce;
-
- uint8_t new_version = oWork.getVersion();
- if(new_version != version || oWork.iPoolId != lastPoolId)
- {
- coinDescription coinDesc = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(oWork.iPoolId);
- if(new_version >= coinDesc.GetMiningForkVersion())
- {
- miner_algo = coinDesc.GetMiningAlgo();
- hash_fun = func_selector(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo);
- }
- else
- {
- miner_algo = coinDesc.GetMiningAlgoRoot();
- hash_fun = func_selector(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo);
- }
- result.algorithm = miner_algo;
- lastPoolId = oWork.iPoolId;
- version = new_version;
- }
+ std::string selected_asm = asm_version_str;
+ if(selected_asm == "auto")
+ selected_asm = cpu::getAsmName(N);
- while(globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo)
+ if(selected_asm != "off")
{
- if ((iCount++ & 0xF) == 0) //Store stats every 16 hashes
+ if(selected_asm == "intel_avx")
{
- uint64_t iStamp = get_timestamp_ms();
- iHashCount.store(iCount, std::memory_order_relaxed);
- iTimestamp.store(iStamp, std::memory_order_relaxed);
+ // Intel Ivy Bridge (Xeon v2, Core i7/i5/i3 3xxx, Pentium G2xxx, Celeron G1xxx)
+ if(N == 1)
+ selected_function = Cryptonight_hash_asm<1u, 0u>::template hash